From b1dddddbe08282468d3ae289772f1c7fc6e16ad2 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Sat, 30 Apr 2022 21:47:11 +0800 Subject: [PATCH 01/93] add vector server, test=doc --- demos/speaker_verification/README.md | 2 +- demos/speaker_verification/README_cn.md | 6 +- .../streaming_asr_server/websocket_client.py | 9 +- paddlespeech/cli/vector/infer.py | 3 +- paddlespeech/server/README_cn.md | 20 ++ .../server/bin/paddlespeech_client.py | 99 ++++++++- paddlespeech/server/conf/application.yaml | 13 +- .../server/conf/vector_application.yaml | 32 +++ .../server/engine/asr/online/asr_engine.py | 8 + paddlespeech/server/engine/engine_factory.py | 3 + paddlespeech/server/engine/vector/__init__.py | 0 .../server/engine/vector/python/__init__.py | 0 .../engine/vector/python/vector_engine.py | 200 ++++++++++++++++++ paddlespeech/server/restful/api.py | 4 +- paddlespeech/server/restful/request.py | 39 +++- paddlespeech/server/restful/response.py | 56 +++++ paddlespeech/server/restful/vector_api.py | 151 +++++++++++++ paddlespeech/server/utils/audio_handler.py | 101 +++++++++ 18 files changed, 735 insertions(+), 11 deletions(-) create mode 100644 paddlespeech/server/conf/vector_application.yaml create mode 100644 paddlespeech/server/engine/vector/__init__.py create mode 100644 paddlespeech/server/engine/vector/python/__init__.py create mode 100644 paddlespeech/server/engine/vector/python/vector_engine.py create mode 100644 paddlespeech/server/restful/vector_api.py diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md index b79f3f7a..b6a1d9bc 100644 --- a/demos/speaker_verification/README.md +++ b/demos/speaker_verification/README.md @@ -14,7 +14,7 @@ see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/doc You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input File -The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. +The input of this cli demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. Here are sample files for this demo that can be downloaded: ```bash diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md index db382f29..90bba38a 100644 --- a/demos/speaker_verification/README_cn.md +++ b/demos/speaker_verification/README_cn.md @@ -4,16 +4,16 @@ ## 介绍 声纹识别是一项用计算机程序自动提取说话人特征的技术。 -这个 demo 是一个从给定音频文件提取说话人特征,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 +这个 demo 是从一个给定音频文件中提取说话人特征,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 ## 使用方法 ### 1. 安装 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 -你可以从 easy,medium,hard 三中方式中选择一种方式安装。 +你可以从easy medium,hard 三种方式中选择一种方式安装。 ### 2. 准备输入 -这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 +声纹cli demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 可以下载此 demo 的示例音频: ```bash diff --git a/demos/streaming_asr_server/websocket_client.py b/demos/streaming_asr_server/websocket_client.py index 523ef482..3cadd72a 100644 --- a/demos/streaming_asr_server/websocket_client.py +++ b/demos/streaming_asr_server/websocket_client.py @@ -28,6 +28,7 @@ def main(args): handler = ASRWsAudioHandler( args.server_ip, args.port, + endpoint=args.endpoint, punc_server_ip=args.punc_server_ip, punc_server_port=args.punc_server_port) loop = asyncio.get_event_loop() @@ -36,7 +37,7 @@ def main(args): if args.wavfile and os.path.exists(args.wavfile): logger.info(f"start to process the wavscp: {args.wavfile}") result = loop.run_until_complete(handler.run(args.wavfile)) - result = result["result"] + # result = result["result"] logger.info(f"asr websocket client finished : {result}") # support to process batch audios from wav.scp @@ -69,7 +70,11 @@ if __name__ == "__main__": default=8091, dest="punc_server_port", help='Punctuation server port') - + parser.add_argument( + "--endpoint", + type=str, + default="/paddlespeech/asr/streaming", + help="ASR websocket endpoint") parser.add_argument( "--wavfile", action="store", diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 37e19391..8afb0f5c 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -272,7 +272,8 @@ class VectorExecutor(BaseExecutor): model_type: str='ecapatdnn_voxceleb12', sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, - ckpt_path: Optional[os.PathLike]=None): + ckpt_path: Optional[os.PathLike]=None, + task=None): """Init the neural network from the model path Args: diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index e799bca8..010d3d51 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -63,3 +63,23 @@ paddlespeech_server start --config_file conf/tts_online_application.yaml ``` paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --input "您好,欢迎使用百度飞桨深度学习框架!" --output output.wav ``` + +## 声纹识别 + +### 启动声纹识别服务 + +``` +paddlespeech_server start --config_file conf/vector_application.yaml +``` + +### 获取说话人音频声纹 + +``` +paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav +``` + +### 两个说话人音频声纹打分 + +``` +paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav +``` \ No newline at end of file diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 2f1ce385..f1f02d16 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -35,7 +35,7 @@ from paddlespeech.server.utils.util import wav2base64 __all__ = [ 'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor', - 'ASROnlineClientExecutor', 'CLSClientExecutor' + 'ASROnlineClientExecutor', 'CLSClientExecutor', 'VectorClientExecutor' ] @@ -583,3 +583,100 @@ class TextClientExecutor(BaseExecutor): response_dict = res.json() punc_text = response_dict["result"]["punc_text"] return punc_text + + +@cli_client_register( + name='paddlespeech_client.vector', description='visit the vector service') +class VectorClientExecutor(BaseExecutor): + def __init__(self): + super(VectorClientExecutor, self).__init__() + self.parser = argparse.ArgumentParser( + prog='paddlespeech_client.vector', add_help=True) + self.parser.add_argument( + '--server_ip', type=str, default='127.0.0.1', help='server ip') + self.parser.add_argument( + '--port', type=int, default=8090, help='server port') + self.parser.add_argument( + '--input', + type=str, + default=None, + help='sentence to be process by text server.') + self.parser.add_argument( + '--task', type=str, default="spk", help="The vector service task") + self.parser.add_argument( + "--enroll", type=str, default=None, help="The enroll audio") + self.parser.add_argument( + "--test", type=str, default=None, help="The test audio") + + def execute(self, argv: List[str]) -> bool: + """Execute the request from the argv. + + Args: + argv (List): the request arguments + + Returns: + str: the request flag + """ + args = self.parser.parse_args(argv) + input_ = args.input + server_ip = args.server_ip + port = args.port + task = args.task + + try: + time_start = time.time() + res = self( + input=input_, + server_ip=server_ip, + port=port, + enroll_audio=args.enroll, + test_audio=args.test, + task=task) + time_end = time.time() + logger.info(f"The vector: {res}") + logger.info("Response time %f s." % (time_end - time_start)) + return True + except Exception as e: + logger.error("Failed to extract vector.") + logger.error(e) + return False + + @stats_wrapper + def __call__(self, + input: str, + server_ip: str="127.0.0.1", + port: int=8090, + audio_format: str="wav", + sample_rate: int=16000, + enroll_audio: str=None, + test_audio: str=None, + task="spk"): + """ + Python API to call text executor. + + Args: + input (str): the request sentence text + server_ip (str, optional): the server ip. Defaults to "127.0.0.1". + port (int, optional): the server port. Defaults to 8090. + + Returns: + str: the punctuation text + """ + if task == "spk": + from paddlespeech.server.utils.audio_handler import VectorHttpHandler + logger.info("vector http client start") + logger.info(f"the input audio: {input}") + handler = VectorHttpHandler(server_ip=server_ip, port=port) + res = handler.run(input, audio_format, sample_rate) + return res + elif task == "score": + from paddlespeech.server.utils.audio_handler import VectorScoreHttpHandler + logger.info("vector score http client start") + logger.info( + f"enroll audio: {enroll_audio}, test audio: {test_audio}") + handler = VectorScoreHttpHandler(server_ip=server_ip, port=port) + res = handler.run(enroll_audio, test_audio, audio_format, + sample_rate) + logger.info(f"The vector score is: {res}") + else: + logger.error(f"Sorry, we have not support such task {task}") diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index c8753059..b6a9942e 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -11,7 +11,7 @@ port: 8090 # protocol = ['websocket', 'http'] (only one can be selected). # http only support offline engine type. protocol: 'http' -engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python'] +engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python'] ################################################################################# @@ -166,4 +166,15 @@ text_python: cfg_path: # [optional] ckpt_path: # [optional] vocab_file: # [optional] + device: # set 'gpu:id' or 'cpu' + + +################################### Vector ###################################### +################### Vector task: spk; engine_type: python ####################### +vector_python: + task: spk + model_type: 'ecapatdnn_voxceleb12' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] device: # set 'gpu:id' or 'cpu' \ No newline at end of file diff --git a/paddlespeech/server/conf/vector_application.yaml b/paddlespeech/server/conf/vector_application.yaml new file mode 100644 index 00000000..c78659e3 --- /dev/null +++ b/paddlespeech/server/conf/vector_application.yaml @@ -0,0 +1,32 @@ +# This is the parameter configuration file for PaddleSpeech Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8090 + +# The task format in the engin_list is: _ +# protocol = ['http'] (only one can be selected). +# http only support offline engine type. +protocol: 'http' +engine_list: ['vector_python'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### Vector ###################################### +################### Vector task: spk; engine_type: python ####################### +vector_python: + task: spk + model_type: 'ecapatdnn_voxceleb12' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + device: # set 'gpu:id' or 'cpu' + + + + diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 990590b4..2e61bb4e 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -13,6 +13,7 @@ # limitations under the License. import copy import os +import time from typing import Optional import numpy as np @@ -153,6 +154,12 @@ class PaddleASRConnectionHanddler: self.n_shift = self.preprocess_conf.process[0]['n_shift'] def extract_feat(self, samples): + + # we compute the elapsed time of first char occuring + # and we record the start time at the first pcm sample arraving + # if self.first_char_occur_elapsed is not None: + # self.first_char_occur_elapsed = time.time() + if "deepspeech2online" in self.model_type: # self.reamined_wav stores all the samples, # include the original remained_wav and this package samples @@ -290,6 +297,7 @@ class PaddleASRConnectionHanddler: self.chunk_num = 0 self.global_frame_offset = 0 self.result_transcripts = [''] + self.first_char_occur_elapsed = None def decode(self, is_finished=False): if "deepspeech2online" in self.model_type: diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py index 30e48de7..6cf95d75 100644 --- a/paddlespeech/server/engine/engine_factory.py +++ b/paddlespeech/server/engine/engine_factory.py @@ -49,5 +49,8 @@ class EngineFactory(object): elif engine_name.lower() == 'text' and engine_type.lower() == 'python': from paddlespeech.server.engine.text.python.text_engine import TextEngine return TextEngine() + elif engine_name.lower() == 'vector' and engine_type.lower() == 'python': + from paddlespeech.server.engine.vector.python.vector_engine import VectorEngine + return VectorEngine() else: return None diff --git a/paddlespeech/server/engine/vector/__init__.py b/paddlespeech/server/engine/vector/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/paddlespeech/server/engine/vector/python/__init__.py b/paddlespeech/server/engine/vector/python/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py new file mode 100644 index 00000000..866c2229 --- /dev/null +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -0,0 +1,200 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +from collections import OrderedDict + +import numpy as np +import paddle + +from paddleaudio.backends import load as load_audio +from paddleaudio.compliance.librosa import melspectrogram +from paddlespeech.cli.log import logger +from paddlespeech.cli.vector.infer import VectorExecutor +from paddlespeech.server.engine.base_engine import BaseEngine +from paddlespeech.vector.io.batch import feature_normalize + + +class PaddleVectorConnectionHandler: + def __init__(self, vector_engine): + """The PaddleSpeech Vector Server Connection Handler + This connection process every server request + Args: + vector_engine (VectorEngine): The Vector engine + """ + super().__init__() + logger.info( + "Create PaddleVectorConnectionHandler to process the vector request") + self.vector_engine = vector_engine + self.executor = self.vector_engine.executor + self.task = self.vector_engine.executor.task + self.model = self.vector_engine.executor.model + self.config = self.vector_engine.executor.config + + self._inputs = OrderedDict() + self._outputs = OrderedDict() + + @paddle.no_grad() + def run(self, audio_data, task="spk"): + """The connection process the http request audio + + Args: + audio_data (bytes): base64.b64decode + + Returns: + str: the punctuation text + """ + logger.info( + f"start to extract the do vector {self.task} from the http request") + if self.task == "spk" and task == "spk": + embedding = self.extract_audio_embedding(audio_data) + return embedding + else: + logger.error( + "The request task is not matched with server model task") + logger.error( + f"The server model task is: {self.task}, but the request task is: {task}" + ) + + return np.array([ + 0.0, + ]) + + @paddle.no_grad() + def get_enroll_test_score(self, enroll_audio, test_audio): + """Get the enroll and test audio score + + Args: + enroll_audio (str): the base64 format enroll audio + test_audio (str): the base64 format test audio + + Returns: + float: the score between enroll and test audio + """ + logger.info("start to extract the enroll audio embedding") + enroll_emb = self.extract_audio_embedding(enroll_audio) + + logger.info("start to extract the test audio embedding") + test_emb = self.extract_audio_embedding(test_audio) + + logger.info( + "start to get the score between the enroll and test embedding") + score = self.executor.get_embeddings_score(enroll_emb, test_emb) + + logger.info(f"get the enroll vs test score: {score}") + return score + + @paddle.no_grad() + def extract_audio_embedding(self, audio: str, sample_rate: int=16000): + """extract the audio embedding + + Args: + audio (_type_): _description_ + sample_rate (int, optional): _description_. Defaults to 16000. + """ + # we can not reuse the cache io.BytesIO(audio) data, + # because the soundfile will change the io.BytesIO(audio) to the end + # thus we should convert the base64 string to io.BytesIO when we need the audio data + if not self.executor._check(io.BytesIO(audio), sample_rate): + logger.info("check the audio sample rate occurs error") + return np.array([0.0]) + + waveform, sr = load_audio(io.BytesIO(audio)) + logger.info(f"load the audio sample points, shape is: {waveform.shape}") + + # stage 2: get the audio feat + # Note: Now we only support fbank feature + try: + feats = melspectrogram( + x=waveform, + sr=self.config.sr, + n_mels=self.config.n_mels, + window_size=self.config.window_size, + hop_length=self.config.hop_size) + logger.info(f"extract the audio feats, shape is: {feats.shape}") + except Exception as e: + logger.info(f"feats occurs exception {e}") + sys.exit(-1) + + feats = paddle.to_tensor(feats).unsqueeze(0) + # in inference period, the lengths is all one without padding + lengths = paddle.ones([1]) + + # stage 3: we do feature normalize, + # Now we assume that the feats must do normalize + feats = feature_normalize(feats, mean_norm=True, std_norm=False) + + # stage 4: store the feats and length in the _inputs, + # which will be used in other function + logger.info(f"feats shape: {feats.shape}") + logger.info("audio extract the feats success") + + logger.info("start to extract the audio embedding") + embedding = self.model.backbone(feats, lengths).squeeze().numpy() + logger.info(f"embedding size: {embedding.shape}") + + return embedding + + +class VectorServerExecutor(VectorExecutor): + def __init__(self): + """The wrapper for TextEcutor + """ + super().__init__() + pass + + +class VectorEngine(BaseEngine): + def __init__(self): + """The Vector Engine + """ + super(VectorEngine, self).__init__() + logger.info("Create the VectorEngine Instance") + + def init(self, config: dict): + """Init the Vector Engine + + Args: + config (dict): The server configuation + + Returns: + bool: The engine instance flag + """ + logger.info("Init the vector engine") + try: + self.config = config + if self.config.device: + self.device = self.config.device + else: + self.device = paddle.get_device() + + paddle.set_device(self.device) + logger.info(f"Vector Engine set the device: {self.device}") + except BaseException as e: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + logger.error("Initialize Vector server engine Failed on device: %s." + % (self.device)) + return False + + self.executor = VectorServerExecutor() + + self.executor._init_from_path( + model_type=config.model_type, + cfg_path=config.cfg_path, + ckpt_path=config.ckpt_path, + task=config.task) + + logger.info("Init the Vector engine successfully") + return True diff --git a/paddlespeech/server/restful/api.py b/paddlespeech/server/restful/api.py index d5e422e3..f1e4ffc8 100644 --- a/paddlespeech/server/restful/api.py +++ b/paddlespeech/server/restful/api.py @@ -21,7 +21,7 @@ from paddlespeech.server.restful.asr_api import router as asr_router from paddlespeech.server.restful.cls_api import router as cls_router from paddlespeech.server.restful.text_api import router as text_router from paddlespeech.server.restful.tts_api import router as tts_router - +from paddlespeech.server.restful.vector_api import router as vec_router _router = APIRouter() @@ -43,6 +43,8 @@ def setup_router(api_list: List): _router.include_router(cls_router) elif api_name == 'text': _router.include_router(text_router) + elif api_name.lower() == 'vector': + _router.include_router(vec_router) else: logger.error( f"PaddleSpeech has not support such service: {api_name}") diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py index 50416627..b23ed76d 100644 --- a/paddlespeech/server/restful/request.py +++ b/paddlespeech/server/restful/request.py @@ -15,7 +15,7 @@ from typing import Optional from pydantic import BaseModel -__all__ = ['ASRRequest', 'TTSRequest', 'CLSRequest'] +__all__ = ['ASRRequest', 'TTSRequest', 'CLSRequest', 'VectorRequest'] #****************************************************************************************/ @@ -85,3 +85,40 @@ class CLSRequest(BaseModel): #****************************************************************************************/ class TextRequest(BaseModel): text: str + + +#****************************************************************************************/ +#************************************ Vecotr request ************************************/ +#****************************************************************************************/ +class VectorRequest(BaseModel): + """ + request body example + { + "audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...", + "task": "spk", + "audio_format": "wav", + "sample_rate": 16000, + } + """ + audio: str + task: str + audio_format: str + sample_rate: int + + +class VectorScoreRequest(BaseModel): + """ + request body example + { + "enroll_audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...", + "test_audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...", + "task": "spk", + "audio_format": "wav", + "sample_rate": 16000, + } + """ + enroll_audio: str + test_audio: str + task: str + audio_format: str + sample_rate: int diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py index 5792959e..f8cdb3cf 100644 --- a/paddlespeech/server/restful/response.py +++ b/paddlespeech/server/restful/response.py @@ -129,6 +129,11 @@ class CLSResponse(BaseModel): result: CLSResult +#****************************************************************************************/ +#************************************ Text response **************************************/ +#****************************************************************************************/ + + class TextResult(BaseModel): punc_text: str @@ -153,6 +158,57 @@ class TextResponse(BaseModel): result: TextResult +#****************************************************************************************/ +#************************************ Vector response **************************************/ +#****************************************************************************************/ + + +class VectorResult(BaseModel): + vec: list + + +class VectorResponse(BaseModel): + """ + response example + { + "success": true, + "code": 0, + "message": { + "description": "success" + }, + "result": { + "vec": [1.0, 1.0] + } + } + """ + success: bool + code: int + message: Message + result: VectorResult + + +class VectorScoreResult(BaseModel): + score: float + +class VectorScoreResponse(BaseModel): + """ + response example + { + "success": true, + "code": 0, + "message": { + "description": "success" + }, + "result": { + "score": 1.0 + } + } + """ + success: bool + code: int + message: Message + result: VectorScoreResult + #****************************************************************************************/ #********************************** Error response **************************************/ #****************************************************************************************/ diff --git a/paddlespeech/server/restful/vector_api.py b/paddlespeech/server/restful/vector_api.py new file mode 100644 index 00000000..6e04f48e --- /dev/null +++ b/paddlespeech/server/restful/vector_api.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import traceback +from typing import Union + +import numpy as np +from fastapi import APIRouter + +from paddlespeech.cli.log import logger +from paddlespeech.server.engine.engine_pool import get_engine_pool +from paddlespeech.server.engine.vector.python.vector_engine import PaddleVectorConnectionHandler +from paddlespeech.server.restful.request import VectorRequest +from paddlespeech.server.restful.request import VectorScoreRequest +from paddlespeech.server.restful.response import ErrorResponse +from paddlespeech.server.restful.response import VectorResponse +from paddlespeech.server.restful.response import VectorScoreResponse +from paddlespeech.server.utils.errors import ErrorCode +from paddlespeech.server.utils.errors import failed_response +from paddlespeech.server.utils.exception import ServerBaseException +router = APIRouter() + + +@router.get('/paddlespeech/vector/help') +def help(): + """help + + Returns: + json: The /paddlespeech/vector api response content + """ + response = { + "success": "True", + "code": 200, + "message": { + "global": "success" + }, + "vector": [2.3, 3.5, 5.5, 6.2, 2.8, 1.2, 0.3, 3.6] + } + return response + + +@router.post( + "/paddlespeech/vector", response_model=Union[VectorResponse, ErrorResponse]) +def vector(request_body: VectorRequest): + """vector api + + Args: + request_body (VectorRequest): the vector request body + + Returns: + json: the vector response body + """ + try: + # 1. get the audio data + # the audio must be base64 format + audio_data = base64.b64decode(request_body.audio) + + # 2. get single engine from engine pool + # and we use the vector_engine to create an connection handler to process the request + engine_pool = get_engine_pool() + vector_engine = engine_pool['vector'] + connection_handler = PaddleVectorConnectionHandler(vector_engine) + + # 3. we use the connection handler to process the audio + audio_vec = connection_handler.run(audio_data, request_body.task) + + # 4. we need the result of the vector instance be numpy.ndarray + if not isinstance(audio_vec, np.ndarray): + logger.error( + f"the vector type is not numpy.array, that is: {type(audio_vec)}" + ) + error_reponse = ErrorResponse() + error_reponse.message.description = f"the vector type is not numpy.array, that is: {type(audio_vec)}" + return error_reponse + + response = { + "success": True, + "code": 200, + "message": { + "description": "success" + }, + "result": { + "vec": audio_vec.tolist() + } + } + + except ServerBaseException as e: + response = failed_response(e.error_code, e.msg) + except BaseException: + response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) + traceback.print_exc() + + return response + + +@router.post( + "/paddlespeech/vector/score", + response_model=Union[VectorScoreResponse, ErrorResponse]) +def score(request_body: VectorScoreRequest): + """vector api + + Args: + request_body (VectorScoreRequest): the punctuation request body + + Returns: + json: the punctuation response body + """ + try: + # 1. get the audio data + # the audio must be base64 format + enroll_data = base64.b64decode(request_body.enroll_audio) + test_data = base64.b64decode(request_body.test_audio) + + # 2. get single engine from engine pool + # and we use the vector_engine to create an connection handler to process the request + engine_pool = get_engine_pool() + vector_engine = engine_pool['vector'] + connection_handler = PaddleVectorConnectionHandler(vector_engine) + + # 3. we use the connection handler to process the audio + score = connection_handler.get_enroll_test_score(enroll_data, test_data) + + response = { + "success": True, + "code": 200, + "message": { + "description": "success" + }, + "result": { + "score": score + } + } + + except ServerBaseException as e: + response = failed_response(e.error_code, e.msg) + except BaseException: + response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) + traceback.print_exc() + + return response diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index f0ec0eaa..a088929f 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -142,6 +142,7 @@ class ASRWsAudioHandler: return "" # 1. send websocket handshake protocal + start_time = time.time() async with websockets.connect(self.url) as ws: # 2. server has already received handshake protocal # client start to send the command @@ -187,7 +188,14 @@ class ASRWsAudioHandler: if self.punc_server: msg["result"] = self.punc_server.run(msg["result"]) + # 6. logging the final result and comptute the statstics + elapsed_time = time.time() - start_time + audio_info = soundfile.info(wavfile_path) logger.info("client final receive msg={}".format(msg)) + logger.info( + f"audio duration: {audio_info.duration}, elapsed time: {elapsed_time}, RTF={elapsed_time/audio_info.duration}" + ) + result = msg return result @@ -456,3 +464,96 @@ class TTSHttpHandler: self.stream.stop_stream() self.stream.close() self.p.terminate() + + +class VectorHttpHandler: + def __init__(self, server_ip=None, port=None): + """The Vector client http request + + Args: + server_ip (str, optional): the http vector server ip. Defaults to "127.0.0.1". + port (int, optional): the http vector server port. Defaults to 8090. + """ + super().__init__() + self.server_ip = server_ip + self.port = port + if server_ip is None or port is None: + self.url = None + else: + self.url = 'http://' + self.server_ip + ":" + str( + self.port) + '/paddlespeech/vector' + + def run(self, input, audio_format, sample_rate, task="spk"): + """Call the http asr to process the audio + + Args: + input (str): the audio file path + audio_format (str): the audio format + sample_rate (str): the audio sample rate + + Returns: + list: the audio vector + """ + if self.url is None: + logger.error("No vector server, please input valid ip and port") + return "" + + audio = wav2base64(input) + data = { + "audio": audio, + "task": task, + "audio_format": audio_format, + "sample_rate": sample_rate, + } + + logger.info(self.url) + res = requests.post(url=self.url, data=json.dumps(data)) + + return res.json() + + +class VectorScoreHttpHandler: + def __init__(self, server_ip=None, port=None): + """The Vector score client http request + + Args: + server_ip (str, optional): the http vector server ip. Defaults to "127.0.0.1". + port (int, optional): the http vector server port. Defaults to 8090. + """ + super().__init__() + self.server_ip = server_ip + self.port = port + if server_ip is None or port is None: + self.url = None + else: + self.url = 'http://' + self.server_ip + ":" + str( + self.port) + '/paddlespeech/vector/score' + + def run(self, enroll_audio, test_audio, audio_format, sample_rate): + """Call the http asr to process the audio + + Args: + input (str): the audio file path + audio_format (str): the audio format + sample_rate (str): the audio sample rate + + Returns: + list: the audio vector + """ + if self.url is None: + logger.error("No vector server, please input valid ip and port") + return "" + + enroll_audio = wav2base64(enroll_audio) + test_audio = wav2base64(test_audio) + data = { + "enroll_audio": enroll_audio, + "test_audio": test_audio, + "task": "score", + "audio_format": audio_format, + "sample_rate": sample_rate, + } + + res = requests.post(url=self.url, data=json.dumps(data)) + + return res.json() From 3950557e043e239526162cec4b42d334457d2a41 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Sun, 1 May 2022 23:50:08 +0800 Subject: [PATCH 02/93] update the vector server note, test=doc --- paddlespeech/server/bin/paddlespeech_client.py | 10 +++++++--- paddlespeech/server/restful/request.py | 5 ++++- paddlespeech/server/restful/response.py | 7 ++++++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index f1f02d16..32f78942 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -655,12 +655,16 @@ class VectorClientExecutor(BaseExecutor): Python API to call text executor. Args: - input (str): the request sentence text + input (str): the request audio data server_ip (str, optional): the server ip. Defaults to "127.0.0.1". port (int, optional): the server port. Defaults to 8090. - + audio_format (str, optional): audio format. Defaults to "wav". + sample_rate (str, optional): audio sample rate. Defaults to 16000. + enroll_audio (str, optional): enroll audio data. Defaults to None. + test_audio (str, optional): test audio data. Defaults to None. + task (str, optional): the task type, "spk" or "socre". Defaults to "spk" Returns: - str: the punctuation text + str: the audio embedding or score between enroll and test audio """ if task == "spk": from paddlespeech.server.utils.audio_handler import VectorHttpHandler diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py index b23ed76d..4e88280a 100644 --- a/paddlespeech/server/restful/request.py +++ b/paddlespeech/server/restful/request.py @@ -15,7 +15,10 @@ from typing import Optional from pydantic import BaseModel -__all__ = ['ASRRequest', 'TTSRequest', 'CLSRequest', 'VectorRequest'] +__all__ = [ + 'ASRRequest', 'TTSRequest', 'CLSRequest', 'VectorRequest', + 'VectorScoreRequest' +] #****************************************************************************************/ diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py index f8cdb3cf..c91b3899 100644 --- a/paddlespeech/server/restful/response.py +++ b/paddlespeech/server/restful/response.py @@ -15,7 +15,10 @@ from typing import List from pydantic import BaseModel -__all__ = ['ASRResponse', 'TTSResponse', 'CLSResponse'] +__all__ = [ + 'ASRResponse', 'TTSResponse', 'CLSResponse', 'TextResponse', + 'VectorResponse', 'VectorScoreResponse' +] class Message(BaseModel): @@ -190,6 +193,7 @@ class VectorResponse(BaseModel): class VectorScoreResult(BaseModel): score: float + class VectorScoreResponse(BaseModel): """ response example @@ -209,6 +213,7 @@ class VectorScoreResponse(BaseModel): message: Message result: VectorScoreResult + #****************************************************************************************/ #********************************** Error response **************************************/ #****************************************************************************************/ From c78653850b020ef54590a744eebe80b6a096af56 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 2 May 2022 20:11:34 +0800 Subject: [PATCH 03/93] join streaming asr and punc server, test=doc --- demos/streaming_asr_server/README.md | 272 +++++++++++++++++- demos/streaming_asr_server/README_cn.md | 272 ++++++++++++++++++ .../conf/punc_application.yaml | 35 +++ .../conf/ws_conformer_application.yaml | 4 +- demos/streaming_asr_server/punc_server.py | 38 +++ demos/streaming_asr_server/server.sh | 5 + .../streaming_asr_server.py | 38 +++ demos/streaming_asr_server/test.sh | 7 +- .../server/bin/paddlespeech_client.py | 42 ++- 9 files changed, 703 insertions(+), 10 deletions(-) create mode 100644 demos/streaming_asr_server/conf/punc_application.yaml create mode 100644 demos/streaming_asr_server/punc_server.py create mode 100755 demos/streaming_asr_server/server.sh create mode 100644 demos/streaming_asr_server/streaming_asr_server.py diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 3de2f386..48cfbaf3 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -355,4 +355,274 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - ``` \ No newline at end of file + ``` + + +## Punctuation service + +### 1. Server usage + +- Command Line + ``` bash + In PaddleSpeech/demos/streaming_asr_server directory to lanuch punctuation service + paddlespeech_server start --config_file conf/punc_application.yaml + ``` + + + Usage: + ```bash + paddlespeech_server start --help + ``` + + Arguments: + - `config_file`: configuration file. + - `log_file`: log file. + + + Output: + ``` bash + [2022-05-02 17:59:26,285] [ INFO] - Create the TextEngine Instance + [2022-05-02 17:59:26,285] [ INFO] - Init the text engine + [2022-05-02 17:59:26,285] [ INFO] - Text Engine set the device: gpu:0 + [2022-05-02 17:59:26,286] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar.gz md5 checking... + [2022-05-02 17:59:30,810] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar + W0502 17:59:31.486552 9595 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 6.1, Driver API Version: 10.2, Runtime API Version: 10.2 + W0502 17:59:31.491360 9595 device_context.cc:465] device: 0, cuDNN Version: 7.6. + [2022-05-02 17:59:34,688] [ INFO] - Already cached /home/users/xiongxinlei/.paddlenlp/models/ernie-1.0/vocab.txt + [2022-05-02 17:59:34,701] [ INFO] - Init the text engine successfully + INFO: Started server process [9595] + [2022-05-02 17:59:34] [INFO] [server.py:75] Started server process [9595] + INFO: Waiting for application startup. + [2022-05-02 17:59:34] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-02 17:59:34] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + [2022-05-02 17:59:34] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + ``` + +- Python API + + ```python + # 在 PaddleSpeech/demos/streaming_asr_server 目录 + from paddlespeech.server.bin.paddlespeech_server import ServerExecutor + + server_executor = ServerExecutor() + server_executor( + config_file="./conf/punc_application.yaml", + log_file="./log/paddlespeech.log") + ``` + + Output: + ``` + [2022-05-02 18:09:02,542] [ INFO] - Create the TextEngine Instance + [2022-05-02 18:09:02,543] [ INFO] - Init the text engine + [2022-05-02 18:09:02,543] [ INFO] - Text Engine set the device: gpu:0 + [2022-05-02 18:09:02,545] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar.gz md5 checking... + [2022-05-02 18:09:06,919] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar + W0502 18:09:07.523002 22615 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 6.1, Driver API Version: 10.2, Runtime API Version: 10.2 + W0502 18:09:07.527882 22615 device_context.cc:465] device: 0, cuDNN Version: 7.6. + [2022-05-02 18:09:10,900] [ INFO] - Already cached /home/users/xiongxinlei/.paddlenlp/models/ernie-1.0/vocab.txt + [2022-05-02 18:09:10,913] [ INFO] - Init the text engine successfully + INFO: Started server process [22615] + [2022-05-02 18:09:10] [INFO] [server.py:75] Started server process [22615] + INFO: Waiting for application startup. + [2022-05-02 18:09:10] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-02 18:09:10] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + [2022-05-02 18:09:10] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + ``` + +### 2. Client usage +**Note** The response time will be slightly longer when using the client for the first time + +- Command line + ``` + paddlespeech_client text --server_ip 127.0.0.1 --port 8190 --input "我认为跑步最重要的就是给我带来了身体健康" + ``` + + Output + ``` + [2022-05-02 18:12:29,767] [ INFO] - The punc text: 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-02 18:12:29,767] [ INFO] - Response time 0.096548 s. + ``` + +- Python3 API + + ```python + from paddlespeech.server.bin.paddlespeech_client import TextClientExecutor + + textclient_executor = TextClientExecutor() + res = textclient_executor( + input="我认为跑步最重要的就是给我带来了身体健康", + server_ip="127.0.0.1", + port=8190,) + print(res) + ``` + + Output: + ``` bash + 我认为跑步最重要的就是给我带来了身体健康。 + ``` + + +## Join streaming asr and punctuation server +We use `streaming_ asr_server.py` and `punc_server.py` two services to lanuch streaming speech recognition and punctuation prediction services respectively. And the `websocket_client.py` script can be used to call streaming speech recognition and punctuation prediction services at the same time. + +### 1. Start two server + +``` bash +Note: streaming speech recognition and punctuation prediction are configured on different graphics cards through configuration files +bash server.sh +``` + +### 2. Call client +- Command line + ``` + paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav + ``` + Output: + ``` + [2022-05-02 18:57:46,961] [ INFO] - asr websocket client start + [2022-05-02 18:57:46,961] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming + [2022-05-02 18:57:46,982] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-02 18:57:46,999] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,011] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,023] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,035] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,046] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,057] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,068] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,079] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,222] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,230] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,239] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,247] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,255] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,263] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,271] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,462] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,525] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,589] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,649] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,708] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,766] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,824] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,881] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:48,130] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,200] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,265] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,327] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,389] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,448] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,505] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,754] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:48,821] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:48,881] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:48,939] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,011] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,080] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,146] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,210] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,452] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,516] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,581] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,645] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,706] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,763] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,818] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:50,064] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,125] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,186] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,245] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,301] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,358] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,414] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,469] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,712] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,776] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,837] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,897] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,956] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:51,012] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:51,276] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:51,277] [ INFO] - asr websocket client finished + [2022-05-02 18:57:51,277] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-02 18:57:51,277] [ INFO] - Response time 4.316903 s. + ``` + +- Use script + ``` + python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav + ``` + Output: + ``` + [2022-05-02 18:29:22,039] [ INFO] - Start to do streaming asr client + [2022-05-02 18:29:22,040] [ INFO] - asr websocket client start + [2022-05-02 18:29:22,040] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming + [2022-05-02 18:29:22,041] [ INFO] - start to process the wavscp: ./zh.wav + [2022-05-02 18:29:22,122] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-02 18:29:22,351] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,360] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,368] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,376] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,384] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,392] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,400] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,408] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,549] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,558] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,567] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,575] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,583] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,591] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,599] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,822] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:22,879] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:22,937] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:22,995] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,052] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,107] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,161] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,213] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,454] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,515] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,575] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,630] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,684] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,789] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:24,030] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,095] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,156] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,213] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,268] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,323] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,377] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,429] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,671] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,797] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,857] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,918] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,975] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:25,029] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:25,271] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,336] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,398] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,458] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,521] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,579] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,652] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,722] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,969] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,034] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,095] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,163] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,229] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,294] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,565] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 + ``` + + \ No newline at end of file diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index bb1d3772..67f62860 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -363,3 +363,275 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} ``` + + + +## 标点预测 + +### 1. 服务端使用方法 + +- 命令行 + ``` bash + 在 PaddleSpeech/demos/streaming_asr_server 目录下启动标点预测服务 + paddlespeech_server start --config_file conf/punc_application.yaml + ``` + + + 使用方法: + + ```bash + paddlespeech_server start --help + ``` + + 参数: + - `config_file`: 服务的配置文件。 + - `log_file`: log 文件。 + + + 输出: + ``` bash + [2022-05-02 17:59:26,285] [ INFO] - Create the TextEngine Instance + [2022-05-02 17:59:26,285] [ INFO] - Init the text engine + [2022-05-02 17:59:26,285] [ INFO] - Text Engine set the device: gpu:0 + [2022-05-02 17:59:26,286] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar.gz md5 checking... + [2022-05-02 17:59:30,810] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar + W0502 17:59:31.486552 9595 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 6.1, Driver API Version: 10.2, Runtime API Version: 10.2 + W0502 17:59:31.491360 9595 device_context.cc:465] device: 0, cuDNN Version: 7.6. + [2022-05-02 17:59:34,688] [ INFO] - Already cached /home/users/xiongxinlei/.paddlenlp/models/ernie-1.0/vocab.txt + [2022-05-02 17:59:34,701] [ INFO] - Init the text engine successfully + INFO: Started server process [9595] + [2022-05-02 17:59:34] [INFO] [server.py:75] Started server process [9595] + INFO: Waiting for application startup. + [2022-05-02 17:59:34] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-02 17:59:34] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + [2022-05-02 17:59:34] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + ``` + +- Python API + + ```python + # 在 PaddleSpeech/demos/streaming_asr_server 目录 + from paddlespeech.server.bin.paddlespeech_server import ServerExecutor + + server_executor = ServerExecutor() + server_executor( + config_file="./conf/punc_application.yaml", + log_file="./log/paddlespeech.log") + ``` + + 输出 + ``` + [2022-05-02 18:09:02,542] [ INFO] - Create the TextEngine Instance + [2022-05-02 18:09:02,543] [ INFO] - Init the text engine + [2022-05-02 18:09:02,543] [ INFO] - Text Engine set the device: gpu:0 + [2022-05-02 18:09:02,545] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar.gz md5 checking... + [2022-05-02 18:09:06,919] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/ernie_linear_p3_wudao-punc-zh/ernie_linear_p3_wudao-punc-zh.tar + W0502 18:09:07.523002 22615 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 6.1, Driver API Version: 10.2, Runtime API Version: 10.2 + W0502 18:09:07.527882 22615 device_context.cc:465] device: 0, cuDNN Version: 7.6. + [2022-05-02 18:09:10,900] [ INFO] - Already cached /home/users/xiongxinlei/.paddlenlp/models/ernie-1.0/vocab.txt + [2022-05-02 18:09:10,913] [ INFO] - Init the text engine successfully + INFO: Started server process [22615] + [2022-05-02 18:09:10] [INFO] [server.py:75] Started server process [22615] + INFO: Waiting for application startup. + [2022-05-02 18:09:10] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-02 18:09:10] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + [2022-05-02 18:09:10] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8190 (Press CTRL+C to quit) + ``` + +### 2. 标点预测客户端使用方法 +**注意:** 初次使用客户端时响应时间会略长 + +- 命令行 (推荐使用) + ``` + paddlespeech_client text --server_ip 127.0.0.1 --port 8190 --input "我认为跑步最重要的就是给我带来了身体健康" + ``` + + 输出 + ``` + [2022-05-02 18:12:29,767] [ INFO] - The punc text: 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-02 18:12:29,767] [ INFO] - Response time 0.096548 s. + ``` + +- Python3 API + + ```python + from paddlespeech.server.bin.paddlespeech_client import TextClientExecutor + + textclient_executor = TextClientExecutor() + res = textclient_executor( + input="我认为跑步最重要的就是给我带来了身体健康", + server_ip="127.0.0.1", + port=8190,) + print(res) + ``` + + 输出: + ``` bash + 我认为跑步最重要的就是给我带来了身体健康。 + ``` + + +## 联合流式语音识别和标点预测 +使用 `streaming_asr_server.py` 和 `punc_server.py` 两个服务,分别启动流式语音识别和标点预测服务。调用 `websocket_client.py` 脚本可以同时调用流式语音识别和标点预测服务。 + +### 1. 启动服务 + +``` bash +注意:流式语音识别和标点预测通过配置文件配置到不同的显卡上 +bash server.sh +``` + +### 2. 调用服务 +- 使用命令行: + ``` + paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav + ``` + 输出: + ``` + [2022-05-02 18:57:46,961] [ INFO] - asr websocket client start + [2022-05-02 18:57:46,961] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming + [2022-05-02 18:57:46,982] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-02 18:57:46,999] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,011] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,023] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,035] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,046] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,057] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,068] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,079] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,222] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,230] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,239] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,247] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,255] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,263] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,271] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:57:47,462] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,525] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,589] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,649] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,708] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,766] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,824] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:47,881] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:57:48,130] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,200] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,265] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,327] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,389] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,448] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,505] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:57:48,754] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:48,821] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:48,881] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:48,939] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,011] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,080] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,146] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,210] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:57:49,452] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,516] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,581] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,645] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,706] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,763] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:49,818] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:57:50,064] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,125] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,186] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,245] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,301] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,358] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,414] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,469] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:57:50,712] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,776] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,837] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,897] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:50,956] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:51,012] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:51,276] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:57:51,277] [ INFO] - asr websocket client finished + [2022-05-02 18:57:51,277] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-02 18:57:51,277] [ INFO] - Response time 4.316903 s. + ``` + +- 使用脚本调用 + ``` + python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav + ``` + 输出: + ``` + [2022-05-02 18:29:22,039] [ INFO] - Start to do streaming asr client + [2022-05-02 18:29:22,040] [ INFO] - asr websocket client start + [2022-05-02 18:29:22,040] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming + [2022-05-02 18:29:22,041] [ INFO] - start to process the wavscp: ./zh.wav + [2022-05-02 18:29:22,122] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-02 18:29:22,351] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,360] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,368] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,376] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,384] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,392] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,400] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,408] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,549] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,558] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,567] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,575] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,583] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,591] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,599] [ INFO] - client receive msg={'result': ''} + [2022-05-02 18:29:22,822] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:22,879] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:22,937] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:22,995] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,052] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,107] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,161] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,213] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-02 18:29:23,454] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,515] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,575] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,630] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,684] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:23,789] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-02 18:29:24,030] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,095] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,156] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,213] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,268] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,323] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,377] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,429] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-02 18:29:24,671] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,797] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,857] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,918] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:24,975] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:25,029] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-02 18:29:25,271] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,336] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,398] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,458] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,521] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,579] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,652] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,722] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-02 18:29:25,969] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,034] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,095] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,163] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,229] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,294] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,565] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 + ``` + + \ No newline at end of file diff --git a/demos/streaming_asr_server/conf/punc_application.yaml b/demos/streaming_asr_server/conf/punc_application.yaml new file mode 100644 index 00000000..e0d06871 --- /dev/null +++ b/demos/streaming_asr_server/conf/punc_application.yaml @@ -0,0 +1,35 @@ +# This is the parameter configuration file for PaddleSpeech Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8190 + +# The task format in the engin_list is: _ +# task choices = ['asr_python'] +# protocol = ['http'] (only one can be selected). +# http only support offline engine type. +protocol: 'http' +engine_list: ['text_python'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### Text ######################################### +################### text task: punc; engine_type: python ####################### +text_python: + task: punc + model_type: 'ernie_linear_p3_wudao' + lang: 'zh' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + vocab_file: # [optional] + device: gpu:0 # set 'gpu:id' or 'cpu' + + + + diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml index 50c7a727..42473555 100644 --- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml +++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml @@ -4,7 +4,7 @@ # SERVER SETTING # ################################################################################# host: 0.0.0.0 -port: 8090 +port: 8290 # The task format in the engin_list is: _ # task choices = ['asr_online'] @@ -29,7 +29,7 @@ asr_online: cfg_path: decode_method: force_yes: True - device: # cpu or gpu:id + device: gpu:3 # cpu or gpu:id am_predictor_conf: device: # set 'gpu:id' or 'cpu' switch_ir_optim: True diff --git a/demos/streaming_asr_server/punc_server.py b/demos/streaming_asr_server/punc_server.py new file mode 100644 index 00000000..eefa0fb4 --- /dev/null +++ b/demos/streaming_asr_server/punc_server.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from paddlespeech.cli.log import logger +from paddlespeech.server.bin.paddlespeech_server import ServerExecutor +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='paddlespeech_server.start', add_help=True) + parser.add_argument( + "--config_file", + action="store", + help="yaml file of the app", + default=None, + required=True) + + parser.add_argument( + "--log_file", + action="store", + help="log file", + default="./log/paddlespeech.log") + logger.info("start to parse the args") + args = parser.parse_args() + + logger.info("start to launch the punctuation server") + punc_server = ServerExecutor() + punc_server(config_file=args.config_file, log_file=args.log_file) diff --git a/demos/streaming_asr_server/server.sh b/demos/streaming_asr_server/server.sh new file mode 100755 index 00000000..04858321 --- /dev/null +++ b/demos/streaming_asr_server/server.sh @@ -0,0 +1,5 @@ +export CUDA_VISIBLE_DEVICE=0,1,2,3 + +nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 & + +nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_application.yaml > streaming_asr.log 2>&1 & diff --git a/demos/streaming_asr_server/streaming_asr_server.py b/demos/streaming_asr_server/streaming_asr_server.py new file mode 100644 index 00000000..011b009a --- /dev/null +++ b/demos/streaming_asr_server/streaming_asr_server.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from paddlespeech.cli.log import logger +from paddlespeech.server.bin.paddlespeech_server import ServerExecutor +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='paddlespeech_server.start', add_help=True) + parser.add_argument( + "--config_file", + action="store", + help="yaml file of the app", + default=None, + required=True) + + parser.add_argument( + "--log_file", + action="store", + help="log file", + default="./log/paddlespeech.log") + logger.info("start to parse the args") + args = parser.parse_args() + + logger.info("start to launch the streaming asr server") + streaming_asr_server = ServerExecutor() + streaming_asr_server(config_file=args.config_file, log_file=args.log_file) diff --git a/demos/streaming_asr_server/test.sh b/demos/streaming_asr_server/test.sh index fe8155cf..912d67a2 100644 --- a/demos/streaming_asr_server/test.sh +++ b/demos/streaming_asr_server/test.sh @@ -1,5 +1,8 @@ # download the test wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -# read the wav and pass it to service -python3 websocket_client.py --wavfile ./zh.wav +# read the wav and pass it to only streaming asr service +python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav + +# read the wav and call streaming and punc service +python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 2f1ce385..9d5c1b21 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -411,6 +411,18 @@ class ASROnlineClientExecutor(BaseExecutor): '--lang', type=str, default="zh_cn", help='language') self.parser.add_argument( '--audio_format', type=str, default="wav", help='audio format') + self.parser.add_argument( + '--punc.server_ip', + type=str, + default=None, + dest="punc_server_ip", + help='Punctuation server ip') + self.parser.add_argument( + '--punc.port', + type=int, + default=8190, + dest="punc_server_port", + help='Punctuation server port') def execute(self, argv: List[str]) -> bool: args = self.parser.parse_args(argv) @@ -428,7 +440,9 @@ class ASROnlineClientExecutor(BaseExecutor): port=port, sample_rate=sample_rate, lang=lang, - audio_format=audio_format) + audio_format=audio_format, + punc_server_ip=args.punc_server_ip, + punc_server_port=args.punc_server_port) time_end = time.time() logger.info(res) logger.info("Response time %f s." % (time_end - time_start)) @@ -445,12 +459,30 @@ class ASROnlineClientExecutor(BaseExecutor): port: int=8091, sample_rate: int=16000, lang: str="zh_cn", - audio_format: str="wav"): - """ - Python API to call an executor. + audio_format: str="wav", + punc_server_ip: str=None, + punc_server_port: str=None): + """Python API to call asr online executor. + + Args: + input (str): the audio file to be send to streaming asr service. + server_ip (str, optional): streaming asr server ip. Defaults to "127.0.0.1". + port (int, optional): streaming asr server port. Defaults to 8091. + sample_rate (int, optional): audio sample rate. Defaults to 16000. + lang (str, optional): audio language type. Defaults to "zh_cn". + audio_format (str, optional): audio format. Defaults to "wav". + punc_server_ip (str, optional): punctuation server ip. Defaults to None. + punc_server_port (str, optional): punctuation server port. Defaults to None. + + Returns: + str: the audio text """ logger.info("asr websocket client start") - handler = ASRWsAudioHandler(server_ip, port) + handler = ASRWsAudioHandler( + server_ip, + port, + punc_server_ip=punc_server_ip, + punc_server_port=punc_server_port) loop = asyncio.get_event_loop() res = loop.run_until_complete(handler.run(input)) logger.info("asr websocket client finished") From 2ab96187aaad5f7e05788fe61b3baa2c1fc5d103 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 4 May 2022 16:42:12 +0800 Subject: [PATCH 04/93] streaming asr server add time stamp, test=doc --- .../server/engine/asr/online/asr_engine.py | 34 ++++++++ .../server/engine/asr/online/ctc_search.py | 87 ++++++++++++++++--- paddlespeech/server/ws/asr_socket.py | 4 +- 3 files changed, 110 insertions(+), 15 deletions(-) diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 990590b4..a98268f0 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -290,6 +290,7 @@ class PaddleASRConnectionHanddler: self.chunk_num = 0 self.global_frame_offset = 0 self.result_transcripts = [''] + self.word_time_stamp = None def decode(self, is_finished=False): if "deepspeech2online" in self.model_type: @@ -505,6 +506,12 @@ class PaddleASRConnectionHanddler: else: return '' + def get_word_time_stamp(self): + if self.word_time_stamp is None: + return [] + else: + return self.word_time_stamp + @paddle.no_grad() def rescoring(self): if "deepspeech2online" in self.model_type or "deepspeech2offline" in self.model_type: @@ -569,8 +576,35 @@ class PaddleASRConnectionHanddler: # update the one best result logger.info(f"best index: {best_index}") self.hyps = [hyps[best_index][0]] + + # update the hyps time stamp + self.time_stamp = hyps[best_index][5] if hyps[best_index][2] > hyps[ + best_index][3] else hyps[best_index][6] + logger.info(f"time stamp: {self.time_stamp}") + self.update_result() + # update each word start and end time stamp + frame_shift_in_ms = self.model.encoder.embed.subsampling_rate * self.n_shift / self.sample_rate + logger.info(f"frame shift ms: {frame_shift_in_ms}") + word_time_stamp = [] + for idx, _ in enumerate(self.time_stamp): + start = (self.time_stamp[idx - 1] + self.time_stamp[idx] + ) / 2.0 if idx > 0 else 0 + start = start * frame_shift_in_ms + + end = (self.time_stamp[idx] + self.time_stamp[idx + 1] + ) / 2.0 if idx < len(self.time_stamp) - 1 else self.offset + end = end * frame_shift_in_ms + word_time_stamp.append({ + "w": self.result_transcripts[0][idx], + "bg": start, + "ed": end + }) + # logger.info(f"{self.result_transcripts[0][idx]}, start: {start}, end: {end}") + self.word_time_stamp = word_time_stamp + logger.info(f"word time stamp: {self.word_time_stamp}") + class ASRServerExecutor(ASRExecutor): def __init__(self): diff --git a/paddlespeech/server/engine/asr/online/ctc_search.py b/paddlespeech/server/engine/asr/online/ctc_search.py index be5fb15b..3a808587 100644 --- a/paddlespeech/server/engine/asr/online/ctc_search.py +++ b/paddlespeech/server/engine/asr/online/ctc_search.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy from collections import defaultdict import paddle @@ -54,14 +55,24 @@ class CTCPrefixBeamSearch: assert len(ctc_probs.shape) == 2 # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - # blank_ending_score and none_blank_ending_score in ln domain + # 0. blank_ending_score, + # 1. none_blank_ending_score, + # 2. viterbi_blank ending, + # 3. viterbi_non_blank, + # 4. current_token_prob, + # 5. times_viterbi_blank, + # 6. times_titerbi_non_blank if self.cur_hyps is None: - self.cur_hyps = [(tuple(), (0.0, -float('inf')))] + self.cur_hyps = [(tuple(), (0.0, -float('inf'), 0.0, 0.0, + -float('inf'), [], []))] + # self.cur_hyps = [(tuple(), (0.0, -float('inf')))] # 2. CTC beam search step by step for t in range(0, maxlen): logp = ctc_probs[t] # (vocab_size,) # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) + # next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) + next_hyps = defaultdict( + lambda: (-float('inf'), -float('inf'), -float('inf'), -float('inf'), -float('inf'), [], [])) # 2.1 First beam prune: select topk best # do token passing process @@ -69,36 +80,83 @@ class CTCPrefixBeamSearch: for s in top_k_index: s = s.item() ps = logp[s].item() - for prefix, (pb, pnb) in self.cur_hyps: + for prefix, (pb, pnb, v_s, v_ns, cur_token_prob, times_s, + times_ns) in self.cur_hyps: last = prefix[-1] if len(prefix) > 0 else None if s == blank_id: # blank - n_pb, n_pnb = next_hyps[prefix] + n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[ + prefix] n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) + + pre_times = times_s if v_s > v_ns else times_ns + n_times_s = copy.deepcopy(pre_times) + viterbi_score = v_s if v_s > v_ns else v_ns + n_v_s = viterbi_score + ps + next_hyps[prefix] = (n_pb, n_pnb, n_v_s, n_v_ns, + n_cur_token_prob, n_times_s, + n_times_ns) elif s == last: # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] + # case1: *a + a => *a + n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[ + prefix] n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) + if n_v_ns < v_ns + ps: + n_v_ns = v_ns + ps + if n_cur_token_prob < ps: + n_cur_token_prob = ps + n_times_ns = copy.deepcopy(times_ns) + n_times_ns[ + -1] = self.abs_time_step # 注意,这里要重新使用绝对时间 + next_hyps[prefix] = (n_pb, n_pnb, n_v_s, n_v_ns, + n_cur_token_prob, n_times_s, + n_times_ns) + # Update *s-s -> *ss, - is for blank + # Case 2: *aε + a => *aa n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] + n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[ + n_prefix] + if n_v_ns < v_s + ps: + n_v_ns = v_s + ps + n_cur_token_prob = ps + n_times_ns = copy.deepcopy(times_s) + n_times_ns.append(self.abs_time_step) n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) + next_hyps[n_prefix] = (n_pb, n_pnb, n_v_s, n_v_ns, + n_cur_token_prob, n_times_s, + n_times_ns) else: + # Case 3: *a + b => *ab, *aε + b => *ab n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] + n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_n = next_hyps[ + n_prefix] + viterbi_score = v_s if v_s > v_ns else v_ns + pre_times = times_s if v_s > v_ns else times_ns + if n_v_ns < viterbi_score + ps: + n_v_ns = viterbi_score + ps + n_cur_token_prob = ps + n_times_ns = copy.deepcopy(pre_times) + n_times_ns.append(self.abs_time_step) + n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) + next_hyps[n_prefix] = (n_pb, n_pnb, n_v_s, n_v_ns, + n_cur_token_prob, n_times_s, + n_times_ns) # 2.2 Second beam prune next_hyps = sorted( next_hyps.items(), - key=lambda x: log_add(list(x[1])), + key=lambda x: log_add([x[1][0], x[1][1]]), reverse=True) self.cur_hyps = next_hyps[:beam_size] - self.hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in self.cur_hyps] + # 2.3 update the absolute time step + self.abs_time_step += 1 + # self.hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in self.cur_hyps] + self.hyps = [(y[0], log_add([y[1][0], y[1][1]]), y[1][2], y[1][3], + y[1][4], y[1][5], y[1][6]) for y in self.cur_hyps] + logger.info("ctc prefix search success") return self.hyps @@ -123,6 +181,7 @@ class CTCPrefixBeamSearch: """ self.cur_hyps = None self.hyps = None + self.abs_time_step = 0 def finalize_search(self): """do nothing in ctc_prefix_beam_search diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py index 68686d3d..0f7dcddd 100644 --- a/paddlespeech/server/ws/asr_socket.py +++ b/paddlespeech/server/ws/asr_socket.py @@ -78,12 +78,14 @@ async def websocket_endpoint(websocket: WebSocket): connection_handler.decode(is_finished=True) connection_handler.rescoring() asr_results = connection_handler.get_result() + word_time_stamp = connection_handler.get_word_time_stamp() connection_handler.reset() resp = { "status": "ok", "signal": "finished", - 'result': asr_results + 'result': asr_results, + 'times': word_time_stamp } await websocket.send_json(resp) break From 69c720073fe5442704cd9b608bd39f81f33b8cce Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 5 May 2022 11:53:46 +0800 Subject: [PATCH 05/93] add copyright --- speechx/patch/README.md | 2 ++ speechx/speechx/websocket/websocket_client.cc | 2 +- speechx/speechx/websocket/websocket_client.h | 2 +- speechx/speechx/websocket/websocket_server.cc | 2 +- speechx/speechx/websocket/websocket_server.h | 3 +-- 5 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 speechx/patch/README.md diff --git a/speechx/patch/README.md b/speechx/patch/README.md new file mode 100644 index 00000000..1bee5ed6 --- /dev/null +++ b/speechx/patch/README.md @@ -0,0 +1,2 @@ +reference: +this patch is from WeNet wenet/runtime/core/patch diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/websocket/websocket_client.cc index bf3bbef2..5176dc89 100644 --- a/speechx/speechx/websocket/websocket_client.cc +++ b/speechx/speechx/websocket/websocket_client.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/websocket/websocket_client.h index 35def076..df7395a7 100644 --- a/speechx/speechx/websocket/websocket_client.h +++ b/speechx/speechx/websocket/websocket_client.h @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc index 71a9e127..2a6b8990 100644 --- a/speechx/speechx/websocket/websocket_server.cc +++ b/speechx/speechx/websocket/websocket_server.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/websocket/websocket_server.h index 469f123f..8856f5d0 100644 --- a/speechx/speechx/websocket/websocket_server.h +++ b/speechx/speechx/websocket/websocket_server.h @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #pragma once #include "base/common.h" From 9720e0faf70fd9ad068df1ea4cdff281a87ef122 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 May 2022 17:17:50 +0800 Subject: [PATCH 06/93] Update README.md --- speechx/examples/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 18b37281..b18c88e0 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,8 +1,6 @@ # Examples for SpeechX -* ds2_ol - ds2 streaming test under `aishell-1` test dataset. -The entrypoint is `ds2_ol/aishell/run.sh` - +* `ds2_ol` - ds2 streaming test under `aishell-1` test dataset. ## How to run From 96453374106a884526093ac9c91f7dd68bf76212 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 May 2022 17:18:57 +0800 Subject: [PATCH 07/93] Update README.md --- speechx/examples/ds2_ol/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md index ed88ef6b..de906924 100644 --- a/speechx/examples/ds2_ol/README.md +++ b/speechx/examples/ds2_ol/README.md @@ -1,10 +1,8 @@ # Deepspeech2 Streaming ASR -* websocket -Streaming ASR with websocket. +* `websocket` - Streaming ASR with websocket. -* aishell -Streaming Decoding under aishell dataset, for local WER test and so on. +* `aishell` - Streaming Decoding under aishell dataset, for local WER test. ## More The below is for developing and offline testing: From 27b7f5000600863b58f1a39cfc6c5eab413806b7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 May 2022 17:20:31 +0800 Subject: [PATCH 08/93] Update README.md --- speechx/examples/ds2_ol/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md index de906924..f405198d 100644 --- a/speechx/examples/ds2_ol/README.md +++ b/speechx/examples/ds2_ol/README.md @@ -1,11 +1,14 @@ # Deepspeech2 Streaming ASR +## Examples + * `websocket` - Streaming ASR with websocket. * `aishell` - Streaming Decoding under aishell dataset, for local WER test. ## More -The below is for developing and offline testing: + +> The below is for developing and offline testing. Do not run it only if you know what it is. * nnet * feat * decoder From 8850955da8610ee84fa6c1effe76b8c69b1f47a2 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 May 2022 17:21:59 +0800 Subject: [PATCH 09/93] Update README.md --- speechx/examples/ds2_ol/aishell/README.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md index 01c89979..1ed8a67c 100644 --- a/speechx/examples/ds2_ol/aishell/README.md +++ b/speechx/examples/ds2_ol/aishell/README.md @@ -1,6 +1,14 @@ # Aishell - Deepspeech2 Streaming -## CTC Prefix Beam Search w/o LM +## How to run + +``` +bash run.sh +``` + +## Results + +### CTC Prefix Beam Search w/o LM ``` Overall -> 16.14 % N=104612 C=88190 S=16110 D=312 I=465 @@ -8,7 +16,7 @@ Mandarin -> 16.14 % N=104612 C=88190 S=16110 D=312 I=465 Other -> 0.00 % N=0 C=0 S=0 D=0 I=0 ``` -## CTC Prefix Beam Search w/ LM +### CTC Prefix Beam Search w/ LM LM: zh_giga.no_cna_cmn.prune01244.klm ``` @@ -17,7 +25,7 @@ Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327 Other -> 0.00 % N=0 C=0 S=0 D=0 I=0 ``` -## CTC WFST +### CTC WFST LM: [aishell train](http://paddlespeech.bj.bcebos.com/speechx/examples/ds2_ol/aishell/aishell_graph.zip) --acoustic_scale=1.2 From 10da21a77b64e39626ea4f9481e8a0c483e0ef74 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 2 May 2022 15:37:41 +0800 Subject: [PATCH 10/93] update the vector cli for server, test=doc --- demos/streaming_asr_server/websocket_client.py | 2 +- paddlespeech/cli/vector/infer.py | 3 +++ paddlespeech/server/bin/paddlespeech_client.py | 6 +++++- paddlespeech/server/engine/vector/python/vector_engine.py | 4 ++-- paddlespeech/server/restful/request.py | 2 +- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/demos/streaming_asr_server/websocket_client.py b/demos/streaming_asr_server/websocket_client.py index 3cadd72a..3451b8d0 100644 --- a/demos/streaming_asr_server/websocket_client.py +++ b/demos/streaming_asr_server/websocket_client.py @@ -37,7 +37,7 @@ def main(args): if args.wavfile and os.path.exists(args.wavfile): logger.info(f"start to process the wavscp: {args.wavfile}") result = loop.run_until_complete(handler.run(args.wavfile)) - # result = result["result"] + result = result["result"] logger.info(f"asr websocket client finished : {result}") # support to process batch audios from wav.scp diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 8afb0f5c..3111badf 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -285,8 +285,10 @@ class VectorExecutor(BaseExecutor): Defaults to None. ckpt_path (Optional[os.PathLike], optional): the pretrained model path, which is stored in the disk. Defaults to None. + task (str, optional): the model task type """ # stage 0: avoid to init the mode again + self.task = task if hasattr(self, "model"): logger.info("Model has been initialized") return @@ -435,6 +437,7 @@ class VectorExecutor(BaseExecutor): if self.sample_rate != 16000 and self.sample_rate != 8000: logger.error( "invalid sample rate, please input --sr 8000 or --sr 16000") + logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}") return False if isinstance(audio_file, (str, os.PathLike)): diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 32f78942..cd1cd51a 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -602,7 +602,11 @@ class VectorClientExecutor(BaseExecutor): default=None, help='sentence to be process by text server.') self.parser.add_argument( - '--task', type=str, default="spk", help="The vector service task") + '--task', + type=str, + default="spk", + choices=["spk", "score"], + help="The vector service task") self.parser.add_argument( "--enroll", type=str, default=None, help="The enroll audio") self.parser.add_argument( diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 866c2229..2fd8dec6 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -99,8 +99,8 @@ class PaddleVectorConnectionHandler: """extract the audio embedding Args: - audio (_type_): _description_ - sample_rate (int, optional): _description_. Defaults to 16000. + audio (str): the audio data + sample_rate (int, optional): the audio sample rate. Defaults to 16000. """ # we can not reuse the cache io.BytesIO(audio) data, # because the soundfile will change the io.BytesIO(audio) to the end diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py index 4e88280a..b7a32481 100644 --- a/paddlespeech/server/restful/request.py +++ b/paddlespeech/server/restful/request.py @@ -115,7 +115,7 @@ class VectorScoreRequest(BaseModel): { "enroll_audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...", "test_audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...", - "task": "spk", + "task": "score", "audio_format": "wav", "sample_rate": 16000, } From 6b53ca7f87dd92d36d2f39bba623231c9c63b4ac Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 5 May 2022 17:39:18 +0800 Subject: [PATCH 11/93] add fbank script --- speechx/examples/ds2_ol/aishell/run_fbank.sh | 170 +++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100755 speechx/examples/ds2_ol/aishell/run_fbank.sh diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh new file mode 100755 index 00000000..8446cbca --- /dev/null +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -0,0 +1,170 @@ +#!/bin/bash +set +x +set -e + +. path.sh + +nj=40 +stage=0 +stop_stage=5 + +. utils/parse_options.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# input +mkdir -p data +data=$PWD/data + +ckpt_dir=$data/fbank_model +model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ +vocb_dir=$ckpt_dir/data/lang_char/ + +# output +mkdir -p exp +exp=$PWD/exp + +lm=$data/zh_giga.no_cna_cmn.prune01244.klm +aishell_wav_scp=aishell_test.scp +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then + if [ ! -d $data/test ]; then + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip + unzip aishell_test.zip + popd + + realpath $data/test/*/*.wav > $data/wavlist + awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id + paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp + fi + + if [ ! -f $ckpt_dir/data/mean_std.json ]; then + mkdir -p $ckpt_dir + pushd $ckpt_dir + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz + tar xzfv WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz + popd + fi + + if [ ! -f $lm ]; then + pushd $data + wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm + popd + fi +fi + +# 3. make feature +text=$data/test/text +label_file=./aishell_result_fbank +wer=./aishell_wer_fbank + +export GLOG_logtostderr=1 + + +cmvn=$data/cmvn_fbank.ark +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 3. gen linear feat + cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn --binary=false + + ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \ + compute_fbank_main \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --feature_wspecifier=ark,scp:$data/split${nj}/JOB/fbank_feat.ark,$data/split${nj}/JOB/fbank_feat.scp \ + --cmvn_file=$cmvn \ + --streaming_chunk=36 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # recognizer + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \ + ctc-prefix-beam-search-decoder-ol \ + --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ + --model_path=$model_dir/avg_5.jit.pdmodel \ + --param_path=$model_dir/avg_5.jit.pdiparams \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --model_cache_shapes="5-1-2048,5-1-2048" \ + --dict_file=$vocb_dir/vocab.txt \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_fbank + + cat $data/split${nj}/*/result_fbank > $exp/${label_file} + utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # decode with lm + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \ + ctc-prefix-beam-search-decoder-ol \ + --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ + --model_path=$model_dir/avg_5.jit.pdmodel \ + --param_path=$model_dir/avg_5.jit.pdiparams \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --model_cache_shapes="5-1-2048,5-1-2048" \ + --dict_file=$vocb_dir/vocab.txt \ + --lm_path=$lm \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/fbank_result_lm + + cat $data/split${nj}/*/fbank_result_lm > $exp/${label_file}_lm + utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm +fi + +wfst=$data/wfst_fbank/ +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + mkdir -p $wfst + if [ ! -f $wfst/aishell_graph.zip ]; then + pushd $wfst + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip + unzip aishell_graph2.zip + mv aishell_graph2/* $wfst + popd + fi +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # TLG decoder + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \ + wfst-decoder-ol \ + --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ + --model_path=$model_dir/avg_5.jit.pdmodel \ + --param_path=$model_dir/avg_5.jit.pdiparams \ + --word_symbol_table=$wfst/words.txt \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --lm_path=$lm \ + --graph_path=$wfst/TLG.fst --max_active=7500 \ + --acoustic_scale=1.2 \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg + + cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg + utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_tlg > $exp/${wer}.tlg + echo "wfst-decoder-ol have finished!!!" + echo "please checkout in ${exp}/${wer}.tlg" +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/fbank_recognizer.log \ + recognizer_test_main \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --cmvn_file=$cmvn \ + --model_path=$model_dir/avg_5.jit.pdmodel \ + --streaming_chunk=30 \ + --use_fbank=true \ + --to_float32=false \ + --param_path=$model_dir/avg_5.jit.pdiparams \ + --word_symbol_table=$graph_dir/words.txt \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --model_cache_shapes="5-1-2048,5-1-2048" \ + --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --acoustic_scale=1.2 \ + --result_wspecifier=ark,t:./result_fbank_recognizer + + cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer + utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer + echo "recognizer test have finished!!!" + echo "please checkout in ${exp}/${wer}.recognizer" +fi From 624ab2c57afafef0260230e707ebe849308bf82f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 May 2022 11:08:00 +0000 Subject: [PATCH 12/93] update asr1 config --- examples/aishell/asr1/conf/chunk_conformer.yaml | 6 +++--- examples/aishell/asr1/conf/conformer.yaml | 2 +- examples/aishell/asr1/conf/transformer.yaml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 3cfe9b1b..b389e367 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -10,7 +10,7 @@ encoder_conf: attention_heads: 4 linear_units: 2048 # the number of units of position-wise feed forward num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 + dropout_rate: 0.1 # sublayer output dropout positional_dropout_rate: 0.1 attention_dropout_rate: 0.0 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 @@ -30,7 +30,7 @@ decoder_conf: attention_heads: 4 linear_units: 2048 num_blocks: 6 - dropout_rate: 0.1 + dropout_rate: 0.1 # sublayer output dropout positional_dropout_rate: 0.1 self_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0 @@ -39,7 +39,7 @@ model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option length_normalized_loss: false - init_type: 'kaiming_uniform' + init_type: 'kaiming_uniform' # !Warning: need to convergence ########################################### # Data # diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index a150a04d..2419d07a 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -37,7 +37,7 @@ model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option length_normalized_loss: false - init_type: 'kaiming_uniform' + init_type: 'kaiming_uniform' # !Warning: need to convergence ########################################### # Data # diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index 9e08ea0e..4e068420 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -10,7 +10,7 @@ encoder_conf: attention_heads: 4 linear_units: 2048 # the number of units of position-wise feed forward num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 + dropout_rate: 0.1 # sublayer output dropout positional_dropout_rate: 0.1 attention_dropout_rate: 0.0 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 @@ -21,7 +21,7 @@ decoder_conf: attention_heads: 4 linear_units: 2048 num_blocks: 6 - dropout_rate: 0.1 + dropout_rate: 0.1 # sublayer output dropout positional_dropout_rate: 0.1 self_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0 From d1eb6269ff42fee7d3d8cbfbe0234e222cd40c54 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Thu, 5 May 2022 19:06:53 +0800 Subject: [PATCH 13/93] update the streaming asr and punc server to cpu device, test=doc --- demos/streaming_asr_server/README.md | 11 ++++++++--- demos/streaming_asr_server/README_cn.md | 8 ++++++-- demos/streaming_asr_server/conf/punc_application.yaml | 2 +- .../conf/ws_conformer_application.yaml | 4 ++-- demos/streaming_asr_server/test.sh | 0 5 files changed, 17 insertions(+), 8 deletions(-) mode change 100644 => 100755 demos/streaming_asr_server/test.sh diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 48cfbaf3..d693dc41 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -29,7 +29,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ### 3. Server Usage - Command Line (Recommended) - + **Note:** The default deployment of the server is on the 'CPU' device, which can be deployed on the 'GPU' by modifying the 'device' parameter in the service configuration file. ```bash # in PaddleSpeech/demos/streaming_asr_server start the service paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml @@ -110,6 +110,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ``` - Python API + **Note:** The default deployment of the server is on the 'CPU' device, which can be deployed on the 'GPU' by modifying the 'device' parameter in the service configuration file. ```python # in PaddleSpeech/demos/streaming_asr_server directory from paddlespeech.server.bin.paddlespeech_server import ServerExecutor @@ -361,8 +362,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ## Punctuation service ### 1. Server usage - + - Command Line + **Note:** The default deployment of the server is on the 'CPU' device, which can be deployed on the 'GPU' by modifying the 'device' parameter in the service configuration file. ``` bash In PaddleSpeech/demos/streaming_asr_server directory to lanuch punctuation service paddlespeech_server start --config_file conf/punc_application.yaml @@ -401,7 +403,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ``` - Python API - + **Note:** The default deployment of the server is on the 'CPU' device, which can be deployed on the 'GPU' by modifying the 'device' parameter in the service configuration file. ```python # 在 PaddleSpeech/demos/streaming_asr_server 目录 from paddlespeech.server.bin.paddlespeech_server import ServerExecutor @@ -467,6 +469,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ## Join streaming asr and punctuation server + +By default, each server is deployed on the 'CPU' device and speech recognition and punctuation prediction can be deployed on different 'GPU' by modifying the' device 'parameter in the service configuration file respectively. + We use `streaming_ asr_server.py` and `punc_server.py` two services to lanuch streaming speech recognition and punctuation prediction services respectively. And the `websocket_client.py` script can be used to call streaming speech recognition and punctuation prediction services at the same time. ### 1. Start two server diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 67f62860..db9cbb5e 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -36,7 +36,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ### 3. 服务端使用方法 - 命令行 (推荐使用) - + **注意:** 默认部署在 `cpu` 设备上,可以通过修改服务配置文件中 `device` 参数部署在 `gpu` 上。 ```bash # 在 PaddleSpeech/demos/streaming_asr_server 目录启动服务 paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml @@ -117,6 +117,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ``` - Python API + **注意:** 默认部署在 `cpu` 设备上,可以通过修改服务配置文件中 `device` 参数部署在 `gpu` 上。 ```python # 在 PaddleSpeech/demos/streaming_asr_server 目录 from paddlespeech.server.bin.paddlespeech_server import ServerExecutor @@ -371,6 +372,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ### 1. 服务端使用方法 - 命令行 + **注意:** 默认部署在 `cpu` 设备上,可以通过修改服务配置文件中 `device` 参数部署在 `gpu` 上。 ``` bash 在 PaddleSpeech/demos/streaming_asr_server 目录下启动标点预测服务 paddlespeech_server start --config_file conf/punc_application.yaml @@ -410,7 +412,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ``` - Python API - + **注意:** 默认部署在 `cpu` 设备上,可以通过修改服务配置文件中 `device` 参数部署在 `gpu` 上。 ```python # 在 PaddleSpeech/demos/streaming_asr_server 目录 from paddlespeech.server.bin.paddlespeech_server import ServerExecutor @@ -476,6 +478,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ## 联合流式语音识别和标点预测 +**注意:** By default, each server is deployed on the 'CPU' device. Voice recognition and punctuation prediction can be deployed on different 'GPUs' by modifying the' device 'parameter in the service configuration file. + 使用 `streaming_asr_server.py` 和 `punc_server.py` 两个服务,分别启动流式语音识别和标点预测服务。调用 `websocket_client.py` 脚本可以同时调用流式语音识别和标点预测服务。 ### 1. 启动服务 diff --git a/demos/streaming_asr_server/conf/punc_application.yaml b/demos/streaming_asr_server/conf/punc_application.yaml index e0d06871..f947525e 100644 --- a/demos/streaming_asr_server/conf/punc_application.yaml +++ b/demos/streaming_asr_server/conf/punc_application.yaml @@ -28,7 +28,7 @@ text_python: cfg_path: # [optional] ckpt_path: # [optional] vocab_file: # [optional] - device: gpu:0 # set 'gpu:id' or 'cpu' + device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml index 42473555..20a50008 100644 --- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml +++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml @@ -29,7 +29,7 @@ asr_online: cfg_path: decode_method: force_yes: True - device: gpu:3 # cpu or gpu:id + device: 'cpu' # cpu or gpu:id am_predictor_conf: device: # set 'gpu:id' or 'cpu' switch_ir_optim: True @@ -42,4 +42,4 @@ asr_online: window_ms: 25 # ms shift_ms: 10 # ms sample_rate: 16000 - sample_width: 2 \ No newline at end of file + sample_width: 2 diff --git a/demos/streaming_asr_server/test.sh b/demos/streaming_asr_server/test.sh old mode 100644 new mode 100755 From 0e2372edd20fde7331b7618302babf358e7c6c5b Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Thu, 5 May 2022 19:43:12 +0800 Subject: [PATCH 14/93] update readme_cn.md, test=doc --- demos/streaming_asr_server/README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index db9cbb5e..b768c435 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -478,7 +478,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ## 联合流式语音识别和标点预测 -**注意:** By default, each server is deployed on the 'CPU' device. Voice recognition and punctuation prediction can be deployed on different 'GPUs' by modifying the' device 'parameter in the service configuration file. +**注意:** 默认部署在 `cpu` 设备上,可以通过修改服务配置文件中 `device` 参数将语音识别和标点预测部署在不同的 `gpu` 上。 使用 `streaming_asr_server.py` 和 `punc_server.py` 两个服务,分别启动流式语音识别和标点预测服务。调用 `websocket_client.py` 脚本可以同时调用流式语音识别和标点预测服务。 From a93de1810d45d87a3187e8558eba68b3c3747200 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 5 May 2022 20:51:54 +0800 Subject: [PATCH 15/93] fix typo --- speechx/examples/ds2_ol/aishell/run_fbank.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 8446cbca..246e3be4 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -5,7 +5,7 @@ set -e . path.sh nj=40 -stage=0 +stage=4 stop_stage=5 . utils/parse_options.sh @@ -117,7 +117,7 @@ fi wfst=$data/wfst_fbank/ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then mkdir -p $wfst - if [ ! -f $wfst/aishell_graph.zip ]; then + if [ ! -f $wfst/aishell_graph2.zip ]; then pushd $wfst wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip unzip aishell_graph2.zip @@ -135,7 +135,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --param_path=$model_dir/avg_5.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --lm_path=$lm \ + --model_cache_shapes="5-1-2048,5-1-2048" \ --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg From b7a77eebcaad28ef9036b25dd0f9f6fbf030e0f5 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Thu, 5 May 2022 23:22:24 +0800 Subject: [PATCH 16/93] update the time stamp type, test=doc --- .../server/engine/asr/online/asr_engine.py | 20 ++++++++++----- .../server/engine/asr/online/ctc_search.py | 25 +++++++++---------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index a2559816..99d34a30 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -13,7 +13,6 @@ # limitations under the License. import copy import os -import time from typing import Optional import numpy as np @@ -297,7 +296,8 @@ class PaddleASRConnectionHanddler: self.chunk_num = 0 self.global_frame_offset = 0 self.result_transcripts = [''] - self.word_time_stamp = None + self.word_time_stamp = [] + self.time_stamp = [] self.first_char_occur_elapsed = None def decode(self, is_finished=False): @@ -515,10 +515,7 @@ class PaddleASRConnectionHanddler: return '' def get_word_time_stamp(self): - if self.word_time_stamp is None: - return [] - else: - return self.word_time_stamp + return self.word_time_stamp @paddle.no_grad() def rescoring(self): @@ -582,7 +579,18 @@ class PaddleASRConnectionHanddler: best_index = i # update the one best result + # hyps stored the beam results and each fields is: + logger.info(f"best index: {best_index}") + # logger.info(f'best result: {hyps[best_index]}') + # the field of the hyps is: + # hyps[0][0]: the sentence word-id in the vocab with a tuple + # hyps[0][1]: the sentence decoding probability with all paths + # hyps[0][2]: viterbi_blank ending probability + # hyps[0][3]: viterbi_non_blank probability + # hyps[0][4]: current_token_prob, + # hyps[0][5]: times_viterbi_blank, + # hyps[0][6]: times_titerbi_non_blank self.hyps = [hyps[best_index][0]] # update the hyps time stamp diff --git a/paddlespeech/server/engine/asr/online/ctc_search.py b/paddlespeech/server/engine/asr/online/ctc_search.py index 3a808587..4c9ac3ac 100644 --- a/paddlespeech/server/engine/asr/online/ctc_search.py +++ b/paddlespeech/server/engine/asr/online/ctc_search.py @@ -27,7 +27,7 @@ class CTCPrefixBeamSearch: """Implement the ctc prefix beam search Args: - config (yacs.config.CfgNode): _description_ + config (yacs.config.CfgNode): the ctc prefix beam search configuration """ self.config = config self.reset() @@ -69,7 +69,6 @@ class CTCPrefixBeamSearch: # 2. CTC beam search step by step for t in range(0, maxlen): logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) # next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) next_hyps = defaultdict( lambda: (-float('inf'), -float('inf'), -float('inf'), -float('inf'), -float('inf'), [], [])) @@ -80,7 +79,7 @@ class CTCPrefixBeamSearch: for s in top_k_index: s = s.item() ps = logp[s].item() - for prefix, (pb, pnb, v_s, v_ns, cur_token_prob, times_s, + for prefix, (pb, pnb, v_b_s, v_nb_s, cur_token_prob, times_s, times_ns) in self.cur_hyps: last = prefix[-1] if len(prefix) > 0 else None if s == blank_id: # blank @@ -88,9 +87,9 @@ class CTCPrefixBeamSearch: prefix] n_pb = log_add([n_pb, pb + ps, pnb + ps]) - pre_times = times_s if v_s > v_ns else times_ns + pre_times = times_s if v_b_s > v_nb_s else times_ns n_times_s = copy.deepcopy(pre_times) - viterbi_score = v_s if v_s > v_ns else v_ns + viterbi_score = v_b_s if v_b_s > v_nb_s else v_nb_s n_v_s = viterbi_score + ps next_hyps[prefix] = (n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, @@ -101,8 +100,8 @@ class CTCPrefixBeamSearch: n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[ prefix] n_pnb = log_add([n_pnb, pnb + ps]) - if n_v_ns < v_ns + ps: - n_v_ns = v_ns + ps + if n_v_ns < v_nb_s + ps: + n_v_ns = v_nb_s + ps if n_cur_token_prob < ps: n_cur_token_prob = ps n_times_ns = copy.deepcopy(times_ns) @@ -117,8 +116,8 @@ class CTCPrefixBeamSearch: n_prefix = prefix + (s, ) n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[ n_prefix] - if n_v_ns < v_s + ps: - n_v_ns = v_s + ps + if n_v_ns < v_b_s + ps: + n_v_ns = v_b_s + ps n_cur_token_prob = ps n_times_ns = copy.deepcopy(times_s) n_times_ns.append(self.abs_time_step) @@ -129,10 +128,10 @@ class CTCPrefixBeamSearch: else: # Case 3: *a + b => *ab, *aε + b => *ab n_prefix = prefix + (s, ) - n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_n = next_hyps[ + n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[ n_prefix] - viterbi_score = v_s if v_s > v_ns else v_ns - pre_times = times_s if v_s > v_ns else times_ns + viterbi_score = v_b_s if v_b_s > v_nb_s else v_nb_s + pre_times = times_s if v_b_s > v_nb_s else times_ns if n_v_ns < viterbi_score + ps: n_v_ns = viterbi_score + ps n_cur_token_prob = ps @@ -153,7 +152,7 @@ class CTCPrefixBeamSearch: # 2.3 update the absolute time step self.abs_time_step += 1 - # self.hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in self.cur_hyps] + self.hyps = [(y[0], log_add([y[1][0], y[1][1]]), y[1][2], y[1][3], y[1][4], y[1][5], y[1][6]) for y in self.cur_hyps] From 5d5266abff63a32c8f1c97351a299371b4b40abc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 02:36:37 +0000 Subject: [PATCH 17/93] rm to_float32 flags, default is fbank --- speechx/examples/ds2_ol/aishell/run.sh | 1 - .../ds2_ol/decoder/recognizer_test_main.cc | 4 ++- .../ds2_ol/feat/compute_fbank_main.cc | 1 + .../feat/linear-spectrogram-wo-db-norm-ol.cc | 3 +- .../ds2_ol/websocket/websocket_server.sh | 1 - speechx/speechx/decoder/param.h | 31 ++++++++++++------- .../speechx/frontend/audio/feature_pipeline.h | 6 ++-- 7 files changed, 28 insertions(+), 19 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index b44200b0..650cb140 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --to_float32=true \ --streaming_chunk=30 \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 00764f53..476fac05 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -19,6 +19,7 @@ DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -30,7 +31,8 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - int sample_rate = 16000; + + int sample_rate = FLAGS_sample_rate; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; LOG(INFO) << "sr: " << sample_rate; diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc index 7beaa587..67683eeb 100644 --- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc +++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc @@ -69,6 +69,7 @@ int main(int argc, char* argv[]) { feat_cache_opts.frame_chunk_stride = 1; feat_cache_opts.frame_chunk_size = 1; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); + LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); int sample_rate = 16000; diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index c3652ad4..bbf0e690 100644 --- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -56,6 +56,7 @@ int main(int argc, char* argv[]) { opt.frame_opts.remove_dc_offset = false; opt.frame_opts.window_type = "hanning"; opt.frame_opts.preemph_coeff = 0.0; + LOG(INFO) << "linear feature: " << true; LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; @@ -77,7 +78,7 @@ int main(int argc, char* argv[]) { int sample_rate = 16000; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "sample rate: " << sample_rate; LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh index 0e389f89..fc57e326 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_server.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -63,7 +63,6 @@ websocket_server_main \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ --streaming_chunk=0.1 \ - --to_float32=true \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 85de08ca..9905bc6e 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -19,23 +19,23 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" +// feature +DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear feature, or fbank"); +DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); -DEFINE_bool(to_float32, true, "audio convert to pcm32"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "max active"); -DEFINE_double(beam, 15.0, "decoder beam"); -DEFINE_double(lattice_beam, 7.5, "decoder beam"); +// feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=5) downsampling module."); DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); + +// nnet +DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); +DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( model_input_names, "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", @@ -47,8 +47,14 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_bool(use_fbank, false, "use fbank or linear feature"); -DEFINE_int32(num_bins, 161, "num bins of mel"); + +// decoder +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_int32(max_active, 7500, "max active"); +DEFINE_double(beam, 15.0, "decoder beam"); +DEFINE_double(lattice_beam, 7.5, "decoder beam"); namespace ppspeech { // todo refactor later @@ -56,17 +62,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; - opts.to_float32 = FLAGS_to_float32; kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; if (opts.use_fbank) { + opts.to_float32 = false; frame_opts.window_type = "povey"; frame_opts.frame_length_ms = 25; opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.fbank_opts.frame_opts = frame_opts; } else { + opts.to_float32 = true; frame_opts.remove_dc_offset = false; frame_opts.frame_length_ms = 20; frame_opts.window_type = "hanning"; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 4868d37e..1acf62a9 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -28,15 +28,15 @@ namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool to_float32; + bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - to_float32(false), - use_fbank(false), + to_float32(false), // true, only for linear feature + use_fbank(true), linear_spectrogram_opts(), fbank_opts(), feature_cache_opts() {} From 8522b8299971e1d86ae6e474f656ea69c25f0060 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 02:40:21 +0000 Subject: [PATCH 18/93] format --- demos/streaming_asr_server/README.md | 2 +- demos/streaming_asr_server/README_cn.md | 2 +- paddlespeech/cli/vector/infer.py | 4 ++- paddlespeech/server/README_cn.md | 2 +- paddlespeech/server/engine/vector/__init__.py | 13 ++++++++++ .../server/engine/vector/python/__init__.py | 13 ++++++++++ .../engine/vector/python/vector_engine.py | 2 +- .../ds2_ol/decoder/recognizer_test_main.cc | 2 +- speechx/speechx/decoder/param.h | 25 ++++++++++--------- speechx/speechx/frontend/audio/fbank.cc | 11 +++++--- .../frontend/audio/feature_pipeline.cc | 10 ++++---- .../speechx/frontend/audio/feature_pipeline.h | 6 ++--- 12 files changed, 62 insertions(+), 30 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index d693dc41..6808de5e 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -630,4 +630,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index b768c435..5fa81d4b 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -638,4 +638,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 3111badf..0a169f8b 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -437,7 +437,9 @@ class VectorExecutor(BaseExecutor): if self.sample_rate != 16000 and self.sample_rate != 8000: logger.error( "invalid sample rate, please input --sr 8000 or --sr 16000") - logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}") + logger.error( + f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}" + ) return False if isinstance(audio_file, (str, os.PathLike)): diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index 010d3d51..a974d40f 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -82,4 +82,4 @@ paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input ``` paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav -``` \ No newline at end of file +``` diff --git a/paddlespeech/server/engine/vector/__init__.py b/paddlespeech/server/engine/vector/__init__.py index e69de29b..97043fd7 100644 --- a/paddlespeech/server/engine/vector/__init__.py +++ b/paddlespeech/server/engine/vector/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/__init__.py b/paddlespeech/server/engine/vector/python/__init__.py index e69de29b..97043fd7 100644 --- a/paddlespeech/server/engine/vector/python/__init__.py +++ b/paddlespeech/server/engine/vector/python/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 2fd8dec6..85430370 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -16,9 +16,9 @@ from collections import OrderedDict import numpy as np import paddle - from paddleaudio.backends import load as load_audio from paddleaudio.compliance.librosa import melspectrogram + from paddlespeech.cli.log import logger from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.server.engine.base_engine import BaseEngine diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 476fac05..7aef73f7 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -31,7 +31,7 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - + int sample_rate = FLAGS_sample_rate; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 9905bc6e..b2bf1890 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -21,7 +21,8 @@ // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); -// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear feature, or fbank"); +// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear +// feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); @@ -67,18 +68,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; if (opts.use_fbank) { - opts.to_float32 = false; - frame_opts.window_type = "povey"; - frame_opts.frame_length_ms = 25; - opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.fbank_opts.frame_opts = frame_opts; + opts.to_float32 = false; + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + opts.fbank_opts.fbank_opts.frame_opts = frame_opts; } else { - opts.to_float32 = true; - frame_opts.remove_dc_offset = false; - frame_opts.frame_length_ms = 20; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; + opts.to_float32 = true; + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + opts.linear_spectrogram_opts.frame_opts = frame_opts; } opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length; opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate; diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index a865db59..fea9032a 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -102,13 +102,16 @@ bool Fbank::Compute(const Vector& waves, Vector* feats) { // note: this online feature-extraction code does not support VTLN. RealFft(&window, true); kaldi::ComputePowerSpectrum(&window); - const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0)); - SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); + const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0)); + SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); if (!opts_.fbank_opts.use_power) { power_spectrum.ApplyPow(0.5); } - int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0); - SubVector mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); + int32 mel_offset = + ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 + : 0); + SubVector mel_energies( + this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); mel_bank.Compute(power_spectrum, &mel_energies); mel_energies.ApplyFloor(1e-07); mel_energies.ApplyLog(); diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 40891871..087de0f0 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -23,13 +23,13 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); unique_ptr base_feature; - + if (opts.use_fbank) { - base_feature.reset(new ppspeech::Fbank(opts.fbank_opts, - std::move(data_source))); + base_feature.reset( + new ppspeech::Fbank(opts.fbank_opts, std::move(data_source))); } else { - base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, - std::move(data_source))); + base_feature.reset(new ppspeech::LinearSpectrogram( + opts.linear_spectrogram_opts, std::move(data_source))); } unique_ptr cmvn( diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 1acf62a9..6b9b4795 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -18,24 +18,24 @@ #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" +#include "frontend/audio/fbank.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" -#include "frontend/audio/fbank.h" #include "frontend/audio/normalizer.h" namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool to_float32; // true, only for linear feature + bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - to_float32(false), // true, only for linear feature + to_float32(false), // true, only for linear feature use_fbank(true), linear_spectrogram_opts(), fbank_opts(), From 491f2d040b9bfc04054d80dccc3680f0ae9d21af Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 6 May 2022 11:14:30 +0800 Subject: [PATCH 19/93] fix typo --- speechx/examples/ds2_ol/aishell/run_fbank.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 246e3be4..88c16857 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -5,7 +5,7 @@ set -e . path.sh nj=40 -stage=4 +stage=0 stop_stage=5 . utils/parse_options.sh @@ -156,10 +156,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --use_fbank=true \ --to_float32=false \ --param_path=$model_dir/avg_5.jit.pdiparams \ - --word_symbol_table=$graph_dir/words.txt \ + --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_cache_shapes="5-1-2048,5-1-2048" \ - --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:./result_fbank_recognizer From a3eaf16f848ed216eb4e3d386ca61cc2ae812cdf Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 6 May 2022 11:41:30 +0800 Subject: [PATCH 20/93] fix copyright issue --- speechx/speechx/websocket/websocket_client.cc | 4 ++-- speechx/speechx/websocket/websocket_client.h | 4 ++-- speechx/speechx/websocket/websocket_server.cc | 4 ++-- speechx/speechx/websocket/websocket_server.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/websocket/websocket_client.cc index 5176dc89..6bd930b8 100644 --- a/speechx/speechx/websocket/websocket_client.cc +++ b/speechx/speechx/websocket/websocket_client.cc @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/websocket/websocket_client.h index df7395a7..ac0aed31 100644 --- a/speechx/speechx/websocket/websocket_client.h +++ b/speechx/speechx/websocket/websocket_client.h @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc index 2a6b8990..28c9eca4 100644 --- a/speechx/speechx/websocket/websocket_server.cc +++ b/speechx/speechx/websocket/websocket_server.cc @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/websocket/websocket_server.h index 8856f5d0..9ea88282 100644 --- a/speechx/speechx/websocket/websocket_server.h +++ b/speechx/speechx/websocket/websocket_server.h @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at From fc5e2e70d7191302c272a1fbf730a9b9a3e38337 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Fri, 6 May 2022 12:57:56 +0800 Subject: [PATCH 21/93] update readme, test=doc --- demos/speech_server/README.md | 11 ++++++----- demos/speech_server/README_cn.md | 19 ++++++++++--------- demos/speech_server/conf/application.yaml | 6 +++--- demos/streaming_tts_server/README.md | 4 ++++ demos/streaming_tts_server/README_cn.md | 4 ++++ paddlespeech/server/README.md | 4 +++- paddlespeech/server/README_cn.md | 1 + 7 files changed, 31 insertions(+), 18 deletions(-) diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 0323d398..3df93238 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -10,7 +10,7 @@ This demo is an implementation of starting the voice service and accessing the s ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.2.1** or above. +It is recommended to use **paddlepaddle 2.2.2** or above. You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File @@ -18,6 +18,7 @@ The configuration file can be found in `conf/application.yaml` . Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. At present, the speech tasks integrated by the service include: asr (speech recognition), tts (text to sppech) and cls (audio classification). Currently the engine type supports two forms: python and inference (Paddle Inference) +**Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. The input of ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. @@ -51,8 +52,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` @@ -74,8 +75,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 4a7c7447..34dcfb1f 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -1,17 +1,17 @@ -([简体中文](./README_cn.md)|English) +(简体中文|[English](./README.md)) # 语音服务 ## 介绍 -这个demo是一个启动语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 +这个demo是一个启动离线语音服务和访问服务的实现。它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 ## 使用方法 ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 -你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 +推荐使用 **paddlepaddle 2.2.2** 或以上版本。 +你可以从 medium,hard 两中方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 @@ -19,9 +19,10 @@ 其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)以及cls(音频分类)。 目前引擎类型支持两种形式:python 及 inference (Paddle Inference) +**注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 -这个 ASR client 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 +ASR client 的输入为是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 可以下载此 ASR client的示例音频: ```bash @@ -52,8 +53,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` @@ -75,8 +76,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index 2b1a0599..762f4af6 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for PaddleSpeech Offline Serving. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] - +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] +protocol: 'http' engine_list: ['asr_python', 'tts_python', 'cls_python'] diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index d03b9e28..5f3b92db 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -29,6 +29,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal. - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing. - Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan +- **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. + ### 3. Streaming speech synthesis server and client using http protocol @@ -120,6 +122,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. + `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: ```bash @@ -254,6 +257,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. + `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index e40de11b..2567b319 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -29,6 +29,8 @@ - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 - 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan +- **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 + ### 3. 使用http协议的流式语音合成服务端及客户端使用方法 #### 3.1 服务端使用方法 @@ -119,6 +121,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 + `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: @@ -254,6 +257,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 + `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md index 98ec1e28..f3dc9224 100644 --- a/paddlespeech/server/README.md +++ b/paddlespeech/server/README.md @@ -10,7 +10,9 @@ paddlespeech_server help ``` ### Start the server - First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started + First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started. + **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. + Then start the service: ```bash paddlespeech_server start --config_file ./conf/application.yaml diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index 010d3d51..98c43c98 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -11,6 +11,7 @@ ``` ### 启动服务 首先设置服务相关配置文件,类似于 `./conf/application.yaml`,设置 `engine_list`,该值表示即将启动的服务中包含的语音任务。 + **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 然后启动服务: ```bash paddlespeech_server start --config_file ./conf/application.yaml From 1f00e243c411f2da313f8decc71c0958d094c182 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Fri, 6 May 2022 13:04:53 +0800 Subject: [PATCH 22/93] update readme, test=doc --- demos/speech_server/README_cn.md | 2 +- demos/streaming_tts_server/README.md | 6 +++--- demos/streaming_tts_server/README_cn.md | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 34dcfb1f..8dca59ea 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -11,7 +11,7 @@ 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). 推荐使用 **paddlepaddle 2.2.2** 或以上版本。 -你可以从 medium,hard 两中方式中选择一种方式安装 PaddleSpeech。 +你可以从 medium,hard 两种方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 5f3b92db..299aa3d2 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -10,7 +10,7 @@ This demo is an implementation of starting the streaming speech synthesis servic ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.2.1** or above. +It is recommended to use **paddlepaddle 2.2.2** or above. You can choose one way from meduim and hard to install paddlespeech. @@ -122,7 +122,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. + - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: ```bash @@ -257,7 +257,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. + - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 2567b319..94fcaac6 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -10,7 +10,7 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 +推荐使用 **paddlepaddle 2.2.2** 或以上版本。 你可以从 medium,hard 两种方式中选择一种方式安装 PaddleSpeech。 @@ -121,7 +121,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 + - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: @@ -257,7 +257,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 + - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: From 4a1cf8547cadd8865ad14171faf594f2252778ae Mon Sep 17 00:00:00 2001 From: lym0302 Date: Fri, 6 May 2022 13:48:10 +0800 Subject: [PATCH 23/93] update readme, test=doc --- demos/speech_server/README_cn.md | 2 +- demos/streaming_tts_server/README_cn.md | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 8dca59ea..55fc6b34 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -22,7 +22,7 @@ **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 -ASR client 的输入为是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 +ASR client 的输入是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 可以下载此 ASR client的示例音频: ```bash diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 94fcaac6..bb159503 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,18 +16,18 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -- `protocol`表示该流式TTS服务使用的网络协议,目前支持 **http 和 websocket** 两种。 -- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -- 流式TTS引擎的AM模型支持:**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持:**hifigan, mb_melgan** -- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - fastspeech2不支持流式am推理,因此am_pad与am_block对它无效 - - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 -- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - hifigan, mb_melgan 均支持流式voc 推理 - - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- `protocol` 表示该流式 TTS 服务使用的网络协议,目前支持 **http 和 websocket** 两种。 +- `engine_list` 表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + - 该 demo 主要介绍流式语音合成服务,因此语音任务应设置为 tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用 onnxruntime 进行推理的引擎。其中,online-onnx 的推理速度更快。 +- 流式 TTS 引擎的 AM 模型支持:**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持:**hifigan, mb_melgan** +- 流式 am 推理中,每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `am_block` 表示 chunk 中的有效帧数,`am_pad` 表示一个 chunk 中 am_block 前后各加的帧数。am_pad 的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2 不支持流式 am 推理,因此 am_pad 与 m_block 对它无效 + - fastspeech2_cnndecoder 支持流式推理,当 am_pad=12 时,流式推理合成音频与非流式合成音频一致 +- 流式 voc 推理中,每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `voc_block` 表示chunk中的有效帧数,`voc_pad` 表示一个 chunk 中 voc_block 前后各加的帧数。voc_pad 的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式 voc 推理 + - 当 voc 模型为 mb_melgan,当 voc_pad=14 时,流式推理合成音频与非流式合成音频一致;voc_pad 最小可以设置为7,合成音频听感上没有异常,若 voc_pad 小于7,合成音频听感上存在异常。 + - 当 voc 模型为 hifigan,当 voc_pad=20 时,流式推理合成音频与非流式合成音频一致;当 voc_pad=14 时,合成音频听感上没有异常。 - 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan - **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 From e87495f04562e7dfa93fc952200bef3924e95fa4 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Fri, 6 May 2022 14:13:31 +0800 Subject: [PATCH 24/93] [server] update readme (#1851) * update readme, test=doc * update readme, test=doc * update readme, test=doc --- demos/speech_server/README.md | 11 +++++---- demos/speech_server/README_cn.md | 19 +++++++------- demos/speech_server/conf/application.yaml | 6 ++--- demos/streaming_tts_server/README.md | 6 ++++- demos/streaming_tts_server/README_cn.md | 30 +++++++++++++---------- paddlespeech/server/README.md | 4 ++- paddlespeech/server/README_cn.md | 1 + 7 files changed, 45 insertions(+), 32 deletions(-) diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 0323d398..3df93238 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -10,7 +10,7 @@ This demo is an implementation of starting the voice service and accessing the s ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.2.1** or above. +It is recommended to use **paddlepaddle 2.2.2** or above. You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File @@ -18,6 +18,7 @@ The configuration file can be found in `conf/application.yaml` . Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. At present, the speech tasks integrated by the service include: asr (speech recognition), tts (text to sppech) and cls (audio classification). Currently the engine type supports two forms: python and inference (Paddle Inference) +**Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. The input of ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. @@ -51,8 +52,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` @@ -74,8 +75,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 4a7c7447..55fc6b34 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -1,17 +1,17 @@ -([简体中文](./README_cn.md)|English) +(简体中文|[English](./README.md)) # 语音服务 ## 介绍 -这个demo是一个启动语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 +这个demo是一个启动离线语音服务和访问服务的实现。它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 ## 使用方法 ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 -你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 +推荐使用 **paddlepaddle 2.2.2** 或以上版本。 +你可以从 medium,hard 两种方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 @@ -19,9 +19,10 @@ 其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)以及cls(音频分类)。 目前引擎类型支持两种形式:python 及 inference (Paddle Inference) +**注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 -这个 ASR client 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 +ASR client 的输入是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 可以下载此 ASR client的示例音频: ```bash @@ -52,8 +53,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` @@ -75,8 +76,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index 2b1a0599..762f4af6 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for PaddleSpeech Offline Serving. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] - +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] +protocol: 'http' engine_list: ['asr_python', 'tts_python', 'cls_python'] diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index d03b9e28..299aa3d2 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -10,7 +10,7 @@ This demo is an implementation of starting the streaming speech synthesis servic ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.2.1** or above. +It is recommended to use **paddlepaddle 2.2.2** or above. You can choose one way from meduim and hard to install paddlespeech. @@ -29,6 +29,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal. - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing. - Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan +- **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. + ### 3. Streaming speech synthesis server and client using http protocol @@ -120,6 +122,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. + - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: ```bash @@ -254,6 +257,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. + - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index e40de11b..bb159503 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -10,25 +10,27 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 +推荐使用 **paddlepaddle 2.2.2** 或以上版本。 你可以从 medium,hard 两种方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -- `protocol`表示该流式TTS服务使用的网络协议,目前支持 **http 和 websocket** 两种。 -- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -- 流式TTS引擎的AM模型支持:**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持:**hifigan, mb_melgan** -- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - fastspeech2不支持流式am推理,因此am_pad与am_block对它无效 - - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 -- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - hifigan, mb_melgan 均支持流式voc 推理 - - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- `protocol` 表示该流式 TTS 服务使用的网络协议,目前支持 **http 和 websocket** 两种。 +- `engine_list` 表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + - 该 demo 主要介绍流式语音合成服务,因此语音任务应设置为 tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用 onnxruntime 进行推理的引擎。其中,online-onnx 的推理速度更快。 +- 流式 TTS 引擎的 AM 模型支持:**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持:**hifigan, mb_melgan** +- 流式 am 推理中,每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `am_block` 表示 chunk 中的有效帧数,`am_pad` 表示一个 chunk 中 am_block 前后各加的帧数。am_pad 的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2 不支持流式 am 推理,因此 am_pad 与 m_block 对它无效 + - fastspeech2_cnndecoder 支持流式推理,当 am_pad=12 时,流式推理合成音频与非流式合成音频一致 +- 流式 voc 推理中,每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `voc_block` 表示chunk中的有效帧数,`voc_pad` 表示一个 chunk 中 voc_block 前后各加的帧数。voc_pad 的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式 voc 推理 + - 当 voc 模型为 mb_melgan,当 voc_pad=14 时,流式推理合成音频与非流式合成音频一致;voc_pad 最小可以设置为7,合成音频听感上没有异常,若 voc_pad 小于7,合成音频听感上存在异常。 + - 当 voc 模型为 hifigan,当 voc_pad=20 时,流式推理合成音频与非流式合成音频一致;当 voc_pad=14 时,合成音频听感上没有异常。 - 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan +- **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 + ### 3. 使用http协议的流式语音合成服务端及客户端使用方法 #### 3.1 服务端使用方法 @@ -119,6 +121,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 + - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: @@ -254,6 +257,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 + - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md index 98ec1e28..f3dc9224 100644 --- a/paddlespeech/server/README.md +++ b/paddlespeech/server/README.md @@ -10,7 +10,9 @@ paddlespeech_server help ``` ### Start the server - First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started + First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started. + **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. + Then start the service: ```bash paddlespeech_server start --config_file ./conf/application.yaml diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index a974d40f..4bd4d873 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -11,6 +11,7 @@ ``` ### 启动服务 首先设置服务相关配置文件,类似于 `./conf/application.yaml`,设置 `engine_list`,该值表示即将启动的服务中包含的语音任务。 + **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 然后启动服务: ```bash paddlespeech_server start --config_file ./conf/application.yaml From 3a7896fc9693a1467f7b9a9b20d48168893f47d6 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 May 2022 06:56:57 +0000 Subject: [PATCH 25/93] update cli, test=asr --- paddlespeech/cli/asr/pretrained_models.py | 16 +++++++++++++--- tests/unit/cli/test_cli.sh | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index 80b04aa4..7f198ad6 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -27,6 +27,16 @@ pretrained_models = { 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, + "conformer_online_wenetspeech-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz', + 'md5': + 'b8c02632b04da34aca88459835be54a6', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/avg_10', + }, "conformer_online_multicn-zh-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.0.model.tar.gz', @@ -69,13 +79,13 @@ pretrained_models = { }, "deepspeech2online_wenetspeech-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz', 'md5': - 'b3ef6fcae8c0058c3c53375341ccb209', + 'e393d4d274af0f6967db24fc146e8074', 'cfg_path': 'model.yaml', 'ckpt_path': - 'exp/deepspeech2_online/checkpoints/avg_3', + 'exp/deepspeech2_online/checkpoints/avg_10', 'lm_url': 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', 'lm_md5': diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index bdf05524..e1f1853f 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -14,6 +14,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav +paddlespeech asr --model conformer_online_wenetspeech --input ./zh.wav paddlespeech asr --model conformer_online_multicn --input ./zh.wav paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav From 477eaa1a74800a054c91cad8c8dee69e0174fa22 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 08:22:49 +0000 Subject: [PATCH 26/93] dev using 0.0.0 as version --- audio/setup.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index ffee6f9d..ec67c81d 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -19,7 +19,7 @@ from setuptools.command.install import install from setuptools.command.test import test # set the version here -VERSION = '1.0.0a' +VERSION = '0.0.0' # Inspired by the example at https://pytest.org/latest/goodpractises.html diff --git a/setup.py b/setup.py index 912fdd6d..252c1918 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ from setuptools.command.install import install HERE = Path(os.path.abspath(os.path.dirname(__file__))) -VERSION = '1.0.0a' +VERSION = '0.0.0' base = [ "editdistance", From 677898ab96e73f0659cbdf437a2e87ae48ff3714 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Fri, 6 May 2022 16:34:46 +0800 Subject: [PATCH 27/93] Add version command in cli. --- paddlespeech/cli/base_commands.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py index 97d5cd7f..0a26b120 100644 --- a/paddlespeech/cli/base_commands.py +++ b/paddlespeech/cli/base_commands.py @@ -47,3 +47,29 @@ class HelpCommand: print(msg) return True + + +@cli_register( + name='paddlespeech.version', + description='Show version and commit id of current package.') +class VersionCommand: + def execute(self, argv: List[str]) -> bool: + try: + from .. import __version__ + version = __version__ + except ImportError: + version = 'Not an official release' + + try: + from .. import __commit__ + commit_id = __commit__ + except ImportError: + commit_id = 'Not found' + + msg = 'Package Version:\n' + msg += ' {}\n\n'.format(version) + msg += 'Commit ID:\n' + msg += ' {}\n\n'.format(commit_id) + + print(msg) + return True From b361a738884cb7598eb54b567c0445f78b72540b Mon Sep 17 00:00:00 2001 From: lym0302 Date: Fri, 6 May 2022 19:29:33 +0800 Subject: [PATCH 28/93] improve server code, test=doc --- .../conf/tts_online_application.yaml | 8 +- .../server/bin/paddlespeech_client.py | 32 +++- paddlespeech/server/conf/application.yaml | 24 +-- .../server/conf/tts_online_application.yaml | 8 +- .../server/engine/asr/online/asr_engine.py | 79 ++------ .../engine/asr/online/pretrained_models.py | 52 +++++ .../engine/asr/paddleinference/asr_engine.py | 40 +--- .../asr/paddleinference/pretrained_models.py | 34 ++++ .../engine/cls/paddleinference/cls_engine.py | 81 +------- .../cls/paddleinference/pretrained_models.py | 58 ++++++ .../tts/online/onnx/pretrained_models.py | 69 +++++++ .../engine/tts/online/onnx/tts_engine.py | 149 +++------------ .../tts/online/python/pretrained_models.py | 73 +++++++ .../engine/tts/online/python/tts_engine.py | 142 ++------------ .../tts/paddleinference/pretrained_models.py | 87 +++++++++ .../engine/tts/paddleinference/tts_engine.py | 170 ++++++----------- .../server/engine/tts/python/tts_engine.py | 32 +++- paddlespeech/server/restful/tts_api.py | 2 +- .../server/tests/tts/online/http_client.py | 25 ++- .../server/tests/tts/online/ws_client.py | 23 ++- paddlespeech/server/utils/audio_handler.py | 45 ++--- paddlespeech/server/utils/util.py | 71 +++++++ paddlespeech/server/ws/tts_socket.py | 2 +- tests/unit/server/offline/change_yaml.py | 2 +- .../unit/server/offline/conf/application.yaml | 6 +- .../unit/server/offline/test_server_client.sh | 7 +- .../tts/check_server/conf/application.yaml | 8 +- .../online/tts/check_server/test_all.sh | 1 - .../check_server/tts_online_application.yaml | 8 +- .../tts/test_server/test_http_client.py | 180 +++++++----------- 30 files changed, 812 insertions(+), 706 deletions(-) create mode 100644 paddlespeech/server/engine/asr/online/pretrained_models.py create mode 100644 paddlespeech/server/engine/asr/paddleinference/pretrained_models.py create mode 100644 paddlespeech/server/engine/cls/paddleinference/pretrained_models.py create mode 100644 paddlespeech/server/engine/tts/online/onnx/pretrained_models.py create mode 100644 paddlespeech/server/engine/tts/online/python/pretrained_models.py create mode 100644 paddlespeech/server/engine/tts/paddleinference/pretrained_models.py diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml index 67d4641a..714f4a68 100644 --- a/demos/streaming_tts_server/conf/tts_online_application.yaml +++ b/demos/streaming_tts_server/conf/tts_online_application.yaml @@ -43,12 +43,12 @@ tts_online: device: 'cpu' # set 'gpu:id' or 'cpu' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -91,12 +91,12 @@ tts_online-onnx: lang: 'zh' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 8677279b..19bdc10b 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -31,6 +31,7 @@ from ..util import stats_wrapper from paddlespeech.cli.log import logger from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler from paddlespeech.server.utils.audio_process import wav2pcm +from paddlespeech.server.utils.util import compute_delay from paddlespeech.server.utils.util import wav2base64 __all__ = [ @@ -221,7 +222,7 @@ class TTSOnlineClientExecutor(BaseExecutor): play = args.play try: - res = self( + self( input=input_, server_ip=server_ip, port=port, @@ -257,17 +258,42 @@ class TTSOnlineClientExecutor(BaseExecutor): logger.info("tts http client start") from paddlespeech.server.utils.audio_handler import TTSHttpHandler handler = TTSHttpHandler(server_ip, port, play) - handler.run(input, spk_id, speed, volume, sample_rate, output) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( + input, spk_id, speed, volume, sample_rate, output) + delay_time_list = compute_delay(receive_time_list, + chunk_duration_list) elif protocol == "websocket": from paddlespeech.server.utils.audio_handler import TTSWsHandler logger.info("tts websocket client start") handler = TTSWsHandler(server_ip, port, play) loop = asyncio.get_event_loop() - loop.run_until_complete(handler.run(input, output)) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( + handler.run(input, output)) + delay_time_list = compute_delay(receive_time_list, + chunk_duration_list) else: logger.error("Please set correct protocol, http or websocket") + return False + + logger.info(f"sentence: {input}") + logger.info(f"duration: {duration} s") + logger.info(f"first response: {first_response} s") + logger.info(f"final response: {final_response} s") + logger.info(f"RTF: {final_response/duration}") + if output is not None: + if save_audio_success: + logger.info(f"Audio successfully saved in {output}") + else: + logger.error("Audio save failed.") + + if delay_time_list != []: + logger.info( + f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + logger.info("The sentence has no delay in streaming synthesis.") @cli_client_register( diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index b6a9942e..31a37ef0 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for PaddleSpeech Offline Serving.. ################################################################################# # SERVER SETTING # @@ -7,9 +7,7 @@ host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] -# protocol = ['websocket', 'http'] (only one can be selected). -# http only support offline engine type. +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] protocol: 'http' engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python'] @@ -50,24 +48,6 @@ asr_inference: summary: True # False -> do not show predictor config -################### speech task: asr; engine_type: online ####################### -asr_online: - model_type: 'deepspeech2online_aishell' - am_model: # the pdmodel file of am static model [optional] - am_params: # the pdiparams file of am static model [optional] - lang: 'zh' - sample_rate: 16000 - cfg_path: - decode_method: - force_yes: True - - am_predictor_conf: - device: # set 'gpu:id' or 'cpu' - switch_ir_optim: True - glog_info: False # True -> print glog - summary: True # False -> do not show predictor config - - ################################### TTS ######################################### ################### speech task: tts; engine_type: python ####################### tts_python: diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml index 67d4641a..714f4a68 100644 --- a/paddlespeech/server/conf/tts_online_application.yaml +++ b/paddlespeech/server/conf/tts_online_application.yaml @@ -43,12 +43,12 @@ tts_online: device: 'cpu' # set 'gpu:id' or 'cpu' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -91,12 +91,12 @@ tts_online-onnx: lang: 'zh' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio - am_block: 42 + am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal - voc_block: 14 + voc_block: 36 voc_pad: 14 # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 99d34a30..ad1e6fa3 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -20,10 +20,9 @@ import paddle from numpy import float32 from yacs.config import CfgNode +from .pretrained_models import pretrained_models from paddlespeech.cli.asr.infer import ASRExecutor -from paddlespeech.cli.asr.infer import model_alias from paddlespeech.cli.log import logger -from paddlespeech.cli.utils import download_and_decompress from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.speech import SpeechSegment @@ -40,45 +39,6 @@ from paddlespeech.server.utils.paddle_predictor import init_predictor __all__ = ['ASREngine'] -pretrained_models = { - "deepspeech2online_aishell-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', - 'md5': - '98b87b171b7240b7cae6e07d8d0bc9be', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/deepspeech2_online/checkpoints/avg_1', - 'model': - 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel', - 'params': - 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, - "conformer_online_multicn-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz', - 'md5': - '0ac93d390552336f2a906aec9e33c5fa', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/chunk_conformer/checkpoints/multi_cn', - 'model': - 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', - 'params': - 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, -} - # ASR server connection process class class PaddleASRConnectionHanddler: @@ -625,24 +585,7 @@ class PaddleASRConnectionHanddler: class ASRServerExecutor(ASRExecutor): def __init__(self): super().__init__() - pass - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path + self.pretrained_models = pretrained_models def _init_from_path(self, model_type: str='deepspeech2online_aishell', @@ -658,20 +601,20 @@ class ASRServerExecutor(ASRExecutor): """ self.model_type = model_type self.sample_rate = sample_rate + sample_rate_str = '16k' if sample_rate == 16000 else '8k' + tag = model_type + '-' + lang + '-' + sample_rate_str if cfg_path is None or am_model is None or am_params is None: - sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str logger.info(f"Load the pretrained model, tag = {tag}") res_path = self._get_pretrained_path(tag) # wenetspeech_zh self.res_path = res_path - self.cfg_path = os.path.join(res_path, - pretrained_models[tag]['cfg_path']) + self.cfg_path = os.path.join( + res_path, self.pretrained_models[tag]['cfg_path']) self.am_model = os.path.join(res_path, - pretrained_models[tag]['model']) + self.pretrained_models[tag]['model']) self.am_params = os.path.join(res_path, - pretrained_models[tag]['params']) + self.pretrained_models[tag]['params']) logger.info(res_path) else: self.cfg_path = os.path.abspath(cfg_path) @@ -699,8 +642,8 @@ class ASRServerExecutor(ASRExecutor): self.text_feature = TextFeaturizer( unit_type=self.config.unit_type, vocab=self.vocab) - lm_url = pretrained_models[tag]['lm_url'] - lm_md5 = pretrained_models[tag]['lm_md5'] + lm_url = self.pretrained_models[tag]['lm_url'] + lm_md5 = self.pretrained_models[tag]['lm_md5'] logger.info(f"Start to load language model {lm_url}") self.download_lm( lm_url, @@ -773,7 +716,7 @@ class ASRServerExecutor(ASRExecutor): model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} logger.info(f"model name: {model_name}") - model_class = dynamic_import(model_name, model_alias) + model_class = dynamic_import(model_name, self.model_alias) model_conf = self.config model = model_class.from_config(model_conf) self.model = model diff --git a/paddlespeech/server/engine/asr/online/pretrained_models.py b/paddlespeech/server/engine/asr/online/pretrained_models.py new file mode 100644 index 00000000..005977b4 --- /dev/null +++ b/paddlespeech/server/engine/asr/online/pretrained_models.py @@ -0,0 +1,52 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + "deepspeech2online_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', + 'md5': + '98b87b171b7240b7cae6e07d8d0bc9be', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2_online/checkpoints/avg_1', + 'model': + 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel', + 'params': + 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, + "conformer_online_multicn-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz', + 'md5': + '0ac93d390552336f2a906aec9e33c5fa', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/multi_cn', + 'model': + 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', + 'params': + 'exp/chunk_conformer/checkpoints/multi_cn.pdparams', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, +} diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 1925bf1d..e275f108 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -19,6 +19,7 @@ from typing import Optional import paddle from yacs.config import CfgNode +from .pretrained_models import pretrained_models from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.cli.utils import MODEL_HOME @@ -31,32 +32,11 @@ from paddlespeech.server.utils.paddle_predictor import run_model __all__ = ['ASREngine'] -pretrained_models = { - "deepspeech2offline_aishell-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', - 'md5': - '932c3593d62fe5c741b59b31318aa314', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/deepspeech2/checkpoints/avg_1', - 'model': - 'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel', - 'params': - 'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, -} - class ASRServerExecutor(ASRExecutor): def __init__(self): super().__init__() - pass + self.pretrained_models = pretrained_models def _init_from_path(self, model_type: str='wenetspeech', @@ -71,18 +51,18 @@ class ASRServerExecutor(ASRExecutor): Init model and other resources from a specific path. """ + sample_rate_str = '16k' if sample_rate == 16000 else '8k' + tag = model_type + '-' + lang + '-' + sample_rate_str if cfg_path is None or am_model is None or am_params is None: - sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str res_path = self._get_pretrained_path(tag) # wenetspeech_zh self.res_path = res_path - self.cfg_path = os.path.join(res_path, - pretrained_models[tag]['cfg_path']) + self.cfg_path = os.path.join( + res_path, self.pretrained_models[tag]['cfg_path']) self.am_model = os.path.join(res_path, - pretrained_models[tag]['model']) + self.pretrained_models[tag]['model']) self.am_params = os.path.join(res_path, - pretrained_models[tag]['params']) + self.pretrained_models[tag]['params']) logger.info(res_path) logger.info(self.cfg_path) logger.info(self.am_model) @@ -109,8 +89,8 @@ class ASRServerExecutor(ASRExecutor): self.text_feature = TextFeaturizer( unit_type=self.config.unit_type, vocab=self.vocab) - lm_url = pretrained_models[tag]['lm_url'] - lm_md5 = pretrained_models[tag]['lm_md5'] + lm_url = self.pretrained_models[tag]['lm_url'] + lm_md5 = self.pretrained_models[tag]['lm_md5'] self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) diff --git a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py new file mode 100644 index 00000000..c4c23e38 --- /dev/null +++ b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + "deepspeech2offline_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', + 'md5': + '932c3593d62fe5c741b59b31318aa314', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2/checkpoints/avg_1', + 'model': + 'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel', + 'params': + 'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, +} diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py index 3982effd..0906c241 100644 --- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py +++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py @@ -20,83 +20,20 @@ import numpy as np import paddle import yaml +from .pretrained_models import pretrained_models from paddlespeech.cli.cls.infer import CLSExecutor from paddlespeech.cli.log import logger -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.paddle_predictor import init_predictor from paddlespeech.server.utils.paddle_predictor import run_model __all__ = ['CLSEngine'] -pretrained_models = { - "panns_cnn6-32k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz', - 'md5': - 'da087c31046d23281d8ec5188c1967da', - 'cfg_path': - 'panns.yaml', - 'model_path': - 'inference.pdmodel', - 'params_path': - 'inference.pdiparams', - 'label_file': - 'audioset_labels.txt', - }, - "panns_cnn10-32k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz', - 'md5': - '5460cc6eafbfaf0f261cc75b90284ae1', - 'cfg_path': - 'panns.yaml', - 'model_path': - 'inference.pdmodel', - 'params_path': - 'inference.pdiparams', - 'label_file': - 'audioset_labels.txt', - }, - "panns_cnn14-32k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz', - 'md5': - 'ccc80b194821274da79466862b2ab00f', - 'cfg_path': - 'panns.yaml', - 'model_path': - 'inference.pdmodel', - 'params_path': - 'inference.pdiparams', - 'label_file': - 'audioset_labels.txt', - }, -} - class CLSServerExecutor(CLSExecutor): def __init__(self): super().__init__() - pass - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path + self.pretrained_models = pretrained_models def _init_from_path( self, @@ -113,14 +50,14 @@ class CLSServerExecutor(CLSExecutor): if cfg_path is None or model_path is None or params_path is None or label_file is None: tag = model_type + '-' + '32k' self.res_path = self._get_pretrained_path(tag) - self.cfg_path = os.path.join(self.res_path, - pretrained_models[tag]['cfg_path']) - self.model_path = os.path.join(self.res_path, - pretrained_models[tag]['model_path']) + self.cfg_path = os.path.join( + self.res_path, self.pretrained_models[tag]['cfg_path']) + self.model_path = os.path.join( + self.res_path, self.pretrained_models[tag]['model_path']) self.params_path = os.path.join( - self.res_path, pretrained_models[tag]['params_path']) - self.label_file = os.path.join(self.res_path, - pretrained_models[tag]['label_file']) + self.res_path, self.pretrained_models[tag]['params_path']) + self.label_file = os.path.join( + self.res_path, self.pretrained_models[tag]['label_file']) else: self.cfg_path = os.path.abspath(cfg_path) self.model_path = os.path.abspath(model_path) diff --git a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py new file mode 100644 index 00000000..e4914874 --- /dev/null +++ b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py @@ -0,0 +1,58 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + "panns_cnn6-32k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz', + 'md5': + 'da087c31046d23281d8ec5188c1967da', + 'cfg_path': + 'panns.yaml', + 'model_path': + 'inference.pdmodel', + 'params_path': + 'inference.pdiparams', + 'label_file': + 'audioset_labels.txt', + }, + "panns_cnn10-32k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz', + 'md5': + '5460cc6eafbfaf0f261cc75b90284ae1', + 'cfg_path': + 'panns.yaml', + 'model_path': + 'inference.pdmodel', + 'params_path': + 'inference.pdiparams', + 'label_file': + 'audioset_labels.txt', + }, + "panns_cnn14-32k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz', + 'md5': + 'ccc80b194821274da79466862b2ab00f', + 'cfg_path': + 'panns.yaml', + 'model_path': + 'inference.pdmodel', + 'params_path': + 'inference.pdiparams', + 'label_file': + 'audioset_labels.txt', + }, +} diff --git a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py new file mode 100644 index 00000000..789f5be7 --- /dev/null +++ b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# support online model +pretrained_models = { + # fastspeech2 + "fastspeech2_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip', + 'md5': + 'fd3ad38d83273ad51f0ea4f4abf3ab4e', + 'ckpt': ['fastspeech2_csmsc.onnx'], + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + "fastspeech2_cnndecoder_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip', + 'md5': + '5f70e1a6bcd29d72d54e7931aa86f266', + 'ckpt': [ + 'fastspeech2_csmsc_am_encoder_infer.onnx', + 'fastspeech2_csmsc_am_decoder.onnx', + 'fastspeech2_csmsc_am_postnet.onnx', + ], + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + + # mb_melgan + "mb_melgan_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip', + 'md5': + '5b83ec746e8414bc29032d954ffd07ec', + 'ckpt': + 'mb_melgan_csmsc.onnx', + 'sample_rate': + 24000, + }, + + # hifigan + "hifigan_csmsc_onnx-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip', + 'md5': + '1a7dc0385875889e46952e50c0994a6b', + 'ckpt': + 'hifigan_csmsc.onnx', + 'sample_rate': + 24000, + }, +} diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index 22c1c960..79244206 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -20,10 +20,9 @@ from typing import Optional import numpy as np import paddle +from .pretrained_models import pretrained_models from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm from paddlespeech.server.utils.onnx_infer import get_sess @@ -34,83 +33,6 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend __all__ = ['TTSEngine'] -# support online model -pretrained_models = { - # fastspeech2 - "fastspeech2_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip', - 'md5': - 'fd3ad38d83273ad51f0ea4f4abf3ab4e', - 'ckpt': ['fastspeech2_csmsc.onnx'], - 'phones_dict': - 'phone_id_map.txt', - 'sample_rate': - 24000, - }, - "fastspeech2_cnndecoder_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip', - 'md5': - '5f70e1a6bcd29d72d54e7931aa86f266', - 'ckpt': [ - 'fastspeech2_csmsc_am_encoder_infer.onnx', - 'fastspeech2_csmsc_am_decoder.onnx', - 'fastspeech2_csmsc_am_postnet.onnx', - ], - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - 'sample_rate': - 24000, - }, - - # mb_melgan - "mb_melgan_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip', - 'md5': - '5b83ec746e8414bc29032d954ffd07ec', - 'ckpt': - 'mb_melgan_csmsc.onnx', - 'sample_rate': - 24000, - }, - - # hifigan - "hifigan_csmsc_onnx-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip', - 'md5': - '1a7dc0385875889e46952e50c0994a6b', - 'ckpt': - 'hifigan_csmsc.onnx', - 'sample_rate': - 24000, - }, -} - -model_alias = { - # acoustic model - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - - # voc - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", -} - -__all__ = ['TTSEngine'] - class TTSServerExecutor(TTSExecutor): def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample): @@ -122,23 +44,6 @@ class TTSServerExecutor(TTSExecutor): self.voc_upsample = voc_upsample self.pretrained_models = pretrained_models - self.model_alias = model_alias - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - #Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - return decompressed_path def _init_from_path( self, @@ -173,10 +78,10 @@ class TTSServerExecutor(TTSExecutor): am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path self.am_ckpt = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][0]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][0]) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) + am_res_path, self.pretrained_models[am_tag]['phones_dict']) else: self.am_ckpt = os.path.abspath(am_ckpt[0]) @@ -192,16 +97,16 @@ class TTSServerExecutor(TTSExecutor): am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path self.am_encoder_infer = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][0]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][0]) self.am_decoder = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][1]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][1]) self.am_postnet = os.path.join( - am_res_path, pretrained_models[am_tag]['ckpt'][2]) + am_res_path, self.pretrained_models[am_tag]['ckpt'][2]) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) + am_res_path, self.pretrained_models[am_tag]['phones_dict']) self.am_stat = os.path.join( - am_res_path, pretrained_models[am_tag]['speech_stats']) + am_res_path, self.pretrained_models[am_tag]['speech_stats']) else: self.am_encoder_infer = os.path.abspath(am_ckpt[0]) @@ -229,8 +134,8 @@ class TTSServerExecutor(TTSExecutor): if voc_ckpt is None: voc_res_path = self._get_pretrained_path(voc_tag) self.voc_res_path = voc_res_path - self.voc_ckpt = os.path.join(voc_res_path, - pretrained_models[voc_tag]['ckpt']) + self.voc_ckpt = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['ckpt']) else: self.voc_ckpt = os.path.abspath(voc_ckpt) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt)) @@ -283,7 +188,6 @@ class TTSServerExecutor(TTSExecutor): """ Model inference and result stored in self.output. """ - #import pdb;pdb.set_trace() am_block = self.am_block am_pad = self.am_pad @@ -453,10 +357,21 @@ class TTSEngine(BaseEngine): self.config.am_block, self.config.am_pad, self.config.voc_block, self.config.voc_pad, self.config.voc_upsample) - if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device: - paddle.set_device("cpu") - else: - paddle.set_device(self.config.am_sess_conf.device) + try: + if self.config.am_sess_conf.device is not None: + self.device = self.config.am_sess_conf.device + elif self.config.voc_sess_conf.device is not None: + self.device = self.config.voc_sess_conf.device + else: + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException as e: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) + return False try: self.executor._init_from_path( @@ -480,16 +395,17 @@ class TTSEngine(BaseEngine): (self.config.voc_sess_conf.device)) return False - logger.info("Initialize TTS server engine successfully on device: %s." % - (self.config.voc_sess_conf.device)) - # warm up try: self.warm_up() + logger.info("Warm up successfully.") except Exception as e: logger.error("Failed to warm up on tts engine.") return False + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.config.voc_sess_conf.device)) + return True def warm_up(self): @@ -499,9 +415,7 @@ class TTSEngine(BaseEngine): sentence = "您好,欢迎使用语音合成服务。" if self.config.lang == 'en': sentence = "Hello and welcome to the speech synthesis service." - logger.info( - "*******************************warm up ********************************" - ) + logger.info("Start to warm up.") for i in range(3): for wav in self.executor.infer( text=sentence, @@ -512,9 +426,6 @@ class TTSEngine(BaseEngine): f"The first response time of the {i} warm up: {self.executor.first_response_time} s" ) break - logger.info( - "**********************************************************************" - ) def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): # Convert byte to text diff --git a/paddlespeech/server/engine/tts/online/python/pretrained_models.py b/paddlespeech/server/engine/tts/online/python/pretrained_models.py new file mode 100644 index 00000000..bf6aded5 --- /dev/null +++ b/paddlespeech/server/engine/tts/online/python/pretrained_models.py @@ -0,0 +1,73 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# support online model +pretrained_models = { + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', + 'md5': + '637d28a5e53aa60275612ba4393d5f22', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_76000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "fastspeech2_cnndecoder_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip', + 'md5': + '6eb28e22ace73e0ebe7845f86478f89f', + 'config': + 'cnndecoder.yaml', + 'ckpt': + 'snapshot_iter_153000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'ee5f0604e20091f0d495b6ec4618b90d', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, +} diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py index 1f51586b..1fca5283 100644 --- a/paddlespeech/server/engine/tts/online/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py @@ -22,10 +22,9 @@ import paddle import yaml from yacs.config import CfgNode +from .pretrained_models import pretrained_models from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm @@ -37,87 +36,6 @@ from paddlespeech.t2s.modules.normalizer import ZScore __all__ = ['TTSEngine'] -# support online model -pretrained_models = { - # fastspeech2 - "fastspeech2_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', - 'md5': - '637d28a5e53aa60275612ba4393d5f22', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_76000.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - "fastspeech2_cnndecoder_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip', - 'md5': - '6eb28e22ace73e0ebe7845f86478f89f', - 'config': - 'cnndecoder.yaml', - 'ckpt': - 'snapshot_iter_153000.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - - # mb_melgan - "mb_melgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', - 'md5': - 'ee5f0604e20091f0d495b6ec4618b90d', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_1000000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - - # hifigan - "hifigan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', - 'md5': - 'dd40a3d88dfcf64513fba2f0f961ada6', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_2500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, -} - -model_alias = { - # acoustic model - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - - # voc - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", -} - -__all__ = ['TTSEngine'] - class TTSServerExecutor(TTSExecutor): def __init__(self, am_block, am_pad, voc_block, voc_pad): @@ -126,6 +44,7 @@ class TTSServerExecutor(TTSExecutor): self.am_pad = am_pad self.voc_block = voc_block self.voc_pad = voc_pad + self.pretrained_models = pretrained_models def get_model_info(self, field: str, @@ -146,7 +65,7 @@ class TTSServerExecutor(TTSExecutor): [Tensor]: standard deviation """ - model_class = dynamic_import(model_name, model_alias) + model_class = dynamic_import(model_name, self.model_alias) if field == "am": odim = self.am_config.n_mels @@ -169,22 +88,6 @@ class TTSServerExecutor(TTSExecutor): return model, model_mu, model_std - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - return decompressed_path - def _init_from_path( self, am: str='fastspeech2_csmsc', @@ -210,15 +113,15 @@ class TTSServerExecutor(TTSExecutor): if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path - self.am_config = os.path.join(am_res_path, - pretrained_models[am_tag]['config']) + self.am_config = os.path.join( + am_res_path, self.pretrained_models[am_tag]['config']) self.am_ckpt = os.path.join(am_res_path, - pretrained_models[am_tag]['ckpt']) + self.pretrained_models[am_tag]['ckpt']) self.am_stat = os.path.join( - am_res_path, pretrained_models[am_tag]['speech_stats']) + am_res_path, self.pretrained_models[am_tag]['speech_stats']) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) + am_res_path, self.pretrained_models[am_tag]['phones_dict']) print("self.phones_dict:", self.phones_dict) logger.info(am_res_path) logger.info(self.am_config) @@ -239,12 +142,12 @@ class TTSServerExecutor(TTSExecutor): if voc_ckpt is None or voc_config is None or voc_stat is None: voc_res_path = self._get_pretrained_path(voc_tag) self.voc_res_path = voc_res_path - self.voc_config = os.path.join(voc_res_path, - pretrained_models[voc_tag]['config']) - self.voc_ckpt = os.path.join(voc_res_path, - pretrained_models[voc_tag]['ckpt']) + self.voc_config = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['config']) + self.voc_ckpt = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['ckpt']) self.voc_stat = os.path.join( - voc_res_path, pretrained_models[voc_tag]['speech_stats']) + voc_res_path, self.pretrained_models[voc_tag]['speech_stats']) logger.info(voc_res_path) logger.info(self.voc_config) logger.info(self.voc_ckpt) @@ -286,7 +189,7 @@ class TTSServerExecutor(TTSExecutor): self.am_ckpt, self.am_stat) am_normalizer = ZScore(am_mu, am_std) am_inference_class = dynamic_import(self.am_name + '_inference', - model_alias) + self.model_alias) self.am_inference = am_inference_class(am_normalizer, am) self.am_inference.eval() print("acoustic model done!") @@ -297,7 +200,7 @@ class TTSServerExecutor(TTSExecutor): self.voc_ckpt, self.voc_stat) voc_normalizer = ZScore(voc_mu, voc_std) voc_inference_class = dynamic_import(self.voc_name + '_inference', - model_alias) + self.model_alias) self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference.eval() print("voc done!") @@ -477,7 +380,7 @@ class TTSEngine(BaseEngine): ), "Please set correct voc_block and voc_pad, they should be more than 0." try: - if self.config.device: + if self.config.device is not None: self.device = self.config.device else: self.device = paddle.get_device() @@ -513,16 +416,16 @@ class TTSEngine(BaseEngine): (self.device)) return False - logger.info("Initialize TTS server engine successfully on device: %s." % - (self.device)) - # warm up try: self.warm_up() + logger.info("Warm up successfully.") except Exception as e: logger.error("Failed to warm up on tts engine.") return False + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.device)) return True def warm_up(self): @@ -532,9 +435,7 @@ class TTSEngine(BaseEngine): sentence = "您好,欢迎使用语音合成服务。" if self.config.lang == 'en': sentence = "Hello and welcome to the speech synthesis service." - logger.info( - "*******************************warm up ********************************" - ) + logger.info("Start to warm up.") for i in range(3): for wav in self.executor.infer( text=sentence, @@ -545,9 +446,6 @@ class TTSEngine(BaseEngine): f"The first response time of the {i} warm up: {self.executor.first_response_time} s" ) break - logger.info( - "**********************************************************************" - ) def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): # Convert byte to text diff --git a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py new file mode 100644 index 00000000..9618a7a6 --- /dev/null +++ b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Static model applied on paddle inference +pretrained_models = { + # speedyspeech + "speedyspeech_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip', + 'md5': + 'f10cbdedf47dc7a9668d2264494e1823', + 'model': + 'speedyspeech_csmsc.pdmodel', + 'params': + 'speedyspeech_csmsc.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'tones_dict': + 'tone_id_map.txt', + 'sample_rate': + 24000, + }, + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip', + 'md5': + '9788cd9745e14c7a5d12d32670b2a5a7', + 'model': + 'fastspeech2_csmsc.pdmodel', + 'params': + 'fastspeech2_csmsc.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + # pwgan + "pwgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip', + 'md5': + 'e3504aed9c5a290be12d1347836d2742', + 'model': + 'pwgan_csmsc.pdmodel', + 'params': + 'pwgan_csmsc.pdiparams', + 'sample_rate': + 24000, + }, + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip', + 'md5': + 'ac6eee94ba483421d750433f4c3b8d36', + 'model': + 'mb_melgan_csmsc.pdmodel', + 'params': + 'mb_melgan_csmsc.pdiparams', + 'sample_rate': + 24000, + }, + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip', + 'md5': + '7edd8c436b3a5546b3a7cb8cff9d5a0c', + 'model': + 'hifigan_csmsc.pdmodel', + 'params': + 'hifigan_csmsc.pdiparams', + 'sample_rate': + 24000, + }, +} diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index db8813ba..f1ce8b76 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -23,10 +23,9 @@ import paddle import soundfile as sf from scipy.io import wavfile +from .pretrained_models import pretrained_models from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor -from paddlespeech.cli.utils import download_and_decompress -from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import change_speed from paddlespeech.server.utils.errors import ErrorCode @@ -38,101 +37,11 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend __all__ = ['TTSEngine'] -# Static model applied on paddle inference -pretrained_models = { - # speedyspeech - "speedyspeech_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip', - 'md5': - 'f10cbdedf47dc7a9668d2264494e1823', - 'model': - 'speedyspeech_csmsc.pdmodel', - 'params': - 'speedyspeech_csmsc.pdiparams', - 'phones_dict': - 'phone_id_map.txt', - 'tones_dict': - 'tone_id_map.txt', - 'sample_rate': - 24000, - }, - # fastspeech2 - "fastspeech2_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip', - 'md5': - '9788cd9745e14c7a5d12d32670b2a5a7', - 'model': - 'fastspeech2_csmsc.pdmodel', - 'params': - 'fastspeech2_csmsc.pdiparams', - 'phones_dict': - 'phone_id_map.txt', - 'sample_rate': - 24000, - }, - # pwgan - "pwgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip', - 'md5': - 'e3504aed9c5a290be12d1347836d2742', - 'model': - 'pwgan_csmsc.pdmodel', - 'params': - 'pwgan_csmsc.pdiparams', - 'sample_rate': - 24000, - }, - # mb_melgan - "mb_melgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip', - 'md5': - 'ac6eee94ba483421d750433f4c3b8d36', - 'model': - 'mb_melgan_csmsc.pdmodel', - 'params': - 'mb_melgan_csmsc.pdiparams', - 'sample_rate': - 24000, - }, - # hifigan - "hifigan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip', - 'md5': - '7edd8c436b3a5546b3a7cb8cff9d5a0c', - 'model': - 'hifigan_csmsc.pdmodel', - 'params': - 'hifigan_csmsc.pdiparams', - 'sample_rate': - 24000, - }, -} - class TTSServerExecutor(TTSExecutor): def __init__(self): super().__init__() - pass - - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - return decompressed_path + self.pretrained_models = pretrained_models def _init_from_path( self, @@ -161,14 +70,14 @@ class TTSServerExecutor(TTSExecutor): if am_model is None or am_params is None or phones_dict is None: am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path - self.am_model = os.path.join(am_res_path, - pretrained_models[am_tag]['model']) - self.am_params = os.path.join(am_res_path, - pretrained_models[am_tag]['params']) + self.am_model = os.path.join( + am_res_path, self.pretrained_models[am_tag]['model']) + self.am_params = os.path.join( + am_res_path, self.pretrained_models[am_tag]['params']) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) - self.am_sample_rate = pretrained_models[am_tag]['sample_rate'] + am_res_path, self.pretrained_models[am_tag]['phones_dict']) + self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate'] logger.info(am_res_path) logger.info(self.am_model) @@ -183,17 +92,17 @@ class TTSServerExecutor(TTSExecutor): # for speedyspeech self.tones_dict = None - if 'tones_dict' in pretrained_models[am_tag]: + if 'tones_dict' in self.pretrained_models[am_tag]: self.tones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['tones_dict']) + am_res_path, self.pretrained_models[am_tag]['tones_dict']) if tones_dict: self.tones_dict = tones_dict # for multi speaker fastspeech2 self.speaker_dict = None - if 'speaker_dict' in pretrained_models[am_tag]: + if 'speaker_dict' in self.pretrained_models[am_tag]: self.speaker_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['speaker_dict']) + am_res_path, self.pretrained_models[am_tag]['speaker_dict']) if speaker_dict: self.speaker_dict = speaker_dict @@ -202,11 +111,12 @@ class TTSServerExecutor(TTSExecutor): if voc_model is None or voc_params is None: voc_res_path = self._get_pretrained_path(voc_tag) self.voc_res_path = voc_res_path - self.voc_model = os.path.join(voc_res_path, - pretrained_models[voc_tag]['model']) - self.voc_params = os.path.join(voc_res_path, - pretrained_models[voc_tag]['params']) - self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate'] + self.voc_model = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['model']) + self.voc_params = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['params']) + self.voc_sample_rate = self.pretrained_models[voc_tag][ + 'sample_rate'] logger.info(voc_res_path) logger.info(self.voc_model) logger.info(self.voc_params) @@ -352,8 +262,24 @@ class TTSEngine(BaseEngine): def init(self, config: dict) -> bool: self.executor = TTSServerExecutor() - self.config = config + + try: + if self.config.am_predictor_conf.device is not None: + self.device = self.config.am_predictor_conf.device + elif self.config.voc_predictor_conf.device is not None: + self.device = self.config.voc_predictor_conf.device + else: + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException as e: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) + return False + self.executor._init_from_path( am=self.config.am, am_model=self.config.am_model, @@ -370,9 +296,35 @@ class TTSEngine(BaseEngine): am_predictor_conf=self.config.am_predictor_conf, voc_predictor_conf=self.config.voc_predictor_conf, ) + # warm up + try: + self.warm_up() + logger.info("Warm up successfully.") + except Exception as e: + logger.error("Failed to warm up on tts engine.") + return False + logger.info("Initialize TTS server engine successfully.") return True + def warm_up(self): + """warm up + """ + if self.config.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if self.config.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + logger.info("Start to warm up.") + for i in range(3): + st = time.time() + self.executor.infer( + text=sentence, + lang=self.config.lang, + am=self.config.am, + spk_id=0, ) + logger.info( + f"The response time of the {i} warm up: {time.time() - st} s") + def postprocess(self, wav, original_fs: int, diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index f153f60b..d0002baa 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -51,15 +51,15 @@ class TTSEngine(BaseEngine): def init(self, config: dict) -> bool: self.executor = TTSServerExecutor() + self.config = config try: - self.config = config - if self.config.device: + if self.config.device is not None: self.device = self.config.device else: self.device = paddle.get_device() paddle.set_device(self.device) - except BaseException: + except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" ) @@ -87,10 +87,36 @@ class TTSEngine(BaseEngine): (self.device)) return False + # warm up + try: + self.warm_up() + logger.info("Warm up successfully.") + except Exception as e: + logger.error("Failed to warm up on tts engine.") + return False + logger.info("Initialize TTS server engine successfully on device: %s." % (self.device)) return True + def warm_up(self): + """warm up + """ + if self.config.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if self.config.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + logger.info("Start to warm up.") + for i in range(3): + st = time.time() + self.executor.infer( + text=sentence, + lang=self.config.lang, + am=self.config.am, + spk_id=0, ) + logger.info( + f"The response time of the {i} warm up: {time.time() - st} s") + def postprocess(self, wav, original_fs: int, diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index d1268428..15d618d9 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -128,7 +128,7 @@ def tts(request_body: TTSRequest): return response -@router.post("/paddlespeech/streaming/tts") +@router.post("/paddlespeech/tts/streaming") async def stream_tts(request_body: TTSRequest): text = request_body.text diff --git a/paddlespeech/server/tests/tts/online/http_client.py b/paddlespeech/server/tests/tts/online/http_client.py index 756f7b5b..47b781ed 100644 --- a/paddlespeech/server/tests/tts/online/http_client.py +++ b/paddlespeech/server/tests/tts/online/http_client.py @@ -14,6 +14,7 @@ import argparse from paddlespeech.server.utils.audio_handler import TTSHttpHandler +from paddlespeech.server.utils.util import compute_delay if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -43,5 +44,25 @@ if __name__ == "__main__": print("tts http client start") handler = TTSHttpHandler(args.server, args.port, args.play) - handler.run(args.text, args.spk_id, args.speed, args.volume, - args.sample_rate, args.output) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( + args.text, args.spk_id, args.speed, args.volume, args.sample_rate, + args.output) + delay_time_list = compute_delay(receive_time_list, chunk_duration_list) + + print(f"sentence: {args.text}") + print(f"duration: {duration} s") + print(f"first response: {first_response} s") + print(f"final response: {final_response} s") + print(f"RTF: {final_response/duration}") + if args.output is not None: + if save_audio_success: + print(f"Audio successfully saved in {args.output}") + else: + print("Audio save failed.") + + if delay_time_list != []: + print( + f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + print("The sentence has no delay in streaming synthesis.") diff --git a/paddlespeech/server/tests/tts/online/ws_client.py b/paddlespeech/server/tests/tts/online/ws_client.py index 821d82a9..0b1794c8 100644 --- a/paddlespeech/server/tests/tts/online/ws_client.py +++ b/paddlespeech/server/tests/tts/online/ws_client.py @@ -15,6 +15,7 @@ import argparse import asyncio from paddlespeech.server.utils.audio_handler import TTSWsHandler +from paddlespeech.server.utils.util import compute_delay if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -35,4 +36,24 @@ if __name__ == "__main__": print("tts websocket client start") handler = TTSWsHandler(args.server, args.port, args.play) loop = asyncio.get_event_loop() - loop.run_until_complete(handler.run(args.text, args.output)) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( + handler.run(args.text, args.output)) + delay_time_list = compute_delay(receive_time_list, chunk_duration_list) + + print(f"sentence: {args.text}") + print(f"duration: {duration} s") + print(f"first response: {first_response} s") + print(f"final response: {final_response} s") + print(f"RTF: {final_response/duration}") + if args.output is not None: + if save_audio_success: + print(f"Audio successfully saved in {args.output}") + else: + print("Audio save failed.") + + if delay_time_list != []: + print( + f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + print("The sentence has no delay in streaming synthesis.") diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index a088929f..75f4a10b 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -259,7 +259,8 @@ class TTSWsHandler: """ self.server = server self.port = port - self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts" + self.url = "ws://" + self.server + ":" + str( + self.port) + "/paddlespeech/tts/streaming" self.play = play if self.play: import pyaudio @@ -295,6 +296,8 @@ class TTSWsHandler: output (str): save audio path """ all_bytes = b'' + receive_time_list = [] + chunk_duration_list = [] # 1. Send websocket handshake protocal async with websockets.connect(self.url) as ws: @@ -309,14 +312,15 @@ class TTSWsHandler: # 3. Process the received response message = await ws.recv() - logger.info(f"句子:{text}") - logger.info(f"首包响应:{time.time() - st} s") + first_response = time.time() - st message = json.loads(message) status = message["status"] while (status == 1): + receive_time_list.append(time.time()) audio = message["audio"] audio = base64.b64decode(audio) # bytes + chunk_duration_list.append(len(audio) / 2.0 / 24000) all_bytes += audio if self.play: self.mutex.acquire() @@ -334,15 +338,11 @@ class TTSWsHandler: if status == 2: final_response = time.time() - st duration = len(all_bytes) / 2.0 / 24000 - logger.info(f"尾包响应:{final_response} s") - logger.info(f"音频时长:{duration} s") - logger.info(f"RTF: {final_response / duration}") if output is not None: - if save_audio(all_bytes, output): - logger.info(f"音频保存至:{output}") - else: - logger.error("save audio error") + save_audio_success = save_audio(all_bytes, output) + else: + save_audio_success = False else: logger.error("infer error") @@ -352,6 +352,8 @@ class TTSWsHandler: self.stream.close() self.p.terminate() + return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list + class TTSHttpHandler: def __init__(self, server="127.0.0.1", port=8092, play: bool=False): @@ -365,7 +367,7 @@ class TTSHttpHandler: self.server = server self.port = port self.url = "http://" + str(self.server) + ":" + str( - self.port) + "/paddlespeech/streaming/tts" + self.port) + "/paddlespeech/tts/streaming" self.play = play if self.play: @@ -423,13 +425,16 @@ class TTSHttpHandler: all_bytes = b'' first_flag = 1 + receive_time_list = [] + chunk_duration_list = [] # 2. Send request st = time.time() html = requests.post(self.url, json.dumps(params), stream=True) # 3. Process the received response - for chunk in html.iter_content(chunk_size=1024): + for chunk in html.iter_content(chunk_size=None): + receive_time_list.append(time.time()) audio = base64.b64decode(chunk) # bytes if first_flag: first_response = time.time() - st @@ -443,21 +448,15 @@ class TTSHttpHandler: self.t.start() self.start_play = False all_bytes += audio + chunk_duration_list.append(len(audio) / 2.0 / 24000) final_response = time.time() - st duration = len(all_bytes) / 2.0 / 24000 - logger.info(f"句子:{text}") - logger.info(f"首包响应:{first_response} s") - logger.info(f"尾包响应:{final_response} s") - logger.info(f"音频时长:{duration} s") - logger.info(f"RTF: {final_response / duration}") - if output is not None: - if save_audio(all_bytes, output): - logger.info(f"音频保存至:{output}") - else: - logger.error("save audio error") + save_audio_success = save_audio(all_bytes, output) + else: + save_audio_success = False if self.play: self.t.join() @@ -465,6 +464,8 @@ class TTSHttpHandler: self.stream.close() self.p.terminate() + return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list + class VectorHttpHandler: def __init__(self, server_ip=None, port=None): diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 72ee0060..061b213c 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -75,3 +75,74 @@ def get_chunks(data, block_size, pad_size, step): else: print("Please set correct type to get chunks, am or voc") return chunks + + +def compute_delay(receive_time_list, chunk_duration_list): + """compute delay + Args: + receive_time_list (list): Time to receive each packet + chunk_duration_list (list): The audio duration corresponding to each packet + Returns: + [list]: Delay time list + """ + assert (len(receive_time_list) == len(chunk_duration_list)) + delay_time_list = [] + play_time = receive_time_list[0] + chunk_duration_list[0] + for i in range(1, len(receive_time_list)): + receive_time = receive_time_list[i] + delay_time = receive_time - play_time + # 有延迟 + if delay_time > 0: + play_time = play_time + delay_time + chunk_duration_list[i] + delay_time_list.append(delay_time) + # 没有延迟 + else: + play_time = play_time + chunk_duration_list[i] + + return delay_time_list + + +def count_engine(logfile: str="./nohup.out"): + """For inference on the statistical engine side + Args: + logfile (str, optional): server log. Defaults to "./nohup.out". + """ + first_response_list = [] + final_response_list = [] + duration_list = [] + + with open(logfile, "r") as f: + for line in f.readlines(): + if "- first response time:" in line: + first_response = float(line.splie(" ")[-2]) + first_response_list.append(first_response) + elif "- final response time:" in line: + final_response = float(line.splie(" ")[-2]) + final_response_list.append(final_response) + elif "- The durations of audio is:" in line: + duration = float(line.splie(" ")[-2]) + duration_list.append(duration) + + assert (len(first_response_list) == len(final_response_list) and + len(final_response_list) == len(duration_list)) + + avg_first_response = sum(first_response_list) / len(first_response_list) + avg_final_response = sum(final_response_list) / len(final_response_list) + avg_duration = sum(duration_list) / len(duration_list) + RTF = sum(final_response_list) / sum(duration_list) + + print( + "************************* engine result ***************************************" + ) + print( + f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}" + ) + print( + f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s" + ) + print( + f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s" + ) + print( + f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" + ) diff --git a/paddlespeech/server/ws/tts_socket.py b/paddlespeech/server/ws/tts_socket.py index 699ee412..482aeb79 100644 --- a/paddlespeech/server/ws/tts_socket.py +++ b/paddlespeech/server/ws/tts_socket.py @@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool router = APIRouter() -@router.websocket('/ws/tts') +@router.websocket('/paddlespeech/tts/streaming') async def websocket_endpoint(websocket: WebSocket): await websocket.accept() diff --git a/tests/unit/server/offline/change_yaml.py b/tests/unit/server/offline/change_yaml.py index cdeaebdb..d51a6259 100644 --- a/tests/unit/server/offline/change_yaml.py +++ b/tests/unit/server/offline/change_yaml.py @@ -19,7 +19,7 @@ def change_device(yamlfile: str, engine: str, device: str): if device == 'cpu': set_device = 'cpu' elif device == 'gpu': - set_device = 'gpu:0' + set_device = 'gpu:3' else: print("Please set correct device: cpu or gpu.") diff --git a/tests/unit/server/offline/conf/application.yaml b/tests/unit/server/offline/conf/application.yaml index 2b1a0599..762f4af6 100644 --- a/tests/unit/server/offline/conf/application.yaml +++ b/tests/unit/server/offline/conf/application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for PaddleSpeech Offline Serving. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] - +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] +protocol: 'http' engine_list: ['asr_python', 'tts_python', 'cls_python'] diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh index e7ae7604..875008a7 100644 --- a/tests/unit/server/offline/test_server_client.sh +++ b/tests/unit/server/offline/test_server_client.sh @@ -21,6 +21,8 @@ StartService(){ } ClientTest(){ + echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $server_ip" + echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $port" # Client test # test asr client paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav @@ -39,6 +41,7 @@ ClientTest(){ ((test_times+=1)) paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav ((test_times+=1)) + } GetTestResult() { @@ -58,6 +61,7 @@ rm -rf log/server.log.wf rm -rf log/server.log rm -rf log/test_result.log +cp ../../../../demos/speech_server/conf/application.yaml ./conf/ config_file=./conf/application.yaml server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') @@ -191,5 +195,4 @@ echo "***************** Here are all the test results ********************" cat ./log/test_result.log # Restoring conf is the same as demos/speech_server -rm -rf ./conf -cp ../../../demos/speech_server/conf/ ./ -rf \ No newline at end of file +cp ../../../../demos/speech_server/conf/application.yaml ./conf/ diff --git a/tests/unit/server/online/tts/check_server/conf/application.yaml b/tests/unit/server/online/tts/check_server/conf/application.yaml index 26cd325b..dd1a7e19 100644 --- a/tests/unit/server/online/tts/check_server/conf/application.yaml +++ b/tests/unit/server/online/tts/check_server/conf/application.yaml @@ -39,9 +39,9 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -80,9 +80,9 @@ tts_online-onnx: # others lang: 'zh' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 voc_upsample: 300 diff --git a/tests/unit/server/online/tts/check_server/test_all.sh b/tests/unit/server/online/tts/check_server/test_all.sh index b2ea6b44..94129860 100644 --- a/tests/unit/server/online/tts/check_server/test_all.sh +++ b/tests/unit/server/online/tts/check_server/test_all.sh @@ -10,7 +10,6 @@ bash test.sh tts_online $log_all_dir/log_tts_online_cpu python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu - python change_yaml.py --change_type device --target_key device --target_value gpu:3 bash test.sh tts_online $log_all_dir/log_tts_online_gpu diff --git a/tests/unit/server/online/tts/check_server/tts_online_application.yaml b/tests/unit/server/online/tts/check_server/tts_online_application.yaml index 26cd325b..dd1a7e19 100644 --- a/tests/unit/server/online/tts/check_server/tts_online_application.yaml +++ b/tests/unit/server/online/tts/check_server/tts_online_application.yaml @@ -39,9 +39,9 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 @@ -80,9 +80,9 @@ tts_online-onnx: # others lang: 'zh' - am_block: 42 + am_block: 72 am_pad: 12 - voc_block: 14 + voc_block: 36 voc_pad: 14 voc_upsample: 300 diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py index 96372ab3..7fdb4e00 100644 --- a/tests/unit/server/online/tts/test_server/test_http_client.py +++ b/tests/unit/server/online/tts/test_server/test_http_client.py @@ -12,117 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import base64 -import json +import asyncio import os -import time -import requests - -from paddlespeech.server.utils.audio_process import pcm2wav +from paddlespeech.server.utils.util import compute_delay from paddlespeech.t2s.exps.syn_utils import get_sentences -def save_audio(buffer, audio_path) -> bool: - if audio_path.endswith("pcm"): - with open(audio_path, "wb") as f: - f.write(buffer) - elif audio_path.endswith("wav"): - with open("./tmp.pcm", "wb") as f: - f.write(buffer) - pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000) - os.system("rm ./tmp.pcm") - else: - print("Only supports saved audio format is pcm or wav") - return False - - return True - - def test(args, text, utt_id): - params = { - "text": text, - "spk_id": args.spk_id, - "speed": args.speed, - "volume": args.volume, - "sample_rate": args.sample_rate, - "save_path": '' - } - - buffer = b'' - flag = 1 - url = "http://" + str(args.server) + ":" + str( - args.port) + "/paddlespeech/streaming/tts" - st = time.time() - html = requests.post(url, json.dumps(params), stream=True) - for chunk in html.iter_content(chunk_size=1024): - chunk = base64.b64decode(chunk) # bytes - if flag: - first_response = time.time() - st - print(f"首包响应:{first_response} s") - flag = 0 - buffer += chunk - - final_response = time.time() - st - duration = len(buffer) / 2.0 / 24000 - - print(f"sentence: {text}") - print(f"尾包响应:{final_response} s") - print(f"音频时长:{duration} s") - print(f"RTF: {final_response / duration}") - - save_path = str(args.output_dir + "/" + utt_id + ".wav") - save_audio(buffer, save_path) - print("音频保存至:", save_path) - - return first_response, final_response, duration - - -def count_engine(logfile: str="./nohup.out"): - """For inference on the statistical engine side - - Args: - logfile (str, optional): server log. Defaults to "./nohup.out". - """ - first_response_list = [] - final_response_list = [] - duration_list = [] + output = str(args.output_dir + "/" + utt_id + ".wav") + if args.protocol == "http": + print("tts http client start") + from paddlespeech.server.utils.audio_handler import TTSHttpHandler + handler = TTSHttpHandler(args.server_ip, args.port, args.play) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( + text, args.spk_id, args.speed, args.volume, args.sample_rate, + output) + + elif args.protocol == "websocket": + from paddlespeech.server.utils.audio_handler import TTSWsHandler + print("tts websocket client start") + handler = TTSWsHandler(args.server_ip, args.port, args.play) + loop = asyncio.get_event_loop() + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( + handler.run(text, output)) - with open(logfile, "r") as f: - for line in f.readlines(): - if "- first response time:" in line: - first_response = float(line.splie(" ")[-2]) - first_response_list.append(first_response) - elif "- final response time:" in line: - final_response = float(line.splie(" ")[-2]) - final_response_list.append(final_response) - elif "- The durations of audio is:" in line: - duration = float(line.splie(" ")[-2]) - duration_list.append(duration) + else: + print("Please set correct protocol, http or websocket") - assert (len(first_response_list) == len(final_response_list) and - len(final_response_list) == len(duration_list)) - - avg_first_response = sum(first_response_list) / len(first_response_list) - avg_final_response = sum(final_response_list) / len(final_response_list) - avg_duration = sum(duration_list) / len(duration_list) - RTF = sum(final_response_list) / sum(duration_list) - - print( - "************************* engine result ***************************************" - ) - print( - f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}" - ) - print( - f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s" - ) - print( - f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s" - ) - print( - f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" - ) + return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list if __name__ == "__main__": @@ -142,10 +60,18 @@ if __name__ == "__main__": default=0, help='Sampling rate, the default is the same as the model') parser.add_argument( - "--server", type=str, help="server ip", default="127.0.0.1") + "--server_ip", type=str, help="server ip", default="127.0.0.1") parser.add_argument("--port", type=int, help="server port", default=8092) + parser.add_argument( + "--protocol", + type=str, + choices=['http', 'websocket'], + help="server protocol", + default="http") parser.add_argument( "--output_dir", type=str, default="./output", help="output dir") + parser.add_argument( + "--play", type=bool, help="whether to play audio", default=False) args = parser.parse_args() @@ -155,13 +81,35 @@ if __name__ == "__main__": first_response_list = [] final_response_list = [] duration_list = [] + all_delay_list = [] + packet_count = 0.0 sentences = get_sentences(text_file=args.text, lang="zh") for utt_id, sentence in sentences: - first_response, final_response, duration = test(args, sentence, utt_id) + first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = test( + args, sentence, utt_id) + delay_time_list = compute_delay(receive_time_list, chunk_duration_list) first_response_list.append(first_response) final_response_list.append(final_response) duration_list.append(duration) + packet_count += len(receive_time_list) + + print(f"句子:{sentence}") + print(f"首包响应时间:{first_response} s") + print(f"尾包响应时间:{final_response} s") + print(f"音频时长:{duration} s") + print(f"该句RTF:{final_response/duration}") + + if delay_time_list != []: + for t in delay_time_list: + all_delay_list.append(t) + print( + f"该句流式合成的延迟情况:总包个数:{len(receive_time_list)},延迟包个数:{len(delay_time_list)}, 最小延迟时间:{min(delay_time_list)} s, 最大延迟时间:{max(delay_time_list)} s, 平均延迟时间:{sum(delay_time_list)/len(delay_time_list)} s, 延迟率:{len(delay_time_list)/len(receive_time_list)}" + ) + else: + print("该句流式合成无延迟情况") + + packet_count += len(receive_time_list) assert (len(first_response_list) == len(final_response_list) and len(final_response_list) == len(duration_list)) @@ -170,19 +118,35 @@ if __name__ == "__main__": avg_final_response = sum(final_response_list) / len(final_response_list) avg_duration = sum(duration_list) / len(duration_list) RTF = sum(final_response_list) / sum(duration_list) + if all_delay_list != []: + delay_count = len(all_delay_list) + avg_delay = sum(all_delay_list) / len(all_delay_list) + delay_ratio = len(all_delay_list) / packet_count + min_delay = min(all_delay_list) + max_delay = max(all_delay_list) + else: + delay_count = 0.0 + avg_delay = 0.0 + delay_ratio = 0.0 + min_delay = 0.0 + max_delay = 0.0 print( "************************* server/client result ***************************************" ) print( - f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}" + f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}." + ) + print( + f"test num: {len(duration_list)}, packet count: {packet_count}, delay count: {delay_count}, avg delay time: {avg_delay} s, delay ratio: {delay_ratio} " ) print( f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s" ) print( - f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s" + f"min first response: {min(first_response_list)} s, max first response: {max(first_response_list)} s." ) print( - f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" + f"min final response: {min(final_response_list)} s, max final response: {max(final_response_list)} s." ) + print(f"min delay: {min_delay} s, max delay: {max_delay}") From 851b77c5f3c72eeaac9f93e83dae6f0aa30104e9 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Fri, 6 May 2022 19:42:29 +0800 Subject: [PATCH 29/93] remove useless code, test=doc --- .../unit/server/offline/test_server_client.sh | 2 - .../online/tts/check_server/http_client.py | 100 -------------- .../online/tts/check_server/ws_client.py | 126 ------------------ 3 files changed, 228 deletions(-) delete mode 100644 tests/unit/server/online/tts/check_server/http_client.py delete mode 100644 tests/unit/server/online/tts/check_server/ws_client.py diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh index 875008a7..dc52609c 100644 --- a/tests/unit/server/offline/test_server_client.sh +++ b/tests/unit/server/offline/test_server_client.sh @@ -21,8 +21,6 @@ StartService(){ } ClientTest(){ - echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $server_ip" - echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $port" # Client test # test asr client paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav diff --git a/tests/unit/server/online/tts/check_server/http_client.py b/tests/unit/server/online/tts/check_server/http_client.py deleted file mode 100644 index cbc1f5c0..00000000 --- a/tests/unit/server/online/tts/check_server/http_client.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import base64 -import json -import os -import time - -import requests - -from paddlespeech.server.utils.audio_process import pcm2wav - - -def save_audio(buffer, audio_path) -> bool: - if args.save_path.endswith("pcm"): - with open(args.save_path, "wb") as f: - f.write(buffer) - elif args.save_path.endswith("wav"): - with open("./tmp.pcm", "wb") as f: - f.write(buffer) - pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000) - os.system("rm ./tmp.pcm") - else: - print("Only supports saved audio format is pcm or wav") - return False - - return True - - -def test(args): - params = { - "text": args.text, - "spk_id": args.spk_id, - "speed": args.speed, - "volume": args.volume, - "sample_rate": args.sample_rate, - "save_path": '' - } - - buffer = b'' - flag = 1 - url = "http://" + str(args.server) + ":" + str( - args.port) + "/paddlespeech/streaming/tts" - st = time.time() - html = requests.post(url, json.dumps(params), stream=True) - for chunk in html.iter_content(chunk_size=1024): - chunk = base64.b64decode(chunk) # bytes - if flag: - first_response = time.time() - st - print(f"首包响应:{first_response} s") - flag = 0 - buffer += chunk - - final_response = time.time() - st - duration = len(buffer) / 2.0 / 24000 - - print(f"尾包响应:{final_response} s") - print(f"音频时长:{duration} s") - print(f"RTF: {final_response / duration}") - - if args.save_path is not None: - if save_audio(buffer, args.save_path): - print("音频保存至:", args.save_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - '--text', - type=str, - default="您好,欢迎使用语音合成服务。", - help='A sentence to be synthesized') - parser.add_argument('--spk_id', type=int, default=0, help='Speaker id') - parser.add_argument('--speed', type=float, default=1.0, help='Audio speed') - parser.add_argument( - '--volume', type=float, default=1.0, help='Audio volume') - parser.add_argument( - '--sample_rate', - type=int, - default=0, - help='Sampling rate, the default is the same as the model') - parser.add_argument( - "--server", type=str, help="server ip", default="127.0.0.1") - parser.add_argument("--port", type=int, help="server port", default=8092) - parser.add_argument( - "--save_path", type=str, help="save audio path", default=None) - - args = parser.parse_args() - test(args) diff --git a/tests/unit/server/online/tts/check_server/ws_client.py b/tests/unit/server/online/tts/check_server/ws_client.py deleted file mode 100644 index eef010cf..00000000 --- a/tests/unit/server/online/tts/check_server/ws_client.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import _thread as thread -import argparse -import base64 -import json -import ssl -import time - -import websocket - -flag = 1 -st = 0.0 -all_bytes = b'' - - -class WsParam(object): - # 初始化 - def __init__(self, text, server="127.0.0.1", port=8090): - self.server = server - self.port = port - self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts" - self.text = text - - # 生成url - def create_url(self): - return self.url - - -def on_message(ws, message): - global flag - global st - global all_bytes - - try: - message = json.loads(message) - audio = message["audio"] - audio = base64.b64decode(audio) # bytes - status = message["status"] - all_bytes += audio - - if status == 0: - print("create successfully.") - elif status == 1: - if flag: - print(f"首包响应:{time.time() - st} s") - flag = 0 - elif status == 2: - final_response = time.time() - st - duration = len(all_bytes) / 2.0 / 24000 - print(f"尾包响应:{final_response} s") - print(f"音频时长:{duration} s") - print(f"RTF: {final_response / duration}") - with open("./out.pcm", "wb") as f: - f.write(all_bytes) - print("ws is closed") - ws.close() - else: - print("infer error") - - except Exception as e: - print("receive msg,but parse exception:", e) - - -# 收到websocket错误的处理 -def on_error(ws, error): - print("### error:", error) - - -# 收到websocket关闭的处理 -def on_close(ws): - print("### closed ###") - - -# 收到websocket连接建立的处理 -def on_open(ws): - def run(*args): - global st - text_base64 = str( - base64.b64encode((wsParam.text).encode('utf-8')), "UTF8") - d = {"text": text_base64} - d = json.dumps(d) - print("Start sending text data") - st = time.time() - ws.send(d) - - thread.start_new_thread(run, ()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--text", - type=str, - help="A sentence to be synthesized", - default="您好,欢迎使用语音合成服务。") - parser.add_argument( - "--server", type=str, help="server ip", default="127.0.0.1") - parser.add_argument("--port", type=int, help="server port", default=8092) - args = parser.parse_args() - - print("***************************************") - print("Server ip: ", args.server) - print("Server port: ", args.port) - print("Sentence to be synthesized: ", args.text) - print("***************************************") - - wsParam = WsParam(text=args.text, server=args.server, port=args.port) - - websocket.enableTrace(False) - wsUrl = wsParam.create_url() - ws = websocket.WebSocketApp( - wsUrl, on_message=on_message, on_error=on_error, on_close=on_close) - ws.on_open = on_open - ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE}) From 5dfb72980edfa10b7af2fac41cd95d8be7e2ddc3 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 11:50:00 +0000 Subject: [PATCH 30/93] add commmit to version info --- setup.py | 97 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 31 deletions(-) diff --git a/setup.py b/setup.py index 912fdd6d..8cd687d8 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ from setuptools.command.install import install HERE = Path(os.path.abspath(os.path.dirname(__file__))) VERSION = '1.0.0a' +COMMITID = 'none' base = [ "editdistance", @@ -97,22 +98,31 @@ requirements = { } -def write_version_py(filename='paddlespeech/__init__.py'): - import paddlespeech - if hasattr(paddlespeech, - "__version__") and paddlespeech.__version__ == VERSION: - return - with open(filename, "a") as f: - f.write(f"\n__version__ = '{VERSION}'\n") +def check_call(cmd: str, shell=False, executable=None): + try: + sp.check_call( + cmd.split(), + shell=shell, + executable="/bin/bash" if shell else executable) + except sp.CalledProcessError as e: + print( + f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:", + e.output, + file=sys.stderr) + raise e -def remove_version_py(filename='paddlespeech/__init__.py'): - with open(filename, "r") as f: - lines = f.readlines() - with open(filename, "w") as f: - for line in lines: - if "__version__" not in line: - f.write(line) +def check_output(cmd: str, shell=False): + try: + out_bytes = sp.check_output(cmd.split()) + except sp.CalledProcessError as e: + out_bytes = e.output # Output generated before error + code = e.returncode # Return code + print( + f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:", + out_bytes, + file=sys.stderr) + return out_bytes.strip().decode('utf8') @contextlib.contextmanager @@ -132,24 +142,12 @@ def read(*names, **kwargs): return fp.read() -def check_call(cmd: str, shell=False, executable=None): - try: - sp.check_call( - cmd.split(), - shell=shell, - executable="/bin/bash" if shell else executable) - except sp.CalledProcessError as e: - print( - f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:", - e.output, - file=sys.stderr) - raise e - - def _remove(files: str): for f in files: f.unlink() +################################# Install ################################## + def _post_install(install_lib_dir): # tools/make @@ -202,8 +200,45 @@ class UploadCommand(Command): sys.exit() -write_version_py() +################################# Version ################################## +def write_version_py(filename='paddlespeech/__init__.py'): + import paddlespeech + if hasattr(paddlespeech, + "__version__") and paddlespeech.__version__ == VERSION: + return + with open(filename, "a") as f: + out_str = f"\n__version__ = '{VERSION}'\n" + print(out_str) + f.write(f"\n__version__ = '{VERSION}'\n") + + COMMITID = check_output("git rev-parse HEAD") + with open(filename, 'a') as f: + out_str = f"\n__commit__ = '{COMMITID}'\n" + print(out_str) + f.write(f"\n__commit__ = '{COMMITID}'\n") + + print(f"{inspect.currentframe().f_code.co_name} done") + + +def remove_version_py(filename='paddlespeech/__init__.py'): + with open(filename, "r") as f: + lines = f.readlines() + with open(filename, "w") as f: + for line in lines: + if "__version__" in line or "__commit__" in line: + continue + f.write(line) + print(f"{inspect.currentframe().f_code.co_name} done") + + +@contextlib.contextmanager +def version_info(): + write_version_py() + yield + remove_version_py() + +################################# Steup ################################## setup_info = dict( # Metadata name='paddlespeech', @@ -273,6 +308,6 @@ setup_info = dict( ] }) -setup(**setup_info) -remove_version_py() +with version_info(): + setup(**setup_info) From 0309a4d03273c72d9f018b1e81657c354c30ee44 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 May 2022 11:53:45 +0000 Subject: [PATCH 31/93] Add doc for wenetspeech model, test=doc --- docs/source/released_model.md | 2 ++ examples/wenetspeech/asr0/RESULTS.md | 7 +++++++ examples/wenetspeech/asr1/RESULTS.md | 14 ++++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 examples/wenetspeech/asr0/RESULTS.md diff --git a/docs/source/released_model.md b/docs/source/released_model.md index aee44859..f0f5a1f2 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,8 +6,10 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: +[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers with only forward direction | 0.152 (test\_net, without LM), 0.053 (aishell, with LM) |-| 10000 h |- [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) +[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) |-| 10000 h |- [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) diff --git a/examples/wenetspeech/asr0/RESULTS.md b/examples/wenetspeech/asr0/RESULTS.md new file mode 100644 index 00000000..17c0b9e5 --- /dev/null +++ b/examples/wenetspeech/asr0/RESULTS.md @@ -0,0 +1,7 @@ +# Wenetspeech + +## Deepspeech2 Streaming + +| Model | Number of Params | Release | Config | Test set | Valid Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 1.2G | r1.0.0a | conf/deepspeech2\_online.yaml + spec aug + fbank161 | test\_net | 13.307 | 15.02 | diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index 5c2b8143..2b03b3f2 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -1,9 +1,19 @@ # WenetSpeech +## Conformer online + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | attention | 9.329 | 0.1102 | +| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | ctc_greedy_search | 9.329 | 0.1207 | +| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | ctc_prefix_beam_search | 9.329 | 0.1203 | +| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | attention_rescoring | 9.329 | 0.1100 | + + ## Conformer -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | | @@ -16,7 +26,7 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | From 9ab63d11133ef8728bf277fd7e6b70a8eae661ff Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 May 2022 12:13:46 +0000 Subject: [PATCH 32/93] improve the doc, test=doc --- docs/source/released_model.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index f0f5a1f2..61a3eb26 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,8 +6,8 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: -[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers with only forward direction | 0.152 (test\_net, without LM), 0.053 (aishell, with LM) |-| 10000 h |- -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM), 0.053 (aishell, w/ LM) |-| 10000 h |- +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) |-| 10000 h |- [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) From 22c3f95057d5db7953e4d3234e5661bbc14319b5 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 6 May 2022 20:49:52 +0800 Subject: [PATCH 33/93] fix fbank result --- speechx/examples/ds2_ol/aishell/run_fbank.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 88c16857..3d4825ac 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -161,9 +161,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --model_cache_shapes="5-1-2048,5-1-2048" \ --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ - --result_wspecifier=ark,t:./result_fbank_recognizer + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_fbank_recognizer - cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer + cat $data/split${nj}/*/result_fbank_recognizer > $exp/${label_file}_recognizer utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer echo "recognizer test have finished!!!" echo "please checkout in ${exp}/${wer}.recognizer" From 02e7586394dd31d37ac43e409aed8657f8eaa426 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 13:02:56 +0000 Subject: [PATCH 34/93] update readme --- README.md | 35 +++++++------ README_cn.md | 63 +++++++++++------------ paddlespeech/cli/asr/pretrained_models.py | 2 +- 3 files changed, 50 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 9791b895..d32131c0 100644 --- a/README.md +++ b/README.md @@ -151,14 +151,24 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample ### Features Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at: -- 📦 **Ease of Use**: low barriers to install, and [CLI](#quick-start) is available to quick-start your journey. +- 📦 **Ease of Use**: low barriers to install, [CLI](#quick-start), [Server](#quick-start-server), and [Streaming Server](#quick-start-streaming-server) is available to quick-start your journey. - 🏆 **Align to the State-of-the-Art**: we provide high-speed and ultra-lightweight models, and also cutting-edge technology. +- 🏆 **Streaming ASR and TTS System**: we provide production ready streaming asr and streaming tts system. - 💯 **Rule-based Chinese frontend**: our frontend contains Text Normalization and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context. -- **Varieties of Functions that Vitalize both Industrial and Academia**: - - 🛎️ *Implementation of critical audio tasks*: this toolkit contains audio functions like Audio Classification, Speech Translation, Automatic Speech Recognition, Text-to-Speech Synthesis, etc. +- 📦 **Varieties of Functions that Vitalize both Industrial and Academia**: + - 🛎️ *Implementation of critical audio tasks*: this toolkit contains audio functions like Automatic Speech Recognition, Text-to-Speech Synthesis, Speaker Verfication, KeyWord Spotting, Audio Classification, and Speech Translation, etc. - 🔬 *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details. - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). +### Recent Update +- 👏🏻 2022.05.06: `Streaming ASR` with `Punctuation Restoration` and `Token Timestamp`. +- 👏🏻 2022.05.06: `Server` is available for `Speaker Verification`, and `Punctuation Restoration`. +- 👏🏻 2022.04.28: `Streaming Server` is available for `Automatic Speech Recognition` and `Text-to-Speech`. +- 👏🏻 2022.03.28: `Server` is available for `Audio Classification`, `Automatic Speech Recognition` and `Text-to-Speech`. +- 👏🏻 2022.03.28: `CLI` is available for `Speaker Verification`. +- 🤗 2021.12.14: [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available! +- 👏🏻 2021.12.10: `CLI` is available for `Audio Classification`, `Automatic Speech Recognition`, `Speech Translation (English to Chinese)` and `Text-to-Speech`. + ### 🔥 Hot Activities -- 👏🏻 2022.04.28: PaddleSpeech Streaming Server 上线! 覆盖了语音识别和语音合成。 -- 👏🏻 2022.03.28: PaddleSpeech Server 上线! 覆盖了声音分类、语音识别、以及语音合成。 -- 👏🏻 2022.03.28: PaddleSpeech CLI 上线声纹验证。 -- 🤗 2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available! -- 👏🏻 2021.12.10: PaddleSpeech CLI 上线!覆盖了声音分类、语音识别、语音翻译(英译中)以及语音合成。 +- 👏🏻 2022.05.06: PaddleSpeech Streaming Server 上线! 覆盖了语音识别(标点恢复、时间戳),和语音合成。 +- 👏🏻 2022.05.06: PaddleSpeech Server 上线! 覆盖了声音分类、语音识别、语音合成、声纹识别,标点恢复。 +- 👏🏻 2022.03.28: PaddleSpeech CLI 覆盖声音分类、语音识别、语音翻译(英译中)、语音合成,声纹验证。 +- 🤗 2021.12.14: PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available! +### 🔥 热门活动 -### 特性 +- 2021.12.21~12.24 -本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括 -- 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。 -- 🏆 **对标 SoTA**: 提供了高速、轻量级模型,且借鉴了最前沿的技术。 -- 💯 **基于规则的中文前端**: 我们的前端包含文本正则化和字音转换(G2P)。此外,我们使用自定义语言规则来适应中文语境。 -- **多种工业界以及学术界主流功能支持**: - - 🛎️ 典型音频任务: 本工具包提供了音频任务如音频分类、语音翻译、自动语音识别、文本转语音、语音合成等任务的实现。 - - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC,详情请见 [模型列表](#model-list)。 - - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 + 4 日直播课: 深度解读 PaddleSpeech 语音技术! + + **直播回放与课件资料: https://aistudio.baidu.com/aistudio/education/group/info/25130** ### 技术交流群 @@ -328,8 +327,8 @@ PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识 语音转文本模块类型 数据集 - 模型种类 - 链接 + 模型类型 + 脚本 @@ -402,9 +401,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 语音合成模块类型 - 模型种类 + 模型类型 数据集 - 链接 + 脚本 @@ -520,8 +519,8 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 任务 数据集 - 模型种类 - 链接 + 模型类型 + 脚本 @@ -544,10 +543,10 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - - - - + + + + @@ -571,8 +570,8 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - - + + diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index 7f198ad6..0f521884 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -27,7 +27,7 @@ pretrained_models = { 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, - "conformer_online_wenetspeech-zh-16k": { + "conformer_online_wenetspeech-zh-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz', 'md5': From 4d7046d2441a13d2f30100dba5fe172aed52c0c8 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 7 May 2022 02:26:48 +0000 Subject: [PATCH 35/93] updata released model info, test=doc --- docs/source/released_model.md | 4 ++-- examples/wenetspeech/asr0/RESULTS.md | 1 + examples/wenetspeech/asr1/RESULTS.md | 14 ++++++++------ paddlespeech/cli/asr/pretrained_models.py | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 61a3eb26..435985bc 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,10 +6,10 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: -[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM), 0.053 (aishell, w/ LM) |-| 10000 h |- +[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM)
0.2417 (test\_meeting, w/o LM)
0.053 (aishell, w/ LM) |-| 10000 h |- [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) -[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) |-| 10000 h |- +[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 18.79 (test\_meeting) |-| 10000 h |- [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) diff --git a/examples/wenetspeech/asr0/RESULTS.md b/examples/wenetspeech/asr0/RESULTS.md index 17c0b9e5..0566a352 100644 --- a/examples/wenetspeech/asr0/RESULTS.md +++ b/examples/wenetspeech/asr0/RESULTS.md @@ -5,3 +5,4 @@ | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | | DeepSpeech2 | 1.2G | r1.0.0a | conf/deepspeech2\_online.yaml + spec aug + fbank161 | test\_net | 13.307 | 15.02 | +| DeepSpeech2 | 1.2G | r1.0.0a | conf/deepspeech2\_online.yaml + spec aug + fbank161 | test\_meeting | 13.307 | 24.17 | diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index 2b03b3f2..72b815b7 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -4,12 +4,14 @@ | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | attention | 9.329 | 0.1102 | -| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | ctc_greedy_search | 9.329 | 0.1207 | -| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | ctc_prefix_beam_search | 9.329 | 0.1203 | -| conformer | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | attention_rescoring | 9.329 | 0.1100 | - - +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | attention | 9.329 | 0.1102 | +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | ctc_greedy_search | 9.329 | 0.1207 | +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | ctc_prefix_beam_search | 9.329 | 0.1203 | +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | attention_rescoring | 9.329 | 0.1100 | +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test meeting | attention | 9.329 | 0.1992 | +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test meeting | ctc_greedy_search | 9.329 | 0.1960 | +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | 9.329 | 0.1946 | +| conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test meeting | attention_rescoring | 9.329 | 0.1879| ## Conformer diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index 7f198ad6..0f521884 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -27,7 +27,7 @@ pretrained_models = { 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, - "conformer_online_wenetspeech-zh-16k": { + "conformer_online_wenetspeech-zh-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz', 'md5': From bb0db29d7e1ab83378d4ee2fae6f02bebd8bfbd8 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Sat, 7 May 2022 11:31:07 +0800 Subject: [PATCH 36/93] update the streaming asr readme, test=doc --- demos/streaming_asr_server/README.md | 522 +++++++++--------- demos/streaming_asr_server/README_cn.md | 522 +++++++++--------- .../server/bin/paddlespeech_server.py | 47 +- 3 files changed, 573 insertions(+), 518 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 6808de5e..8423e5d0 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -212,69 +212,73 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav Output: ```bash - [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} - [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 - [2022-04-21 15:59:12,884] [ INFO] - Response time 9.051567 s. + [2022-05-06 21:10:35,598] [ INFO] - Start to do streaming asr client + [2022-05-06 21:10:35,600] [ INFO] - asr websocket client start + [2022-05-06 21:10:35,600] [ INFO] - endpoint: ws://127.0.0.1:8390/paddlespeech/asr/streaming + [2022-05-06 21:10:35,600] [ INFO] - start to process the wavscp: ./zh.wav + [2022-05-06 21:10:35,670] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-06 21:10:35,699] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,713] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,726] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,738] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,750] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,762] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,774] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,786] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,387] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,398] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,407] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,416] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,425] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,434] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,442] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,930] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,938] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,946] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,954] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,962] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,970] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,977] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,985] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:37,484] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,492] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,500] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,508] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,517] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,525] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,532] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:38,050] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,058] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,066] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,073] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,081] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,089] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,097] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,105] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,630] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,639] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,647] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,655] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,663] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,671] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,679] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:39,216] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,224] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,232] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,240] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,248] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,256] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,264] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,272] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,885] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,896] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,905] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,915] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,924] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,934] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:44,827] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-06 21:10:44,827] [ INFO] - audio duration: 4.9968125, elapsed time: 9.225094079971313, RTF=1.846195765794957 + [2022-05-06 21:10:44,828] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康 ``` @@ -295,67 +299,71 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav Output: ```bash - [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} - [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:03,137] [ INFO] - asr websocket client start + [2022-05-06 21:14:03,137] [ INFO] - endpoint: ws://127.0.0.1:8390/paddlespeech/asr/streaming + [2022-05-06 21:14:03,149] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-06 21:14:03,167] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,181] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,194] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,207] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,219] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,230] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,241] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,252] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,768] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,776] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,784] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,792] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,800] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,807] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,815] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:04,301] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,309] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,317] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,325] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,333] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,341] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,349] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,356] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,855] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,864] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,871] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,879] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,887] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,894] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,902] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:05,418] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,426] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,434] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,442] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,449] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,457] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,465] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,473] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,996] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,006] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,013] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,021] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,029] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,037] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,045] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,581] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,589] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,597] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,605] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,613] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,621] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,628] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,636] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:07,188] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,196] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,203] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,211] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,219] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,226] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:12,158] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-06 21:14:12,159] [ INFO] - audio duration: 4.9968125, elapsed time: 9.019973039627075, RTF=1.8051453881103354 + [2022-05-06 21:14:12,160] [ INFO] - asr websocket client finished ``` @@ -488,72 +496,73 @@ bash server.sh ``` Output: ``` - [2022-05-02 18:57:46,961] [ INFO] - asr websocket client start - [2022-05-02 18:57:46,961] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming - [2022-05-02 18:57:46,982] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} - [2022-05-02 18:57:46,999] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,011] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,023] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,035] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,046] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,057] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,068] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,079] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,222] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,230] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,239] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,247] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,255] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,263] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,271] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,462] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,525] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,589] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,649] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,708] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,766] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,824] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,881] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:48,130] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,200] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,265] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,327] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,389] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,448] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,505] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,754] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:48,821] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:48,881] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:48,939] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,011] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,080] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,146] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,210] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,452] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,516] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,581] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,645] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,706] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,763] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,818] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:50,064] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,125] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,186] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,245] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,301] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,358] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,414] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,469] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,712] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,776] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,837] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,897] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,956] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:51,012] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:51,276] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:51,277] [ INFO] - asr websocket client finished - [2022-05-02 18:57:51,277] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康。 - [2022-05-02 18:57:51,277] [ INFO] - Response time 4.316903 s. + [2022-05-07 11:21:47,060] [ INFO] - asr websocket client start + [2022-05-07 11:21:47,060] [ INFO] - endpoint: ws://127.0.0.1:8490/paddlespeech/asr/streaming + [2022-05-07 11:21:47,080] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-07 11:21:47,096] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,108] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,120] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,131] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,142] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,152] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,163] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,173] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,705] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,713] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,721] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,728] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,736] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,743] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,751] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:48,459] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,572] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,681] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,790] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,898] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,005] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,112] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,219] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,935] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,062] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,186] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,310] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,435] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,560] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,686] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:51,444] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:51,606] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:51,744] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:51,882] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,020] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,159] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,298] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,437] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:53,298] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,450] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,589] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,728] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,867] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:54,007] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:54,146] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:55,002] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,148] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,292] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,437] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,584] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,731] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,877] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:56,021] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:56,842] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,013] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,174] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,336] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,497] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,659] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:22:03,035] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-07 11:22:03,035] [ INFO] - audio duration: 4.9968125, elapsed time: 15.974023818969727, RTF=3.1968427510477384 + [2022-05-07 11:22:03,037] [ INFO] - asr websocket client finished + [2022-05-07 11:22:03,037] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-07 11:22:03,037] [ INFO] - Response time 15.977116 s. ``` - Use script @@ -562,72 +571,73 @@ bash server.sh ``` Output: ``` - [2022-05-02 18:29:22,039] [ INFO] - Start to do streaming asr client - [2022-05-02 18:29:22,040] [ INFO] - asr websocket client start - [2022-05-02 18:29:22,040] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming - [2022-05-02 18:29:22,041] [ INFO] - start to process the wavscp: ./zh.wav - [2022-05-02 18:29:22,122] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} - [2022-05-02 18:29:22,351] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,360] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,368] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,376] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,384] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,392] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,400] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,408] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,549] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,558] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,567] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,575] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,583] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,591] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,599] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,822] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:22,879] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:22,937] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:22,995] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,052] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,107] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,161] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,213] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,454] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,515] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,575] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,630] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,684] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,789] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:24,030] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,095] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,156] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,213] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,268] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,323] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,377] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,429] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,671] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,797] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,857] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,918] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,975] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:25,029] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:25,271] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,336] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,398] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,458] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,521] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,579] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,652] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,722] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,969] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,034] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,095] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,163] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,229] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,294] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,565] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-07 11:11:02,984] [ INFO] - Start to do streaming asr client + [2022-05-07 11:11:02,985] [ INFO] - asr websocket client start + [2022-05-07 11:11:02,985] [ INFO] - endpoint: ws://127.0.0.1:8490/paddlespeech/asr/streaming + [2022-05-07 11:11:02,986] [ INFO] - start to process the wavscp: ./zh.wav + [2022-05-07 11:11:03,006] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-07 11:11:03,021] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,034] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,046] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,058] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,070] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,081] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,092] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,102] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,629] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,638] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,645] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,653] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,661] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,668] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,676] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:04,402] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,510] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,619] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,743] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,849] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,956] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:05,063] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:05,170] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:05,876] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,019] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,184] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,342] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,537] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,727] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,871] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:07,617] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:07,769] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:07,905] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,043] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,186] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,326] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,466] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,611] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:09,431] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,571] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,714] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,853] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,992] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:10,129] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:10,266] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:11,113] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,296] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,439] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,582] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,727] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,869] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:12,011] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:12,153] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:12,969] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,137] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,297] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,456] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,615] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,776] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:18,915] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-07 11:11:18,915] [ INFO] - audio duration: 4.9968125, elapsed time: 15.928460597991943, RTF=3.187724293835709 + [2022-05-07 11:11:18,916] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康 ``` diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 5fa81d4b..bda545dd 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -220,69 +220,73 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav 输出: ```bash - [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} - [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 - [2022-04-21 15:59:12,884] [ INFO] - Response time 9.051567 s. + [2022-05-06 21:10:35,598] [ INFO] - Start to do streaming asr client + [2022-05-06 21:10:35,600] [ INFO] - asr websocket client start + [2022-05-06 21:10:35,600] [ INFO] - endpoint: ws://127.0.0.1:8390/paddlespeech/asr/streaming + [2022-05-06 21:10:35,600] [ INFO] - start to process the wavscp: ./zh.wav + [2022-05-06 21:10:35,670] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-06 21:10:35,699] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,713] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,726] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,738] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,750] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,762] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,774] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:35,786] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,387] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,398] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,407] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,416] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,425] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,434] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,442] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:10:36,930] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,938] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,946] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,954] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,962] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,970] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,977] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:36,985] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:10:37,484] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,492] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,500] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,508] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,517] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,525] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:37,532] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:10:38,050] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,058] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,066] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,073] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,081] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,089] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,097] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,105] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:10:38,630] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,639] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,647] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,655] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,663] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,671] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:38,679] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:10:39,216] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,224] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,232] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,240] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,248] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,256] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,264] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,272] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:10:39,885] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,896] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,905] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,915] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,924] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:39,934] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:10:44,827] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-06 21:10:44,827] [ INFO] - audio duration: 4.9968125, elapsed time: 9.225094079971313, RTF=1.846195765794957 + [2022-05-06 21:10:44,828] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康 ``` - Python API @@ -302,67 +306,71 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav 输出: ```bash - [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} - [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:03,137] [ INFO] - asr websocket client start + [2022-05-06 21:14:03,137] [ INFO] - endpoint: ws://127.0.0.1:8390/paddlespeech/asr/streaming + [2022-05-06 21:14:03,149] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-06 21:14:03,167] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,181] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,194] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,207] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,219] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,230] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,241] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,252] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,768] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,776] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,784] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,792] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,800] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,807] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:03,815] [ INFO] - client receive msg={'result': ''} + [2022-05-06 21:14:04,301] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,309] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,317] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,325] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,333] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,341] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,349] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,356] [ INFO] - client receive msg={'result': '我认为跑'} + [2022-05-06 21:14:04,855] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,864] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,871] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,879] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,887] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,894] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:04,902] [ INFO] - client receive msg={'result': '我认为跑步最重要的'} + [2022-05-06 21:14:05,418] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,426] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,434] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,442] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,449] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,457] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,465] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,473] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是'} + [2022-05-06 21:14:05,996] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,006] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,013] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,021] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,029] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,037] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,045] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给'} + [2022-05-06 21:14:06,581] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,589] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,597] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,605] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,613] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,621] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,628] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:06,636] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了'} + [2022-05-06 21:14:07,188] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,196] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,203] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,211] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,219] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:07,226] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-05-06 21:14:12,158] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-06 21:14:12,159] [ INFO] - audio duration: 4.9968125, elapsed time: 9.019973039627075, RTF=1.8051453881103354 + [2022-05-06 21:14:12,160] [ INFO] - asr websocket client finished ``` @@ -496,72 +504,73 @@ bash server.sh ``` 输出: ``` - [2022-05-02 18:57:46,961] [ INFO] - asr websocket client start - [2022-05-02 18:57:46,961] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming - [2022-05-02 18:57:46,982] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} - [2022-05-02 18:57:46,999] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,011] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,023] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,035] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,046] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,057] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,068] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,079] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,222] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,230] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,239] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,247] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,255] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,263] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,271] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:57:47,462] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,525] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,589] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,649] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,708] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,766] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,824] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:47,881] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:57:48,130] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,200] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,265] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,327] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,389] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,448] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,505] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:57:48,754] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:48,821] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:48,881] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:48,939] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,011] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,080] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,146] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,210] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:57:49,452] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,516] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,581] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,645] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,706] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,763] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:49,818] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:57:50,064] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,125] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,186] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,245] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,301] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,358] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,414] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,469] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:57:50,712] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,776] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,837] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,897] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:50,956] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:51,012] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:51,276] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:57:51,277] [ INFO] - asr websocket client finished - [2022-05-02 18:57:51,277] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康。 - [2022-05-02 18:57:51,277] [ INFO] - Response time 4.316903 s. + [2022-05-07 11:21:47,060] [ INFO] - asr websocket client start + [2022-05-07 11:21:47,060] [ INFO] - endpoint: ws://127.0.0.1:8490/paddlespeech/asr/streaming + [2022-05-07 11:21:47,080] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-07 11:21:47,096] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,108] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,120] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,131] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,142] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,152] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,163] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,173] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,705] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,713] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,721] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,728] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,736] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,743] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:47,751] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:21:48,459] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,572] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,681] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,790] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:48,898] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,005] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,112] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,219] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:21:49,935] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,062] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,186] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,310] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,435] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,560] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:50,686] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:21:51,444] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:51,606] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:51,744] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:51,882] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,020] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,159] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,298] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:52,437] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:21:53,298] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,450] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,589] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,728] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:53,867] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:54,007] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:54,146] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:21:55,002] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,148] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,292] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,437] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,584] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,731] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:55,877] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:56,021] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:21:56,842] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,013] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,174] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,336] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,497] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:21:57,659] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:22:03,035] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-07 11:22:03,035] [ INFO] - audio duration: 4.9968125, elapsed time: 15.974023818969727, RTF=3.1968427510477384 + [2022-05-07 11:22:03,037] [ INFO] - asr websocket client finished + [2022-05-07 11:22:03,037] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-07 11:22:03,037] [ INFO] - Response time 15.977116 s. ``` - 使用脚本调用 @@ -570,72 +579,73 @@ bash server.sh ``` 输出: ``` - [2022-05-02 18:29:22,039] [ INFO] - Start to do streaming asr client - [2022-05-02 18:29:22,040] [ INFO] - asr websocket client start - [2022-05-02 18:29:22,040] [ INFO] - endpoint: ws://127.0.0.1:8290/paddlespeech/asr/streaming - [2022-05-02 18:29:22,041] [ INFO] - start to process the wavscp: ./zh.wav - [2022-05-02 18:29:22,122] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} - [2022-05-02 18:29:22,351] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,360] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,368] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,376] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,384] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,392] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,400] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,408] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,549] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,558] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,567] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,575] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,583] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,591] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,599] [ INFO] - client receive msg={'result': ''} - [2022-05-02 18:29:22,822] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:22,879] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:22,937] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:22,995] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,052] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,107] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,161] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,213] [ INFO] - client receive msg={'result': '我认为,跑'} - [2022-05-02 18:29:23,454] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,515] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,575] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,630] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,684] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:23,789] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} - [2022-05-02 18:29:24,030] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,095] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,156] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,213] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,268] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,323] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,377] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,429] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} - [2022-05-02 18:29:24,671] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,736] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,797] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,857] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,918] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:24,975] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:25,029] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} - [2022-05-02 18:29:25,271] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,336] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,398] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,458] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,521] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,579] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,652] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,722] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} - [2022-05-02 18:29:25,969] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,034] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,095] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,163] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,229] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,294] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,565] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。'} - [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-07 11:11:02,984] [ INFO] - Start to do streaming asr client + [2022-05-07 11:11:02,985] [ INFO] - asr websocket client start + [2022-05-07 11:11:02,985] [ INFO] - endpoint: ws://127.0.0.1:8490/paddlespeech/asr/streaming + [2022-05-07 11:11:02,986] [ INFO] - start to process the wavscp: ./zh.wav + [2022-05-07 11:11:03,006] [ INFO] - client receive msg={"status": "ok", "signal": "server_ready"} + [2022-05-07 11:11:03,021] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,034] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,046] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,058] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,070] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,081] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,092] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,102] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,629] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,638] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,645] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,653] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,661] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,668] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:03,676] [ INFO] - client receive msg={'result': ''} + [2022-05-07 11:11:04,402] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,510] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,619] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,743] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,849] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:04,956] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:05,063] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:05,170] [ INFO] - client receive msg={'result': '我认为,跑'} + [2022-05-07 11:11:05,876] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,019] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,184] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,342] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,537] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,727] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:06,871] [ INFO] - client receive msg={'result': '我认为,跑步最重要的。'} + [2022-05-07 11:11:07,617] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:07,769] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:07,905] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,043] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,186] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,326] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,466] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:08,611] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是。'} + [2022-05-07 11:11:09,431] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,571] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,714] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,853] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:09,992] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:10,129] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:10,266] [ INFO] - client receive msg={'result': '我认为,跑步最重要的就是给。'} + [2022-05-07 11:11:11,113] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,296] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,439] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,582] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,727] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:11,869] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:12,011] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:12,153] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了。'} + [2022-05-07 11:11:12,969] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,137] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,297] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,456] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,615] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:13,776] [ INFO] - client receive msg={'result': '我认为跑步最重要的就是给我带来了身体健康。'} + [2022-05-07 11:11:18,915] [ INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '我认为跑步最重要的就是给我带来了身体健康。', 'times': [{'w': '我', 'bg': 0.0, 'ed': 0.7000000000000001}, {'w': '认', 'bg': 0.7000000000000001, 'ed': 0.84}, {'w': '为', 'bg': 0.84, 'ed': 1.0}, {'w': '跑', 'bg': 1.0, 'ed': 1.18}, {'w': '步', 'bg': 1.18, 'ed': 1.36}, {'w': '最', 'bg': 1.36, 'ed': 1.5}, {'w': '重', 'bg': 1.5, 'ed': 1.6400000000000001}, {'w': '要', 'bg': 1.6400000000000001, 'ed': 1.78}, {'w': '的', 'bg': 1.78, 'ed': 1.9000000000000001}, {'w': '就', 'bg': 1.9000000000000001, 'ed': 2.06}, {'w': '是', 'bg': 2.06, 'ed': 2.62}, {'w': '给', 'bg': 2.62, 'ed': 3.16}, {'w': '我', 'bg': 3.16, 'ed': 3.3200000000000003}, {'w': '带', 'bg': 3.3200000000000003, 'ed': 3.48}, {'w': '来', 'bg': 3.48, 'ed': 3.62}, {'w': '了', 'bg': 3.62, 'ed': 3.7600000000000002}, {'w': '身', 'bg': 3.7600000000000002, 'ed': 3.9}, {'w': '体', 'bg': 3.9, 'ed': 4.0600000000000005}, {'w': '健', 'bg': 4.0600000000000005, 'ed': 4.26}, {'w': '康', 'bg': 4.26, 'ed': 4.96}]} + [2022-05-07 11:11:18,915] [ INFO] - audio duration: 4.9968125, elapsed time: 15.928460597991943, RTF=3.187724293835709 + [2022-05-07 11:11:18,916] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康 ``` diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 474a8b79..9e3b0ed0 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -109,14 +109,16 @@ class ServerStatsExecutor(): '--task', type=str, default=None, - choices=['asr', 'tts', 'cls'], + choices=['asr', 'tts', 'cls', 'text', 'vector'], help='Choose speech task.', required=True) - self.task_choices = ['asr', 'tts', 'cls'] + self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector'] self.model_name_format = { 'asr': 'Model-Language-Sample Rate', 'tts': 'Model-Language', - 'cls': 'Model-Sample Rate' + 'cls': 'Model-Sample Rate', + 'text': 'Model-Task-Language', + 'vector': 'Model-Sample Rate' } def show_support_models(self, pretrained_models: dict): @@ -137,7 +139,7 @@ class ServerStatsExecutor(): "Please input correct speech task, choices = ['asr', 'tts']") return False - elif self.task == 'asr': + elif self.task.lower() == 'asr': try: from paddlespeech.cli.asr.infer import pretrained_models logger.info( @@ -159,7 +161,7 @@ class ServerStatsExecutor(): ) return False - elif self.task == 'tts': + elif self.task.lower() == 'tts': try: from paddlespeech.cli.tts.infer import pretrained_models logger.info( @@ -181,7 +183,7 @@ class ServerStatsExecutor(): ) return False - elif self.task == 'cls': + elif self.task.lower() == 'cls': try: from paddlespeech.cli.cls.infer import pretrained_models logger.info( @@ -202,3 +204,36 @@ class ServerStatsExecutor(): "Failed to get the table of CLS pretrained models supported in the service." ) return False + elif self.task.lower() == 'text': + try: + from paddlespeech.cli.text.infer import pretrained_models + logger.info( + "Here is the table of Text pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of Text pretrained models supported in the service." + ) + return False + elif self.task.lower() == 'vector': + try: + from paddlespeech.cli.vector.infer import pretrained_models + logger.info( + "Here is the table of Vector pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of Vector pretrained models supported in the service." + ) + return False + else: + logger.error( + f"Failed to get the table of {self.task} pretrained models supported in the service." + ) + return False From 7be6b0e8cf46315ba09c7ca7d49247ab4862d9b6 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 7 May 2022 07:08:42 +0000 Subject: [PATCH 37/93] unify name style & frame with abs timestamp --- paddlespeech/server/bin/main.py | 77 --------------- paddlespeech/server/restful/api.py | 10 +- paddlespeech/server/utils/audio_handler.py | 10 +- paddlespeech/server/utils/audio_process.py | 2 +- paddlespeech/server/utils/buffer.py | 21 ++-- paddlespeech/server/ws/api.py | 4 +- paddlespeech/server/ws/asr_socket.py | 110 --------------------- paddlespeech/server/ws/tts_socket.py | 61 ------------ 8 files changed, 31 insertions(+), 264 deletions(-) delete mode 100644 paddlespeech/server/bin/main.py delete mode 100644 paddlespeech/server/ws/asr_socket.py delete mode 100644 paddlespeech/server/ws/tts_socket.py diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py deleted file mode 100644 index 81824c85..00000000 --- a/paddlespeech/server/bin/main.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse - -import uvicorn -from fastapi import FastAPI - -from paddlespeech.server.engine.engine_pool import init_engine_pool -from paddlespeech.server.restful.api import setup_router as setup_http_router -from paddlespeech.server.utils.config import get_config -from paddlespeech.server.ws.api import setup_router as setup_ws_router - -app = FastAPI( - title="PaddleSpeech Serving API", description="Api", version="0.0.1") - - -def init(config): - """system initialization - - Args: - config (CfgNode): config object - - Returns: - bool: - """ - # init api - api_list = list(engine.split("_")[0] for engine in config.engine_list) - if config.protocol == "websocket": - api_router = setup_ws_router(api_list) - elif config.protocol == "http": - api_router = setup_http_router(api_list) - else: - raise Exception("unsupported protocol") - app.include_router(api_router) - - if not init_engine_pool(config): - return False - - return True - - -def main(args): - """main function""" - - config = get_config(args.config_file) - - if init(config): - uvicorn.run(app, host=config.host, port=config.port, debug=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--config_file", - action="store", - help="yaml file of the app", - default="./conf/application.yaml") - - parser.add_argument( - "--log_file", - action="store", - help="log file", - default="./log/paddlespeech.log") - args = parser.parse_args() - - main(args) diff --git a/paddlespeech/server/restful/api.py b/paddlespeech/server/restful/api.py index f1e4ffc8..63f865e8 100644 --- a/paddlespeech/server/restful/api.py +++ b/paddlespeech/server/restful/api.py @@ -29,19 +29,19 @@ def setup_router(api_list: List): """setup router for fastapi Args: - api_list (List): [asr, tts, cls] + api_list (List): [asr, tts, cls, text, vecotr] Returns: APIRouter """ for api_name in api_list: - if api_name == 'asr': + if api_name.lower() == 'asr': _router.include_router(asr_router) - elif api_name == 'tts': + elif api_name.lower() == 'tts': _router.include_router(tts_router) - elif api_name == 'cls': + elif api_name.lower() == 'cls': _router.include_router(cls_router) - elif api_name == 'text': + elif api_name.lower() == 'text': _router.include_router(text_router) elif api_name.lower() == 'vector': _router.include_router(vec_router) diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 75f4a10b..2bce28e3 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -43,6 +43,7 @@ class TextHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/text' + logger.info(f"endpoint: {self.url}") def run(self, text): """Call the text server to process the specific text @@ -107,8 +108,10 @@ class ASRWsAudioHandler: """ samples, sample_rate = soundfile.read(wavfile_path, dtype='int16') x_len = len(samples) + assert sample_rate == 16000 + + chunk_size = int(85 * sample_rate / 1000) # 85ms, sample_rate = 16kHz - chunk_size = 85 * 16 #80ms, sample_rate = 16kHz if x_len % chunk_size != 0: padding_len_x = chunk_size - x_len % chunk_size else: @@ -217,6 +220,7 @@ class ASRHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/asr' + logger.info(f"endpoint: {self.url}") def run(self, input, audio_format, sample_rate, lang): """Call the http asr to process the audio @@ -275,6 +279,7 @@ class TTSWsHandler: self.start_play = True self.t = threading.Thread(target=self.play_audio) self.max_fail = 50 + logger.info(f"endpoint: {self.url}") def play_audio(self): while True: @@ -383,6 +388,7 @@ class TTSHttpHandler: self.start_play = True self.t = threading.Thread(target=self.play_audio) self.max_fail = 50 + logger.info(f"endpoint: {self.url}") def play_audio(self): while True: @@ -483,6 +489,7 @@ class VectorHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/vector' + logger.info(f"endpoint: {self.url}") def run(self, input, audio_format, sample_rate, task="spk"): """Call the http asr to process the audio @@ -529,6 +536,7 @@ class VectorScoreHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/vector/score' + logger.info(f"endpoint: {self.url}") def run(self, enroll_audio, test_audio, audio_format, sample_rate): """Call the http asr to process the audio diff --git a/paddlespeech/server/utils/audio_process.py b/paddlespeech/server/utils/audio_process.py index 6fb5bb83..bb02d664 100644 --- a/paddlespeech/server/utils/audio_process.py +++ b/paddlespeech/server/utils/audio_process.py @@ -107,7 +107,7 @@ def change_speed(sample_raw, speed_rate, sample_rate): def float2pcm(sig, dtype='int16'): - """Convert floating point signal with a range from -1 to 1 to PCM. + """Convert floating point signal with a range from -1 to 1 to PCM16. Args: sig (array): Input array, must have floating point type. diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py index d4e6cd49..f56db752 100644 --- a/paddlespeech/server/utils/buffer.py +++ b/paddlespeech/server/utils/buffer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - class Frame(object): """Represents a "frame" of audio data.""" @@ -46,8 +45,7 @@ class ChunkBuffer(object): self.shift_ms = shift_ms self.sample_rate = sample_rate self.sample_width = sample_width # int16 = 2; float32 = 4 - self.remained_audio = b'' - + self.window_sec = float((self.window_n - 1) * self.shift_ms + self.window_ms) / 1000.0 self.shift_sec = float(self.shift_n * self.shift_ms / 1000.0) @@ -57,22 +55,31 @@ class ChunkBuffer(object): self.shift_bytes = int(self.shift_sec * self.sample_rate * self.sample_width) + self.remained_audio = b'' + # abs timestamp from `start` or latest `reset` + self.timestamp = 0.0 + + def reset(self): + """ + reset buffer state. + """ + self.timestamp = 0.0 + self.remained_audio = b'' + def frame_generator(self, audio): """Generates audio frames from PCM audio data. Takes the desired frame duration in milliseconds, the PCM data, and the sample rate. Yields Frames of the requested duration. """ - audio = self.remained_audio + audio self.remained_audio = b'' offset = 0 - timestamp = 0.0 while offset + self.window_bytes <= len(audio): - yield Frame(audio[offset:offset + self.window_bytes], timestamp, + yield Frame(audio[offset:offset + self.window_bytes], self.timestamp, self.window_sec) - timestamp += self.shift_sec + self.timestamp += self.shift_sec offset += self.shift_bytes self.remained_audio += audio[offset:] diff --git a/paddlespeech/server/ws/api.py b/paddlespeech/server/ws/api.py index 313fd16f..83d542a1 100644 --- a/paddlespeech/server/ws/api.py +++ b/paddlespeech/server/ws/api.py @@ -15,8 +15,8 @@ from typing import List from fastapi import APIRouter -from paddlespeech.server.ws.asr_socket import router as asr_router -from paddlespeech.server.ws.tts_socket import router as tts_router +from paddlespeech.server.ws.asr_api import router as asr_router +from paddlespeech.server.ws.tts_api import router as tts_router _router = APIRouter() diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py deleted file mode 100644 index 0f7dcddd..00000000 --- a/paddlespeech/server/ws/asr_socket.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json - -from fastapi import APIRouter -from fastapi import WebSocket -from fastapi import WebSocketDisconnect -from starlette.websockets import WebSocketState as WebSocketState - -from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler -from paddlespeech.server.engine.engine_pool import get_engine_pool - -router = APIRouter() - - -@router.websocket('/paddlespeech/asr/streaming') -async def websocket_endpoint(websocket: WebSocket): - """PaddleSpeech Online ASR Server api - - Args: - websocket (WebSocket): the websocket instance - """ - - #1. the interface wait to accept the websocket protocal header - # and only we receive the header, it establish the connection with specific thread - await websocket.accept() - - #2. if we accept the websocket headers, we will get the online asr engine instance - engine_pool = get_engine_pool() - asr_engine = engine_pool['asr'] - - #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio - # and each connection has its own connection instance to process the request - # and only if client send the start signal, we create the PaddleASRConnectionHanddler instance - connection_handler = None - - try: - #4. we do a loop to process the audio package by package according the protocal - # and only if the client send finished signal, we will break the loop - while True: - # careful here, changed the source code from starlette.websockets - # 4.1 we wait for the client signal for the specific action - assert websocket.application_state == WebSocketState.CONNECTED - message = await websocket.receive() - websocket._raise_on_disconnect(message) - - #4.2 text for the action command and bytes for pcm data - if "text" in message: - # we first parse the specific command - message = json.loads(message["text"]) - if 'signal' not in message: - resp = {"status": "ok", "message": "no valid json data"} - await websocket.send_json(resp) - - # start command, we create the PaddleASRConnectionHanddler instance to process the audio data - # end command, we process the all the last audio pcm and return the final result - # and we break the loop - if message['signal'] == 'start': - resp = {"status": "ok", "signal": "server_ready"} - # do something at begining here - # create the instance to process the audio - connection_handler = PaddleASRConnectionHanddler(asr_engine) - await websocket.send_json(resp) - elif message['signal'] == 'end': - # reset single engine for an new connection - # and we will destroy the connection - connection_handler.decode(is_finished=True) - connection_handler.rescoring() - asr_results = connection_handler.get_result() - word_time_stamp = connection_handler.get_word_time_stamp() - connection_handler.reset() - - resp = { - "status": "ok", - "signal": "finished", - 'result': asr_results, - 'times': word_time_stamp - } - await websocket.send_json(resp) - break - else: - resp = {"status": "ok", "message": "no valid json data"} - await websocket.send_json(resp) - elif "bytes" in message: - # bytes for the pcm data - message = message["bytes"] - - # we extract the remained audio pcm - # and decode for the result in this package data - connection_handler.extract_feat(message) - connection_handler.decode(is_finished=False) - asr_results = connection_handler.get_result() - - # return the current period result - # if the engine create the vad instance, this connection will have many period results - resp = {'result': asr_results} - await websocket.send_json(resp) - except WebSocketDisconnect: - pass diff --git a/paddlespeech/server/ws/tts_socket.py b/paddlespeech/server/ws/tts_socket.py deleted file mode 100644 index 482aeb79..00000000 --- a/paddlespeech/server/ws/tts_socket.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json - -from fastapi import APIRouter -from fastapi import WebSocket -from fastapi import WebSocketDisconnect -from starlette.websockets import WebSocketState as WebSocketState - -from paddlespeech.cli.log import logger -from paddlespeech.server.engine.engine_pool import get_engine_pool - -router = APIRouter() - - -@router.websocket('/paddlespeech/tts/streaming') -async def websocket_endpoint(websocket: WebSocket): - await websocket.accept() - - try: - # careful here, changed the source code from starlette.websockets - assert websocket.application_state == WebSocketState.CONNECTED - message = await websocket.receive() - websocket._raise_on_disconnect(message) - - # get engine - engine_pool = get_engine_pool() - tts_engine = engine_pool['tts'] - - # 获取 message 并转文本 - message = json.loads(message["text"]) - text_bese64 = message["text"] - sentence = tts_engine.preprocess(text_bese64=text_bese64) - - # run - wav_generator = tts_engine.run(sentence) - - while True: - try: - tts_results = next(wav_generator) - resp = {"status": 1, "audio": tts_results} - await websocket.send_json(resp) - except StopIteration as e: - resp = {"status": 2, "audio": ''} - await websocket.send_json(resp) - logger.info("Complete the transmission of audio streams") - break - - except WebSocketDisconnect: - pass From 175c67b75ea34274e0f1162a7fc6e0e0139cde22 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 7 May 2022 07:11:22 +0000 Subject: [PATCH 38/93] asr socket to asr api --- paddlespeech/server/ws/asr_api.py | 110 ++++++++++++++++++++++++++++++ paddlespeech/server/ws/tts_api.py | 61 +++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 paddlespeech/server/ws/asr_api.py create mode 100644 paddlespeech/server/ws/tts_api.py diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py new file mode 100644 index 00000000..0f7dcddd --- /dev/null +++ b/paddlespeech/server/ws/asr_api.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json + +from fastapi import APIRouter +from fastapi import WebSocket +from fastapi import WebSocketDisconnect +from starlette.websockets import WebSocketState as WebSocketState + +from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler +from paddlespeech.server.engine.engine_pool import get_engine_pool + +router = APIRouter() + + +@router.websocket('/paddlespeech/asr/streaming') +async def websocket_endpoint(websocket: WebSocket): + """PaddleSpeech Online ASR Server api + + Args: + websocket (WebSocket): the websocket instance + """ + + #1. the interface wait to accept the websocket protocal header + # and only we receive the header, it establish the connection with specific thread + await websocket.accept() + + #2. if we accept the websocket headers, we will get the online asr engine instance + engine_pool = get_engine_pool() + asr_engine = engine_pool['asr'] + + #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio + # and each connection has its own connection instance to process the request + # and only if client send the start signal, we create the PaddleASRConnectionHanddler instance + connection_handler = None + + try: + #4. we do a loop to process the audio package by package according the protocal + # and only if the client send finished signal, we will break the loop + while True: + # careful here, changed the source code from starlette.websockets + # 4.1 we wait for the client signal for the specific action + assert websocket.application_state == WebSocketState.CONNECTED + message = await websocket.receive() + websocket._raise_on_disconnect(message) + + #4.2 text for the action command and bytes for pcm data + if "text" in message: + # we first parse the specific command + message = json.loads(message["text"]) + if 'signal' not in message: + resp = {"status": "ok", "message": "no valid json data"} + await websocket.send_json(resp) + + # start command, we create the PaddleASRConnectionHanddler instance to process the audio data + # end command, we process the all the last audio pcm and return the final result + # and we break the loop + if message['signal'] == 'start': + resp = {"status": "ok", "signal": "server_ready"} + # do something at begining here + # create the instance to process the audio + connection_handler = PaddleASRConnectionHanddler(asr_engine) + await websocket.send_json(resp) + elif message['signal'] == 'end': + # reset single engine for an new connection + # and we will destroy the connection + connection_handler.decode(is_finished=True) + connection_handler.rescoring() + asr_results = connection_handler.get_result() + word_time_stamp = connection_handler.get_word_time_stamp() + connection_handler.reset() + + resp = { + "status": "ok", + "signal": "finished", + 'result': asr_results, + 'times': word_time_stamp + } + await websocket.send_json(resp) + break + else: + resp = {"status": "ok", "message": "no valid json data"} + await websocket.send_json(resp) + elif "bytes" in message: + # bytes for the pcm data + message = message["bytes"] + + # we extract the remained audio pcm + # and decode for the result in this package data + connection_handler.extract_feat(message) + connection_handler.decode(is_finished=False) + asr_results = connection_handler.get_result() + + # return the current period result + # if the engine create the vad instance, this connection will have many period results + resp = {'result': asr_results} + await websocket.send_json(resp) + except WebSocketDisconnect: + pass diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py new file mode 100644 index 00000000..699ee412 --- /dev/null +++ b/paddlespeech/server/ws/tts_api.py @@ -0,0 +1,61 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json + +from fastapi import APIRouter +from fastapi import WebSocket +from fastapi import WebSocketDisconnect +from starlette.websockets import WebSocketState as WebSocketState + +from paddlespeech.cli.log import logger +from paddlespeech.server.engine.engine_pool import get_engine_pool + +router = APIRouter() + + +@router.websocket('/ws/tts') +async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + + try: + # careful here, changed the source code from starlette.websockets + assert websocket.application_state == WebSocketState.CONNECTED + message = await websocket.receive() + websocket._raise_on_disconnect(message) + + # get engine + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + + # 获取 message 并转文本 + message = json.loads(message["text"]) + text_bese64 = message["text"] + sentence = tts_engine.preprocess(text_bese64=text_bese64) + + # run + wav_generator = tts_engine.run(sentence) + + while True: + try: + tts_results = next(wav_generator) + resp = {"status": 1, "audio": tts_results} + await websocket.send_json(resp) + except StopIteration as e: + resp = {"status": 2, "audio": ''} + await websocket.send_json(resp) + logger.info("Complete the transmission of audio streams") + break + + except WebSocketDisconnect: + pass From 12ae137c83f5b3d804dffd6c40ac9d5e128bd883 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 7 May 2022 07:12:57 +0000 Subject: [PATCH 39/93] update tts_api for ws --- paddlespeech/server/ws/tts_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py index 699ee412..20a63d4c 100644 --- a/paddlespeech/server/ws/tts_api.py +++ b/paddlespeech/server/ws/tts_api.py @@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool router = APIRouter() -@router.websocket('/ws/tts') +@router.websocket('/paddlespeech/tts/streaming') async def websocket_endpoint(websocket: WebSocket): await websocket.accept() @@ -58,4 +58,4 @@ async def websocket_endpoint(websocket: WebSocket): break except WebSocketDisconnect: - pass + pass \ No newline at end of file From 9947380898cb7f1aa043d3cdf8015892a09aa9c9 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 7 May 2022 07:56:46 +0000 Subject: [PATCH 40/93] fix the doc, test=doc --- docs/source/released_model.md | 2 +- examples/wenetspeech/asr0/RESULTS.md | 4 ++-- examples/wenetspeech/asr1/RESULTS.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 435985bc..74435ae1 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM)
0.2417 (test\_meeting, w/o LM)
0.053 (aishell, w/ LM) |-| 10000 h |- [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) -[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 18.79 (test\_meeting) |-| 10000 h |- +[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) diff --git a/examples/wenetspeech/asr0/RESULTS.md b/examples/wenetspeech/asr0/RESULTS.md index 0566a352..0796b7bc 100644 --- a/examples/wenetspeech/asr0/RESULTS.md +++ b/examples/wenetspeech/asr0/RESULTS.md @@ -4,5 +4,5 @@ | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 1.2G | r1.0.0a | conf/deepspeech2\_online.yaml + spec aug + fbank161 | test\_net | 13.307 | 15.02 | -| DeepSpeech2 | 1.2G | r1.0.0a | conf/deepspeech2\_online.yaml + spec aug + fbank161 | test\_meeting | 13.307 | 24.17 | +| DeepSpeech2 | 1.2G | r1.0.0a | conf/deepspeech2\_online.yaml + spec aug + fbank161, w/o LM | test\_net | 13.307 | 15.02 | +| DeepSpeech2 | 1.2G | r1.0.0a | conf/deepspeech2\_online.yaml + spec aug + fbank161, w/o LM | test\_meeting | 13.307 | 24.17 | diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index 72b815b7..2f7ad659 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -2,7 +2,7 @@ ## Conformer online -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | +| Model | Params | Config | Augmentation| Test set | Decode method | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | | conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | attention | 9.329 | 0.1102 | | conformer_online | 123.47 M | conf/chunk_conformer.yaml | spec_aug | test net | ctc_greedy_search | 9.329 | 0.1207 | From da08f1c1afe856c8ac53d36f73cfb8c729526088 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Sat, 7 May 2022 16:40:32 +0800 Subject: [PATCH 41/93] Add RFT for asr task. --- paddlespeech/cli/asr/infer.py | 29 +++++++++++++++++++++++++++-- paddlespeech/cli/executor.py | 16 ++++++++++++++++ paddlespeech/cli/utils.py | 10 +++++++++- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 0fb54868..9218cfa5 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -14,6 +14,7 @@ import argparse import os import sys +import time from collections import OrderedDict from typing import List from typing import Optional @@ -29,8 +30,10 @@ from ..download import get_path_from_url from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register +from ..utils import CLI_TIMER from ..utils import MODEL_HOME from ..utils import stats_wrapper +from ..utils import timer_register from .pretrained_models import model_alias from .pretrained_models import pretrained_models from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer @@ -41,6 +44,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] +@timer_register @cli_register( name='paddlespeech.asr', description='Speech to text infer command.') class ASRExecutor(BaseExecutor): @@ -99,6 +103,11 @@ class ASRExecutor(BaseExecutor): default=False, help='No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate' ) + self.parser.add_argument( + '--rtf', + action="store_true", + default=False, + help='Show Real-time Factor(RTF).') self.parser.add_argument( '--device', type=str, @@ -407,6 +416,7 @@ class ASRExecutor(BaseExecutor): ckpt_path = parser_args.ckpt_path decode_method = parser_args.decode_method force_yes = parser_args.yes + rtf = parser_args.rtf device = parser_args.device if not parser_args.verbose: @@ -419,12 +429,15 @@ class ASRExecutor(BaseExecutor): for id_, input_ in task_source.items(): try: res = self(input_, model, lang, sample_rate, config, ckpt_path, - decode_method, force_yes, device) + decode_method, force_yes, rtf, device) task_results[id_] = res except Exception as e: has_exceptions = True task_results[id_] = f'{e.__class__.__name__}: {e}' + if rtf: + self.show_rtf(CLI_TIMER[self.__class__.__name__]) + self.process_task_results(parser_args.input, task_results, parser_args.job_dump_result) @@ -443,6 +456,7 @@ class ASRExecutor(BaseExecutor): ckpt_path: os.PathLike=None, decode_method: str='attention_rescoring', force_yes: bool=False, + rtf: bool=False, device=paddle.get_device()): """ Python API to call an executor. @@ -454,7 +468,18 @@ class ASRExecutor(BaseExecutor): self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path) self.preprocess(model, audio_file) - self.infer(model) + + if rtf: + k = self.__class__.__name__ + CLI_TIMER[k]['start'].append(time.time()) + self.infer(model) + CLI_TIMER[k]['end'].append(time.time()) + audio, audio_sample_rate = soundfile.read( + audio_file, dtype="int16", always_2d=True) + CLI_TIMER[k]['extra'].append(audio.shape[0] / audio_sample_rate) + else: + self.infer(model) + res = self.postprocess() # Retrieve result of asr. return res diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index df0b6783..4a631c7f 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -235,3 +235,19 @@ class BaseExecutor(ABC): 'Use pretrained model stored in: {}'.format(decompressed_path)) return decompressed_path + + def show_rtf(self, info: Dict[str, List[float]]): + """ + Calculate rft of current task and show results. + """ + num_samples = 0 + task_duration = 0.0 + wav_duration = 0.0 + + for start, end, dur in zip(info['start'], info['end'], info['extra']): + num_samples += 1 + task_duration += end - start + wav_duration += dur + + logger.info('Sample Count: {}'.format(num_samples)) + logger.info('Avg RTF: {}'.format(task_duration / wav_duration)) diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index 8e094894..82d40c8b 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -24,11 +24,11 @@ from typing import Any from typing import Dict import paddle -import paddleaudio import requests import yaml from paddle.framework import load +import paddleaudio from . import download from .entry import commands try: @@ -39,6 +39,7 @@ except ImportError: requests.adapters.DEFAULT_RETRIES = 3 __all__ = [ + 'timer_register', 'cli_register', 'get_command', 'download_and_decompress', @@ -46,6 +47,13 @@ __all__ = [ 'stats_wrapper', ] +CLI_TIMER = {} + + +def timer_register(command): + CLI_TIMER[command.__name__] = {'start': [], 'end': [], 'extra': []} + return command + def cli_register(name: str, description: str='') -> Any: def _warpper(command): From 19d015b60ae82cfd1ed842f00074db7e2944547c Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Sat, 7 May 2022 17:00:13 +0800 Subject: [PATCH 42/93] Add RFT for asr task. --- paddlespeech/cli/asr/infer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 9218cfa5..23029cfb 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -467,19 +467,18 @@ class ASRExecutor(BaseExecutor): paddle.set_device(device) self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path) - self.preprocess(model, audio_file) - if rtf: k = self.__class__.__name__ CLI_TIMER[k]['start'].append(time.time()) - self.infer(model) + + self.preprocess(model, audio_file) + self.infer(model) + res = self.postprocess() # Retrieve result of asr. + + if rtf: CLI_TIMER[k]['end'].append(time.time()) audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) CLI_TIMER[k]['extra'].append(audio.shape[0] / audio_sample_rate) - else: - self.infer(model) - - res = self.postprocess() # Retrieve result of asr. return res From 864041085fac428c65ac310c776197b31eef1649 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 May 2022 07:56:24 +0000 Subject: [PATCH 43/93] replace dist.spawn with dist.launch, test=asr --- examples/aishell/asr0/local/train.sh | 3 +-- examples/aishell/asr1/local/train.sh | 3 +-- examples/callcenter/asr1/local/train.sh | 3 +-- examples/librispeech/asr0/local/train.sh | 3 +-- examples/librispeech/asr1/local/train.sh | 3 +-- examples/librispeech/asr2/local/train.sh | 3 +-- examples/timit/asr1/local/train.sh | 3 +-- examples/tiny/asr0/local/train.sh | 3 +-- examples/tiny/asr1/local/train.sh | 3 +-- paddlespeech/s2t/exps/deepspeech2/bin/train.py | 5 +---- paddlespeech/s2t/exps/u2/bin/train.py | 5 +---- paddlespeech/s2t/exps/u2_kaldi/bin/train.py | 5 +---- paddlespeech/s2t/exps/u2_st/bin/train.py | 5 +---- 13 files changed, 13 insertions(+), 34 deletions(-) diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh index 54c642b6..6cf6855d 100755 --- a/examples/aishell/asr0/local/train.sh +++ b/examples/aishell/asr0/local/train.sh @@ -20,8 +20,7 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh index 1c8593bd..643a7b68 100755 --- a/examples/aishell/asr1/local/train.sh +++ b/examples/aishell/asr1/local/train.sh @@ -27,9 +27,8 @@ ckpt_name=$2 mkdir -p exp -python3 -u ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --seed ${seed} \ ---ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --profiler-options "${profiler_options}" \ diff --git a/examples/callcenter/asr1/local/train.sh b/examples/callcenter/asr1/local/train.sh index 3e92fd16..e919e0d2 100755 --- a/examples/callcenter/asr1/local/train.sh +++ b/examples/callcenter/asr1/local/train.sh @@ -21,8 +21,7 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh index 0479398f..55130fc1 100755 --- a/examples/librispeech/asr0/local/train.sh +++ b/examples/librispeech/asr0/local/train.sh @@ -20,8 +20,7 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh index 275d3a49..b3a4160e 100755 --- a/examples/librispeech/asr1/local/train.sh +++ b/examples/librispeech/asr1/local/train.sh @@ -22,8 +22,7 @@ fi # export FLAGS_cudnn_exhaustive_search=true # export FLAGS_conv_workspace_size_limit=4000 -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh index 898391f4..06c7a0e2 100755 --- a/examples/librispeech/asr2/local/train.sh +++ b/examples/librispeech/asr2/local/train.sh @@ -19,9 +19,8 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi -python3 -u ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --model-name u2_kaldi \ ---ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh index 9b3fa177..c4648c3e 100755 --- a/examples/timit/asr1/local/train.sh +++ b/examples/timit/asr1/local/train.sh @@ -19,8 +19,7 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh index a69b6ddb..33d8a238 100755 --- a/examples/tiny/asr0/local/train.sh +++ b/examples/tiny/asr0/local/train.sh @@ -26,8 +26,7 @@ model_type=$3 mkdir -p exp -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh index 1c8593bd..643a7b68 100755 --- a/examples/tiny/asr1/local/train.sh +++ b/examples/tiny/asr1/local/train.sh @@ -27,9 +27,8 @@ ckpt_name=$2 mkdir -p exp -python3 -u ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --seed ${seed} \ ---ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --profiler-options "${profiler_options}" \ diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index 09e8662f..9c7c61a5 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -27,10 +27,7 @@ def main_sp(config, args): def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - else: - main_sp(config, args) + main_sp(config, args) if __name__ == "__main__": diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py index 53c22328..cfd7be81 100644 --- a/paddlespeech/s2t/exps/u2/bin/train.py +++ b/paddlespeech/s2t/exps/u2/bin/train.py @@ -32,10 +32,7 @@ def main_sp(config, args): def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - else: - main_sp(config, args) + main_sp(config, args) if __name__ == "__main__": diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py index fcfc05a8..f802e53d 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py +++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py @@ -36,10 +36,7 @@ def main_sp(config, args): def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - else: - main_sp(config, args) + main_sp(config, args) if __name__ == "__main__": diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py index 4dec9ec8..8d4bfca8 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/train.py +++ b/paddlespeech/s2t/exps/u2_st/bin/train.py @@ -30,10 +30,7 @@ def main_sp(config, args): def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - else: - main_sp(config, args) + main_sp(config, args) if __name__ == "__main__": From 9f389a7a33b4b1fb54976d1ba7ca9bed3b4190f1 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 May 2022 09:07:59 +0000 Subject: [PATCH 44/93] support cpu, test=asr --- demos/audio_searching/src/operations/load.py | 5 +++-- examples/aishell/asr0/local/train.sh | 9 +++++++++ examples/aishell/asr0/run.sh | 2 +- examples/aishell/asr1/local/train.sh | 11 +++++++++++ examples/callcenter/asr1/local/train.sh | 8 ++++++++ examples/librispeech/asr0/local/train.sh | 9 +++++++++ examples/librispeech/asr1/local/train.sh | 8 ++++++++ examples/librispeech/asr2/local/train.sh | 9 +++++++++ examples/tiny/asr0/local/train.sh | 10 ++++++++++ examples/tiny/asr1/local/train.sh | 11 +++++++++++ paddlespeech/s2t/exps/deepspeech2/bin/train.py | 1 - paddlespeech/s2t/exps/u2/bin/train.py | 1 - paddlespeech/s2t/exps/u2_kaldi/bin/train.py | 1 - paddlespeech/s2t/exps/u2_st/bin/train.py | 1 - paddlespeech/s2t/io/sampler.py | 2 +- paddlespeech/t2s/modules/transformer/repeat.py | 2 +- 16 files changed, 81 insertions(+), 9 deletions(-) diff --git a/demos/audio_searching/src/operations/load.py b/demos/audio_searching/src/operations/load.py index 0d9edb78..d1ea0057 100644 --- a/demos/audio_searching/src/operations/load.py +++ b/demos/audio_searching/src/operations/load.py @@ -26,8 +26,9 @@ def get_audios(path): """ supported_formats = [".wav", ".mp3", ".ogg", ".flac", ".m4a"] return [ - item for sublist in [[os.path.join(dir, file) for file in files] - for dir, _, files in list(os.walk(path))] + item + for sublist in [[os.path.join(dir, file) for file in files] + for dir, _, files in list(os.walk(path))] for item in sublist if os.path.splitext(item)[1] in supported_formats ] diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh index 6cf6855d..7c0ad075 100755 --- a/examples/aishell/asr0/local/train.sh +++ b/examples/aishell/asr0/local/train.sh @@ -20,11 +20,20 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--model_type ${model_type} \ +--seed ${seed} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ --seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 114af5a9..0542e361 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -2,7 +2,7 @@ set -e source path.sh -gpus=0,1,2,3 +gpus=1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh index 643a7b68..eb9f4f69 100755 --- a/examples/aishell/asr1/local/train.sh +++ b/examples/aishell/asr1/local/train.sh @@ -27,6 +27,16 @@ ckpt_name=$2 mkdir -p exp +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --seed ${seed} \ --config ${config_path} \ @@ -34,6 +44,7 @@ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/t --profiler-options "${profiler_options}" \ --benchmark-batch-size ${benchmark_batch_size} \ --benchmark-max-step ${benchmark_max_step} +fi if [ ${seed} != 0 ]; then diff --git a/examples/callcenter/asr1/local/train.sh b/examples/callcenter/asr1/local/train.sh index e919e0d2..3d7d35f1 100755 --- a/examples/callcenter/asr1/local/train.sh +++ b/examples/callcenter/asr1/local/train.sh @@ -21,10 +21,18 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--seed ${seed} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh index 55130fc1..788f9bf8 100755 --- a/examples/librispeech/asr0/local/train.sh +++ b/examples/librispeech/asr0/local/train.sh @@ -20,11 +20,20 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--model_type ${model_type} \ +--seed ${seed} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ --seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh index b3a4160e..b21c1017 100755 --- a/examples/librispeech/asr1/local/train.sh +++ b/examples/librispeech/asr1/local/train.sh @@ -22,10 +22,18 @@ fi # export FLAGS_cudnn_exhaustive_search=true # export FLAGS_conv_workspace_size_limit=4000 +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--seed ${seed} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh index 06c7a0e2..a3ad5fd7 100755 --- a/examples/librispeech/asr2/local/train.sh +++ b/examples/librispeech/asr2/local/train.sh @@ -19,11 +19,20 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--model-name u2_kaldi \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--seed ${seed} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --model-name u2_kaldi \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh index 33d8a238..5fecf3de 100755 --- a/examples/tiny/asr0/local/train.sh +++ b/examples/tiny/asr0/local/train.sh @@ -26,12 +26,22 @@ model_type=$3 mkdir -p exp +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--model_type ${model_type} \ +--profiler-options "${profiler_options}" \ +--seed ${seed} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ --profiler-options "${profiler_options}" \ --seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh index 643a7b68..eb9f4f69 100755 --- a/examples/tiny/asr1/local/train.sh +++ b/examples/tiny/asr1/local/train.sh @@ -27,6 +27,16 @@ ckpt_name=$2 mkdir -p exp +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ --seed ${seed} \ --config ${config_path} \ @@ -34,6 +44,7 @@ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/t --profiler-options "${profiler_options}" \ --benchmark-batch-size ${benchmark_batch_size} \ --benchmark-max-step ${benchmark_max_step} +fi if [ ${seed} != 0 ]; then diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index 9c7c61a5..e2c68d4b 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Trainer for DeepSpeech2 model.""" -from paddle import distributed as dist from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py index cfd7be81..dc3a87c1 100644 --- a/paddlespeech/s2t/exps/u2/bin/train.py +++ b/paddlespeech/s2t/exps/u2/bin/train.py @@ -15,7 +15,6 @@ import cProfile import os -from paddle import distributed as dist from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py index f802e53d..b11da715 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py +++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py @@ -15,7 +15,6 @@ import cProfile import os -from paddle import distributed as dist from yacs.config import CfgNode from paddlespeech.s2t.training.cli import default_argument_parser diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py index 8d4bfca8..574942e5 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/train.py +++ b/paddlespeech/s2t/exps/u2_st/bin/train.py @@ -15,7 +15,6 @@ import cProfile import os -from paddle import distributed as dist from yacs.config import CfgNode from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index ac55af12..89752bb9 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 1e946adf..2073a78b 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -36,4 +36,4 @@ def repeat(N, fn): Returns: MultiSequential: Repeated model instance. """ - return MultiSequential(* [fn(n) for n in range(N)]) + return MultiSequential(*[fn(n) for n in range(N)]) From 8a3c88d42e24cf3b57c50387f3c9691307e1fcf6 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 May 2022 09:25:03 +0000 Subject: [PATCH 45/93] add launch for st, test=asr --- examples/aishell/asr0/local/train.sh | 1 + examples/aishell/asr0/run.sh | 2 +- examples/aishell/asr1/local/train.sh | 1 + examples/callcenter/asr1/local/train.sh | 1 + examples/librispeech/asr0/local/train.sh | 1 + examples/librispeech/asr1/local/train.sh | 1 + examples/librispeech/asr2/local/train.sh | 1 + examples/ted_en_zh/st0/local/train.sh | 8 ++++++++ examples/ted_en_zh/st1/local/train.sh | 11 ++++++++++- examples/timit/asr1/local/train.sh | 9 +++++++++ examples/tiny/asr0/local/train.sh | 1 + examples/tiny/asr1/local/train.sh | 1 + 12 files changed, 36 insertions(+), 2 deletions(-) diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh index 7c0ad075..102c051c 100755 --- a/examples/aishell/asr0/local/train.sh +++ b/examples/aishell/asr0/local/train.sh @@ -29,6 +29,7 @@ python3 -u ${BIN_DIR}/train.py \ --seed ${seed} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 0542e361..114af5a9 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -2,7 +2,7 @@ set -e source path.sh -gpus=1,2,3 +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh index eb9f4f69..5617f7ef 100755 --- a/examples/aishell/asr1/local/train.sh +++ b/examples/aishell/asr1/local/train.sh @@ -38,6 +38,7 @@ python3 -u ${BIN_DIR}/train.py \ --benchmark-max-step ${benchmark_max_step} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --seed ${seed} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/callcenter/asr1/local/train.sh b/examples/callcenter/asr1/local/train.sh index 3d7d35f1..03b4588e 100755 --- a/examples/callcenter/asr1/local/train.sh +++ b/examples/callcenter/asr1/local/train.sh @@ -29,6 +29,7 @@ python3 -u ${BIN_DIR}/train.py \ --seed ${seed} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh index 788f9bf8..50d1d192 100755 --- a/examples/librispeech/asr0/local/train.sh +++ b/examples/librispeech/asr0/local/train.sh @@ -29,6 +29,7 @@ python3 -u ${BIN_DIR}/train.py \ --seed ${seed} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh index b21c1017..3860d85c 100755 --- a/examples/librispeech/asr1/local/train.sh +++ b/examples/librispeech/asr1/local/train.sh @@ -30,6 +30,7 @@ python3 -u ${BIN_DIR}/train.py \ --seed ${seed} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh index a3ad5fd7..560424ea 100755 --- a/examples/librispeech/asr2/local/train.sh +++ b/examples/librispeech/asr2/local/train.sh @@ -28,6 +28,7 @@ python3 -u ${BIN_DIR}/train.py \ --seed ${seed} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --model-name u2_kaldi \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/ted_en_zh/st0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh index e366376b..ad00653b 100755 --- a/examples/ted_en_zh/st0/local/train.sh +++ b/examples/ted_en_zh/st0/local/train.sh @@ -19,11 +19,19 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} +else +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/ted_en_zh/st1/local/train.sh b/examples/ted_en_zh/st1/local/train.sh index a8e4acaa..5da64e99 100755 --- a/examples/ted_en_zh/st1/local/train.sh +++ b/examples/ted_en_zh/st1/local/train.sh @@ -20,12 +20,21 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --checkpoint_path "${ckpt_path}" \ --seed ${seed} +else +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--checkpoint_path "${ckpt_path}" \ +--seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic @@ -36,4 +45,4 @@ if [ $? -ne 0 ]; then exit 1 fi -exit 0 \ No newline at end of file +exit 0 diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh index c4648c3e..66140758 100755 --- a/examples/timit/asr1/local/train.sh +++ b/examples/timit/asr1/local/train.sh @@ -19,10 +19,19 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--seed ${seed} +else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh index 5fecf3de..9060be67 100755 --- a/examples/tiny/asr0/local/train.sh +++ b/examples/tiny/asr0/local/train.sh @@ -36,6 +36,7 @@ python3 -u ${BIN_DIR}/train.py \ --seed ${seed} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh index eb9f4f69..5617f7ef 100755 --- a/examples/tiny/asr1/local/train.sh +++ b/examples/tiny/asr1/local/train.sh @@ -38,6 +38,7 @@ python3 -u ${BIN_DIR}/train.py \ --benchmark-max-step ${benchmark_max_step} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ --seed ${seed} \ --config ${config_path} \ --output exp/${ckpt_name} \ From e55177c3db5e35a59db87c8b61addac88ae0dda0 Mon Sep 17 00:00:00 2001 From: QingshuChen Date: Mon, 9 May 2022 11:05:17 +0000 Subject: [PATCH 46/93] speedyspeech support kunlun --- paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py | 9 +++++++-- paddlespeech/t2s/exps/speedyspeech/train.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index cb742c59..252ac932 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -174,12 +174,17 @@ def main(): parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") + parser.add_argument( + "--nxpu", type=int, default=0, help="if nxpu == 0 and ngpu == 0, use cpu.") args, _ = parser.parse_known_args() if args.ngpu == 0: - paddle.set_device("cpu") + if args.nxpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("xpu") elif args.ngpu > 0: paddle.set_device("gpu") else: diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index bda5370c..d4cfe348 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -46,7 +46,10 @@ def train_sp(args, config): # setup running environment correctly world_size = paddle.distributed.get_world_size() if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: - paddle.set_device("cpu") + if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("xpu") else: paddle.set_device("gpu") if world_size > 1: @@ -185,7 +188,9 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + "--nxpu", type=int, default=0, help="if nxpu == 0 and ngpu == 0, use cpu.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu") parser.add_argument( "--use-relative-path", From 06bea5f03d35f7e5fb5dfc1dbb51ff982b875a7e Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 9 May 2022 19:15:14 +0800 Subject: [PATCH 47/93] update the vector and text readme, test=doc --- demos/speech_server/README.md | 166 ++++++++++++++++++ demos/speech_server/README_cn.md | 160 +++++++++++++++++ demos/speech_server/conf/application.yaml | 25 ++- .../ws_conformer_wenetspeech_application.yaml | 46 +++++ paddlespeech/server/README.md | 21 +++ .../engine/asr/online/pretrained_models.py | 18 ++ 6 files changed, 435 insertions(+), 1 deletion(-) create mode 100644 demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 3df93238..2334ec22 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -236,6 +236,166 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` +### 7. Speaker Verification Client Usage + +#### 7.1 Extract speaker embedding +**Note:** The response time will be slightly longer when using the client for the first time +- Command Line (Recommended) + +``` bash +paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8590 --input 85236145389.wav +``` + + * Usage: + + ``` bash + paddlespeech_client vector --help + ``` + + * Arguments: + * server_ip: server ip. Default: 127.0.0.1 + * port: server port. Default: 8090 + * input(required): Input text to generate. + * task: the task of vector, can be use 'spk' or 'score。Default is 'spk'。 + * enroll: enroll audio + * test: test audio + + * Output: + + ``` bash + [2022-05-08 00:18:44,249] [ INFO] - vector http client start + [2022-05-08 00:18:44,250] [ INFO] - the input audio: 85236145389.wav + [2022-05-08 00:18:44,250] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector + [2022-05-08 00:18:44,250] [ INFO] - http://127.0.0.1:8590/paddlespeech/vector + [2022-05-08 00:18:44,406] [ INFO] - The vector: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'vec': [1.421751856803894, 5.626245498657227, -5.342077255249023, 1.1773887872695923, 3.3080549240112305, 1.7565933465957642, 5.167886257171631, 10.806358337402344, -3.8226819038391113, -5.614140033721924, 2.6238479614257812, -0.8072972893714905, 1.9635076522827148, -7.312870025634766, 0.011035939678549767, -9.723129272460938, 0.6619706153869629, -6.976806163787842, 10.213476181030273, 7.494769096374512, 2.9105682373046875, 3.8949244022369385, 3.799983501434326, 7.106168746948242, 16.90532875061035, -7.149388313293457, 8.733108520507812, 3.423006296157837, -4.831653594970703, -11.403363227844238, 11.232224464416504, 7.127461910247803, -4.282842636108398, 2.452359437942505, -5.130749702453613, -18.17766761779785, -2.6116831302642822, -11.000344276428223, -6.731433391571045, 1.6564682722091675, 0.7618281245231628, 1.125300407409668, -2.0838370323181152, 4.725743293762207, -8.782588005065918, -3.5398752689361572, 3.8142364025115967, 5.142068862915039, 2.1620609760284424, 4.09643030166626, -6.416214942932129, 12.747446060180664, 1.9429892301559448, -15.15294361114502, 6.417416095733643, 16.09701156616211, -9.716667175292969, -1.9920575618743896, -3.36494779586792, -1.8719440698623657, 11.567351341247559, 3.6978814601898193, 11.258262634277344, 7.442368507385254, 9.183408737182617, 4.528149127960205, -1.2417854070663452, 4.395912170410156, 6.6727728843688965, 5.88988733291626, 7.627128601074219, -0.6691966652870178, -11.889698028564453, -9.20886516571045, -7.42740535736084, -3.777663230895996, 6.917238712310791, -9.848755836486816, -2.0944676399230957, -5.1351165771484375, 0.4956451654434204, 9.317537307739258, -5.914181232452393, -1.809860348701477, -0.11738915741443634, -7.1692705154418945, -1.057827353477478, -5.721670627593994, -5.117385387420654, 16.13765525817871, -4.473617076873779, 7.6624321937561035, -0.55381840467453, 9.631585121154785, -6.470459461212158, -8.548508644104004, 4.371616840362549, -0.7970245480537415, 4.4789886474609375, -2.975860834121704, 3.2721822261810303, 2.838287830352783, 5.134591102600098, -9.19079875946045, -0.5657302737236023, -4.8745832443237305, 2.3165574073791504, -5.984319686889648, -2.1798853874206543, 0.3554139733314514, -0.3178512752056122, 9.493552207946777, 2.1144471168518066, 4.358094692230225, -12.089824676513672, 8.451693534851074, -7.925466537475586, 4.624246597290039, 4.428936958312988, 18.69200897216797, -2.6204581260681152, -5.14918851852417, -0.3582090139389038, 8.488558769226074, 4.98148775100708, -9.326835632324219, -2.2544219493865967, 6.641760349273682, 1.2119598388671875, 10.977124214172363, 16.555034637451172, 3.3238420486450195, 9.551861763000488, -1.6676981449127197, -0.7953944206237793, -8.605667114257812, -0.4735655188560486, 2.674196243286133, -5.359177112579346, -2.66738224029541, 0.6660683155059814, 15.44322681427002, 4.740593433380127, -3.472534418106079, 11.592567443847656, -2.0544962882995605, 1.736127495765686, -8.265326499938965, -9.30447769165039, 5.406829833984375, -1.518022894859314, -7.746612548828125, -6.089611053466797, 0.07112743705511093, -0.3490503430366516, -8.64989185333252, -9.998957633972168, -2.564845085144043, -0.5399947762489319, 2.6018123626708984, -0.3192799389362335, -1.8815255165100098, -2.0721492767333984, -3.410574436187744, -8.29980754852295, 1.483638048171997, -15.365986824035645, -8.288211822509766, 3.884779930114746, -3.4876468181610107, 7.362999439239502, 0.4657334089279175, 3.1326050758361816, 12.438895225524902, -1.8337041139602661, 4.532927989959717, 2.7264339923858643, 10.14534854888916, -6.521963596343994, 2.897155523300171, -3.392582654953003, 5.079153060913086, 7.7597246170043945, 4.677570819854736, 5.845779895782471, 2.402411460876465, 7.7071051597595215, 3.9711380004882812, -6.39003849029541, 6.12687873840332, -3.776029348373413, -11.118121147155762]}} + [2022-05-08 00:18:44,406] [ INFO] - Response time 0.156481 s. + ``` + +* Python API + +``` python +from paddlespeech.server.bin.paddlespeech_client import VectorClientExecutor + +vectorclient_executor = VectorClientExecutor() +res = vectorclient_executor( + input="85236145389.wav", + server_ip="127.0.0.1", + port=8590, + task="spk") +print(res) +``` + +* Output: + + ``` bash + {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'vec': [1.421751856803894, 5.626245498657227, -5.342077255249023, 1.1773887872695923, 3.3080549240112305, 1.7565933465957642, 5.167886257171631, 10.806358337402344, -3.8226819038391113, -5.614140033721924, 2.6238479614257812, -0.8072972893714905, 1.9635076522827148, -7.312870025634766, 0.011035939678549767, -9.723129272460938, 0.6619706153869629, -6.976806163787842, 10.213476181030273, 7.494769096374512, 2.9105682373046875, 3.8949244022369385, 3.799983501434326, 7.106168746948242, 16.90532875061035, -7.149388313293457, 8.733108520507812, 3.423006296157837, -4.831653594970703, -11.403363227844238, 11.232224464416504, 7.127461910247803, -4.282842636108398, 2.452359437942505, -5.130749702453613, -18.17766761779785, -2.6116831302642822, -11.000344276428223, -6.731433391571045, 1.6564682722091675, 0.7618281245231628, 1.125300407409668, -2.0838370323181152, 4.725743293762207, -8.782588005065918, -3.5398752689361572, 3.8142364025115967, 5.142068862915039, 2.1620609760284424, 4.09643030166626, -6.416214942932129, 12.747446060180664, 1.9429892301559448, -15.15294361114502, 6.417416095733643, 16.09701156616211, -9.716667175292969, -1.9920575618743896, -3.36494779586792, -1.8719440698623657, 11.567351341247559, 3.6978814601898193, 11.258262634277344, 7.442368507385254, 9.183408737182617, 4.528149127960205, -1.2417854070663452, 4.395912170410156, 6.6727728843688965, 5.88988733291626, 7.627128601074219, -0.6691966652870178, -11.889698028564453, -9.20886516571045, -7.42740535736084, -3.777663230895996, 6.917238712310791, -9.848755836486816, -2.0944676399230957, -5.1351165771484375, 0.4956451654434204, 9.317537307739258, -5.914181232452393, -1.809860348701477, -0.11738915741443634, -7.1692705154418945, -1.057827353477478, -5.721670627593994, -5.117385387420654, 16.13765525817871, -4.473617076873779, 7.6624321937561035, -0.55381840467453, 9.631585121154785, -6.470459461212158, -8.548508644104004, 4.371616840362549, -0.7970245480537415, 4.4789886474609375, -2.975860834121704, 3.2721822261810303, 2.838287830352783, 5.134591102600098, -9.19079875946045, -0.5657302737236023, -4.8745832443237305, 2.3165574073791504, -5.984319686889648, -2.1798853874206543, 0.3554139733314514, -0.3178512752056122, 9.493552207946777, 2.1144471168518066, 4.358094692230225, -12.089824676513672, 8.451693534851074, -7.925466537475586, 4.624246597290039, 4.428936958312988, 18.69200897216797, -2.6204581260681152, -5.14918851852417, -0.3582090139389038, 8.488558769226074, 4.98148775100708, -9.326835632324219, -2.2544219493865967, 6.641760349273682, 1.2119598388671875, 10.977124214172363, 16.555034637451172, 3.3238420486450195, 9.551861763000488, -1.6676981449127197, -0.7953944206237793, -8.605667114257812, -0.4735655188560486, 2.674196243286133, -5.359177112579346, -2.66738224029541, 0.6660683155059814, 15.44322681427002, 4.740593433380127, -3.472534418106079, 11.592567443847656, -2.0544962882995605, 1.736127495765686, -8.265326499938965, -9.30447769165039, 5.406829833984375, -1.518022894859314, -7.746612548828125, -6.089611053466797, 0.07112743705511093, -0.3490503430366516, -8.64989185333252, -9.998957633972168, -2.564845085144043, -0.5399947762489319, 2.6018123626708984, -0.3192799389362335, -1.8815255165100098, -2.0721492767333984, -3.410574436187744, -8.29980754852295, 1.483638048171997, -15.365986824035645, -8.288211822509766, 3.884779930114746, -3.4876468181610107, 7.362999439239502, 0.4657334089279175, 3.1326050758361816, 12.438895225524902, -1.8337041139602661, 4.532927989959717, 2.7264339923858643, 10.14534854888916, -6.521963596343994, 2.897155523300171, -3.392582654953003, 5.079153060913086, 7.7597246170043945, 4.677570819854736, 5.845779895782471, 2.402411460876465, 7.7071051597595215, 3.9711380004882812, -6.39003849029541, 6.12687873840332, -3.776029348373413, -11.118121147155762]}} + ``` + +#### 7.2 Get the score between speaker audio embedding + +**Note:** The response time will be slightly longer when using the client for the first time + +- Command Line (Recommended) + + ``` bash + paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8590 --enroll 85236145389.wav --test 123456789.wav + ``` + + * Usage: + + ``` bash + paddlespeech_client vector --help + ``` + + * Arguments: + * server_ip: server ip. Default: 127.0.0.1 + * port: server port. Default: 8090 + * input(required): Input text to generate. + * task: the task of vector, can be use 'spk' or 'score。If get the score, this must be 'score' parameter. + * enroll: enroll audio + * test: test audio + +* Output: + +``` bash + [2022-05-09 10:28:40,556] [ INFO] - vector score http client start + [2022-05-09 10:28:40,556] [ INFO] - enroll audio: 85236145389.wav, test audio: 123456789.wav + [2022-05-09 10:28:40,556] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector/score + [2022-05-09 10:28:40,731] [ INFO] - The vector score is: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'score': 0.4292638897895813}} + [2022-05-09 10:28:40,731] [ INFO] - The vector: None + [2022-05-09 10:28:40,731] [ INFO] - Response time 0.175514 s. +``` + +* Python API + +``` python +from paddlespeech.server.bin.paddlespeech_client import VectorClientExecutor + +vectorclient_executor = VectorClientExecutor() +res = vectorclient_executor( + input=None, + enroll_audio="85236145389.wav", + test_audio="123456789.wav", + server_ip="127.0.0.1", + port=8590, + task="score") +print(res) +``` + +* Output: + +``` bash +[2022-05-09 10:34:54,769] [ INFO] - vector score http client start +[2022-05-09 10:34:54,771] [ INFO] - enroll audio: 85236145389.wav, test audio: 123456789.wav +[2022-05-09 10:34:54,771] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector/score +[2022-05-09 10:34:55,026] [ INFO] - The vector score is: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'score': 0.4292638897895813}} +``` + + +### 8. Punctuation prediction + +**Note:** The response time will be slightly longer when using the client for the first time + +- Command Line (Recommended) + ``` bash + paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input "我认为跑步最重要的就是给我带来了身体健康" + ``` + + Usage: + + ```bash + paddlespeech_client text --help + ``` + 参数: + - `server_ip`: server ip. Default: 127.0.0.1 + - `port`: server port. Default: 8090 + - `input`(required): Input text to get punctuation. + + Output: + ```bash + [2022-05-09 18:19:04,397] [ INFO] - The punc text: 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-09 18:19:04,397] [ INFO] - Response time 0.092407 s. + ``` + +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_client import TextClientExecutor + + textclient_executor = TextClientExecutor() + res = textclient_executor( + input="我认为跑步最重要的就是给我带来了身体健康", + server_ip="127.0.0.1", + port=8390,) + print(res) + + ``` + + Output: + ```bash + 我认为跑步最重要的就是给我带来了身体健康。 + ``` + + ## Models supported by the service ### ASR model Get all models supported by the ASR service via `paddlespeech_server stats --task asr`, where static models can be used for paddle inference inference. @@ -245,3 +405,9 @@ Get all models supported by the TTS service via `paddlespeech_server stats --tas ### CLS model Get all models supported by the CLS service via `paddlespeech_server stats --task cls`, where static models can be used for paddle inference inference. + +### Vector model +Get all models supported by the TTS service via `paddlespeech_server stats --task vector`, where static models can be used for paddle inference inference. + +### Text model +Get all models supported by the CLS service via `paddlespeech_server stats --task text`, where static models can be used for paddle inference inference. diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 55fc6b34..5f24239d 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -194,6 +194,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` ### 6. CLS 客户端使用方法 + **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) ``` @@ -240,6 +241,159 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` +### 7. 声纹客户端使用方法 + +#### 7.1 提取声纹特征 +注意: 初次使用客户端时响应时间会略长 +* 命令行 (推荐使用) + +``` bash +paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8590 --input 85236145389.wav +``` + +* 使用帮助: + +``` bash +paddlespeech_client vector --help +``` +* 参数: + * server_ip: 服务端ip地址,默认: 127.0.0.1。 + * port: 服务端口,默认: 8090。 + * input(必须输入): 用于识别的音频文件。 + * task: vector 的任务,可选spk或者score。默认是 spk。 + * enroll: 注册音频;。 + * test: 测试音频。 +* 输出: + +``` bash + [2022-05-08 00:18:44,249] [ INFO] - vector http client start + [2022-05-08 00:18:44,250] [ INFO] - the input audio: 85236145389.wav + [2022-05-08 00:18:44,250] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector + [2022-05-08 00:18:44,250] [ INFO] - http://127.0.0.1:8590/paddlespeech/vector + [2022-05-08 00:18:44,406] [ INFO] - The vector: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'vec': [1.421751856803894, 5.626245498657227, -5.342077255249023, 1.1773887872695923, 3.3080549240112305, 1.7565933465957642, 5.167886257171631, 10.806358337402344, -3.8226819038391113, -5.614140033721924, 2.6238479614257812, -0.8072972893714905, 1.9635076522827148, -7.312870025634766, 0.011035939678549767, -9.723129272460938, 0.6619706153869629, -6.976806163787842, 10.213476181030273, 7.494769096374512, 2.9105682373046875, 3.8949244022369385, 3.799983501434326, 7.106168746948242, 16.90532875061035, -7.149388313293457, 8.733108520507812, 3.423006296157837, -4.831653594970703, -11.403363227844238, 11.232224464416504, 7.127461910247803, -4.282842636108398, 2.452359437942505, -5.130749702453613, -18.17766761779785, -2.6116831302642822, -11.000344276428223, -6.731433391571045, 1.6564682722091675, 0.7618281245231628, 1.125300407409668, -2.0838370323181152, 4.725743293762207, -8.782588005065918, -3.5398752689361572, 3.8142364025115967, 5.142068862915039, 2.1620609760284424, 4.09643030166626, -6.416214942932129, 12.747446060180664, 1.9429892301559448, -15.15294361114502, 6.417416095733643, 16.09701156616211, -9.716667175292969, -1.9920575618743896, -3.36494779586792, -1.8719440698623657, 11.567351341247559, 3.6978814601898193, 11.258262634277344, 7.442368507385254, 9.183408737182617, 4.528149127960205, -1.2417854070663452, 4.395912170410156, 6.6727728843688965, 5.88988733291626, 7.627128601074219, -0.6691966652870178, -11.889698028564453, -9.20886516571045, -7.42740535736084, -3.777663230895996, 6.917238712310791, -9.848755836486816, -2.0944676399230957, -5.1351165771484375, 0.4956451654434204, 9.317537307739258, -5.914181232452393, -1.809860348701477, -0.11738915741443634, -7.1692705154418945, -1.057827353477478, -5.721670627593994, -5.117385387420654, 16.13765525817871, -4.473617076873779, 7.6624321937561035, -0.55381840467453, 9.631585121154785, -6.470459461212158, -8.548508644104004, 4.371616840362549, -0.7970245480537415, 4.4789886474609375, -2.975860834121704, 3.2721822261810303, 2.838287830352783, 5.134591102600098, -9.19079875946045, -0.5657302737236023, -4.8745832443237305, 2.3165574073791504, -5.984319686889648, -2.1798853874206543, 0.3554139733314514, -0.3178512752056122, 9.493552207946777, 2.1144471168518066, 4.358094692230225, -12.089824676513672, 8.451693534851074, -7.925466537475586, 4.624246597290039, 4.428936958312988, 18.69200897216797, -2.6204581260681152, -5.14918851852417, -0.3582090139389038, 8.488558769226074, 4.98148775100708, -9.326835632324219, -2.2544219493865967, 6.641760349273682, 1.2119598388671875, 10.977124214172363, 16.555034637451172, 3.3238420486450195, 9.551861763000488, -1.6676981449127197, -0.7953944206237793, -8.605667114257812, -0.4735655188560486, 2.674196243286133, -5.359177112579346, -2.66738224029541, 0.6660683155059814, 15.44322681427002, 4.740593433380127, -3.472534418106079, 11.592567443847656, -2.0544962882995605, 1.736127495765686, -8.265326499938965, -9.30447769165039, 5.406829833984375, -1.518022894859314, -7.746612548828125, -6.089611053466797, 0.07112743705511093, -0.3490503430366516, -8.64989185333252, -9.998957633972168, -2.564845085144043, -0.5399947762489319, 2.6018123626708984, -0.3192799389362335, -1.8815255165100098, -2.0721492767333984, -3.410574436187744, -8.29980754852295, 1.483638048171997, -15.365986824035645, -8.288211822509766, 3.884779930114746, -3.4876468181610107, 7.362999439239502, 0.4657334089279175, 3.1326050758361816, 12.438895225524902, -1.8337041139602661, 4.532927989959717, 2.7264339923858643, 10.14534854888916, -6.521963596343994, 2.897155523300171, -3.392582654953003, 5.079153060913086, 7.7597246170043945, 4.677570819854736, 5.845779895782471, 2.402411460876465, 7.7071051597595215, 3.9711380004882812, -6.39003849029541, 6.12687873840332, -3.776029348373413, -11.118121147155762]}} + [2022-05-08 00:18:44,406] [ INFO] - Response time 0.156481 s. +``` + +* Python API + +``` python +from paddlespeech.server.bin.paddlespeech_client import VectorClientExecutor + +vectorclient_executor = VectorClientExecutor() +res = vectorclient_executor( + input="85236145389.wav", + server_ip="127.0.0.1", + port=8590, + task="spk") +print(res) +``` + +* 输出: + +``` bash + {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'vec': [1.421751856803894, 5.626245498657227, -5.342077255249023, 1.1773887872695923, 3.3080549240112305, 1.7565933465957642, 5.167886257171631, 10.806358337402344, -3.8226819038391113, -5.614140033721924, 2.6238479614257812, -0.8072972893714905, 1.9635076522827148, -7.312870025634766, 0.011035939678549767, -9.723129272460938, 0.6619706153869629, -6.976806163787842, 10.213476181030273, 7.494769096374512, 2.9105682373046875, 3.8949244022369385, 3.799983501434326, 7.106168746948242, 16.90532875061035, -7.149388313293457, 8.733108520507812, 3.423006296157837, -4.831653594970703, -11.403363227844238, 11.232224464416504, 7.127461910247803, -4.282842636108398, 2.452359437942505, -5.130749702453613, -18.17766761779785, -2.6116831302642822, -11.000344276428223, -6.731433391571045, 1.6564682722091675, 0.7618281245231628, 1.125300407409668, -2.0838370323181152, 4.725743293762207, -8.782588005065918, -3.5398752689361572, 3.8142364025115967, 5.142068862915039, 2.1620609760284424, 4.09643030166626, -6.416214942932129, 12.747446060180664, 1.9429892301559448, -15.15294361114502, 6.417416095733643, 16.09701156616211, -9.716667175292969, -1.9920575618743896, -3.36494779586792, -1.8719440698623657, 11.567351341247559, 3.6978814601898193, 11.258262634277344, 7.442368507385254, 9.183408737182617, 4.528149127960205, -1.2417854070663452, 4.395912170410156, 6.6727728843688965, 5.88988733291626, 7.627128601074219, -0.6691966652870178, -11.889698028564453, -9.20886516571045, -7.42740535736084, -3.777663230895996, 6.917238712310791, -9.848755836486816, -2.0944676399230957, -5.1351165771484375, 0.4956451654434204, 9.317537307739258, -5.914181232452393, -1.809860348701477, -0.11738915741443634, -7.1692705154418945, -1.057827353477478, -5.721670627593994, -5.117385387420654, 16.13765525817871, -4.473617076873779, 7.6624321937561035, -0.55381840467453, 9.631585121154785, -6.470459461212158, -8.548508644104004, 4.371616840362549, -0.7970245480537415, 4.4789886474609375, -2.975860834121704, 3.2721822261810303, 2.838287830352783, 5.134591102600098, -9.19079875946045, -0.5657302737236023, -4.8745832443237305, 2.3165574073791504, -5.984319686889648, -2.1798853874206543, 0.3554139733314514, -0.3178512752056122, 9.493552207946777, 2.1144471168518066, 4.358094692230225, -12.089824676513672, 8.451693534851074, -7.925466537475586, 4.624246597290039, 4.428936958312988, 18.69200897216797, -2.6204581260681152, -5.14918851852417, -0.3582090139389038, 8.488558769226074, 4.98148775100708, -9.326835632324219, -2.2544219493865967, 6.641760349273682, 1.2119598388671875, 10.977124214172363, 16.555034637451172, 3.3238420486450195, 9.551861763000488, -1.6676981449127197, -0.7953944206237793, -8.605667114257812, -0.4735655188560486, 2.674196243286133, -5.359177112579346, -2.66738224029541, 0.6660683155059814, 15.44322681427002, 4.740593433380127, -3.472534418106079, 11.592567443847656, -2.0544962882995605, 1.736127495765686, -8.265326499938965, -9.30447769165039, 5.406829833984375, -1.518022894859314, -7.746612548828125, -6.089611053466797, 0.07112743705511093, -0.3490503430366516, -8.64989185333252, -9.998957633972168, -2.564845085144043, -0.5399947762489319, 2.6018123626708984, -0.3192799389362335, -1.8815255165100098, -2.0721492767333984, -3.410574436187744, -8.29980754852295, 1.483638048171997, -15.365986824035645, -8.288211822509766, 3.884779930114746, -3.4876468181610107, 7.362999439239502, 0.4657334089279175, 3.1326050758361816, 12.438895225524902, -1.8337041139602661, 4.532927989959717, 2.7264339923858643, 10.14534854888916, -6.521963596343994, 2.897155523300171, -3.392582654953003, 5.079153060913086, 7.7597246170043945, 4.677570819854736, 5.845779895782471, 2.402411460876465, 7.7071051597595215, 3.9711380004882812, -6.39003849029541, 6.12687873840332, -3.776029348373413, -11.118121147155762]}} +``` + +#### 7.2 音频声纹打分 + +注意: 初次使用客户端时响应时间会略长 +* 命令行 (推荐使用) + +``` bash +paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8590 --enroll 85236145389.wav --test 123456789.wav +``` + +* 使用帮助: + +``` bash +paddlespeech_client vector --help +``` + +* 参数: + * server_ip: 服务端ip地址,默认: 127.0.0.1。 + * port: 服务端口,默认: 8090。 + * input(必须输入): 用于识别的音频文件。 + * task: vector 的任务,可选spk或者score。默认是 spk。 + * enroll: 注册音频;。 + * test: 测试音频。 +* 输出: + +``` bash + [2022-05-09 10:28:40,556] [ INFO] - vector score http client start + [2022-05-09 10:28:40,556] [ INFO] - enroll audio: 85236145389.wav, test audio: 123456789.wav + [2022-05-09 10:28:40,556] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector/score + [2022-05-09 10:28:40,731] [ INFO] - The vector score is: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'score': 0.4292638897895813}} + [2022-05-09 10:28:40,731] [ INFO] - The vector: None + [2022-05-09 10:28:40,731] [ INFO] - Response time 0.175514 s. +``` + +* Python API + +``` python +from paddlespeech.server.bin.paddlespeech_client import VectorClientExecutor + +vectorclient_executor = VectorClientExecutor() +res = vectorclient_executor( + input=None, + enroll_audio="85236145389.wav", + test_audio="123456789.wav", + server_ip="127.0.0.1", + port=8590, + task="score") +print(res) +``` + +* 输出: + +``` bash +[2022-05-09 10:34:54,769] [ INFO] - vector score http client start +[2022-05-09 10:34:54,771] [ INFO] - enroll audio: 85236145389.wav, test audio: 123456789.wav +[2022-05-09 10:34:54,771] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector/score +[2022-05-09 10:34:55,026] [ INFO] - The vector score is: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'score': 0.4292638897895813}} +``` + + +### 8. 标点预测 + + **注意:** 初次使用客户端时响应时间会略长 + - 命令行 (推荐使用) + ``` bash + paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input "我认为跑步最重要的就是给我带来了身体健康" + ``` + + 使用帮助: + + ```bash + paddlespeech_client text --help + ``` + 参数: + - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 + - `port`: 服务端口,默认: 8090。 + - `input`(必须输入): 用于标点预测的文本内容。 + + 输出: + ```bash + [2022-05-09 18:19:04,397] [ INFO] - The punc text: 我认为跑步最重要的就是给我带来了身体健康。 + [2022-05-09 18:19:04,397] [ INFO] - Response time 0.092407 s. + ``` + +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_client import TextClientExecutor + + textclient_executor = TextClientExecutor() + res = textclient_executor( + input="我认为跑步最重要的就是给我带来了身体健康", + server_ip="127.0.0.1", + port=8390,) + print(res) + + ``` + + 输出: + ```bash + 我认为跑步最重要的就是给我带来了身体健康。 + ``` ## 服务支持的模型 ### ASR支持的模型 @@ -250,3 +404,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### CLS支持的模型 通过 `paddlespeech_server stats --task cls` 获取CLS服务支持的所有模型,其中静态模型可用于 paddle inference 推理。 + +### Vector支持的模型 +通过 `paddlespeech_server stats --task vector` 获取Vector服务支持的所有模型。 + +### Text支持的模型 +通过 `paddlespeech_server stats --task text` 获取Text服务支持的所有模型。 \ No newline at end of file diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index 762f4af6..14a9195a 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -9,7 +9,7 @@ port: 8090 # The task format in the engin_list is: _ # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] protocol: 'http' -engine_list: ['asr_python', 'tts_python', 'cls_python'] +engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python'] ################################################################################# @@ -135,3 +135,26 @@ cls_inference: glog_info: False # True -> print glog summary: True # False -> do not show predictor config + +################################### Text ######################################### +################### text task: punc; engine_type: python ####################### +text_python: + task: punc + model_type: 'ernie_linear_p3_wudao' + lang: 'zh' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + vocab_file: # [optional] + device: # set 'gpu:id' or 'cpu' + + +################################### Vector ###################################### +################### Vector task: spk; engine_type: python ####################### +vector_python: + task: spk + model_type: 'ecapatdnn_voxceleb12' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + device: # set 'gpu:id' or 'cpu' \ No newline at end of file diff --git a/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml new file mode 100644 index 00000000..e9a89c19 --- /dev/null +++ b/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml @@ -0,0 +1,46 @@ +# This is the parameter configuration file for PaddleSpeech Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8090 + +# The task format in the engin_list is: _ +# task choices = ['asr_online'] +# protocol = ['websocket'] (only one can be selected). +# websocket only support online engine type. +protocol: 'websocket' +engine_list: ['asr_online'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### ASR ######################################### +################### speech task: asr; engine_type: online ####################### +asr_online: + model_type: 'conformer_online_wenetspeech' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + device: 'cpu' # cpu or gpu:id + decode_method: "attention_rescoring" + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + chunk_buffer_conf: + window_n: 7 # frame + shift_n: 4 # frame + window_ms: 25 # ms + shift_ms: 10 # ms + sample_rate: 16000 + sample_width: 2 diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md index f3dc9224..34b7fc2a 100644 --- a/paddlespeech/server/README.md +++ b/paddlespeech/server/README.md @@ -63,3 +63,24 @@ paddlespeech_server start --config_file conf/tts_online_application.yaml ``` paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --input "您好,欢迎使用百度飞桨深度学习框架!" --output output.wav ``` + + +## Speaker Verification + +### Lanuch speaker verification server + +``` +paddlespeech_server start --config_file conf/vector_application.yaml +``` + +### Extract speaker embedding from aduio + +``` +paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav +``` + +### Get score with speaker audio embedding + +``` +paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav +``` diff --git a/paddlespeech/server/engine/asr/online/pretrained_models.py b/paddlespeech/server/engine/asr/online/pretrained_models.py index 005977b4..ff377865 100644 --- a/paddlespeech/server/engine/asr/online/pretrained_models.py +++ b/paddlespeech/server/engine/asr/online/pretrained_models.py @@ -49,4 +49,22 @@ pretrained_models = { 'lm_md5': '29e02312deb2e59b3c8686c7966d4fe3' }, + "conformer_online_wenetspeech-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz', + 'md5': + 'b8c02632b04da34aca88459835be54a6', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/avg_10', + 'model': + 'exp/chunk_conformer/checkpoints/avg_10.pdparams', + 'params': + 'exp/chunk_conformer/checkpoints/avg_10.pdparams', + 'lm_url': + '', + 'lm_md5': + '', + }, } From 597d601dec340f19de634f2430de6f4270c79e53 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 9 May 2022 20:06:21 +0800 Subject: [PATCH 48/93] update the port to 8090, test=doc --- demos/speech_server/README.md | 16 ++++++++-------- demos/speech_server/README_cn.md | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 2334ec22..bb974c97 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -243,7 +243,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - Command Line (Recommended) ``` bash -paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8590 --input 85236145389.wav +paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav ``` * Usage: @@ -265,7 +265,7 @@ paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8590 --input ``` bash [2022-05-08 00:18:44,249] [ INFO] - vector http client start [2022-05-08 00:18:44,250] [ INFO] - the input audio: 85236145389.wav - [2022-05-08 00:18:44,250] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector + [2022-05-08 00:18:44,250] [ INFO] - endpoint: http://127.0.0.1:8090/paddlespeech/vector [2022-05-08 00:18:44,250] [ INFO] - http://127.0.0.1:8590/paddlespeech/vector [2022-05-08 00:18:44,406] [ INFO] - The vector: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'vec': [1.421751856803894, 5.626245498657227, -5.342077255249023, 1.1773887872695923, 3.3080549240112305, 1.7565933465957642, 5.167886257171631, 10.806358337402344, -3.8226819038391113, -5.614140033721924, 2.6238479614257812, -0.8072972893714905, 1.9635076522827148, -7.312870025634766, 0.011035939678549767, -9.723129272460938, 0.6619706153869629, -6.976806163787842, 10.213476181030273, 7.494769096374512, 2.9105682373046875, 3.8949244022369385, 3.799983501434326, 7.106168746948242, 16.90532875061035, -7.149388313293457, 8.733108520507812, 3.423006296157837, -4.831653594970703, -11.403363227844238, 11.232224464416504, 7.127461910247803, -4.282842636108398, 2.452359437942505, -5.130749702453613, -18.17766761779785, -2.6116831302642822, -11.000344276428223, -6.731433391571045, 1.6564682722091675, 0.7618281245231628, 1.125300407409668, -2.0838370323181152, 4.725743293762207, -8.782588005065918, -3.5398752689361572, 3.8142364025115967, 5.142068862915039, 2.1620609760284424, 4.09643030166626, -6.416214942932129, 12.747446060180664, 1.9429892301559448, -15.15294361114502, 6.417416095733643, 16.09701156616211, -9.716667175292969, -1.9920575618743896, -3.36494779586792, -1.8719440698623657, 11.567351341247559, 3.6978814601898193, 11.258262634277344, 7.442368507385254, 9.183408737182617, 4.528149127960205, -1.2417854070663452, 4.395912170410156, 6.6727728843688965, 5.88988733291626, 7.627128601074219, -0.6691966652870178, -11.889698028564453, -9.20886516571045, -7.42740535736084, -3.777663230895996, 6.917238712310791, -9.848755836486816, -2.0944676399230957, -5.1351165771484375, 0.4956451654434204, 9.317537307739258, -5.914181232452393, -1.809860348701477, -0.11738915741443634, -7.1692705154418945, -1.057827353477478, -5.721670627593994, -5.117385387420654, 16.13765525817871, -4.473617076873779, 7.6624321937561035, -0.55381840467453, 9.631585121154785, -6.470459461212158, -8.548508644104004, 4.371616840362549, -0.7970245480537415, 4.4789886474609375, -2.975860834121704, 3.2721822261810303, 2.838287830352783, 5.134591102600098, -9.19079875946045, -0.5657302737236023, -4.8745832443237305, 2.3165574073791504, -5.984319686889648, -2.1798853874206543, 0.3554139733314514, -0.3178512752056122, 9.493552207946777, 2.1144471168518066, 4.358094692230225, -12.089824676513672, 8.451693534851074, -7.925466537475586, 4.624246597290039, 4.428936958312988, 18.69200897216797, -2.6204581260681152, -5.14918851852417, -0.3582090139389038, 8.488558769226074, 4.98148775100708, -9.326835632324219, -2.2544219493865967, 6.641760349273682, 1.2119598388671875, 10.977124214172363, 16.555034637451172, 3.3238420486450195, 9.551861763000488, -1.6676981449127197, -0.7953944206237793, -8.605667114257812, -0.4735655188560486, 2.674196243286133, -5.359177112579346, -2.66738224029541, 0.6660683155059814, 15.44322681427002, 4.740593433380127, -3.472534418106079, 11.592567443847656, -2.0544962882995605, 1.736127495765686, -8.265326499938965, -9.30447769165039, 5.406829833984375, -1.518022894859314, -7.746612548828125, -6.089611053466797, 0.07112743705511093, -0.3490503430366516, -8.64989185333252, -9.998957633972168, -2.564845085144043, -0.5399947762489319, 2.6018123626708984, -0.3192799389362335, -1.8815255165100098, -2.0721492767333984, -3.410574436187744, -8.29980754852295, 1.483638048171997, -15.365986824035645, -8.288211822509766, 3.884779930114746, -3.4876468181610107, 7.362999439239502, 0.4657334089279175, 3.1326050758361816, 12.438895225524902, -1.8337041139602661, 4.532927989959717, 2.7264339923858643, 10.14534854888916, -6.521963596343994, 2.897155523300171, -3.392582654953003, 5.079153060913086, 7.7597246170043945, 4.677570819854736, 5.845779895782471, 2.402411460876465, 7.7071051597595215, 3.9711380004882812, -6.39003849029541, 6.12687873840332, -3.776029348373413, -11.118121147155762]}} [2022-05-08 00:18:44,406] [ INFO] - Response time 0.156481 s. @@ -280,7 +280,7 @@ vectorclient_executor = VectorClientExecutor() res = vectorclient_executor( input="85236145389.wav", server_ip="127.0.0.1", - port=8590, + port=8090, task="spk") print(res) ``` @@ -298,7 +298,7 @@ print(res) - Command Line (Recommended) ``` bash - paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8590 --enroll 85236145389.wav --test 123456789.wav + paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 85236145389.wav --test 123456789.wav ``` * Usage: @@ -320,7 +320,7 @@ print(res) ``` bash [2022-05-09 10:28:40,556] [ INFO] - vector score http client start [2022-05-09 10:28:40,556] [ INFO] - enroll audio: 85236145389.wav, test audio: 123456789.wav - [2022-05-09 10:28:40,556] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector/score + [2022-05-09 10:28:40,556] [ INFO] - endpoint: http://127.0.0.1:8090/paddlespeech/vector/score [2022-05-09 10:28:40,731] [ INFO] - The vector score is: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'score': 0.4292638897895813}} [2022-05-09 10:28:40,731] [ INFO] - The vector: None [2022-05-09 10:28:40,731] [ INFO] - Response time 0.175514 s. @@ -337,7 +337,7 @@ res = vectorclient_executor( enroll_audio="85236145389.wav", test_audio="123456789.wav", server_ip="127.0.0.1", - port=8590, + port=8090, task="score") print(res) ``` @@ -347,7 +347,7 @@ print(res) ``` bash [2022-05-09 10:34:54,769] [ INFO] - vector score http client start [2022-05-09 10:34:54,771] [ INFO] - enroll audio: 85236145389.wav, test audio: 123456789.wav -[2022-05-09 10:34:54,771] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector/score +[2022-05-09 10:34:54,771] [ INFO] - endpoint: http://127.0.0.1:8090/paddlespeech/vector/score [2022-05-09 10:34:55,026] [ INFO] - The vector score is: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'score': 0.4292638897895813}} ``` @@ -385,7 +385,7 @@ print(res) res = textclient_executor( input="我认为跑步最重要的就是给我带来了身体健康", server_ip="127.0.0.1", - port=8390,) + port=8090,) print(res) ``` diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 5f24239d..8fa67c0d 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -17,7 +17,7 @@ ### 2. 准备配置文件 配置文件可参见 `conf/application.yaml` 。 其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 -目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)以及cls(音频分类)。 +目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)、cls(音频分类)、vector(声纹识别)以及text(文本处理)。 目前引擎类型支持两种形式:python 及 inference (Paddle Inference) **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 @@ -248,7 +248,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee * 命令行 (推荐使用) ``` bash -paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8590 --input 85236145389.wav +paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav ``` * 使用帮助: @@ -268,7 +268,7 @@ paddlespeech_client vector --help ``` bash [2022-05-08 00:18:44,249] [ INFO] - vector http client start [2022-05-08 00:18:44,250] [ INFO] - the input audio: 85236145389.wav - [2022-05-08 00:18:44,250] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector + [2022-05-08 00:18:44,250] [ INFO] - endpoint: http://127.0.0.1:8090/paddlespeech/vector [2022-05-08 00:18:44,250] [ INFO] - http://127.0.0.1:8590/paddlespeech/vector [2022-05-08 00:18:44,406] [ INFO] - The vector: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'vec': [1.421751856803894, 5.626245498657227, -5.342077255249023, 1.1773887872695923, 3.3080549240112305, 1.7565933465957642, 5.167886257171631, 10.806358337402344, -3.8226819038391113, -5.614140033721924, 2.6238479614257812, -0.8072972893714905, 1.9635076522827148, -7.312870025634766, 0.011035939678549767, -9.723129272460938, 0.6619706153869629, -6.976806163787842, 10.213476181030273, 7.494769096374512, 2.9105682373046875, 3.8949244022369385, 3.799983501434326, 7.106168746948242, 16.90532875061035, -7.149388313293457, 8.733108520507812, 3.423006296157837, -4.831653594970703, -11.403363227844238, 11.232224464416504, 7.127461910247803, -4.282842636108398, 2.452359437942505, -5.130749702453613, -18.17766761779785, -2.6116831302642822, -11.000344276428223, -6.731433391571045, 1.6564682722091675, 0.7618281245231628, 1.125300407409668, -2.0838370323181152, 4.725743293762207, -8.782588005065918, -3.5398752689361572, 3.8142364025115967, 5.142068862915039, 2.1620609760284424, 4.09643030166626, -6.416214942932129, 12.747446060180664, 1.9429892301559448, -15.15294361114502, 6.417416095733643, 16.09701156616211, -9.716667175292969, -1.9920575618743896, -3.36494779586792, -1.8719440698623657, 11.567351341247559, 3.6978814601898193, 11.258262634277344, 7.442368507385254, 9.183408737182617, 4.528149127960205, -1.2417854070663452, 4.395912170410156, 6.6727728843688965, 5.88988733291626, 7.627128601074219, -0.6691966652870178, -11.889698028564453, -9.20886516571045, -7.42740535736084, -3.777663230895996, 6.917238712310791, -9.848755836486816, -2.0944676399230957, -5.1351165771484375, 0.4956451654434204, 9.317537307739258, -5.914181232452393, -1.809860348701477, -0.11738915741443634, -7.1692705154418945, -1.057827353477478, -5.721670627593994, -5.117385387420654, 16.13765525817871, -4.473617076873779, 7.6624321937561035, -0.55381840467453, 9.631585121154785, -6.470459461212158, -8.548508644104004, 4.371616840362549, -0.7970245480537415, 4.4789886474609375, -2.975860834121704, 3.2721822261810303, 2.838287830352783, 5.134591102600098, -9.19079875946045, -0.5657302737236023, -4.8745832443237305, 2.3165574073791504, -5.984319686889648, -2.1798853874206543, 0.3554139733314514, -0.3178512752056122, 9.493552207946777, 2.1144471168518066, 4.358094692230225, -12.089824676513672, 8.451693534851074, -7.925466537475586, 4.624246597290039, 4.428936958312988, 18.69200897216797, -2.6204581260681152, -5.14918851852417, -0.3582090139389038, 8.488558769226074, 4.98148775100708, -9.326835632324219, -2.2544219493865967, 6.641760349273682, 1.2119598388671875, 10.977124214172363, 16.555034637451172, 3.3238420486450195, 9.551861763000488, -1.6676981449127197, -0.7953944206237793, -8.605667114257812, -0.4735655188560486, 2.674196243286133, -5.359177112579346, -2.66738224029541, 0.6660683155059814, 15.44322681427002, 4.740593433380127, -3.472534418106079, 11.592567443847656, -2.0544962882995605, 1.736127495765686, -8.265326499938965, -9.30447769165039, 5.406829833984375, -1.518022894859314, -7.746612548828125, -6.089611053466797, 0.07112743705511093, -0.3490503430366516, -8.64989185333252, -9.998957633972168, -2.564845085144043, -0.5399947762489319, 2.6018123626708984, -0.3192799389362335, -1.8815255165100098, -2.0721492767333984, -3.410574436187744, -8.29980754852295, 1.483638048171997, -15.365986824035645, -8.288211822509766, 3.884779930114746, -3.4876468181610107, 7.362999439239502, 0.4657334089279175, 3.1326050758361816, 12.438895225524902, -1.8337041139602661, 4.532927989959717, 2.7264339923858643, 10.14534854888916, -6.521963596343994, 2.897155523300171, -3.392582654953003, 5.079153060913086, 7.7597246170043945, 4.677570819854736, 5.845779895782471, 2.402411460876465, 7.7071051597595215, 3.9711380004882812, -6.39003849029541, 6.12687873840332, -3.776029348373413, -11.118121147155762]}} [2022-05-08 00:18:44,406] [ INFO] - Response time 0.156481 s. @@ -283,7 +283,7 @@ vectorclient_executor = VectorClientExecutor() res = vectorclient_executor( input="85236145389.wav", server_ip="127.0.0.1", - port=8590, + port=8090, task="spk") print(res) ``` @@ -300,7 +300,7 @@ print(res) * 命令行 (推荐使用) ``` bash -paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8590 --enroll 85236145389.wav --test 123456789.wav +paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 85236145389.wav --test 123456789.wav ``` * 使用帮助: @@ -321,7 +321,7 @@ paddlespeech_client vector --help ``` bash [2022-05-09 10:28:40,556] [ INFO] - vector score http client start [2022-05-09 10:28:40,556] [ INFO] - enroll audio: 85236145389.wav, test audio: 123456789.wav - [2022-05-09 10:28:40,556] [ INFO] - endpoint: http://127.0.0.1:8590/paddlespeech/vector/score + [2022-05-09 10:28:40,556] [ INFO] - endpoint: http://127.0.0.1:8090/paddlespeech/vector/score [2022-05-09 10:28:40,731] [ INFO] - The vector score is: {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'score': 0.4292638897895813}} [2022-05-09 10:28:40,731] [ INFO] - The vector: None [2022-05-09 10:28:40,731] [ INFO] - Response time 0.175514 s. @@ -338,7 +338,7 @@ res = vectorclient_executor( enroll_audio="85236145389.wav", test_audio="123456789.wav", server_ip="127.0.0.1", - port=8590, + port=8090, task="score") print(res) ``` @@ -385,7 +385,7 @@ print(res) res = textclient_executor( input="我认为跑步最重要的就是给我带来了身体健康", server_ip="127.0.0.1", - port=8390,) + port=8090,) print(res) ``` From 5656768c9508894965ccfde4e6374b0a8d6364be Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 10 May 2022 10:10:04 +0800 Subject: [PATCH 49/93] streaming asr server using paddlespeech_server/client --- demos/streaming_asr_server/server.sh | 7 +++++-- demos/streaming_asr_server/test.sh | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/demos/streaming_asr_server/server.sh b/demos/streaming_asr_server/server.sh index 04858321..4266f8c6 100755 --- a/demos/streaming_asr_server/server.sh +++ b/demos/streaming_asr_server/server.sh @@ -1,5 +1,8 @@ export CUDA_VISIBLE_DEVICE=0,1,2,3 + export CUDA_VISIBLE_DEVICE=0,1,2,3 -nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 & +# nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 & +paddlespeech_server start --config_file conf/punc_application.yaml &> punc.log & -nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_application.yaml > streaming_asr.log 2>&1 & +# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_application.yaml > streaming_asr.log 2>&1 & +paddlespeech_server start --config_file conf/ws_conformer_application.yaml &> streaming_asr.log & \ No newline at end of file diff --git a/demos/streaming_asr_server/test.sh b/demos/streaming_asr_server/test.sh index 912d67a2..c7b57e9b 100755 --- a/demos/streaming_asr_server/test.sh +++ b/demos/streaming_asr_server/test.sh @@ -2,7 +2,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav # read the wav and pass it to only streaming asr service -python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav +# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav +paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --input ./zh.wav # read the wav and call streaming and punc service -python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav +# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav +paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav \ No newline at end of file From 1eab2b86983a942733f07e1752f226bb8f0d6ee3 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 10 May 2022 10:19:21 +0800 Subject: [PATCH 50/93] update asr port --- demos/streaming_asr_server/conf/application.yaml | 4 ++-- demos/streaming_asr_server/conf/ws_application.yaml | 1 + demos/streaming_asr_server/conf/ws_conformer_application.yaml | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/demos/streaming_asr_server/conf/application.yaml b/demos/streaming_asr_server/conf/application.yaml index 50c7a727..f576d704 100644 --- a/demos/streaming_asr_server/conf/application.yaml +++ b/demos/streaming_asr_server/conf/application.yaml @@ -21,7 +21,7 @@ engine_list: ['asr_online'] ################################### ASR ######################################### ################### speech task: asr; engine_type: online ####################### asr_online: - model_type: 'conformer_online_multicn' + model_type: 'conformer_online_wenetspeech' am_model: # the pdmodel file of am static model [optional] am_params: # the pdiparams file of am static model [optional] lang: 'zh' @@ -29,7 +29,7 @@ asr_online: cfg_path: decode_method: force_yes: True - device: # cpu or gpu:id + device: cpu # cpu or gpu:id am_predictor_conf: device: # set 'gpu:id' or 'cpu' switch_ir_optim: True diff --git a/demos/streaming_asr_server/conf/ws_application.yaml b/demos/streaming_asr_server/conf/ws_application.yaml index fc02f2ca..f2ea6330 100644 --- a/demos/streaming_asr_server/conf/ws_application.yaml +++ b/demos/streaming_asr_server/conf/ws_application.yaml @@ -29,6 +29,7 @@ asr_online: cfg_path: decode_method: force_yes: True + device: 'cpu' # cpu or gpu:id am_predictor_conf: device: # set 'gpu:id' or 'cpu' diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml index 20a50008..2affde07 100644 --- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml +++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml @@ -4,7 +4,7 @@ # SERVER SETTING # ################################################################################# host: 0.0.0.0 -port: 8290 +port: 8090 # The task format in the engin_list is: _ # task choices = ['asr_online'] From 2381d2b078ad31cdbc866a7d08bcd412c3a820ae Mon Sep 17 00:00:00 2001 From: root Date: Tue, 10 May 2022 11:00:52 +0000 Subject: [PATCH 51/93] add cacu rtf scripts, test=doc --- tests/unit/cli/aishell_test_prepare.py | 142 +++++++++++++++++++++++++ tests/unit/cli/cacu_rtf_by_aishell.sh | 27 +++++ 2 files changed, 169 insertions(+) create mode 100644 tests/unit/cli/aishell_test_prepare.py create mode 100644 tests/unit/cli/cacu_rtf_by_aishell.sh diff --git a/tests/unit/cli/aishell_test_prepare.py b/tests/unit/cli/aishell_test_prepare.py new file mode 100644 index 00000000..288de62a --- /dev/null +++ b/tests/unit/cli/aishell_test_prepare.py @@ -0,0 +1,142 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from pathlib import Path + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/33' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' +DATA_URL = URL_ROOT + '/data_aishell.tgz' +MD5_DATA = '2f494334227864a8a8fec932999db9d8' +RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz' +MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'wav', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + + utt2spk = Path(audio_path).parent.name + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append(audio_path) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + manifest_dir = os.path.dirname(manifest_path_prefix) + +def prepare_dataset(url, md5sum, target_dir, manifest_path=None): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + if manifest_path: + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + prepare_dataset( + url=RESOURCE_URL, + md5sum=MD5_RESOURCE, + target_dir=args.target_dir, + manifest_path=None) + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/tests/unit/cli/cacu_rtf_by_aishell.sh b/tests/unit/cli/cacu_rtf_by_aishell.sh new file mode 100644 index 00000000..b9d68352 --- /dev/null +++ b/tests/unit/cli/cacu_rtf_by_aishell.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +stage=-1 +stop_stage=100 +MAIN_ROOT=../../.. + +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; +TARGET_DIR=${MAIN_ROOT}/dataset +mkdir -p ${TARGET_DIR} +mkdir -p data + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + # download data, generate manifests + python3 aishell_test_prepare.py \ + --manifest_prefix="data/manifest" \ + --target_dir="${TARGET_DIR}/aishell" + + if [ $? -ne 0 ]; then + echo "Prepare Aishell failed. Terminated." + exit 1 + fi + +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + cat data/manifest.test | paddlespeech asr --model conformer_online_aishell --rtf -v +fi From d4f863dc9730d357273da5a8740cfb3f06d4cf40 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Tue, 10 May 2022 21:08:39 +0800 Subject: [PATCH 52/93] improve, test=doc --- .../server/bin/paddlespeech_client.py | 52 +++++++++++- .../server/bin/paddlespeech_server.py | 11 ++- paddlespeech/server/utils/audio_handler.py | 30 +++++-- paddlespeech/server/utils/util.py | 20 +++++ paddlespeech/server/ws/tts_api.py | 84 +++++++++++++------ 5 files changed, 161 insertions(+), 36 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 19bdc10b..f0a206b7 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -18,6 +18,7 @@ import io import json import os import random +import sys import time from typing import List @@ -32,6 +33,7 @@ from paddlespeech.cli.log import logger from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler from paddlespeech.server.utils.audio_process import wav2pcm from paddlespeech.server.utils.util import compute_delay +from paddlespeech.server.utils.util import network_reachable from paddlespeech.server.utils.util import wav2base64 __all__ = [ @@ -128,6 +130,7 @@ class TTSClientExecutor(BaseExecutor): return True except Exception as e: logger.error("Failed to synthesized audio.") + logger.error(e) return False @stats_wrapper @@ -154,6 +157,12 @@ class TTSClientExecutor(BaseExecutor): "save_path": output } + # Check if the network is reachable + network = 'http://' + server_ip + ":" + str(port) + if network_reachable(network) is not True: + logger.error(f"{network} unreachable, please check the ip address.") + sys.exit(-1) + res = requests.post(url, json.dumps(request)) response_dict = res.json() if output is not None: @@ -236,6 +245,7 @@ class TTSOnlineClientExecutor(BaseExecutor): return True except Exception as e: logger.error("Failed to synthesized audio.") + logger.error(e) return False @stats_wrapper @@ -254,6 +264,12 @@ class TTSOnlineClientExecutor(BaseExecutor): Python API to call an executor. """ + # Check if the network is reachable + network = 'http://' + server_ip + ":" + str(port) + if network_reachable(network) is not True: + logger.error(f"{network} unreachable, please check the ip address.") + sys.exit(-1) + if protocol == "http": logger.info("tts http client start") from paddlespeech.server.utils.audio_handler import TTSHttpHandler @@ -275,7 +291,7 @@ class TTSOnlineClientExecutor(BaseExecutor): else: logger.error("Please set correct protocol, http or websocket") - return False + sys.exit(-1) logger.info(f"sentence: {input}") logger.info(f"duration: {duration} s") @@ -399,6 +415,13 @@ class ASRClientExecutor(BaseExecutor): # and paddlespeech_client asr only support http protocol protocol = "http" if protocol.lower() == "http": + # Check if the network is reachable + network = 'http://' + server_ip + ":" + str(port) + if network_reachable(network) is not True: + logger.error( + f"{network} unreachable, please check the ip address.") + sys.exit(-1) + from paddlespeech.server.utils.audio_handler import ASRHttpHandler logger.info("asr http client start") handler = ASRHttpHandler(server_ip=server_ip, port=port) @@ -503,6 +526,13 @@ class ASROnlineClientExecutor(BaseExecutor): Returns: str: the audio text """ + + # Check if the network is reachable + network = 'http://' + server_ip + ":" + str(port) + if network_reachable(network) is not True: + logger.error(f"{network} unreachable, please check the ip address.") + sys.exit(-1) + logger.info("asr websocket client start") handler = ASRWsAudioHandler( server_ip, @@ -555,6 +585,7 @@ class CLSClientExecutor(BaseExecutor): return True except Exception as e: logger.error("Failed to speech classification.") + logger.error(e) return False @stats_wrapper @@ -567,6 +598,12 @@ class CLSClientExecutor(BaseExecutor): Python API to call an executor. """ + # Check if the network is reachable + network = 'http://' + server_ip + ":" + str(port) + if network_reachable(network) is not True: + logger.error(f"{network} unreachable, please check the ip address.") + sys.exit(-1) + url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/cls' audio = wav2base64(input) data = {"audio": audio, "topk": topk} @@ -632,6 +669,12 @@ class TextClientExecutor(BaseExecutor): str: the punctuation text """ + # Check if the network is reachable + network = 'http://' + server_ip + ":" + str(port) + if network_reachable(network) is not True: + logger.error(f"{network} unreachable, please check the ip address.") + sys.exit(-1) + url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/text' request = { "text": input, @@ -728,6 +771,13 @@ class VectorClientExecutor(BaseExecutor): Returns: str: the audio embedding or score between enroll and test audio """ + + # Check if the network is reachable + network = 'http://' + server_ip + ":" + str(port) + if network_reachable(network) is not True: + logger.error(f"{network} unreachable, please check the ip address.") + sys.exit(-1) + if task == "spk": from paddlespeech.server.utils.audio_handler import VectorHttpHandler logger.info("vector http client start") diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 9e3b0ed0..db92f179 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import sys from typing import List import uvicorn @@ -79,10 +80,12 @@ class ServerExecutor(BaseExecutor): def execute(self, argv: List[str]) -> bool: args = self.parser.parse_args(argv) - config = get_config(args.config_file) - - if self.init(config): - uvicorn.run(app, host=config.host, port=config.port, debug=True) + try: + self(args.config_file, args.log_file) + except Exception as e: + logger.error("Failed to start server.") + logger.error(e) + sys.exit(-1) @stats_wrapper def __call__(self, diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 2bce28e3..93d44626 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -304,18 +304,24 @@ class TTSWsHandler: receive_time_list = [] chunk_duration_list = [] - # 1. Send websocket handshake protocal + # 1. Send websocket handshake request async with websockets.connect(self.url) as ws: - # 2. Server has already received handshake protocal - # send text to engine + # 2. Server has already received handshake response, send start request + start_request = json.dumps({"task": "tts", "signal": "start"}) + await ws.send(start_request) + msg = await ws.recv() + logger.info(f"client receive msg={msg}") + msg = json.loads(msg) + session = msg["session"] + + # 3. send speech synthesis request text_base64 = str(base64.b64encode((text).encode('utf-8')), "UTF8") - d = {"text": text_base64} - d = json.dumps(d) + request = json.dumps({"text": text_base64}) st = time.time() - await ws.send(d) + await ws.send(request) logging.info("send a message to the server") - # 3. Process the received response + # Process the received response message = await ws.recv() first_response = time.time() - st message = json.loads(message) @@ -348,6 +354,15 @@ class TTSWsHandler: save_audio_success = save_audio(all_bytes, output) else: save_audio_success = False + + # 5. send end request + end_request = json.dumps({ + "task": "tts", + "signal": "end", + "session": session + }) + await ws.send(end_request) + else: logger.error("infer error") @@ -458,6 +473,7 @@ class TTSHttpHandler: final_response = time.time() - st duration = len(all_bytes) / 2.0 / 24000 + html.close() # when stream=True if output is not None: save_audio_success = save_audio(all_bytes, output) diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 061b213c..3f2603b4 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -13,6 +13,8 @@ import base64 import math +import requests + def wav2base64(wav_file: str): """ @@ -146,3 +148,21 @@ def count_engine(logfile: str="./nohup.out"): print( f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" ) + + +def network_reachable(url: str, timeout: int=5) -> bool: + """Check if the network is reachable + + Args: + url (str): http://server_ip:port or ws://server_ip:port + timeout (int, optional): timeout. Defaults to 5. + + Returns: + bool: Whether the network is reachable. + """ + try: + request = requests.get(url, timeout=timeout) + return True + except (requests.ConnectionError, requests.Timeout) as exception: + print(exception) + return False diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py index 20a63d4c..33b30ce4 100644 --- a/paddlespeech/server/ws/tts_api.py +++ b/paddlespeech/server/ws/tts_api.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import json +import uuid from fastapi import APIRouter from fastapi import WebSocket @@ -26,36 +27,71 @@ router = APIRouter() @router.websocket('/paddlespeech/tts/streaming') async def websocket_endpoint(websocket: WebSocket): + """PaddleSpeech Online TTS Server api + + Args: + websocket (WebSocket): the websocket instance + """ + + #1. the interface wait to accept the websocket protocal header + # and only we receive the header, it establish the connection with specific thread await websocket.accept() + #2. if we accept the websocket headers, we will get the online tts engine instance + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + try: - # careful here, changed the source code from starlette.websockets - assert websocket.application_state == WebSocketState.CONNECTED - message = await websocket.receive() - websocket._raise_on_disconnect(message) + while True: + # careful here, changed the source code from starlette.websockets + assert websocket.application_state == WebSocketState.CONNECTED + message = await websocket.receive() + websocket._raise_on_disconnect(message) + message = json.loads(message["text"]) - # get engine - engine_pool = get_engine_pool() - tts_engine = engine_pool['tts'] + if 'signal' in message: + # start request + if message['signal'] == 'start': + session = uuid.uuid1().hex + resp = { + "status": 0, + "signal": "server ready", + "session": session + } + await websocket.send_json(resp) - # 获取 message 并转文本 - message = json.loads(message["text"]) - text_bese64 = message["text"] - sentence = tts_engine.preprocess(text_bese64=text_bese64) + # end request + elif message['signal'] == 'end': + resp = { + "status": 0, + "signal": "connection will be closed", + "session": session + } + await websocket.send_json(resp) - # run - wav_generator = tts_engine.run(sentence) + # speech synthesis request + elif 'text' in message: + text_bese64 = message["text"] + sentence = tts_engine.preprocess(text_bese64=text_bese64) - while True: - try: - tts_results = next(wav_generator) - resp = {"status": 1, "audio": tts_results} - await websocket.send_json(resp) - except StopIteration as e: - resp = {"status": 2, "audio": ''} - await websocket.send_json(resp) - logger.info("Complete the transmission of audio streams") - break + # run + wav_generator = tts_engine.run(sentence) + + while True: + try: + tts_results = next(wav_generator) + resp = {"status": 1, "audio": tts_results} + await websocket.send_json(resp) + except StopIteration as e: + resp = {"status": 2, "audio": ''} + await websocket.send_json(resp) + logger.info( + "Complete the transmission of audio streams") + break + + else: + logger.error( + "Invalid request, please check if the request is correct.") except WebSocketDisconnect: - pass \ No newline at end of file + pass From 347af638e28301cfefe1e0608ba28ebf02bbb5ad Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 11 May 2022 09:43:40 +0800 Subject: [PATCH 53/93] changet vector train.py local_rank to rank, test=doc --- paddlespeech/vector/exps/ecapa_tdnn/train.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py index aad148a9..bf014045 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/train.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py @@ -54,7 +54,7 @@ def main(args, config): # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining paddle.distributed.init_parallel_env() nranks = paddle.distributed.get_world_size() - local_rank = paddle.distributed.get_rank() + rank = paddle.distributed.get_rank() # set the random seed, it is the necessary measures for multiprocess training seed_everything(config.seed) @@ -112,10 +112,10 @@ def main(args, config): state_dict = paddle.load( os.path.join(args.load_checkpoint, 'model.pdopt')) optimizer.set_state_dict(state_dict) - if local_rank == 0: + if rank == 0: logger.info(f'Checkpoint loaded from {args.load_checkpoint}') except FileExistsError: - if local_rank == 0: + if rank == 0: logger.info('Train from scratch.') try: @@ -219,7 +219,7 @@ def main(args, config): timer.count() # step plus one in timer # stage 9-10: print the log information only on 0-rank per log-freq batchs - if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0: + if (batch_idx + 1) % config.log_interval == 0 and rank == 0: lr = optimizer.get_lr() avg_loss /= config.log_interval avg_acc = num_corrects / num_samples @@ -250,7 +250,7 @@ def main(args, config): # stage 9-11: save the model parameters only on 0-rank per save-freq batchs if epoch % config.save_interval == 0 and batch_idx + 1 == steps_per_epoch: - if local_rank != 0: + if rank != 0: paddle.distributed.barrier( ) # Wait for valid step in main process continue # Resume trainning on other process @@ -317,7 +317,7 @@ def main(args, config): paddle.distributed.barrier() # Main process # stage 10: create the final trained model.pdparams with soft link - if local_rank == 0: + if rank == 0: final_model = os.path.join(args.checkpoint_dir, "model.pdparams") logger.info(f"we will create the final model: {final_model}") if os.path.islink(final_model): From b1f9b8016dbe4a975cff6e51e951ce6a0f1c297e Mon Sep 17 00:00:00 2001 From: lym0302 Date: Wed, 11 May 2022 10:41:19 +0800 Subject: [PATCH 54/93] add start and end request on ws tts, test=doc --- .../server/bin/paddlespeech_client.py | 44 ------------------- paddlespeech/server/utils/util.py | 20 --------- paddlespeech/server/ws/tts_api.py | 8 +++- 3 files changed, 7 insertions(+), 65 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index f0a206b7..6d730959 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -33,7 +33,6 @@ from paddlespeech.cli.log import logger from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler from paddlespeech.server.utils.audio_process import wav2pcm from paddlespeech.server.utils.util import compute_delay -from paddlespeech.server.utils.util import network_reachable from paddlespeech.server.utils.util import wav2base64 __all__ = [ @@ -157,12 +156,6 @@ class TTSClientExecutor(BaseExecutor): "save_path": output } - # Check if the network is reachable - network = 'http://' + server_ip + ":" + str(port) - if network_reachable(network) is not True: - logger.error(f"{network} unreachable, please check the ip address.") - sys.exit(-1) - res = requests.post(url, json.dumps(request)) response_dict = res.json() if output is not None: @@ -264,12 +257,6 @@ class TTSOnlineClientExecutor(BaseExecutor): Python API to call an executor. """ - # Check if the network is reachable - network = 'http://' + server_ip + ":" + str(port) - if network_reachable(network) is not True: - logger.error(f"{network} unreachable, please check the ip address.") - sys.exit(-1) - if protocol == "http": logger.info("tts http client start") from paddlespeech.server.utils.audio_handler import TTSHttpHandler @@ -415,13 +402,6 @@ class ASRClientExecutor(BaseExecutor): # and paddlespeech_client asr only support http protocol protocol = "http" if protocol.lower() == "http": - # Check if the network is reachable - network = 'http://' + server_ip + ":" + str(port) - if network_reachable(network) is not True: - logger.error( - f"{network} unreachable, please check the ip address.") - sys.exit(-1) - from paddlespeech.server.utils.audio_handler import ASRHttpHandler logger.info("asr http client start") handler = ASRHttpHandler(server_ip=server_ip, port=port) @@ -527,12 +507,6 @@ class ASROnlineClientExecutor(BaseExecutor): str: the audio text """ - # Check if the network is reachable - network = 'http://' + server_ip + ":" + str(port) - if network_reachable(network) is not True: - logger.error(f"{network} unreachable, please check the ip address.") - sys.exit(-1) - logger.info("asr websocket client start") handler = ASRWsAudioHandler( server_ip, @@ -598,12 +572,6 @@ class CLSClientExecutor(BaseExecutor): Python API to call an executor. """ - # Check if the network is reachable - network = 'http://' + server_ip + ":" + str(port) - if network_reachable(network) is not True: - logger.error(f"{network} unreachable, please check the ip address.") - sys.exit(-1) - url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/cls' audio = wav2base64(input) data = {"audio": audio, "topk": topk} @@ -669,12 +637,6 @@ class TextClientExecutor(BaseExecutor): str: the punctuation text """ - # Check if the network is reachable - network = 'http://' + server_ip + ":" + str(port) - if network_reachable(network) is not True: - logger.error(f"{network} unreachable, please check the ip address.") - sys.exit(-1) - url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/text' request = { "text": input, @@ -772,12 +734,6 @@ class VectorClientExecutor(BaseExecutor): str: the audio embedding or score between enroll and test audio """ - # Check if the network is reachable - network = 'http://' + server_ip + ":" + str(port) - if network_reachable(network) is not True: - logger.error(f"{network} unreachable, please check the ip address.") - sys.exit(-1) - if task == "spk": from paddlespeech.server.utils.audio_handler import VectorHttpHandler logger.info("vector http client start") diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 3f2603b4..061b213c 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -13,8 +13,6 @@ import base64 import math -import requests - def wav2base64(wav_file: str): """ @@ -148,21 +146,3 @@ def count_engine(logfile: str="./nohup.out"): print( f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" ) - - -def network_reachable(url: str, timeout: int=5) -> bool: - """Check if the network is reachable - - Args: - url (str): http://server_ip:port or ws://server_ip:port - timeout (int, optional): timeout. Defaults to 5. - - Returns: - bool: Whether the network is reachable. - """ - try: - request = requests.get(url, timeout=timeout) - return True - except (requests.ConnectionError, requests.Timeout) as exception: - print(exception) - return False diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py index 33b30ce4..b43c7445 100644 --- a/paddlespeech/server/ws/tts_api.py +++ b/paddlespeech/server/ws/tts_api.py @@ -68,6 +68,10 @@ async def websocket_endpoint(websocket: WebSocket): "session": session } await websocket.send_json(resp) + break + else: + resp = {"status": 0, "signal": "no valid json data"} + await websocket.send_json(resp) # speech synthesis request elif 'text' in message: @@ -83,10 +87,12 @@ async def websocket_endpoint(websocket: WebSocket): resp = {"status": 1, "audio": tts_results} await websocket.send_json(resp) except StopIteration as e: + import pdb + pdb.set_trace() resp = {"status": 2, "audio": ''} await websocket.send_json(resp) logger.info( - "Complete the transmission of audio streams") + "Complete the synthesis of the audio streams") break else: From be21aed09ba9709ce1248c41bfe59c76670f1c6c Mon Sep 17 00:00:00 2001 From: lym0302 Date: Wed, 11 May 2022 13:15:37 +0800 Subject: [PATCH 55/93] trans remove file way, test=doc --- .../server/bin/paddlespeech_client.py | 2 +- .../server/tests/tts/offline/http_client.py | 2 +- paddlespeech/server/utils/audio_handler.py | 87 +++++++++++-------- paddlespeech/server/utils/audio_process.py | 2 +- paddlespeech/server/ws/tts_api.py | 11 +-- tests/unit/server/offline/change_yaml.py | 9 +- .../online/tts/check_server/change_yaml.py | 5 +- .../tts/test_server/test_http_client.py | 4 +- 8 files changed, 69 insertions(+), 53 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 6d730959..3adf8015 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -92,7 +92,7 @@ class TTSClientExecutor(BaseExecutor): temp_wav = str(random.getrandbits(128)) + ".wav" soundfile.write(temp_wav, samples, sample_rate) wav2pcm(temp_wav, outfile, data_type=np.int16) - os.system("rm %s" % (temp_wav)) + os.remove(temp_wav) else: logger.error("The format for saving audio only supports wav or pcm") diff --git a/paddlespeech/server/tests/tts/offline/http_client.py b/paddlespeech/server/tests/tts/offline/http_client.py index 1bdee4c1..24109a0e 100644 --- a/paddlespeech/server/tests/tts/offline/http_client.py +++ b/paddlespeech/server/tests/tts/offline/http_client.py @@ -61,7 +61,7 @@ def tts_client(args): temp_wav = str(random.getrandbits(128)) + ".wav" soundfile.write(temp_wav, samples, sample_rate) wav2pcm(temp_wav, outfile, data_type=np.int16) - os.system("rm %s" % (temp_wav)) + os.remove(temp_wav) else: print("The format for saving audio only supports wav or pcm") diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 93d44626..b85cf485 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -321,50 +321,63 @@ class TTSWsHandler: await ws.send(request) logging.info("send a message to the server") - # Process the received response + # 4. Process the received response message = await ws.recv() first_response = time.time() - st message = json.loads(message) status = message["status"] + while True: + # When throw an exception + if status == -1: + # send end request + end_request = json.dumps({ + "task": "tts", + "signal": "end", + "session": session + }) + await ws.send(end_request) + break - while (status == 1): - receive_time_list.append(time.time()) - audio = message["audio"] - audio = base64.b64decode(audio) # bytes - chunk_duration_list.append(len(audio) / 2.0 / 24000) - all_bytes += audio - if self.play: - self.mutex.acquire() - self.buffer += audio - self.mutex.release() - if self.start_play: - self.t.start() - self.start_play = False - - message = await ws.recv() - message = json.loads(message) - status = message["status"] - - # 4. Last packet, no audio information - if status == 2: - final_response = time.time() - st - duration = len(all_bytes) / 2.0 / 24000 - - if output is not None: - save_audio_success = save_audio(all_bytes, output) - else: - save_audio_success = False + # Rerutn last packet normally, no audio information + elif status == 2: + final_response = time.time() - st + duration = len(all_bytes) / 2.0 / 24000 + + if output is not None: + save_audio_success = save_audio(all_bytes, output) + else: + save_audio_success = False + + # send end request + end_request = json.dumps({ + "task": "tts", + "signal": "end", + "session": session + }) + await ws.send(end_request) + break - # 5. send end request - end_request = json.dumps({ - "task": "tts", - "signal": "end", - "session": session - }) - await ws.send(end_request) + # Return the audio stream normally + elif status == 1: + receive_time_list.append(time.time()) + audio = message["audio"] + audio = base64.b64decode(audio) # bytes + chunk_duration_list.append(len(audio) / 2.0 / 24000) + all_bytes += audio + if self.play: + self.mutex.acquire() + self.buffer += audio + self.mutex.release() + if self.start_play: + self.t.start() + self.start_play = False + + message = await ws.recv() + message = json.loads(message) + status = message["status"] - else: - logger.error("infer error") + else: + logger.error("infer error, return status is invalid.") if self.play: self.t.join() diff --git a/paddlespeech/server/utils/audio_process.py b/paddlespeech/server/utils/audio_process.py index bb02d664..416d77ac 100644 --- a/paddlespeech/server/utils/audio_process.py +++ b/paddlespeech/server/utils/audio_process.py @@ -167,7 +167,7 @@ def save_audio(bytes_data, audio_path, sample_rate: int=24000) -> bool: channels=1, bits=16, sample_rate=sample_rate) - os.system("rm ./tmp.pcm") + os.remove("./tmp.pcm") else: print("Only supports saved audio format is pcm or wav") return False diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py index b43c7445..a3a4c4d4 100644 --- a/paddlespeech/server/ws/tts_api.py +++ b/paddlespeech/server/ws/tts_api.py @@ -16,7 +16,6 @@ import uuid from fastapi import APIRouter from fastapi import WebSocket -from fastapi import WebSocketDisconnect from starlette.websockets import WebSocketState as WebSocketState from paddlespeech.cli.log import logger @@ -87,17 +86,19 @@ async def websocket_endpoint(websocket: WebSocket): resp = {"status": 1, "audio": tts_results} await websocket.send_json(resp) except StopIteration as e: - import pdb - pdb.set_trace() resp = {"status": 2, "audio": ''} await websocket.send_json(resp) logger.info( "Complete the synthesis of the audio streams") break + except Exception as e: + resp = {"status": -1, "audio": ''} + await websocket.send_json(resp) + break else: logger.error( "Invalid request, please check if the request is correct.") - except WebSocketDisconnect: - pass + except Exception as e: + logger.error(e) diff --git a/tests/unit/server/offline/change_yaml.py b/tests/unit/server/offline/change_yaml.py index d51a6259..ded7e3b4 100644 --- a/tests/unit/server/offline/change_yaml.py +++ b/tests/unit/server/offline/change_yaml.py @@ -1,6 +1,7 @@ #!/usr/bin/python import argparse import os +import shutil import yaml @@ -14,7 +15,7 @@ def change_device(yamlfile: str, engine: str, device: str): model_type (dict): change model type """ tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml" - os.system("cp %s %s" % (yamlfile, tmp_yamlfile)) + shutil.copyfile(yamlfile, tmp_yamlfile) if device == 'cpu': set_device = 'cpu' @@ -41,7 +42,7 @@ def change_device(yamlfile: str, engine: str, device: str): print(yaml.dump(y, default_flow_style=False, sort_keys=False)) yaml.dump(y, fw, allow_unicode=True) - os.system("rm %s" % (tmp_yamlfile)) + os.remove(tmp_yamlfile) print("Change %s successfully." % (yamlfile)) @@ -52,7 +53,7 @@ def change_engine_type(yamlfile: str, engine_type): task (str): asr or tts """ tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml" - os.system("cp %s %s" % (yamlfile, tmp_yamlfile)) + shutil.copyfile(yamlfile, tmp_yamlfile) speech_task = engine_type.split("_")[0] with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw: @@ -65,7 +66,7 @@ def change_engine_type(yamlfile: str, engine_type): y['engine_list'] = engine_list print(yaml.dump(y, default_flow_style=False, sort_keys=False)) yaml.dump(y, fw, allow_unicode=True) - os.system("rm %s" % (tmp_yamlfile)) + os.remove(tmp_yamlfile) print("Change %s successfully." % (yamlfile)) diff --git a/tests/unit/server/online/tts/check_server/change_yaml.py b/tests/unit/server/online/tts/check_server/change_yaml.py index 01351df0..b04ad0a8 100644 --- a/tests/unit/server/online/tts/check_server/change_yaml.py +++ b/tests/unit/server/online/tts/check_server/change_yaml.py @@ -1,6 +1,7 @@ #!/usr/bin/python import argparse import os +import shutil import yaml @@ -13,7 +14,7 @@ def change_value(args): target_value = args.target_value tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml" - os.system("cp %s %s" % (yamlfile, tmp_yamlfile)) + shutil.copyfile(yamlfile, tmp_yamlfile) with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw: y = yaml.safe_load(f) @@ -51,7 +52,7 @@ def change_value(args): print(yaml.dump(y, default_flow_style=False, sort_keys=False)) yaml.dump(y, fw, allow_unicode=True) - os.system("rm %s" % (tmp_yamlfile)) + os.remove(tmp_yamlfile) print(f"Change key: {target_key} to value: {target_value} successfully.") diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py index 7fdb4e00..3174e85e 100644 --- a/tests/unit/server/online/tts/test_server/test_http_client.py +++ b/tests/unit/server/online/tts/test_server/test_http_client.py @@ -75,8 +75,8 @@ if __name__ == "__main__": args = parser.parse_args() - os.system("rm -rf %s" % (args.output_dir)) - os.mkdir(args.output_dir) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) first_response_list = [] final_response_list = [] From e5b40b03c53cc60a56a109feb9b05bd83a48731f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 11 May 2022 07:35:08 +0000 Subject: [PATCH 56/93] add streaming tts demo, test=doc --- docs/source/index.rst | 1 + docs/source/streaming_tts_demo_video.rst | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 docs/source/streaming_tts_demo_video.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 7f9c87bd..7741f17f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -55,6 +55,7 @@ Contents demo_video tts_demo_video + streaming_tts_demo_video .. toctree:: diff --git a/docs/source/streaming_tts_demo_video.rst b/docs/source/streaming_tts_demo_video.rst new file mode 100644 index 00000000..3ad9ca6c --- /dev/null +++ b/docs/source/streaming_tts_demo_video.rst @@ -0,0 +1,12 @@ +Streaming TTS Demo Video +================== + +.. raw:: html + + + From 167aaa65b97471fe39e6bd3b0075d1c362ff4617 Mon Sep 17 00:00:00 2001 From: Jerryuhoo Date: Thu, 12 May 2022 14:39:50 +0800 Subject: [PATCH 57/93] normalize wav max value to 1 in preprocess, test=tts --- paddlespeech/t2s/exps/fastspeech2/preprocess.py | 5 ++++- paddlespeech/t2s/exps/gan_vocoder/preprocess.py | 5 ++++- paddlespeech/t2s/exps/speedyspeech/preprocess.py | 5 ++++- paddlespeech/t2s/exps/tacotron2/preprocess.py | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index db1842b2..5fc51365 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -55,8 +55,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur wav, _ = librosa.load(str(fp), sr=config.fs) - if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + if len(wav.shape) != 1: return record + max_value = np.abs(wav).max() + if max_value > 1.0: + wav = wav / max_value assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(wav).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 4871bca7..8adab0fe 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur y, _ = librosa.load(str(fp), sr=config.fs) - if len(y.shape) != 1 or np.abs(y).max() > 1.0: + if len(y.shape) != 1: return record + max_value = np.abs(y).max() + if max_value > 1.0: + y = y / max_value assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(y).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index e833d139..6c6b443f 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur wav, _ = librosa.load(str(fp), sr=config.fs) - if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + if len(wav.shape) != 1: return record + max_value = np.abs(wav).max() + if max_value > 1.0: + wav = wav / max_value assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(wav).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py index 14a0d7ea..95349d59 100644 --- a/paddlespeech/t2s/exps/tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -51,8 +51,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur wav, _ = librosa.load(str(fp), sr=config.fs) - if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + if len(wav.shape) != 1: return record + max_value = np.abs(wav).max() + if max_value > 1.0: + wav = wav / max_value assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(wav).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." From 88501fc32a50b129156e2075be6b420bc9364e7b Mon Sep 17 00:00:00 2001 From: root Date: Thu, 12 May 2022 07:27:39 +0000 Subject: [PATCH 58/93] fix server doc and decode_method --- demos/streaming_asr_server/README.md | 72 ------------------- demos/streaming_asr_server/README_cn.md | 72 ------------------- .../server/engine/asr/online/asr_engine.py | 2 +- 3 files changed, 1 insertion(+), 145 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 8423e5d0..909f5a4c 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -57,42 +57,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. INFO: Started server process [11173] @@ -134,42 +98,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. INFO: Started server process [11173] diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index bda545dd..0f1ae1c1 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -64,42 +64,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. INFO: Started server process [11173] @@ -141,42 +105,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform - set kaiming_uniform [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. INFO: Started server process [11173] diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index ad1e6fa3..79b0ddb7 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -669,7 +669,7 @@ class ASRServerExecutor(ASRExecutor): ]: logger.info( "we set the decoding_method to attention_rescoring") - self.config.decode.decoding = "attention_rescoring" + self.config.decode.decoding_method = "attention_rescoring" assert self.config.decode.decoding_method in [ "ctc_prefix_beam_search", "attention_rescoring" ], f"we only support ctc_prefix_beam_search and attention_rescoring dedoding method, current decoding method is {self.config.decode.decoding_method}" From 387629052889017f485e5d59b416c08c7c3f741c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 11 May 2022 08:24:57 +0000 Subject: [PATCH 59/93] add pptts readme, test=doc --- docs/source/tts/PPTTS.md | 74 ++++++++++++++++++++++++++++++++++++ docs/source/tts/PPTTS_cn.md | 76 +++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 docs/source/tts/PPTTS.md create mode 100644 docs/source/tts/PPTTS_cn.md diff --git a/docs/source/tts/PPTTS.md b/docs/source/tts/PPTTS.md new file mode 100644 index 00000000..c8534cd3 --- /dev/null +++ b/docs/source/tts/PPTTS.md @@ -0,0 +1,74 @@ +([简体中文](./PPTTS_cn.md)|English) + +- [1. Introduction](#1) +- [2. Characteristic](#2) +- [3. Benchmark](#3) +- [4. Demo](#4) +- [5. Tutorials](#5) + - [5.1 Training and Inference Optimization](#51) + - [5.2 Characteristic APPs of TTS](#52) + - [5.3 TTS Server](#53) + + +## 1. Introduction + +PP-TTS is a streaming speech synthesis system developed by PaddleSpeech. Based on the implementation of [SOTA Algorithms](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/released_model.md#text-to-speech-models), a faster inference engine is used to realize streaming speech synthesis technology to meet the needs of commercial speech interaction scenarios. + +#### PP-TTS +Pipline of TTS: +
+ +PP-TTS provides a Chinese streaming speech synthesis system based on FastSpeech2 and HiFiGAN by default: + +- Text Frontend: The rule-based Chinese text frontend system is adopted to optimize Chinese text such as text normalization, polyphony, and tone sandhi. +- Acoustic Model: The decoder of FastSpeech2 is improved so that it can be stream synthesized +- Vocoder: Streaming synthesis of GAN vocoder is supported +- Inference Engine: Using ONNXRuntime to optimize the inference of TTS models, so that the TTS system can also achieve RTF < 1 on low-voltage, meeting the requirements of streaming synthesis + + +## 2. Characteristic +- Open source leading Chinese TTS system +- Using ONNXRuntime to optimize the inference of TTS models +- The only open-source streaming TTS system +- Easy disassembly: Developers can easily replace different acoustic models and vocoders in different languages, use different inference engines (Paddle dynamic graph, PaddleInference, ONNXRuntime, etc.), and use different network services (HTTP, WebSocket) + + +## 3. Benchmark +PaddleSpeech TTS models' benchmark: [TTS-Benchmark](https://github.com/PaddlePaddle/PaddleSpeech/wiki/TTS-Benchmark)。 + + +## 4. Demo +See: [Streaming TTS Demo Video](https://paddlespeech.readthedocs.io/en/latest/streaming_tts_demo_video.html) + + +## 5. Tutorials + + +### 5.1 Training and Inference Optimization + +Default FastSpeech2: [tts3/run.sh](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/tts3/run.sh) + +Streaming FastSpeech2: [tts3/run_cnndecoder.sh](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/tts3/run_cnndecoder.sh) + +HiFiGAN:[voc5/run.sh](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/voc5/run.sh) + + +### 5.2 Characteristic APPs of TTS +text_to_speech - convert text into speech: [text_to_speech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/text_to_speech) + +style_fs2 - multi style control for FastSpeech2 model: [style_fs2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/style_fs2) + +story talker - book reader based on OCR and TTS: [story_talker](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/story_talker) + +metaverse - 2D AR with TTS: [metaverse](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/metaverse) + + +### 5.3 TTS Server + +Non-streaming TTS Server: [speech_server](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server) + +Streaming TTS Server: [streaming_tts_server](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/streaming_tts_server) + + +For more tutorials please see: [PP-TTS:流式语音合成原理及服务部署 +](https://aistudio.baidu.com/aistudio/projectdetail/3885352) diff --git a/docs/source/tts/PPTTS_cn.md b/docs/source/tts/PPTTS_cn.md new file mode 100644 index 00000000..2b650d62 --- /dev/null +++ b/docs/source/tts/PPTTS_cn.md @@ -0,0 +1,76 @@ +(简体中文|[English](./PPTTS.md)) + +# PP-TTS + +- [1. 简介](#1) +- [2. 特性](#2) +- [3. Benchmark](#3) +- [4. 效果展示](#4) +- [5. 使用教程](#5) + - [5.1 模型训练与推理优化](#51) + - [5.2 语音合成特色应用](#52) + - [5.3 语音合成服务搭建](#53) + + +## 1. 简介 + +PP-TTS 是 PaddleSpeech 自研的流式语音合成系统。在实现[前沿算法](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/released_model.md#text-to-speech-models)的基础上,使用了更快的推理引擎,实现了流式语音合成技术,使其满足商业语音交互场景的需求。 + +#### PP-TTS +语音合成基本流程如下图所示: +
+ +PP-TTS 默认提供基于 FastSpeech2 声学模型和 HiFiGAN 声码器的中文流式语音合成系统: + +- 文本前端:采用基于规则的中文文本前端系统,对文本正则、多音字、变调等中文文本场景进行了优化。 +- 声学模型:对 FastSpeech2 模型的 Decoder 进行改进,使其可以流式合成 +- 声码器:支持对 GAN Vocoder 的流式合成 +- 推理引擎:使用 ONNXRuntime 推理引擎优化模型推理性能,使得语音合成系统在低压 CPU 上也能达到 RTF<1,满足流式合成的要求 + + +## 2. 特性 +- 开源领先的中文语音合成系统 +- 使用 ONNXRuntime 推理引擎优化模型推理性能 +- 唯一开源的流式语音合成系统 +- 易拆卸性:可以很方便地更换不同语种上的不同声学模型和声码器、使用不同的推理引擎(Paddle 动态图、PaddleInference 和 ONNXRuntime 等)、使用不同的网络服务(HTTP、Websocket) + + +## 3. Benchmark +PaddleSpeech TTS 模型之间的性能对比,请查看 [TTS-Benchmark](https://github.com/PaddlePaddle/PaddleSpeech/wiki/TTS-Benchmark)。 + + +## 4. 效果展示 +请参考:[Streaming TTS Demo Video](https://paddlespeech.readthedocs.io/en/latest/streaming_tts_demo_video.html) + + +## 5. 使用教程 + + +### 5.1 模型训练与推理优化 + +Default FastSpeech2:[tts3/run.sh](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/tts3/run.sh) + +流式 FastSpeech2:[tts3/run_cnndecoder.sh](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/tts3/run_cnndecoder.sh) + +HiFiGAN:[voc5/run.sh](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/voc5/run.sh) + + +### 5.2 语音合成特色应用 +一键式实现语音合成:[text_to_speech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/text_to_speech) + +个性化语音合成 - 基于 FastSpeech2 模型的个性化语音合成:[style_fs2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/style_fs2) + +会说话的故事书 - 基于 OCR 和语音合成的会说话的故事书:[story_talker](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/story_talker) + +元宇宙 - 基于语音合成的 2D 增强现实:[metaverse](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/metaverse) + + +### 5.3 语音合成服务搭建 + +一键式搭建非流式语音合成服务:[speech_server](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server) + +一键式搭建流式语音合成服务:[streaming_tts_server](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/streaming_tts_server) + + +更多教程,包括模型设计、模型训练、推理部署等,请参考 AIStudio 教程:[PP-TTS:流式语音合成原理及服务部署 +](https://aistudio.baidu.com/aistudio/projectdetail/3885352) From 8d19c056d76b88abebeb0902cf4cd98ecb62de7a Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 12 May 2022 19:55:17 +0800 Subject: [PATCH 60/93] add custom_streaming_asr --- demos/custom_streaming_asr/README.md | 1 + demos/custom_streaming_asr/README_cn.md | 0 demos/custom_streaming_asr/path.sh | 2 ++ demos/custom_streaming_asr/setup_docker.sh | 1 + .../custom_streaming_asr/websocket_client.sh | 18 ++++++++++ .../custom_streaming_asr/websocket_server.sh | 33 +++++++++++++++++++ 6 files changed, 55 insertions(+) create mode 100644 demos/custom_streaming_asr/README.md create mode 100644 demos/custom_streaming_asr/README_cn.md create mode 100644 demos/custom_streaming_asr/path.sh create mode 100644 demos/custom_streaming_asr/setup_docker.sh create mode 100755 demos/custom_streaming_asr/websocket_client.sh create mode 100755 demos/custom_streaming_asr/websocket_server.sh diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md new file mode 100644 index 00000000..16df311c --- /dev/null +++ b/demos/custom_streaming_asr/README.md @@ -0,0 +1 @@ +run setup_docker.sh first and then run websocket_server.sh diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md new file mode 100644 index 00000000..e69de29b diff --git a/demos/custom_streaming_asr/path.sh b/demos/custom_streaming_asr/path.sh new file mode 100644 index 00000000..47462324 --- /dev/null +++ b/demos/custom_streaming_asr/path.sh @@ -0,0 +1,2 @@ +export LD_LIBRARY_PATH=$PWD/resource/lib +export PATH=$PATH:$PWD/resource/bin diff --git a/demos/custom_streaming_asr/setup_docker.sh b/demos/custom_streaming_asr/setup_docker.sh new file mode 100644 index 00000000..329a75db --- /dev/null +++ b/demos/custom_streaming_asr/setup_docker.sh @@ -0,0 +1 @@ +sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash diff --git a/demos/custom_streaming_asr/websocket_client.sh b/demos/custom_streaming_asr/websocket_client.sh new file mode 100755 index 00000000..ede076ca --- /dev/null +++ b/demos/custom_streaming_asr/websocket_client.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set +x +set -e + +. path.sh +# input +data=$PWD/data + +# output +wav_scp=wav.scp + +export GLOG_logtostderr=1 + +# websocket client +websocket_client_main \ + --wav_rspecifier=scp:$data/$wav_scp \ + --streaming_chunk=0.36 \ + --port=8881 diff --git a/demos/custom_streaming_asr/websocket_server.sh b/demos/custom_streaming_asr/websocket_server.sh new file mode 100755 index 00000000..041c345b --- /dev/null +++ b/demos/custom_streaming_asr/websocket_server.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set +x +set -e + +export GLOG_logtostderr=1 + +. path.sh +#test websocket server + +model_dir=./resource/model +graph_dir=./resource/graph +cmvn=./data/cmvn.ark + + +#paddle_asr_online/resource.tar.gz +if [ ! -f $cmvn ]; then + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/resource.tar.gz + tar xzfv resource.tar.gz + ln -s ./resource/data . +fi + +websocket_server_main \ + --cmvn_file=$cmvn \ + --streaming_chunk=0.1 \ + --use_fbank=true \ + --model_path=$model_dir/avg_10.jit.pdmodel \ + --param_path=$model_dir/avg_10.jit.pdiparams \ + --model_cache_shapes="5-1-2048,5-1-2048" \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --word_symbol_table=$graph_dir/words.txt \ + --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --port=8881 \ + --acoustic_scale=12 From 5e1b974dc093b5aee653edf725550163d59d560a Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 13 May 2022 11:21:19 +0800 Subject: [PATCH 61/93] fix compile error --- speechx/examples/ds2_ol/feat/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speechx/examples/ds2_ol/feat/CMakeLists.txt b/speechx/examples/ds2_ol/feat/CMakeLists.txt index 2d9bcb4c..632f22e8 100644 --- a/speechx/examples/ds2_ol/feat/CMakeLists.txt +++ b/speechx/examples/ds2_ol/feat/CMakeLists.txt @@ -13,4 +13,4 @@ target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags g set(bin_name cmvn-json2kaldi) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog ${DEPS}) +target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog) From b86ac883feec8ff8e9a3db86117ab6e747edfd95 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Fri, 13 May 2022 17:41:14 +0800 Subject: [PATCH 62/93] add PPASR --- docs/source/asr/PPASR_cn.md | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 docs/source/asr/PPASR_cn.md diff --git a/docs/source/asr/PPASR_cn.md b/docs/source/asr/PPASR_cn.md new file mode 100644 index 00000000..ee65c278 --- /dev/null +++ b/docs/source/asr/PPASR_cn.md @@ -0,0 +1,74 @@ +## 目录 +- [1. 简介](#1) +- [2. 特点](#2) +- [3. 使用教程](#3) + - [3.1 预训练模型](#31) + - [3.2 模型训练](#32) + - [3.3 模型推理](#33) + - [3.4 服务部署](#33) + - [3.5 支持个性化场景部署](#33) +- [4. 快速开始](#4) + + +## 1. 简介 + +PPASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文的模型,支持模型的训练,并且支持使用命令行的方式进行模型的推理。 PPASR也支持流式模型的部署,以及个性化场景的部署。 + + +## 2. 特点 +PPASR 的主要特点如下: +- 提供在中英文开源数据集 aishell (中文),wenetspeech(中文),librispeech (英文)上的预训练模型。模型包含 deepspeech2 模型以及 conformer/transformer 模型。 +- 支持中英文的模型训练功能。 +- 支持命令行方式的模型推理, `paddlespeech asr --input xxx.wav` 方式调用各个预训练模型进行推理。 +- 支持流式 ASR 的服务部署,也支持输出时间戳。 +- 支持个性化场景的部署。 + +## 3. 使用教程 + + +## 3.1 预训练模型 +支持的预训练模型列表:[released_model.md](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/released_model.md)。 +其中效果较好的模型为 Ds2 Online Wenetspeech ASR0 Model 以及 Conformer Online Wenetspeech ASR1 Model。 两个模型都支持流式 ASR。 + + + +## 3.2 模型训练 + +模型的训练的参考脚本存放在 examples 中,并按照 `examples/数据集/模型` 存放,数据集主要支持 aishell 和 librispeech,模型支持 deepspeech2 模型和 u2 (conformer/transformer) 模型。 +具体的执行脚本的步骤记录在 run.sh 当中。具体可参考[这里](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) + + + +## 3.3 模型推理 + +PPASR 支持在使用`pip install paddlespeech`后 使用命令行的方式来使用预训练模型进行推理。 + +具体支持的功能包括: + +- 对单条音频进行预测 +- 使用管道的方式对多条音频进行预测 +- 支持 RTF 的计算 + +具体的使用方式可以参考[这里](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README_cn.md) + + + +## 3.4 服务部署 + +PPASR 支持流式ASR的服务部署。支持 语音识别 + 标点处理两个功能同时使用。 + +server 的 demo [链接](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/streaming_asr_server) + +![image](https://user-images.githubusercontent.com/87408988/168255342-1fc790c0-16f4-4540-a861-db239076727c.png) + + +## 3.5 支持个性化场景部署 + +针对个性化场景部署,提供了 特征提取(fbank) => 推理模型(打分库)=> TLG(WFST, token, lexion, grammer)的 C++ 程序。具体参考[这里](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/speechx) + + +## 4. 快速开始 + +关于如果使用 PPASR,可以看这里的[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md),其中提供了 **简单**、**中等**、**困难 ** 三种安装方式。如果想体验paddlespeech 的推理功能,可以用 **简单** 安装方式。 + + From 8c9182b2031076dffff0ff890fcd324209dfab03 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Fri, 13 May 2022 17:41:31 +0800 Subject: [PATCH 63/93] Update PPASR_cn.md --- docs/source/asr/PPASR_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/asr/PPASR_cn.md b/docs/source/asr/PPASR_cn.md index ee65c278..526db47a 100644 --- a/docs/source/asr/PPASR_cn.md +++ b/docs/source/asr/PPASR_cn.md @@ -69,6 +69,6 @@ server 的 demo [链接](https://github.com/PaddlePaddle/PaddleSpeech/tree/devel ## 4. 快速开始 -关于如果使用 PPASR,可以看这里的[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md),其中提供了 **简单**、**中等**、**困难 ** 三种安装方式。如果想体验paddlespeech 的推理功能,可以用 **简单** 安装方式。 +关于如果使用 PPASR,可以看这里的[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md),其中提供了 **简单**、**中等**、**困难** 三种安装方式。如果想体验paddlespeech 的推理功能,可以用 **简单** 安装方式。 From a5f52d6d8e054c5e7c6ebc6d28975bcfee3d4f1c Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 13 May 2022 17:56:50 +0800 Subject: [PATCH 64/93] update readme --- demos/custom_streaming_asr/README.md | 17 ++++++++++++++++- demos/custom_streaming_asr/README_cn.md | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md index 16df311c..c84b4dd4 100644 --- a/demos/custom_streaming_asr/README.md +++ b/demos/custom_streaming_asr/README.md @@ -1 +1,16 @@ -run setup_docker.sh first and then run websocket_server.sh +([简体中文](./README_cn.md)|English) + +# Customized Auto Speech Recognition + +## introduction +In some cases, we need to recognize the specific sentence with high accuracy. eg: customized keyword spotting, address recognition in navigation apps . customized ASR can slove those issues. + +this demo is customized for expense account of taxi, which need to recognize rare address. + +## Usage +### 1. Installation +Install docker by runing script setup_docker.sh. And then, install tmux (apt-get install tmux). + +### 2. demo +* bash websocket_server.sh. This script will download resources and libs, and then setup the server. +* In the other terminal of docker, run script websocket_client.sh, the client will send data and get the results. \ No newline at end of file diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index e69de29b..d2a09ce9 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -0,0 +1,18 @@ +(简体中文|[English](./README.md) + +# 定制化语音识别演示 +## 介绍 +定制化的语音识别是满足一些特定场景的语句识别的技术。 + +可以参见简单的教程: +https://aistudio.baidu.com/aistudio/projectdetail/3986429 + +这个 demo 是打车报销单的场景识别,定制化了地点。 + +## 使用方法 +### 1. 配置环境 +请通过 setup_docker.sh 安装镜像。进入镜像后,安装tmux (apt-get install tmux),方便后续演示。 + +### 2. 演示 +* bash websocket_server.sh, 完成相关资源和库的下载。这时候服务已经启动。 +* 在镜像另一个终端中,bash websocket_client.sh, 通过client发送数据,得到结果。 From 999f16739ab48d0d812fa16e30a7ee6810b0c431 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Fri, 13 May 2022 17:59:39 +0800 Subject: [PATCH 65/93] add pipeline, test=asr --- docs/source/asr/PPASR_cn.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/asr/PPASR_cn.md b/docs/source/asr/PPASR_cn.md index 526db47a..8cc700b0 100644 --- a/docs/source/asr/PPASR_cn.md +++ b/docs/source/asr/PPASR_cn.md @@ -16,6 +16,10 @@ PPASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文 ## 2. 特点 +语音识别的基本流程如下图所示: +
+ + PPASR 的主要特点如下: - 提供在中英文开源数据集 aishell (中文),wenetspeech(中文),librispeech (英文)上的预训练模型。模型包含 deepspeech2 模型以及 conformer/transformer 模型。 - 支持中英文的模型训练功能。 @@ -23,6 +27,7 @@ PPASR 的主要特点如下: - 支持流式 ASR 的服务部署,也支持输出时间戳。 - 支持个性化场景的部署。 + ## 3. 使用教程 From a043c79c514eb769f18928ed958d2914f27ad8e1 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Fri, 13 May 2022 19:33:05 +0800 Subject: [PATCH 66/93] create streaming_asr_demo.rst, test=doc --- docs/source/streaming_asr_demo_video.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 docs/source/streaming_asr_demo_video.rst diff --git a/docs/source/streaming_asr_demo_video.rst b/docs/source/streaming_asr_demo_video.rst new file mode 100644 index 00000000..50c7689f --- /dev/null +++ b/docs/source/streaming_asr_demo_video.rst @@ -0,0 +1,11 @@ +Streaming ASR Demo Video +================== + +.. raw:: html + + From c5d2224d6d51a93b23737ef51463a23a6259e907 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Fri, 13 May 2022 19:18:57 +0800 Subject: [PATCH 67/93] fix cors, test=doc --- demos/speech_server/README.md | 30 +++- demos/speech_server/README_cn.md | 153 ++++++++++-------- demos/speech_server/asr_client.sh | 2 + demos/speech_server/cls_client.sh | 2 + demos/speech_server/conf/application.yaml | 4 +- demos/speech_server/tts_client.sh | 1 + demos/streaming_asr_server/README.md | 26 ++- demos/streaming_asr_server/README_cn.md | 12 ++ demos/streaming_asr_server/test.sh | 2 + demos/streaming_tts_server/README.md | 20 ++- demos/streaming_tts_server/README_cn.md | 20 ++- .../conf/tts_online_application.yaml | 2 +- demos/streaming_tts_server/test_client.sh | 4 +- .../server/bin/paddlespeech_client.py | 2 + .../server/bin/paddlespeech_server.py | 10 ++ paddlespeech/server/conf/application.yaml | 4 +- .../server/conf/tts_online_application.yaml | 2 +- .../unit/server/offline/conf/application.yaml | 2 +- .../tts/check_server/conf/application.yaml | 2 +- .../check_server/tts_online_application.yaml | 2 +- 20 files changed, 193 insertions(+), 109 deletions(-) diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index bb974c97..5a3de0cc 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -52,8 +52,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` @@ -75,8 +75,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` @@ -84,6 +84,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 4. ASR Client Usage **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) + + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ``` paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav ``` @@ -132,6 +135,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 5. TTS Client Usage **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) + + If `127.0.0.1` is not accessible, you need to use the actual service IP address + ```bash paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` @@ -192,6 +198,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 6. CLS Client Usage **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) + + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ``` paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav ``` @@ -242,9 +251,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) -``` bash -paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav -``` + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + + ``` bash + paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav + ``` * Usage: @@ -297,6 +308,8 @@ print(res) - Command Line (Recommended) + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ``` bash paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 85236145389.wav --test 123456789.wav ``` @@ -357,6 +370,9 @@ print(res) **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) + + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ``` bash paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input "我认为跑步最重要的就是给我带来了身体健康" ``` diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 8fa67c0d..51b6caa4 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -53,8 +53,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` @@ -76,39 +76,42 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` ### 4. ASR 客户端使用方法 **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) - ``` - paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav - ``` + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 - 使用帮助: - - ```bash - paddlespeech_client asr --help - ``` + ``` + paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav - 参数: - - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 - - `port`: 服务端口,默认: 8090。 - - `input`(必须输入): 用于识别的音频文件。 - - `sample_rate`: 音频采样率,默认值:16000。 - - `lang`: 模型语言,默认值:zh_cn。 - - `audio_format`: 音频格式,默认值:wav。 + ``` - 输出: + 使用帮助: + + ```bash + paddlespeech_client asr --help + ``` + + 参数: + - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 + - `port`: 服务端口,默认: 8090。 + - `input`(必须输入): 用于识别的音频文件。 + - `sample_rate`: 音频采样率,默认值:16000。 + - `lang`: 模型语言,默认值:zh_cn。 + - `audio_format`: 音频格式,默认值:wav。 + + 输出: - ```bash - [2022-02-23 18:11:22,819] [ INFO] - {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}} - [2022-02-23 18:11:22,820] [ INFO] - time cost 0.689145 s. - ``` + ```bash + [2022-02-23 18:11:22,819] [ INFO] - {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}} + [2022-02-23 18:11:22,820] [ INFO] - time cost 0.689145 s. + ``` - Python API ```python @@ -135,33 +138,35 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 5. TTS 客户端使用方法 **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) - - ```bash - paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav - ``` - 使用帮助: - ```bash - paddlespeech_client tts --help - ``` - - 参数: - - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 - - `port`: 服务端口,默认: 8090。 - - `input`(必须输入): 待合成的文本。 - - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。 - - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0 - - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0 - - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认与模型相同。 默认值:0 - - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - - 输出: - ```bash - [2022-02-23 15:20:37,875] [ INFO] - {'description': 'success.'} - [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav. - [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s. - [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s. - ``` + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + + ```bash + paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ``` + 使用帮助: + + ```bash + paddlespeech_client tts --help + ``` + + 参数: + - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 + - `port`: 服务端口,默认: 8090。 + - `input`(必须输入): 待合成的文本。 + - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。 + - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0 + - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0 + - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认与模型相同。 默认值:0 + - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 + + 输出: + ```bash + [2022-02-23 15:20:37,875] [ INFO] - {'description': 'success.'} + [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav. + [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s. + [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s. + ``` - Python API ```python @@ -197,9 +202,12 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) - ``` - paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav - ``` + + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + + ``` + paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + ``` 使用帮助: @@ -247,15 +255,17 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee 注意: 初次使用客户端时响应时间会略长 * 命令行 (推荐使用) -``` bash -paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav -``` + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + + ``` bash + paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav + ``` * 使用帮助: -``` bash -paddlespeech_client vector --help -``` + ``` bash + paddlespeech_client vector --help + ``` * 参数: * server_ip: 服务端ip地址,默认: 127.0.0.1。 * port: 服务端口,默认: 8090。 @@ -299,15 +309,17 @@ print(res) 注意: 初次使用客户端时响应时间会略长 * 命令行 (推荐使用) -``` bash -paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 85236145389.wav --test 123456789.wav -``` + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + + ``` bash + paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 85236145389.wav --test 123456789.wav + ``` * 使用帮助: -``` bash -paddlespeech_client vector --help -``` + ``` bash + paddlespeech_client vector --help + ``` * 参数: * server_ip: 服务端ip地址,默认: 127.0.0.1。 @@ -357,9 +369,12 @@ print(res) **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) - ``` bash - paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input "我认为跑步最重要的就是给我带来了身体健康" - ``` + + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + + ``` bash + paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input "我认为跑步最重要的就是给我带来了身体健康" + ``` 使用帮助: @@ -409,4 +424,4 @@ print(res) 通过 `paddlespeech_server stats --task vector` 获取Vector服务支持的所有模型。 ### Text支持的模型 -通过 `paddlespeech_server stats --task text` 获取Text服务支持的所有模型。 \ No newline at end of file +通过 `paddlespeech_server stats --task text` 获取Text服务支持的所有模型。 diff --git a/demos/speech_server/asr_client.sh b/demos/speech_server/asr_client.sh index afe2f821..37a7ab0b 100644 --- a/demos/speech_server/asr_client.sh +++ b/demos/speech_server/asr_client.sh @@ -1,4 +1,6 @@ #!/bin/bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav diff --git a/demos/speech_server/cls_client.sh b/demos/speech_server/cls_client.sh index 5797aa20..67012648 100644 --- a/demos/speech_server/cls_client.sh +++ b/demos/speech_server/cls_client.sh @@ -1,4 +1,6 @@ #!/bin/bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --topk 1 diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index 14a9195a..c6588ce8 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -3,7 +3,7 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8090 # The task format in the engin_list is: _ @@ -157,4 +157,4 @@ vector_python: sample_rate: 16000 cfg_path: # [optional] ckpt_path: # [optional] - device: # set 'gpu:id' or 'cpu' \ No newline at end of file + device: # set 'gpu:id' or 'cpu' diff --git a/demos/speech_server/tts_client.sh b/demos/speech_server/tts_client.sh index a756dfd3..a443a0a9 100644 --- a/demos/speech_server/tts_client.sh +++ b/demos/speech_server/tts_client.sh @@ -1,3 +1,4 @@ #!/bin/bash +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 909f5a4c..cfc89f7b 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -119,9 +119,12 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) - ``` - paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav - ``` + + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + + ``` + paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + ``` Usage: @@ -374,10 +377,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ### 2. Client usage **Note** The response time will be slightly longer when using the client for the first time -- Command line - ``` - paddlespeech_client text --server_ip 127.0.0.1 --port 8190 --input "我认为跑步最重要的就是给我带来了身体健康" - ``` +- Command line: + + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + + ``` + paddlespeech_client text --server_ip 127.0.0.1 --port 8190 --input "我认为跑步最重要的就是给我带来了身体健康" + ``` Output ``` @@ -419,6 +425,9 @@ bash server.sh ### 2. Call client - Command line + + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ``` paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav ``` @@ -494,6 +503,9 @@ bash server.sh ``` - Use script + + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ``` python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav ``` diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 0f1ae1c1..1660cef7 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -125,6 +125,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) + + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + ``` paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav ``` @@ -384,6 +387,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) + + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + ``` paddlespeech_client text --server_ip 127.0.0.1 --port 8190 --input "我认为跑步最重要的就是给我带来了身体健康" ``` @@ -427,6 +433,9 @@ bash server.sh ### 2. 调用服务 - 使用命令行: + + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + ``` paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav ``` @@ -502,6 +511,9 @@ bash server.sh ``` - 使用脚本调用 + + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + ``` python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav ``` diff --git a/demos/streaming_asr_server/test.sh b/demos/streaming_asr_server/test.sh index c7b57e9b..4f43c653 100755 --- a/demos/streaming_asr_server/test.sh +++ b/demos/streaming_asr_server/test.sh @@ -2,9 +2,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav # read the wav and pass it to only streaming asr service +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. # python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --input ./zh.wav # read the wav and call streaming and punc service +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. # python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav \ No newline at end of file diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 299aa3d2..775cd908 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -63,8 +63,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. [2022-04-24 20:05:28] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-24 20:05:28] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-24 20:05:28] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-24 20:05:28] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -90,8 +90,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. [2022-04-24 21:00:17] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-24 21:00:17] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-24 21:00:17] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-24 21:00:17] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -101,6 +101,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. Access http streaming TTS service: + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ```bash paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` @@ -198,8 +200,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. [2022-04-27 10:18:09] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-27 10:18:09] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -226,8 +228,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. [2022-04-27 10:20:16] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-27 10:20:16] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -236,6 +238,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. Access websocket streaming TTS service: + If `127.0.0.1` is not accessible, you need to use the actual service IP address. + ```bash paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index bb159503..9c2cc50e 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -62,8 +62,8 @@ [2022-04-24 20:05:28] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-24 20:05:28] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-24 20:05:28] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-24 20:05:28] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -89,8 +89,8 @@ [2022-04-24 21:00:17] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-24 21:00:17] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-24 21:00:17] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-24 21:00:17] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -100,6 +100,8 @@ 访问 http 流式TTS服务: + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + ```bash paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` @@ -198,8 +200,8 @@ [2022-04-27 10:18:09] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-27 10:18:09] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -226,8 +228,8 @@ [2022-04-27 10:20:16] [INFO] [on.py:45] Waiting for application startup. INFO: Application startup complete. [2022-04-27 10:20:16] [INFO] [on.py:59] Application startup complete. - INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) - [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + INFO: Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) + [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8092 (Press CTRL+C to quit) ``` @@ -236,6 +238,8 @@ 访问 websocket 流式TTS服务: + 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 + ```bash paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml index 714f4a68..964e85ef 100644 --- a/demos/streaming_tts_server/conf/tts_online_application.yaml +++ b/demos/streaming_tts_server/conf/tts_online_application.yaml @@ -3,7 +3,7 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8092 # The task format in the engin_list is: _ diff --git a/demos/streaming_tts_server/test_client.sh b/demos/streaming_tts_server/test_client.sh index 86982095..bd88f20b 100644 --- a/demos/streaming_tts_server/test_client.sh +++ b/demos/streaming_tts_server/test_client.sh @@ -1,7 +1,9 @@ #!/bin/bash # http client test +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav # websocket client test -#paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. +# paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 3adf8015..c3ecaa71 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -20,6 +20,7 @@ import os import random import sys import time +import warnings from typing import List import numpy as np @@ -34,6 +35,7 @@ from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler from paddlespeech.server.utils.audio_process import wav2pcm from paddlespeech.server.utils.util import compute_delay from paddlespeech.server.utils.util import wav2base64 +warnings.filterwarnings("ignore") __all__ = [ 'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor', diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index db92f179..0a0c8637 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -13,11 +13,13 @@ # limitations under the License. import argparse import sys +import warnings from typing import List import uvicorn from fastapi import FastAPI from prettytable import PrettyTable +from starlette.middleware.cors import CORSMiddleware from ..executor import BaseExecutor from ..util import cli_server_register @@ -27,12 +29,20 @@ from paddlespeech.server.engine.engine_pool import init_engine_pool from paddlespeech.server.restful.api import setup_router as setup_http_router from paddlespeech.server.utils.config import get_config from paddlespeech.server.ws.api import setup_router as setup_ws_router +warnings.filterwarnings("ignore") __all__ = ['ServerExecutor', 'ServerStatsExecutor'] app = FastAPI( title="PaddleSpeech Serving API", description="Api", version="0.0.1") +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"]) + @cli_server_register( name='paddlespeech_server.start', description='Start the service') diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 31a37ef0..8650154e 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -3,7 +3,7 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8090 # The task format in the engin_list is: _ @@ -157,4 +157,4 @@ vector_python: sample_rate: 16000 cfg_path: # [optional] ckpt_path: # [optional] - device: # set 'gpu:id' or 'cpu' \ No newline at end of file + device: # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml index 714f4a68..964e85ef 100644 --- a/paddlespeech/server/conf/tts_online_application.yaml +++ b/paddlespeech/server/conf/tts_online_application.yaml @@ -3,7 +3,7 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8092 # The task format in the engin_list is: _ diff --git a/tests/unit/server/offline/conf/application.yaml b/tests/unit/server/offline/conf/application.yaml index 762f4af6..ce399e28 100644 --- a/tests/unit/server/offline/conf/application.yaml +++ b/tests/unit/server/offline/conf/application.yaml @@ -3,7 +3,7 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8090 # The task format in the engin_list is: _ diff --git a/tests/unit/server/online/tts/check_server/conf/application.yaml b/tests/unit/server/online/tts/check_server/conf/application.yaml index dd1a7e19..9bf66396 100644 --- a/tests/unit/server/online/tts/check_server/conf/application.yaml +++ b/tests/unit/server/online/tts/check_server/conf/application.yaml @@ -3,7 +3,7 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8092 # The task format in the engin_list is: _ diff --git a/tests/unit/server/online/tts/check_server/tts_online_application.yaml b/tests/unit/server/online/tts/check_server/tts_online_application.yaml index dd1a7e19..9bf66396 100644 --- a/tests/unit/server/online/tts/check_server/tts_online_application.yaml +++ b/tests/unit/server/online/tts/check_server/tts_online_application.yaml @@ -3,7 +3,7 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8092 # The task format in the engin_list is: _ From 81ae5ffd7267917e99ddbeedc52b0b987288a8eb Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 13 May 2022 19:49:44 +0800 Subject: [PATCH 68/93] add readme --- demos/custom_streaming_asr/README.md | 56 ++++++++++++++++++++++--- demos/custom_streaming_asr/README_cn.md | 47 ++++++++++++++++++--- 2 files changed, 93 insertions(+), 10 deletions(-) diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md index c84b4dd4..550682df 100644 --- a/demos/custom_streaming_asr/README.md +++ b/demos/custom_streaming_asr/README.md @@ -3,14 +3,60 @@ # Customized Auto Speech Recognition ## introduction -In some cases, we need to recognize the specific sentence with high accuracy. eg: customized keyword spotting, address recognition in navigation apps . customized ASR can slove those issues. +In some cases, we need to recognize the specific rare words with high accuracy. eg: address recognition in navigation apps. customized ASR can slove those issues. -this demo is customized for expense account of taxi, which need to recognize rare address. +this demo is customized for expense account, which need to recognize rare address. + +* G with slot: 打车到 "address_slot"。 +![](https://ai-studio-static-online.cdn.bcebos.com/28d9ef132a7f47a895a65ae9e5c4f55b8f472c9f3dd24be8a2e66e0b88b173a4) + +* this is address slot wfst, you can add the address which want to recognize. +![](https://ai-studio-static-online.cdn.bcebos.com/47c89100ef8c465bac733605ffc53d76abefba33d62f4d818d351f8cea3c8fe2) + +* after replace operation, G = fstreplace(G_with_slot, address_slot), we will get the customized graph. +![](https://ai-studio-static-online.cdn.bcebos.com/60a3095293044f10b73039ab10c7950d139a6717580a44a3ba878c6e74de402b) ## Usage ### 1. Installation -Install docker by runing script setup_docker.sh. And then, install tmux (apt-get install tmux). +install paddle:2.2.2 docker. +``` +sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash +``` ### 2. demo -* bash websocket_server.sh. This script will download resources and libs, and then setup the server. -* In the other terminal of docker, run script websocket_client.sh, the client will send data and get the results. \ No newline at end of file +* run websocket_server.sh. This script will download resources and libs, and launch the service. +``` +bash websocket_server.sh +``` +this script run in two steps: +1. download the resources.tar.gz, those direcotries will be found in resource directory. +model: acustic model +graph: the decoder graph (TLG.fst) +lib: some libs +bin: binary +data: audio and wav.scp + +2. websocket_server_main launch the service. +some params: +port: the service port +graph_path: the decoder graph path +model_path: acustic model path +please refer other params in those files: +PaddleSpeech/speechx/speechx/decoder/param.h +PaddleSpeech/speechx/examples/ds2_ol/websocket/websocket_server_main.cc + +* In other terminal, run script websocket_client.sh, the client will send data and get the results. +``` +bash websocket_client.sh +``` +websocket_client_main will launch the client, the wav_scp is the wav set, port is the server service port. + +* result: +In the log of client, you will see the message below: +``` +0513 10:58:13.827821 41768 recognizer_test_main.cc:56] wav len (sample): 70208 +I0513 10:58:13.884493 41768 feature_cache.h:52] set finished +I0513 10:58:24.247171 41768 paddle_nnet.h:76] Tensor neml: 10240 +I0513 10:58:24.247249 41768 paddle_nnet.h:76] Tensor neml: 10240 +LOG ([5.5.544~2-f21d7]:main():decoder/recognizer_test_main.cc:90) the result of case_10 is 五月十二日二十二点三十六分加班打车回家四十一元 +``` \ No newline at end of file diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index d2a09ce9..e0b95c3f 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -1,18 +1,55 @@ -(简体中文|[English](./README.md) +(简体中文|[English](./README.md)) # 定制化语音识别演示 ## 介绍 定制化的语音识别是满足一些特定场景的语句识别的技术。 -可以参见简单的教程: +可以参见简单的原理教程: https://aistudio.baidu.com/aistudio/projectdetail/3986429 这个 demo 是打车报销单的场景识别,定制化了地点。 ## 使用方法 ### 1. 配置环境 -请通过 setup_docker.sh 安装镜像。进入镜像后,安装tmux (apt-get install tmux),方便后续演示。 +安装paddle:2.2.2 docker镜像。 +``` +sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash +``` ### 2. 演示 -* bash websocket_server.sh, 完成相关资源和库的下载。这时候服务已经启动。 -* 在镜像另一个终端中,bash websocket_client.sh, 通过client发送数据,得到结果。 +* 运行如下命令,完成相关资源和库的下载和服务启动。 +``` +bash websocket_server.sh +``` +上面脚本完成了如下两个功能: +1. 完成resource.tar.gz下载,解压后,会在resource中发现如下目录: +model: 声学模型 +graph: 解码构图 +lib: 相关库 +bin: 运行程序 +data: 语音数据 + +2. 通过websocket_server_main来启动服务。 +这里简单的介绍几个参数: +port是服务端口, +graph_path用来指定解码图文件, +model相关参数用来指定声学模型文件。 +其他参数说明可参见代码: +PaddleSpeech/speechx/speechx/decoder/param.h +PaddleSpeech/speechx/examples/ds2_ol/websocket/websocket_server_main.cc + +* 在另一个终端中, 通过client发送数据,得到结果。运行如下命令: +``` +bash websocket_client.sh +``` +通过websocket_client_main来启动client服务,其中$wav_scp是发送的语音句子集合,port为服务端口。 + +* 结果: +client的log中可以看到如下类似的结果 +``` +0513 10:58:13.827821 41768 recognizer_test_main.cc:56] wav len (sample): 70208 +I0513 10:58:13.884493 41768 feature_cache.h:52] set finished +I0513 10:58:24.247171 41768 paddle_nnet.h:76] Tensor neml: 10240 +I0513 10:58:24.247249 41768 paddle_nnet.h:76] Tensor neml: 10240 +LOG ([5.5.544~2-f21d7]:main():decoder/recognizer_test_main.cc:90) the result of case_10 is 五月十二日二十二点三十六分加班打车回家四十一元 +``` From 53932eea88021f1ed67fd0c3c5ef2484b22cee27 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Fri, 13 May 2022 19:53:20 +0800 Subject: [PATCH 69/93] test=doc --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 7741f17f..fc1649eb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -54,6 +54,7 @@ Contents :caption: Demos demo_video + streaming_asr_demo_video tts_demo_video streaming_tts_demo_video From 8126ae726b22a17b7703b49302c5989d5bc9f95d Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 13 May 2022 20:03:24 +0800 Subject: [PATCH 70/93] add docker pull --- demos/custom_streaming_asr/README.md | 2 ++ demos/custom_streaming_asr/README_cn.md | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md index 550682df..5d94856f 100644 --- a/demos/custom_streaming_asr/README.md +++ b/demos/custom_streaming_asr/README.md @@ -20,6 +20,8 @@ this demo is customized for expense account, which need to recognize rare addres ### 1. Installation install paddle:2.2.2 docker. ``` +sudo nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:2.2.2 + sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash ``` diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index e0b95c3f..209b882e 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -2,17 +2,25 @@ # 定制化语音识别演示 ## 介绍 -定制化的语音识别是满足一些特定场景的语句识别的技术。 +在一些场景中,识别系统需要高精度的识别一些稀有词,例如导航软件中地名识别。而通过定制化识别可以满足这一需求。 -可以参见简单的原理教程: -https://aistudio.baidu.com/aistudio/projectdetail/3986429 +这个 demo 是打车报销单的场景识别,需要识别一些稀有的地名,可以通过如下操作实现。 -这个 demo 是打车报销单的场景识别,定制化了地点。 +* G with slot: 打车到 "address_slot"。 +![](https://ai-studio-static-online.cdn.bcebos.com/28d9ef132a7f47a895a65ae9e5c4f55b8f472c9f3dd24be8a2e66e0b88b173a4) + +* 这是address slot wfst, 可以添加一些需要识别的地名. +![](https://ai-studio-static-online.cdn.bcebos.com/47c89100ef8c465bac733605ffc53d76abefba33d62f4d818d351f8cea3c8fe2) + +* 通过replace 操作, G = fstreplace(G_with_slot, address_slot), 最终可以得到定制化的解码图。 +![](https://ai-studio-static-online.cdn.bcebos.com/60a3095293044f10b73039ab10c7950d139a6717580a44a3ba878c6e74de402b) ## 使用方法 ### 1. 配置环境 安装paddle:2.2.2 docker镜像。 ``` +sudo nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:2.2.2 + sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash ``` From 3d81e95ce6a161fdcf921e7bd96ac24580667396 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Fri, 13 May 2022 20:04:02 +0800 Subject: [PATCH 71/93] test=asr --- docs/source/asr/PPASR_cn.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/asr/PPASR_cn.md b/docs/source/asr/PPASR_cn.md index 8cc700b0..1f72f1b9 100644 --- a/docs/source/asr/PPASR_cn.md +++ b/docs/source/asr/PPASR_cn.md @@ -12,7 +12,7 @@ ## 1. 简介 -PPASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文的模型,支持模型的训练,并且支持使用命令行的方式进行模型的推理。 PPASR也支持流式模型的部署,以及个性化场景的部署。 +PP-ASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文的模型,支持模型的训练,并且支持使用命令行的方式进行模型的推理。 PP-ASR也支持流式模型的部署,以及个性化场景的部署。 ## 2. 特点 @@ -20,9 +20,9 @@ PPASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文
-PPASR 的主要特点如下: -- 提供在中英文开源数据集 aishell (中文),wenetspeech(中文),librispeech (英文)上的预训练模型。模型包含 deepspeech2 模型以及 conformer/transformer 模型。 -- 支持中英文的模型训练功能。 +PP-ASR 的主要特点如下: +- 提供在中/英文开源数据集 aishell (中文),wenetspeech(中文),librispeech (英文)上的预训练模型。模型包含 deepspeech2 模型以及 conformer/transformer 模型。 +- 支持中/英文的模型训练功能。 - 支持命令行方式的模型推理, `paddlespeech asr --input xxx.wav` 方式调用各个预训练模型进行推理。 - 支持流式 ASR 的服务部署,也支持输出时间戳。 - 支持个性化场景的部署。 From 4b14c7ee342fab03dba3648999bfe92141bc475f Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Fri, 13 May 2022 20:11:03 +0800 Subject: [PATCH 72/93] test=doc --- docs/source/streaming_asr_demo_video.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/streaming_asr_demo_video.rst b/docs/source/streaming_asr_demo_video.rst index 50c7689f..6c96fea0 100644 --- a/docs/source/streaming_asr_demo_video.rst +++ b/docs/source/streaming_asr_demo_video.rst @@ -5,7 +5,6 @@ Streaming ASR Demo Video From 06a01b03a2555d8d34664b335a71ab4d39c3d597 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sat, 14 May 2022 12:23:15 +0800 Subject: [PATCH 73/93] format --- demos/custom_streaming_asr/README.md | 12 ++++++------ demos/custom_streaming_asr/README_cn.md | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md index 5d94856f..40f696dd 100644 --- a/demos/custom_streaming_asr/README.md +++ b/demos/custom_streaming_asr/README.md @@ -20,9 +20,9 @@ this demo is customized for expense account, which need to recognize rare addres ### 1. Installation install paddle:2.2.2 docker. ``` -sudo nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:2.2.2 +sudo docker pull registry.baidubce.com/paddlepaddle/paddle:2.2.2 -sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash +sudo docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash ``` ### 2. demo @@ -30,16 +30,16 @@ sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/pad ``` bash websocket_server.sh ``` -this script run in two steps: -1. download the resources.tar.gz, those direcotries will be found in resource directory. +this script run in two steps: +1. download the resources.tar.gz, those direcotries will be found in resource directory. model: acustic model graph: the decoder graph (TLG.fst) lib: some libs bin: binary data: audio and wav.scp -2. websocket_server_main launch the service. -some params: +2. websocket_server_main launch the service. +some params: port: the service port graph_path: the decoder graph path model_path: acustic model path diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index 209b882e..5a0d1c30 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -30,7 +30,7 @@ sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/pad bash websocket_server.sh ``` 上面脚本完成了如下两个功能: -1. 完成resource.tar.gz下载,解压后,会在resource中发现如下目录: +1. 完成resource.tar.gz下载,解压后,会在resource中发现如下目录: model: 声学模型 graph: 解码构图 lib: 相关库 From 86de61250e189e206ea8ed4c242e73aee104a506 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sat, 14 May 2022 12:27:04 +0800 Subject: [PATCH 74/93] format --- demos/custom_streaming_asr/README.md | 7 ++++--- demos/custom_streaming_asr/README_cn.md | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md index 40f696dd..b4777c2b 100644 --- a/demos/custom_streaming_asr/README.md +++ b/demos/custom_streaming_asr/README.md @@ -7,13 +7,13 @@ In some cases, we need to recognize the specific rare words with high accuracy. this demo is customized for expense account, which need to recognize rare address. -* G with slot: 打车到 "address_slot"。 +* G with slot: 打车到 "address_slot"。 ![](https://ai-studio-static-online.cdn.bcebos.com/28d9ef132a7f47a895a65ae9e5c4f55b8f472c9f3dd24be8a2e66e0b88b173a4) -* this is address slot wfst, you can add the address which want to recognize. +* this is address slot wfst, you can add the address which want to recognize. ![](https://ai-studio-static-online.cdn.bcebos.com/47c89100ef8c465bac733605ffc53d76abefba33d62f4d818d351f8cea3c8fe2) -* after replace operation, G = fstreplace(G_with_slot, address_slot), we will get the customized graph. +* after replace operation, G = fstreplace(G_with_slot, address_slot), we will get the customized graph. ![](https://ai-studio-static-online.cdn.bcebos.com/60a3095293044f10b73039ab10c7950d139a6717580a44a3ba878c6e74de402b) ## Usage @@ -28,6 +28,7 @@ sudo docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --n ### 2. demo * run websocket_server.sh. This script will download resources and libs, and launch the service. ``` +cd /paddle bash websocket_server.sh ``` this script run in two steps: diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index 5a0d1c30..82f59e91 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -6,27 +6,28 @@ 这个 demo 是打车报销单的场景识别,需要识别一些稀有的地名,可以通过如下操作实现。 -* G with slot: 打车到 "address_slot"。 +* G with slot: 打车到 "address_slot"。 ![](https://ai-studio-static-online.cdn.bcebos.com/28d9ef132a7f47a895a65ae9e5c4f55b8f472c9f3dd24be8a2e66e0b88b173a4) -* 这是address slot wfst, 可以添加一些需要识别的地名. +* 这是address slot wfst, 可以添加一些需要识别的地名. ![](https://ai-studio-static-online.cdn.bcebos.com/47c89100ef8c465bac733605ffc53d76abefba33d62f4d818d351f8cea3c8fe2) -* 通过replace 操作, G = fstreplace(G_with_slot, address_slot), 最终可以得到定制化的解码图。 +* 通过replace 操作, G = fstreplace(G_with_slot, address_slot), 最终可以得到定制化的解码图。 ![](https://ai-studio-static-online.cdn.bcebos.com/60a3095293044f10b73039ab10c7950d139a6717580a44a3ba878c6e74de402b) ## 使用方法 ### 1. 配置环境 安装paddle:2.2.2 docker镜像。 ``` -sudo nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:2.2.2 +sudo docker pull registry.baidubce.com/paddlepaddle/paddle:2.2.2 -sudo nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash +sudo docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/paddle --name=paddle_demo_docker registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash ``` ### 2. 演示 * 运行如下命令,完成相关资源和库的下载和服务启动。 ``` +cd /paddle bash websocket_server.sh ``` 上面脚本完成了如下两个功能: From b917b1a7f28fe5ff1b210a41b411e0d11d738d98 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sat, 14 May 2022 12:28:22 +0800 Subject: [PATCH 75/93] format --- demos/custom_streaming_asr/README.md | 4 ++-- demos/custom_streaming_asr/README_cn.md | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md index b4777c2b..aa28d502 100644 --- a/demos/custom_streaming_asr/README.md +++ b/demos/custom_streaming_asr/README.md @@ -33,11 +33,11 @@ bash websocket_server.sh ``` this script run in two steps: 1. download the resources.tar.gz, those direcotries will be found in resource directory. -model: acustic model +model: acustic model graph: the decoder graph (TLG.fst) lib: some libs bin: binary -data: audio and wav.scp +data: audio and wav.scp 2. websocket_server_main launch the service. some params: diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index 82f59e91..8aa5cc1e 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -32,11 +32,11 @@ bash websocket_server.sh ``` 上面脚本完成了如下两个功能: 1. 完成resource.tar.gz下载,解压后,会在resource中发现如下目录: -model: 声学模型 -graph: 解码构图 -lib: 相关库 -bin: 运行程序 -data: 语音数据 +model: 声学模型 +graph: 解码构图 +lib: 相关库 +bin: 运行程序 +data: 语音数据 2. 通过websocket_server_main来启动服务。 这里简单的介绍几个参数: From d40dd7bc31859e26d8c1b5fe2729133af4b63030 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sat, 14 May 2022 12:29:04 +0800 Subject: [PATCH 76/93] format --- demos/custom_streaming_asr/README_cn.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index 8aa5cc1e..11a72484 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -42,10 +42,10 @@ data: 语音数据 这里简单的介绍几个参数: port是服务端口, graph_path用来指定解码图文件, -model相关参数用来指定声学模型文件。 -其他参数说明可参见代码: -PaddleSpeech/speechx/speechx/decoder/param.h -PaddleSpeech/speechx/examples/ds2_ol/websocket/websocket_server_main.cc +model相关参数用来指定声学模型文件。 +其他参数说明可参见代码: +PaddleSpeech/speechx/speechx/decoder/param.h +PaddleSpeech/speechx/examples/ds2_ol/websocket/websocket_server_main.cc * 在另一个终端中, 通过client发送数据,得到结果。运行如下命令: ``` From 226823e57a33d11c1e81adcd795a2b30d5854a7a Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sat, 14 May 2022 12:30:53 +0800 Subject: [PATCH 77/93] format --- demos/custom_streaming_asr/README_cn.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index 11a72484..51f3387d 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -39,10 +39,14 @@ bin: 运行程序 data: 语音数据 2. 通过websocket_server_main来启动服务。 -这里简单的介绍几个参数: -port是服务端口, -graph_path用来指定解码图文件, +这里简单的介绍几个参数: + +port是服务端口, + +graph_path用来指定解码图文件, + model相关参数用来指定声学模型文件。 + 其他参数说明可参见代码: PaddleSpeech/speechx/speechx/decoder/param.h PaddleSpeech/speechx/examples/ds2_ol/websocket/websocket_server_main.cc From fd7822493718df29aa316a91e0819bdb227eb600 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sat, 14 May 2022 12:31:49 +0800 Subject: [PATCH 78/93] format --- demos/custom_streaming_asr/README_cn.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index 51f3387d..55e3b88e 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -40,11 +40,8 @@ data: 语音数据 2. 通过websocket_server_main来启动服务。 这里简单的介绍几个参数: - port是服务端口, - graph_path用来指定解码图文件, - model相关参数用来指定声学模型文件。 其他参数说明可参见代码: From 92b20aa327937198e6d695940ac87032981c885d Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Sat, 14 May 2022 12:53:38 +0800 Subject: [PATCH 79/93] format --- demos/custom_streaming_asr/README_cn.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index 55e3b88e..cd8893af 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -41,9 +41,7 @@ data: 语音数据 2. 通过websocket_server_main来启动服务。 这里简单的介绍几个参数: port是服务端口, -graph_path用来指定解码图文件, -model相关参数用来指定声学模型文件。 - +graph_path用来指定解码图文件, 其他参数说明可参见代码: PaddleSpeech/speechx/speechx/decoder/param.h PaddleSpeech/speechx/examples/ds2_ol/websocket/websocket_server_main.cc From dbbbd2f9f3a2ada15142caf35bd1bdd9e1e8a2de Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Sat, 14 May 2022 13:17:13 +0800 Subject: [PATCH 80/93] test=doc --- demos/streaming_asr_server/README_cn.md | 100 +++++++++++------------- 1 file changed, 46 insertions(+), 54 deletions(-) diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 1660cef7..7aba3de4 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -19,11 +19,11 @@ 流式ASR的服务启动脚本和服务测试脚本存放在 `PaddleSpeech/demos/streaming_asr_server` 目录。 下载好 `PaddleSpeech` 之后,进入到 `PaddleSpeech/demos/streaming_asr_server` 目录。 -配置文件可参见该目录下 `conf/ws_application.yaml` 和 `conf/ws_conformer_application.yaml` 。 +配置文件可参见该目录下 `conf/ws_application.yaml` 和 `conf/ws_conformer_wenetspeech_application.yaml` 。 目前服务集成的模型有: DeepSpeech2和 conformer模型,对应的配置文件如下: * DeepSpeech: `conf/ws_application.yaml` -* conformer: `conf/ws_conformer_application.yaml` +* conformer: `conf/ws_conformer_wenetspeech_application.yaml` @@ -39,7 +39,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **注意:** 默认部署在 `cpu` 设备上,可以通过修改服务配置文件中 `device` 参数部署在 `gpu` 上。 ```bash # 在 PaddleSpeech/demos/streaming_asr_server 目录启动服务 - paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml + paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml ``` 使用方法: @@ -53,31 +53,27 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav 输出: ```bash - [2022-04-21 15:52:18,126] [ INFO] - create the online asr engine instance - [2022-04-21 15:52:18,127] [ INFO] - paddlespeech_server set the device: cpu - [2022-04-21 15:52:18,128] [ INFO] - Load the pretrained model, tag = conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,128] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz md5 checking... - [2022-04-21 15:52:18,727] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/model.yaml - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine - [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success - [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. - INFO: Started server process [11173] - [2022-04-21 15:52:21] [INFO] [server.py:75] Started server process [11173] - INFO: Waiting for application startup. - [2022-04-21 15:52:21] [INFO] [on.py:45] Waiting for application startup. - INFO: Application startup complete. - [2022-04-21 15:52:21] [INFO] [on.py:59] Application startup complete. - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1460: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - infos = await tasks.gather(*fs, loop=self) - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1518: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - await tasks.sleep(0, loop=self) - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-04-21 15:52:21] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:13,086] [ INFO] - create the online asr engine instance + [2022-05-14 04:56:13,086] [ INFO] - paddlespeech_server set the device: cpu + [2022-05-14 04:56:13,087] [ INFO] - Load the pretrained model, tag = conformer_online_wenetspeech-zh-16k + [2022-05-14 04:56:13,087] [ INFO] - File /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz md5 checking... + [2022-05-14 04:56:17,542] [ INFO] - Use pretrained model stored in: /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1. 0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/model.yaml + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,852] [ INFO] - start to create the stream conformer asr engine + [2022-05-14 04:56:17,863] [ INFO] - model name: conformer_online + [2022-05-14 04:56:22,756] [ INFO] - create the transformer like model success + [2022-05-14 04:56:22,758] [ INFO] - Initialize ASR server engine successfully. + INFO: Started server process [4242] + [2022-05-14 04:56:22] [INFO] [server.py:75] Started server process [4242] + INFO: Waiting for application startup. + [2022-05-14 04:56:22] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) ``` - Python API @@ -88,37 +84,33 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav server_executor = ServerExecutor() server_executor( - config_file="./conf/ws_conformer_application.yaml", + config_file="./conf/ws_conformer_wenetspeech_application", log_file="./log/paddlespeech.log") ``` 输出: ```bash - [2022-04-21 15:52:18,126] [ INFO] - create the online asr engine instance - [2022-04-21 15:52:18,127] [ INFO] - paddlespeech_server set the device: cpu - [2022-04-21 15:52:18,128] [ INFO] - Load the pretrained model, tag = conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,128] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz md5 checking... - [2022-04-21 15:52:18,727] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/model.yaml - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine - [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success - [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. - INFO: Started server process [11173] - [2022-04-21 15:52:21] [INFO] [server.py:75] Started server process [11173] - INFO: Waiting for application startup. - [2022-04-21 15:52:21] [INFO] [on.py:45] Waiting for application startup. - INFO: Application startup complete. - [2022-04-21 15:52:21] [INFO] [on.py:59] Application startup complete. - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1460: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - infos = await tasks.gather(*fs, loop=self) - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1518: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - await tasks.sleep(0, loop=self) - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-04-21 15:52:21] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:13,086] [ INFO] - create the online asr engine instance + [2022-05-14 04:56:13,086] [ INFO] - paddlespeech_server set the device: cpu + [2022-05-14 04:56:13,087] [ INFO] - Load the pretrained model, tag = conformer_online_wenetspeech-zh-16k + [2022-05-14 04:56:13,087] [ INFO] - File /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz md5 checking... + [2022-05-14 04:56:17,542] [ INFO] - Use pretrained model stored in: /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1. 0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/model.yaml + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,852] [ INFO] - start to create the stream conformer asr engine + [2022-05-14 04:56:17,863] [ INFO] - model name: conformer_online + [2022-05-14 04:56:22,756] [ INFO] - create the transformer like model success + [2022-05-14 04:56:22,758] [ INFO] - Initialize ASR server engine successfully. + INFO: Started server process [4242] + [2022-05-14 04:56:22] [INFO] [server.py:75] Started server process [4242] + INFO: Waiting for application startup. + [2022-05-14 04:56:22] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) ``` ### 4. ASR 客户端使用方法 From 0a862b2458fd8d05cc7def358c5af4218a9c3921 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Sat, 14 May 2022 13:20:32 +0800 Subject: [PATCH 81/93] test=doc --- demos/streaming_asr_server/README.md | 98 +++++++++++++--------------- 1 file changed, 45 insertions(+), 53 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index cfc89f7b..86b8a973 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -15,7 +15,7 @@ It is recommended to use **paddlepaddle 2.2.1** or above. You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File -The configuration file can be found in `conf/ws_application.yaml` 和 `conf/ws_conformer_application.yaml`. +The configuration file can be found in `conf/ws_application.yaml` 和 `conf/ws_conformer_wenetspeech_application.yaml`. At present, the speech tasks integrated by the model include: DeepSpeech2 and conformer. @@ -32,7 +32,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **Note:** The default deployment of the server is on the 'CPU' device, which can be deployed on the 'GPU' by modifying the 'device' parameter in the service configuration file. ```bash # in PaddleSpeech/demos/streaming_asr_server start the service - paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml + paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml ``` Usage: @@ -46,31 +46,27 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav Output: ```bash - [2022-04-21 15:52:18,126] [ INFO] - create the online asr engine instance - [2022-04-21 15:52:18,127] [ INFO] - paddlespeech_server set the device: cpu - [2022-04-21 15:52:18,128] [ INFO] - Load the pretrained model, tag = conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,128] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz md5 checking... - [2022-04-21 15:52:18,727] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/model.yaml - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine - [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success - [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. - INFO: Started server process [11173] - [2022-04-21 15:52:21] [INFO] [server.py:75] Started server process [11173] - INFO: Waiting for application startup. - [2022-04-21 15:52:21] [INFO] [on.py:45] Waiting for application startup. - INFO: Application startup complete. - [2022-04-21 15:52:21] [INFO] [on.py:59] Application startup complete. - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1460: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - infos = await tasks.gather(*fs, loop=self) - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1518: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - await tasks.sleep(0, loop=self) - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-04-21 15:52:21] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:13,086] [ INFO] - create the online asr engine instance + [2022-05-14 04:56:13,086] [ INFO] - paddlespeech_server set the device: cpu + [2022-05-14 04:56:13,087] [ INFO] - Load the pretrained model, tag = conformer_online_wenetspeech-zh-16k + [2022-05-14 04:56:13,087] [ INFO] - File /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz md5 checking... + [2022-05-14 04:56:17,542] [ INFO] - Use pretrained model stored in: /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1. 0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/model.yaml + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,852] [ INFO] - start to create the stream conformer asr engine + [2022-05-14 04:56:17,863] [ INFO] - model name: conformer_online + [2022-05-14 04:56:22,756] [ INFO] - create the transformer like model success + [2022-05-14 04:56:22,758] [ INFO] - Initialize ASR server engine successfully. + INFO: Started server process [4242] + [2022-05-14 04:56:22] [INFO] [server.py:75] Started server process [4242] + INFO: Waiting for application startup. + [2022-05-14 04:56:22] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) ``` - Python API @@ -81,37 +77,33 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav server_executor = ServerExecutor() server_executor( - config_file="./conf/ws_conformer_application.yaml", + config_file="./conf/ws_conformer_wenetspeech_application.yaml", log_file="./log/paddlespeech.log") ``` Output: ```bash - [2022-04-21 15:52:18,126] [ INFO] - create the online asr engine instance - [2022-04-21 15:52:18,127] [ INFO] - paddlespeech_server set the device: cpu - [2022-04-21 15:52:18,128] [ INFO] - Load the pretrained model, tag = conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,128] [ INFO] - File /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz md5 checking... - [2022-04-21 15:52:18,727] [ INFO] - Use pretrained model stored in: /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/model.yaml - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:18,727] [ INFO] - /home/users/xiongxinlei/.paddlespeech/models/conformer_online_multicn-zh-16k/exp/chunk_conformer/checkpoints/multi_cn.pdparams - [2022-04-21 15:52:19,446] [ INFO] - start to create the stream conformer asr engine - [2022-04-21 15:52:19,473] [ INFO] - model name: conformer_online - [2022-04-21 15:52:21,731] [ INFO] - create the transformer like model success - [2022-04-21 15:52:21,733] [ INFO] - Initialize ASR server engine successfully. - INFO: Started server process [11173] - [2022-04-21 15:52:21] [INFO] [server.py:75] Started server process [11173] - INFO: Waiting for application startup. - [2022-04-21 15:52:21] [INFO] [on.py:45] Waiting for application startup. - INFO: Application startup complete. - [2022-04-21 15:52:21] [INFO] [on.py:59] Application startup complete. - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1460: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - infos = await tasks.gather(*fs, loop=self) - /home/users/xiongxinlei/.conda/envs/paddlespeech/lib/python3.9/asyncio/base_events.py:1518: DeprecationWarning: The loop argument is deprecated since Python 3.8, and scheduled for removal in Python 3.10. - await tasks.sleep(0, loop=self) - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-04-21 15:52:21] [INFO] [server.py:206] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:13,086] [ INFO] - create the online asr engine instance + [2022-05-14 04:56:13,086] [ INFO] - paddlespeech_server set the device: cpu + [2022-05-14 04:56:13,087] [ INFO] - Load the pretrained model, tag = conformer_online_wenetspeech-zh-16k + [2022-05-14 04:56:13,087] [ INFO] - File /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz md5 checking... + [2022-05-14 04:56:17,542] [ INFO] - Use pretrained model stored in: /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1. 0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/model.yaml + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,543] [ INFO] - /root/.paddlespeech/models/conformer_online_wenetspeech-zh-16k/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar/exp/ chunk_conformer/checkpoints/avg_10.pdparams + [2022-05-14 04:56:17,852] [ INFO] - start to create the stream conformer asr engine + [2022-05-14 04:56:17,863] [ INFO] - model name: conformer_online + [2022-05-14 04:56:22,756] [ INFO] - create the transformer like model success + [2022-05-14 04:56:22,758] [ INFO] - Initialize ASR server engine successfully. + INFO: Started server process [4242] + [2022-05-14 04:56:22] [INFO] [server.py:75] Started server process [4242] + INFO: Waiting for application startup. + [2022-05-14 04:56:22] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) ``` From a695c2cd369305e6c27d9e7b181f2162fa97a838 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Sat, 14 May 2022 13:22:16 +0800 Subject: [PATCH 82/93] test=doc --- demos/streaming_asr_server/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 86b8a973..4dd534f7 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -66,7 +66,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav INFO: Application startup complete. [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` - Python API @@ -103,7 +103,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav INFO: Application startup complete. [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` From e7cb0365dd7773f4e994f3bf3c64d624ea6c5c0d Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Sat, 14 May 2022 13:23:09 +0800 Subject: [PATCH 83/93] test=doc --- demos/streaming_asr_server/README_cn.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 7aba3de4..72ea9377 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -73,7 +73,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav INFO: Application startup complete. [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` - Python API @@ -110,7 +110,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav INFO: Application startup complete. [2022-05-14 04:56:22] [INFO] [on.py:59] Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8091 (Press CTRL+C to quit) + [2022-05-14 04:56:22] [INFO] [server.py:211] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` ### 4. ASR 客户端使用方法 From 67939d0d6691f7be48e496ddfb92c19bffd8c39a Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Sat, 14 May 2022 12:52:35 +0800 Subject: [PATCH 84/93] add check asr server model type, test=doc --- .../conf/application.yaml | 5 ++- .../server/engine/asr/online/asr_engine.py | 40 +++++++++++++------ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/demos/streaming_asr_server/conf/application.yaml b/demos/streaming_asr_server/conf/application.yaml index f576d704..e9a89c19 100644 --- a/demos/streaming_asr_server/conf/application.yaml +++ b/demos/streaming_asr_server/conf/application.yaml @@ -29,7 +29,8 @@ asr_online: cfg_path: decode_method: force_yes: True - device: cpu # cpu or gpu:id + device: 'cpu' # cpu or gpu:id + decode_method: "attention_rescoring" am_predictor_conf: device: # set 'gpu:id' or 'cpu' switch_ir_optim: True @@ -42,4 +43,4 @@ asr_online: window_ms: 25 # ms shift_ms: 10 # ms sample_rate: 16000 - sample_width: 2 \ No newline at end of file + sample_width: 2 diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 79b0ddb7..6280093f 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -13,6 +13,7 @@ # limitations under the License. import copy import os +import sys from typing import Optional import numpy as np @@ -588,7 +589,7 @@ class ASRServerExecutor(ASRExecutor): self.pretrained_models = pretrained_models def _init_from_path(self, - model_type: str='deepspeech2online_aishell', + model_type: str=None, am_model: Optional[os.PathLike]=None, am_params: Optional[os.PathLike]=None, lang: str='zh', @@ -599,6 +600,12 @@ class ASRServerExecutor(ASRExecutor): """ Init model and other resources from a specific path. """ + if not model_type or not lang or not sample_rate: + logger.error( + "The model type or lang or sample rate is None, please input an valid server parameter yaml" + ) + return False + self.model_type = model_type self.sample_rate = sample_rate sample_rate_str = '16k' if sample_rate == 16000 else '8k' @@ -1028,20 +1035,27 @@ class ASREngine(BaseEngine): self.device = paddle.get_device() logger.info(f"paddlespeech_server set the device: {self.device}") paddle.set_device(self.device) - except BaseException: + except BaseException as e: logger.error( - "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + f"Set device failed, please check if device '{self.device}' is already used and the parameter 'device' in the yaml file" ) - - self.executor._init_from_path( - model_type=self.config.model_type, - am_model=self.config.am_model, - am_params=self.config.am_params, - lang=self.config.lang, - sample_rate=self.config.sample_rate, - cfg_path=self.config.cfg_path, - decode_method=self.config.decode_method, - am_predictor_conf=self.config.am_predictor_conf) + logger.error( + "If all GPU or XPU is used, you can set the server to 'cpu'") + sys.exit(-1) + + if not self.executor._init_from_path( + model_type=self.config.model_type, + am_model=self.config.am_model, + am_params=self.config.am_params, + lang=self.config.lang, + sample_rate=self.config.sample_rate, + cfg_path=self.config.cfg_path, + decode_method=self.config.decode_method, + am_predictor_conf=self.config.am_predictor_conf): + logger.error( + "Init the ASR server occurs error, please check the server configuration yaml" + ) + return False logger.info("Initialize ASR server engine successfully.") return True From f57fff24fb10609809010bd671f26bf3a6b2812f Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Sat, 14 May 2022 16:20:14 +0800 Subject: [PATCH 85/93] update the init flag, test=doc --- paddlespeech/server/engine/asr/online/asr_engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 6280093f..fd57a3d5 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -737,6 +737,8 @@ class ASRServerExecutor(ASRExecutor): # update the ctc decoding self.searcher = CTCPrefixBeamSearch(self.config.decode) self.transformer_decode_reset() + + return True def reset_decoder_and_chunk(self): """reset decoder and chunk state for an new audio From 760b60e7cc3789f06015290c5c32bd8dda024db2 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 16 May 2022 10:55:48 +0800 Subject: [PATCH 86/93] test=doc --- demos/custom_streaming_asr/README_cn.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md index cd8893af..ffbf682f 100644 --- a/demos/custom_streaming_asr/README_cn.md +++ b/demos/custom_streaming_asr/README_cn.md @@ -9,10 +9,10 @@ * G with slot: 打车到 "address_slot"。 ![](https://ai-studio-static-online.cdn.bcebos.com/28d9ef132a7f47a895a65ae9e5c4f55b8f472c9f3dd24be8a2e66e0b88b173a4) -* 这是address slot wfst, 可以添加一些需要识别的地名. +* 这是 address slot wfst, 可以添加一些需要识别的地名. ![](https://ai-studio-static-online.cdn.bcebos.com/47c89100ef8c465bac733605ffc53d76abefba33d62f4d818d351f8cea3c8fe2) -* 通过replace 操作, G = fstreplace(G_with_slot, address_slot), 最终可以得到定制化的解码图。 +* 通过 replace 操作, G = fstreplace(G_with_slot, address_slot), 最终可以得到定制化的解码图。 ![](https://ai-studio-static-online.cdn.bcebos.com/60a3095293044f10b73039ab10c7950d139a6717580a44a3ba878c6e74de402b) ## 使用方法 @@ -31,29 +31,29 @@ cd /paddle bash websocket_server.sh ``` 上面脚本完成了如下两个功能: -1. 完成resource.tar.gz下载,解压后,会在resource中发现如下目录: +1. 完成 resource.tar.gz 下载,解压后,会在 resource 中发现如下目录: model: 声学模型 graph: 解码构图 lib: 相关库 bin: 运行程序 data: 语音数据 -2. 通过websocket_server_main来启动服务。 +2. 通过 websocket_server_main 来启动服务。 这里简单的介绍几个参数: -port是服务端口, -graph_path用来指定解码图文件, +port 是服务端口, +graph_path 用来指定解码图文件, 其他参数说明可参见代码: PaddleSpeech/speechx/speechx/decoder/param.h PaddleSpeech/speechx/examples/ds2_ol/websocket/websocket_server_main.cc -* 在另一个终端中, 通过client发送数据,得到结果。运行如下命令: +* 在另一个终端中, 通过 client 发送数据,得到结果。运行如下命令: ``` bash websocket_client.sh ``` -通过websocket_client_main来启动client服务,其中$wav_scp是发送的语音句子集合,port为服务端口。 +通过 websocket_client_main 来启动 client 服务,其中 wav_scp 是发送的语音句子集合,port 为服务端口。 * 结果: -client的log中可以看到如下类似的结果 +client 的 log 中可以看到如下类似的结果 ``` 0513 10:58:13.827821 41768 recognizer_test_main.cc:56] wav len (sample): 70208 I0513 10:58:13.884493 41768 feature_cache.h:52] set finished From 3ae013959b9e06f419fd98a74de54c90ec887c17 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 16 May 2022 15:55:14 +0800 Subject: [PATCH 87/93] Updata PPASR_cn.md, test=doc --- docs/source/asr/PPASR_cn.md | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/docs/source/asr/PPASR_cn.md b/docs/source/asr/PPASR_cn.md index 1f72f1b9..6f04d104 100644 --- a/docs/source/asr/PPASR_cn.md +++ b/docs/source/asr/PPASR_cn.md @@ -1,3 +1,6 @@ +(简体中文|[English](./PPASR.md)) +# PP-ASR + ## 目录 - [1. 简介](#1) - [2. 特点](#2) @@ -12,7 +15,7 @@ ## 1. 简介 -PP-ASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文的模型,支持模型的训练,并且支持使用命令行的方式进行模型的推理。 PP-ASR也支持流式模型的部署,以及个性化场景的部署。 +PP-ASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文的模型,支持模型的训练,并且支持使用命令行的方式进行模型的推理。 PP-ASR 也支持流式模型的部署,以及个性化场景的部署。 ## 2. 特点 @@ -32,21 +35,23 @@ PP-ASR 的主要特点如下: ## 3.1 预训练模型 -支持的预训练模型列表:[released_model.md](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/released_model.md)。 +支持的预训练模型列表:[released_model](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/released_model.md)。 其中效果较好的模型为 Ds2 Online Wenetspeech ASR0 Model 以及 Conformer Online Wenetspeech ASR1 Model。 两个模型都支持流式 ASR。 - +关于模型设计的部分,可以参考 AIStudio 教程: +- [Deepspeech2](https://aistudio.baidu.com/aistudio/projectdetail/3866807) +- [Transformer](https://aistudio.baidu.com/aistudio/projectdetail/3470110) ## 3.2 模型训练 模型的训练的参考脚本存放在 examples 中,并按照 `examples/数据集/模型` 存放,数据集主要支持 aishell 和 librispeech,模型支持 deepspeech2 模型和 u2 (conformer/transformer) 模型。 -具体的执行脚本的步骤记录在 run.sh 当中。具体可参考[这里](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) +具体的执行脚本的步骤记录在 run.sh 当中。具体可参考: [asr1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) ## 3.3 模型推理 -PPASR 支持在使用`pip install paddlespeech`后 使用命令行的方式来使用预训练模型进行推理。 +PP-ASR 支持在使用`pip install paddlespeech`后 使用命令行的方式来使用预训练模型进行推理。 具体支持的功能包括: @@ -54,26 +59,37 @@ PPASR 支持在使用`pip install paddlespeech`后 使用命令行的方式来 - 使用管道的方式对多条音频进行预测 - 支持 RTF 的计算 -具体的使用方式可以参考[这里](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README_cn.md) +具体的使用方式可以参考: [speech_recognition](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README_cn.md) ## 3.4 服务部署 -PPASR 支持流式ASR的服务部署。支持 语音识别 + 标点处理两个功能同时使用。 +PP-ASR 支持流式ASR的服务部署。支持 语音识别 + 标点处理两个功能同时使用。 -server 的 demo [链接](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/streaming_asr_server) +server 的 demo: [streaming_asr_server](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/streaming_asr_server) ![image](https://user-images.githubusercontent.com/87408988/168255342-1fc790c0-16f4-4540-a861-db239076727c.png) +网页上使用 asr server 的效果展示:[streaming_asr_demo_video](https://paddlespeech.readthedocs.io/en/latest/streaming_asr_demo_video.html) + +关于服务部署方面的更多资料,可以参考 AIStudio 教程: +- [流式服务-模型部分](https://aistudio.baidu.com/aistudio/projectdetail/3839884) +- [流式服务](https://aistudio.baidu.com/aistudio/projectdetail/4017905) + ## 3.5 支持个性化场景部署 -针对个性化场景部署,提供了 特征提取(fbank) => 推理模型(打分库)=> TLG(WFST, token, lexion, grammer)的 C++ 程序。具体参考[这里](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/speechx) +针对个性化场景部署,提供了特征提取(fbank) => 推理模型(打分库)=> TLG(WFST, token, lexion, grammer)的 C++ 程序。具体参考 [speechx](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/speechx)。 如果想快速了解和使用,可以参考: [custom_streaming_asr](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/custom_streaming_asr/README_cn.md) + +关于支持个性化场景部署的更多资料,可以参考 AIStudio 教程: +- [定制化识别](https://aistudio.baidu.com/aistudio/projectdetail/4021561) + + ## 4. 快速开始 -关于如果使用 PPASR,可以看这里的[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md),其中提供了 **简单**、**中等**、**困难** 三种安装方式。如果想体验paddlespeech 的推理功能,可以用 **简单** 安装方式。 +关于如果使用 PP-ASR,可以看这里的 [install](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md),其中提供了 **简单**、**中等**、**困难** 三种安装方式。如果想体验 paddlespeech 的推理功能,可以用 **简单** 安装方式。 From 4228de6f75f891318a691d049e82af6d3d2a752b Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 16 May 2022 18:13:07 +0800 Subject: [PATCH 88/93] test=asr --- docs/source/asr/PPASR.md | 96 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 docs/source/asr/PPASR.md diff --git a/docs/source/asr/PPASR.md b/docs/source/asr/PPASR.md new file mode 100644 index 00000000..ef22954a --- /dev/null +++ b/docs/source/asr/PPASR.md @@ -0,0 +1,96 @@ +([简体中文](./PPASR.md)|English) +# PP-ASR + +## Catalogue +- [1. Introduction](#1) +- [2. Characteristic](#2) +- [3. Tutorials](#3) + - [3.1 Pre-trained Models](#31) + - [3.2 Training](#32) + - [3.3 Inference](#33) + - [3.4 Service Deployment](#33) + - [3.5 Customized Auto Speech Recognition and Deployment](#33) +- [4. Quick Start](#4) + + +## 1. Introduction + +PP-ASR is a tool to provide ASR(Automatic speech recognition) function. It provides a variety of Chinese and English models and supports model training. It also supports model inference using the command line. In addition, PP-ASR supports the deployment of streaming models and customized ASR. + + +## 2. Characteristic +The basic process of ASR is shown in the figure below: +
+ + +The main characteristics of PP-ASR are shown below: +- Provides pre-trained models on Chinese/English open source datasets: aishell(Chinese), wenetspeech(Chinese) and librispeech(English). The models includes deepspeech2 and conformer/transformer. +- Support model training on Chinese/English datasets. +- Support model inference using the command line. You can use to use `paddlespeech asr --model xxx --input xxx.wav` to use pre-trained model to do model inference. +- Support deployment of streaming ASR server. Besides ASR function, the server supports timestamp function. +- Support customized auto speech recognition and deployment. + + +## 3. Tutorials + + +## 3.1 Pre-trained Models +The support pre-trained model list: [released_model](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/released_model.md). +The model with good effect are Ds2 Online Wenetspeech ASR0 Model and Conformer Online Wenetspeech ASR1 Model. Both two models support streaming ASR. +For more information about model design, you can refer to the aistudio tutorial: +- [Deepspeech2](https://aistudio.baidu.com/aistudio/projectdetail/3866807) +- [Transformer](https://aistudio.baidu.com/aistudio/projectdetail/3470110) + + +## 3.2 Training +The reference script for model training is stored in [examples](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples) and stored according to "examples/dataset/model". The dataset mainly supports aishell and librispeech. The model supports deepspeech2 and u2(conformer/transformer). +The specific steps of executing the script are recorded in `run.sh`. + +For more information, you can refer to: [asr1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) + + + +## 3.3 Inference + +PP-ASR supports use `paddlespeech asr --model xxx --input xxx.wav` to use pre-trained model to do model inference after install `paddlespeech` by `pip install paddlespeech`. + +Specific supported functions include: + +- Prediction of single audio +- Use pipe to predict multiple audio +- Support RTF calculation + +For specific usage, please refer to: [speech_recognition](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README_cn.md) + + + +## 3.4 Service Deployment + +PP-ASR supports the service deployment of streaming ASR. Support the simultaneous use of speech recognition and punctuation processing. + +Demo of ASR Server: [streaming_asr_server](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/streaming_asr_server) + +![image](https://user-images.githubusercontent.com/87408988/168255342-1fc790c0-16f4-4540-a861-db239076727c.png) + +Display of using ASR server on Web page: [streaming_asr_demo_video](https://paddlespeech.readthedocs.io/en/latest/streaming_asr_demo_video.html) + + +For more information about service deployment, you can refer to the aistudio tutorial: +- [Streaming service - model part](https://aistudio.baidu.com/aistudio/projectdetail/3839884) +- [Streaming service](https://aistudio.baidu.com/aistudio/projectdetail/4017905) + + +## 3.5 Customized Auto Speech Recognition and Deployment + +For customized auto speech recognition and deployment, PP-ASR provides feature extraction(fbank) => Inference model(Scoring Library)=> C++ program of TLG(WFST, token, lexion, grammer). For specific usage, please refer to: [speechx](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/speechx) +If you want to quickly use it, you can refer to: [custom_streaming_asr](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/custom_streaming_asr/README_cn.md) + +For more information about customized auto speech recognition and deployment, you can refer to the aistudio tutorial: +- [Customized Auto Speech Recognition](https://aistudio.baidu.com/aistudio/projectdetail/4021561) + + + + +## 4. Quick Start + +To use PP-ASR, you can see here [install](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md), It supplies three methods to install `paddlespeech`, which are **Easy**, **Medium** and **Hard**. If you want to experience the inference function of paddlespeech, you can use **Easy** installation method. From ff8b487f47f41aedc9a204c9f3a0613d6b88003d Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 16 May 2022 18:13:18 +0800 Subject: [PATCH 89/93] test=asr --- docs/source/asr/PPASR_cn.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/source/asr/PPASR_cn.md b/docs/source/asr/PPASR_cn.md index 6f04d104..82b1c1d3 100644 --- a/docs/source/asr/PPASR_cn.md +++ b/docs/source/asr/PPASR_cn.md @@ -26,7 +26,7 @@ PP-ASR 是一个 提供 ASR 功能的工具。其提供了多种中文和英文 PP-ASR 的主要特点如下: - 提供在中/英文开源数据集 aishell (中文),wenetspeech(中文),librispeech (英文)上的预训练模型。模型包含 deepspeech2 模型以及 conformer/transformer 模型。 - 支持中/英文的模型训练功能。 -- 支持命令行方式的模型推理, `paddlespeech asr --input xxx.wav` 方式调用各个预训练模型进行推理。 +- 支持命令行方式的模型推理,可使用 `paddlespeech asr --model xxx --input xxx.wav` 方式调用各个预训练模型进行推理。 - 支持流式 ASR 的服务部署,也支持输出时间戳。 - 支持个性化场景的部署。 @@ -37,15 +37,15 @@ PP-ASR 的主要特点如下: ## 3.1 预训练模型 支持的预训练模型列表:[released_model](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/released_model.md)。 其中效果较好的模型为 Ds2 Online Wenetspeech ASR0 Model 以及 Conformer Online Wenetspeech ASR1 Model。 两个模型都支持流式 ASR。 -关于模型设计的部分,可以参考 AIStudio 教程: +更多关于模型设计的部分,可以参考 AIStudio 教程: - [Deepspeech2](https://aistudio.baidu.com/aistudio/projectdetail/3866807) - [Transformer](https://aistudio.baidu.com/aistudio/projectdetail/3470110) ## 3.2 模型训练 -模型的训练的参考脚本存放在 examples 中,并按照 `examples/数据集/模型` 存放,数据集主要支持 aishell 和 librispeech,模型支持 deepspeech2 模型和 u2 (conformer/transformer) 模型。 -具体的执行脚本的步骤记录在 run.sh 当中。具体可参考: [asr1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) +模型的训练的参考脚本存放在 [examples](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples) 中,并按照 `examples/数据集/模型` 存放,数据集主要支持 aishell 和 librispeech,模型支持 deepspeech2 模型和 u2 (conformer/transformer) 模型。 +具体的执行脚本的步骤记录在 `run.sh` 当中。具体可参考: [asr1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) @@ -80,7 +80,8 @@ server 的 demo: [streaming_asr_server](https://github.com/PaddlePaddle/Paddle ## 3.5 支持个性化场景部署 -针对个性化场景部署,提供了特征提取(fbank) => 推理模型(打分库)=> TLG(WFST, token, lexion, grammer)的 C++ 程序。具体参考 [speechx](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/speechx)。 如果想快速了解和使用,可以参考: [custom_streaming_asr](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/custom_streaming_asr/README_cn.md) +针对个性化场景部署,提供了特征提取(fbank) => 推理模型(打分库)=> TLG(WFST, token, lexion, grammer)的 C++ 程序。具体参考 [speechx](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/speechx)。 +如果想快速了解和使用,可以参考: [custom_streaming_asr](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/custom_streaming_asr/README_cn.md) 关于支持个性化场景部署的更多资料,可以参考 AIStudio 教程: - [定制化识别](https://aistudio.baidu.com/aistudio/projectdetail/4021561) From 80352edb21bea1b1d3fba8b1c9ffdffcf2aab570 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 16 May 2022 10:57:37 +0000 Subject: [PATCH 90/93] fix test_rtf, test=doc --- tests/unit/cli/cacu_rtf_by_aishell.sh | 1 + tests/unit/cli/path.sh | 11 +++++++++++ 2 files changed, 12 insertions(+) create mode 100644 tests/unit/cli/path.sh diff --git a/tests/unit/cli/cacu_rtf_by_aishell.sh b/tests/unit/cli/cacu_rtf_by_aishell.sh index b9d68352..55aa9054 100644 --- a/tests/unit/cli/cacu_rtf_by_aishell.sh +++ b/tests/unit/cli/cacu_rtf_by_aishell.sh @@ -1,5 +1,6 @@ #!/bin/bash +source path.sh stage=-1 stop_stage=100 MAIN_ROOT=../../.. diff --git a/tests/unit/cli/path.sh b/tests/unit/cli/path.sh new file mode 100644 index 00000000..38a242a4 --- /dev/null +++ b/tests/unit/cli/path.sh @@ -0,0 +1,11 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ From bff52147dd1189f0c835093b9468b25c707e1704 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 16 May 2022 19:20:53 +0800 Subject: [PATCH 91/93] test=doc --- docs/source/asr/PPASR.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/asr/PPASR.md b/docs/source/asr/PPASR.md index ef22954a..3779434e 100644 --- a/docs/source/asr/PPASR.md +++ b/docs/source/asr/PPASR.md @@ -1,4 +1,4 @@ -([简体中文](./PPASR.md)|English) +([简体中文](./PPASR_cn.md)|English) # PP-ASR ## Catalogue @@ -24,9 +24,9 @@ The basic process of ASR is shown in the figure below: The main characteristics of PP-ASR are shown below: -- Provides pre-trained models on Chinese/English open source datasets: aishell(Chinese), wenetspeech(Chinese) and librispeech(English). The models includes deepspeech2 and conformer/transformer. +- Provides pre-trained models on Chinese/English open source datasets: aishell(Chinese), wenetspeech(Chinese) and librispeech(English). The models include deepspeech2 and conformer/transformer. - Support model training on Chinese/English datasets. -- Support model inference using the command line. You can use to use `paddlespeech asr --model xxx --input xxx.wav` to use pre-trained model to do model inference. +- Support model inference using the command line. You can use to use `paddlespeech asr --model xxx --input xxx.wav` to use the pre-trained model to do model inference. - Support deployment of streaming ASR server. Besides ASR function, the server supports timestamp function. - Support customized auto speech recognition and deployment. @@ -43,21 +43,21 @@ For more information about model design, you can refer to the aistudio tutorial: ## 3.2 Training -The reference script for model training is stored in [examples](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples) and stored according to "examples/dataset/model". The dataset mainly supports aishell and librispeech. The model supports deepspeech2 and u2(conformer/transformer). +The referenced script for model training is stored in [examples](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples) and stored according to "examples/dataset/model". The dataset mainly supports aishell and librispeech. The model supports deepspeech2 and u2(conformer/transformer). The specific steps of executing the script are recorded in `run.sh`. -For more information, you can refer to: [asr1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) +For more information, you can refer to [asr1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1) ## 3.3 Inference -PP-ASR supports use `paddlespeech asr --model xxx --input xxx.wav` to use pre-trained model to do model inference after install `paddlespeech` by `pip install paddlespeech`. +PP-ASR supports use `paddlespeech asr --model xxx --input xxx.wav` to use the pre-trained model to do model inference after install `paddlespeech` by `pip install paddlespeech`. Specific supported functions include: - Prediction of single audio -- Use pipe to predict multiple audio +- Use the pipe to predict multiple audio - Support RTF calculation For specific usage, please refer to: [speech_recognition](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README_cn.md) @@ -83,7 +83,7 @@ For more information about service deployment, you can refer to the aistudio tut ## 3.5 Customized Auto Speech Recognition and Deployment For customized auto speech recognition and deployment, PP-ASR provides feature extraction(fbank) => Inference model(Scoring Library)=> C++ program of TLG(WFST, token, lexion, grammer). For specific usage, please refer to: [speechx](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/speechx) -If you want to quickly use it, you can refer to: [custom_streaming_asr](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/custom_streaming_asr/README_cn.md) +If you want to quickly use it, you can refer to [custom_streaming_asr](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/custom_streaming_asr/README_cn.md) For more information about customized auto speech recognition and deployment, you can refer to the aistudio tutorial: - [Customized Auto Speech Recognition](https://aistudio.baidu.com/aistudio/projectdetail/4021561) From 8e5f825641b83dc6f943660a55c8602bf0bf2c76 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 16 May 2022 19:27:09 +0800 Subject: [PATCH 92/93] test=doc --- docs/source/tts/PPTTS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/tts/PPTTS.md b/docs/source/tts/PPTTS.md index c8534cd3..ef0baa07 100644 --- a/docs/source/tts/PPTTS.md +++ b/docs/source/tts/PPTTS.md @@ -1,5 +1,7 @@ ([简体中文](./PPTTS_cn.md)|English) +# PPTTS + - [1. Introduction](#1) - [2. Characteristic](#2) - [3. Benchmark](#3) From e61757dbf784863f1f61fb57489a0e0a2305459c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 16 May 2022 13:01:11 +0000 Subject: [PATCH 93/93] fix yao, test=tts --- paddlespeech/t2s/frontend/tone_sandhi.py | 3 ++- paddlespeech/t2s/frontend/zh_normalization/num.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 07f7fa2b..e3102b9b 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -63,7 +63,8 @@ class ToneSandhi(): '扫把', '惦记' } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" + "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎", + "幺幺" } self.punc = ":,;。?!“”‘’':,;.?!" diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index a83b42a4..ec136773 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -103,7 +103,7 @@ def replace_default_num(match): str """ number = match.group(0) - return verbalize_digit(number) + return verbalize_digit(number, alt_one=True) # 数字表达式
Task Dataset Model Type Link 任务 数据集 模型类型 脚本
任务 数据集 模型种类 链接 模型类型 脚本