From 0ffe1f91143b0489fd38be90747afcbb5e61fedc Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 28 Mar 2022 03:35:55 +0000 Subject: [PATCH 01/15] replace kaidi_fbank with paddleaudio --- examples/aishell/asr1/conf/preprocess.yaml | 9 ++-- paddlespeech/s2t/transform/spectrogram.py | 45 ++++++++++++++++++++ paddlespeech/s2t/transform/transformation.py | 1 + 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index f7f4c58d5..a20ff2ab3 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -3,8 +3,9 @@ process: - type: fbank_kaldi fs: 16000 n_mels: 80 - n_shift: 160 - win_length: 400 + n_frame_length: 25 + n_frame_shift: 10 + energy_floor: 0.0 dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json @@ -23,7 +24,3 @@ process: n_mask: 2 inplace: true replace_with_zero: false - - - - diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 889cd349d..f779b07d4 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -14,8 +14,11 @@ # Modified from espnet(https://github.com/espnet/espnet) import librosa import numpy as np +import paddle from python_speech_features import logfbank +import paddleaudio.compliance.kaldi as kaldi + def stft(x, n_fft, @@ -309,6 +312,48 @@ class IStft(): class LogMelSpectrogramKaldi(): + def __init__(self, + fs=16000, + n_mels=80, + n_frame_length=25, + n_frame_shift=10, + energy_floor=0.0, + dither=0.1): + self.fs = fs + self.n_mels = n_mels + self.n_frame_length = n_frame_length + self.n_frame_shift = n_frame_shift + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def __call__(self, x, train): + dither = self.dither if train else 0.0 + waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32) + mat = kaldi.fbank( + waveform, + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=dither, + energy_floor=self.energy_floor, + sr=self.fs) + mat = np.squeeze(mat.numpy()) + return mat + + +class LogMelSpectrogramKaldi_decay(): def __init__( self, fs=16000, diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 381b0cdc9..3b433cb0b 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -31,6 +31,7 @@ import_alias = dict( freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask", spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment", speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation", + speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox", volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation", noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection", bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation", From bc5ae43d3a5ecdb3fbcea92d18a054ddc260816a Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 28 Mar 2022 04:49:10 +0000 Subject: [PATCH 02/15] restructure expand in length_regulator.py for paddle2onnx, test=tts --- examples/csmsc/tts3/README.md | 4 +++- .../t2s/modules/predictor/length_regulator.py | 24 ++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 7b803526f..a31203f8a 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -227,7 +227,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios: - [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) - [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) -The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). +The static model can be downloaded here: +[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). +[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip) Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index 2472c413b..be788e6ed 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -73,15 +73,21 @@ class LengthRegulator(nn.Layer): batch_size, t_enc = paddle.shape(durations) slens = paddle.sum(durations, -1) t_dec = paddle.max(slens) - M = paddle.zeros([batch_size, t_dec, t_enc]) - for i in range(batch_size): - k = 0 - for j in range(t_enc): - d = durations[i, j] - # If the d == 0, slice action is meaningless and not supported in paddle - if d >= 1: - M[i, k:k + d, j] = 1 - k += d + t_dec_1 = t_dec + 1 + flatten_duration = paddle.cumsum( + paddle.reshape(durations, [batch_size * t_enc])) + 1 + init = paddle.zeros(t_dec_1) + m_batch = batch_size * t_enc + M = paddle.zeros([t_dec_1, m_batch]) + for i in range(m_batch): + d = flatten_duration[i] + m = paddle.concat( + [paddle.ones(d), paddle.zeros(t_dec_1 - d)], axis=0) + M[:, i] = m - init + init = m + M = paddle.reshape(M, shape=[t_dec_1, batch_size, t_enc]) + M = M[1:, :, :] + M = paddle.transpose(M, (1, 0, 2)) encodings = paddle.matmul(M, encodings) return encodings From e52fc08c586e1da29bb5ccf1ebaf6b639e21412d Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 28 Mar 2022 05:53:31 +0000 Subject: [PATCH 03/15] update readme, test=doc --- examples/csmsc/tts3/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index a31203f8a..08bf2ee91 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -228,8 +228,8 @@ Pretrained FastSpeech2 model with no silence in the edge of audios: - [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) The static model can be downloaded here: -[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). -[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip) +- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip) +- [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip) Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: From ed490b66cb052c1308117e5e9703d94d8e43239a Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 29 Mar 2022 03:20:07 +0000 Subject: [PATCH 04/15] update spectrogram, test=asr --- examples/aishell/asr1/conf/preprocess.yaml | 5 ++-- paddlespeech/s2t/transform/spectrogram.py | 34 ++++++++++++++++------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index a20ff2ab3..d3992cb9f 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -3,9 +3,8 @@ process: - type: fbank_kaldi fs: 16000 n_mels: 80 - n_frame_length: 25 - n_frame_shift: 10 - energy_floor: 0.0 + n_shift: 160 + win_length: 400 dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index f779b07d4..75787d92b 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -312,17 +312,33 @@ class IStft(): class LogMelSpectrogramKaldi(): - def __init__(self, - fs=16000, - n_mels=80, - n_frame_length=25, - n_frame_shift=10, - energy_floor=0.0, - dither=0.1): + def __init__( + self, + fs=16000, + n_mels=80, + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + energy_floor=0.0, + dither=0.1): + """ + The Kaldi implementation of LogMelSpectrogram + Args: + fs (int): sample rate of the audio + n_mels (int): number of mel filter banks + n_shift (int): number of points in a frame shift + win_length (int): number of points in a frame windows + energy_floor (float): Floor on energy in Spectrogram computation (absolute) + dither (float): Dithering constant + + Returns: + LogMelSpectrogramKaldi + """ + self.fs = fs self.n_mels = n_mels - self.n_frame_length = n_frame_length - self.n_frame_shift = n_frame_shift + num_point_ms = fs / 1000 + self.n_frame_length = win_length / num_point_ms + self.n_frame_shift = n_shift / num_point_ms self.energy_floor = energy_floor self.dither = dither From 284a5f26c20aa21401f0720ac594d7c5800ef629 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Tue, 29 Mar 2022 17:17:32 +0800 Subject: [PATCH 05/15] fix speechx doc typo --- speechx/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/speechx/README.md b/speechx/README.md index 07449f9f9..bb256712e 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -5,7 +5,7 @@ We develop under: * docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7 * os - Ubuntu 16.04.7 LTS -* ** gcc/g++/gfortran - 8.2.0 ** +* gcc/g++/gfortran - 8.2.0 * cmake - 3.16.0 > We make sure all things work fun under docker, and recommend using it to develop and deploy. @@ -24,7 +24,7 @@ nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspac * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). -* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nviida-docker`. +* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvida-docker`. 2. Build `speechx` and `examples`. From 678c8369aa6e01063496d5a9992e893ad444012b Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Tue, 29 Mar 2022 17:38:50 +0800 Subject: [PATCH 06/15] fix typo --- speechx/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speechx/README.md b/speechx/README.md index bb256712e..610b88a8f 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -24,7 +24,7 @@ nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspac * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). -* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvida-docker`. +* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`. 2. Build `speechx` and `examples`. From f47146af494f510428e9d14702f4b735c88843aa Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 29 Mar 2022 12:09:54 +0000 Subject: [PATCH 07/15] add docstring, test=asr --- paddlespeech/s2t/transform/spectrogram.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 75787d92b..4a65548fe 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -355,7 +355,20 @@ class LogMelSpectrogramKaldi(): dither=self.dither, )) def __call__(self, x, train): + """ + Args: + x (np.ndarray): shape (Ti,) + train (bool): True, train mode. + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ dither = self.dither if train else 0.0 + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32) mat = kaldi.fbank( waveform, From fa160d86e326e7f2a8260f07ccbaa68c28c9defc Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 30 Mar 2022 07:57:52 +0000 Subject: [PATCH 08/15] update version of ctcdecoders, test=asr --- third_party/ctc_decoders/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py index 4a11b890d..ce2787e3f 100644 --- a/third_party/ctc_decoders/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -127,7 +127,7 @@ decoders_module = [ setup( name='paddlespeech_ctcdecoders', - version='0.1.1', + version='0.2.0', description="CTC decoders in paddlespeech", author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com", From d847fe29cfc83afbe5e4fc6f3717240516600eb7 Mon Sep 17 00:00:00 2001 From: WilliamZhang06 Date: Wed, 30 Mar 2022 16:44:04 +0800 Subject: [PATCH 09/15] added online asr engine , test=doc --- paddlespeech/s2t/frontend/audio.py | 12 + paddlespeech/s2t/frontend/speech.py | 16 + paddlespeech/server/bin/main.py | 10 +- paddlespeech/server/conf/application.yaml | 27 +- .../server/engine/asr/online/__init__.py | 13 + .../server/engine/asr/online/asr_engine.py | 355 ++++++++++++++++++ paddlespeech/server/engine/engine_factory.py | 3 + .../tests/asr/online/microphone_client.py | 154 ++++++++ .../tests/asr/online/websocket_client.py | 115 ++++++ paddlespeech/server/utils/buffer.py | 59 +++ paddlespeech/server/utils/vad.py | 79 ++++ paddlespeech/server/ws/__init__.py | 13 + paddlespeech/server/ws/api.py | 38 ++ paddlespeech/server/ws/asr_socket.py | 106 ++++++ 14 files changed, 996 insertions(+), 4 deletions(-) create mode 100644 paddlespeech/server/engine/asr/online/__init__.py create mode 100644 paddlespeech/server/engine/asr/online/asr_engine.py create mode 100644 paddlespeech/server/tests/asr/online/microphone_client.py create mode 100644 paddlespeech/server/tests/asr/online/websocket_client.py create mode 100644 paddlespeech/server/utils/buffer.py create mode 100644 paddlespeech/server/utils/vad.py create mode 100644 paddlespeech/server/ws/__init__.py create mode 100644 paddlespeech/server/ws/api.py create mode 100644 paddlespeech/server/ws/asr_socket.py diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index d0368cc8d..7f71e5dd9 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -208,6 +208,18 @@ class AudioSegment(): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) + @classmethod + def from_pcm(cls, samples, sample_rate): + """Create audio segment from a byte string containing audio samples. + :param samples: Audio samples [num_samples x num_channels]. + :type samples: numpy.ndarray + :param sample_rate: Audio sample rate. + :type sample_rate: int + :return: Audio segment instance. + :rtype: AudioSegment + """ + return cls(samples, sample_rate) + @classmethod def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. diff --git a/paddlespeech/s2t/frontend/speech.py b/paddlespeech/s2t/frontend/speech.py index 8fd661c92..0340831a6 100644 --- a/paddlespeech/s2t/frontend/speech.py +++ b/paddlespeech/s2t/frontend/speech.py @@ -107,6 +107,22 @@ class SpeechSegment(AudioSegment): return cls(audio.samples, audio.sample_rate, transcript, tokens, token_ids) + @classmethod + def from_pcm(cls, samples, sample_rate, transcript, tokens=None, token_ids=None): + """Create speech segment from pcm on online mode + Args: + samples (numpy.ndarray): Audio samples [num_samples x num_channels]. + sample_rate (int): Audio sample rate. + transcript (str): Transcript text for the speech. + tokens (List[str], optional): text tokens. Defaults to None. + token_ids (List[int], optional): text token ids. Defaults to None. + Returns: + SpeechSegment: Speech segment instance. + """ + audio = AudioSegment.from_pcm(samples, sample_rate) + return cls(audio.samples, audio.sample_rate, transcript, tokens, + token_ids) + @classmethod def concatenate(cls, *segments): """Concatenate an arbitrary number of speech segments together, both diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py index de5282993..45ded33d8 100644 --- a/paddlespeech/server/bin/main.py +++ b/paddlespeech/server/bin/main.py @@ -17,7 +17,8 @@ import uvicorn from fastapi import FastAPI from paddlespeech.server.engine.engine_pool import init_engine_pool -from paddlespeech.server.restful.api import setup_router +from paddlespeech.server.restful.api import setup_router as setup_http_router +from paddlespeech.server.ws.api import setup_router as setup_ws_router from paddlespeech.server.utils.config import get_config app = FastAPI( @@ -35,7 +36,12 @@ def init(config): """ # init api api_list = list(engine.split("_")[0] for engine in config.engine_list) - api_router = setup_router(api_list) + if config.protocol == "websocket": + api_router = setup_ws_router(api_list) + elif config.protocol == "http": + api_router = setup_http_router(api_list) + else: + raise Exception("unsupported protocol") app.include_router(api_router) if not init_engine_pool(config): diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 2b1a05998..40de8e3b2 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -3,13 +3,18 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 127.0.0.1 +host: 0.0.0.0 port: 8090 # The task format in the engin_list is: _ # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] +# protocol: 'http' +# engine_list: ['asr_python', 'tts_python', 'cls_python'] -engine_list: ['asr_python', 'tts_python', 'cls_python'] + +# websocket, http (only choose one). websocket only support online engine type. +protocol: 'websocket' +engine_list: ['asr_online'] ################################################################################# @@ -48,6 +53,24 @@ asr_inference: summary: True # False -> do not show predictor config +################### speech task: asr; engine_type: online ####################### +asr_online: + model_type: 'deepspeech2online_aishell' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + ################################### TTS ######################################### ################### speech task: tts; engine_type: python ####################### tts_python: diff --git a/paddlespeech/server/engine/asr/online/__init__.py b/paddlespeech/server/engine/asr/online/__init__.py new file mode 100644 index 000000000..97043fd7b --- /dev/null +++ b/paddlespeech/server/engine/asr/online/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py new file mode 100644 index 000000000..d5c1aa7bd --- /dev/null +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -0,0 +1,355 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +import os +import time +from typing import Optional +import pickle +import numpy as np +from numpy import float32 +import soundfile + +import paddle +from yacs.config import CfgNode + +from paddlespeech.s2t.frontend.speech import SpeechSegment +from paddlespeech.cli.asr.infer import ASRExecutor +from paddlespeech.cli.log import logger +from paddlespeech.cli.utils import MODEL_HOME +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.modules.ctc import CTCDecoder +from paddlespeech.s2t.utils.utility import UpdateConfig +from paddlespeech.server.engine.base_engine import BaseEngine +from paddlespeech.server.utils.config import get_config +from paddlespeech.server.utils.paddle_predictor import init_predictor +from paddlespeech.server.utils.paddle_predictor import run_model + +__all__ = ['ASREngine'] + +pretrained_models = { + "deepspeech2online_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz', + 'md5': + 'd5e076217cf60486519f72c217d21b9b', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2_online/checkpoints/avg_1', + 'model': + 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel', + 'params': + 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, +} + + +class ASRServerExecutor(ASRExecutor): + def __init__(self): + super().__init__() + pass + + def _init_from_path(self, + model_type: str='wenetspeech', + am_model: Optional[os.PathLike]=None, + am_params: Optional[os.PathLike]=None, + lang: str='zh', + sample_rate: int=16000, + cfg_path: Optional[os.PathLike]=None, + decode_method: str='attention_rescoring', + am_predictor_conf: dict=None): + """ + Init model and other resources from a specific path. + """ + + if cfg_path is None or am_model is None or am_params is None: + sample_rate_str = '16k' if sample_rate == 16000 else '8k' + tag = model_type + '-' + lang + '-' + sample_rate_str + res_path = self._get_pretrained_path(tag) # wenetspeech_zh + self.res_path = res_path + self.cfg_path = os.path.join(res_path, + pretrained_models[tag]['cfg_path']) + + self.am_model = os.path.join(res_path, + pretrained_models[tag]['model']) + self.am_params = os.path.join(res_path, + pretrained_models[tag]['params']) + logger.info(res_path) + logger.info(self.cfg_path) + logger.info(self.am_model) + logger.info(self.am_params) + else: + self.cfg_path = os.path.abspath(cfg_path) + self.am_model = os.path.abspath(am_model) + self.am_params = os.path.abspath(am_params) + self.res_path = os.path.dirname( + os.path.dirname(os.path.abspath(self.cfg_path))) + + #Init body. + self.config = CfgNode(new_allowed=True) + self.config.merge_from_file(self.cfg_path) + + with UpdateConfig(self.config): + if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: + from paddlespeech.s2t.io.collator import SpeechCollator + self.vocab = self.config.vocab_filepath + self.config.decode.lang_model_path = os.path.join( + MODEL_HOME, 'language_model', + self.config.decode.lang_model_path) + self.collate_fn_test = SpeechCollator.from_config(self.config) + self.text_feature = TextFeaturizer( + unit_type=self.config.unit_type, vocab=self.vocab) + + lm_url = pretrained_models[tag]['lm_url'] + lm_md5 = pretrained_models[tag]['lm_md5'] + self.download_lm( + lm_url, + os.path.dirname(self.config.decode.lang_model_path), lm_md5) + elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: + raise Exception("wrong type") + else: + raise Exception("wrong type") + + # AM predictor + self.am_predictor_conf = am_predictor_conf + self.am_predictor = init_predictor( + model_file=self.am_model, + params_file=self.am_params, + predictor_conf=self.am_predictor_conf) + + # decoder + self.decoder = CTCDecoder( + odim=self.config.output_dim, # is in vocab + enc_n_units=self.config.rnn_layer_size * 2, + blank_id=self.config.blank_id, + dropout_rate=0.0, + reduction=True, # sum + batch_average=True, # sum / batch_size + grad_norm_type=self.config.get('ctc_grad_norm_type', None)) + + # init decoder + cfg = self.config.decode + decode_batch_size = 1 # for online + self.decoder.init_decoder( + decode_batch_size, self.text_feature.vocab_list, + cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta, + cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n, + cfg.num_proc_bsearch) + + # init state box + self.chunk_state_h_box = np.zeros( + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), + dtype=float32) + self.chunk_state_c_box = np.zeros( + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), + dtype=float32) + + def reset_decoder_and_chunk(self): + """reset decoder and chunk state for an new audio + """ + self.decoder.reset_decoder(batch_size=1) + # init state box, for new audio request + self.chunk_state_h_box = np.zeros( + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), + dtype=float32) + self.chunk_state_c_box = np.zeros( + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), + dtype=float32) + + def decode_one_chunk(self, x_chunk, x_chunk_lens, model_type: str): + """decode one chunk + + Args: + x_chunk (numpy.array): shape[B, T, D] + x_chunk_lens (numpy.array): shape[B] + model_type (str): online model type + + Returns: + [type]: [description] + """ + if "deepspeech2online" in model_type : + input_names = self.am_predictor.get_input_names() + audio_handle = self.am_predictor.get_input_handle(input_names[0]) + audio_len_handle = self.am_predictor.get_input_handle(input_names[1]) + h_box_handle = self.am_predictor.get_input_handle(input_names[2]) + c_box_handle = self.am_predictor.get_input_handle(input_names[3]) + + audio_handle.reshape(x_chunk.shape) + audio_handle.copy_from_cpu(x_chunk) + + audio_len_handle.reshape(x_chunk_lens.shape) + audio_len_handle.copy_from_cpu(x_chunk_lens) + + h_box_handle.reshape(self.chunk_state_h_box.shape) + h_box_handle.copy_from_cpu(self.chunk_state_h_box) + + c_box_handle.reshape(self.chunk_state_c_box.shape) + c_box_handle.copy_from_cpu(self.chunk_state_c_box) + + output_names = self.am_predictor.get_output_names() + output_handle = self.am_predictor.get_output_handle(output_names[0]) + output_lens_handle = self.am_predictor.get_output_handle(output_names[1]) + output_state_h_handle = self.am_predictor.get_output_handle( + output_names[2]) + output_state_c_handle = self.am_predictor.get_output_handle( + output_names[3]) + + self.am_predictor.run() + + output_chunk_probs = output_handle.copy_to_cpu() + output_chunk_lens = output_lens_handle.copy_to_cpu() + self.chunk_state_h_box = output_state_h_handle.copy_to_cpu() + self.chunk_state_c_box = output_state_c_handle.copy_to_cpu() + + self.decoder.next(output_chunk_probs, output_chunk_lens) + trans_best, trans_beam = self.decoder.decode() + + return trans_best[0] + + elif "conformer" in model_type or "transformer" in model_type: + raise Exception("invalid model name") + else: + raise Exception("invalid model name") + + def _pcm16to32(self, audio): + """pcm int16 to float32 + + Args: + audio(numpy.array): numpy.int16 + + Returns: + audio(numpy.array): numpy.float32 + """ + if audio.dtype == np.int16: + audio = audio.astype("float32") + bits = np.iinfo(np.int16).bits + audio = audio / (2**(bits - 1)) + return audio + + def extract_feat(self, samples, sample_rate): + """extract feat + + Args: + samples (numpy.array): numpy.float32 + sample_rate (int): sample rate + + Returns: + x_chunk (numpy.array): shape[B, T, D] + x_chunk_lens (numpy.array): shape[B] + """ + # pcm16 -> pcm 32 + samples = self._pcm16to32(samples) + + # read audio + speech_segment = SpeechSegment.from_pcm( + samples, sample_rate, transcript=" ") + # audio augment + self.collate_fn_test.augmentation.transform_audio(speech_segment) + + # extract speech feature + spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize( + speech_segment, self.collate_fn_test.keep_transcription_text) + # CMVN spectrum + if self.collate_fn_test._normalizer: + spectrum = self.collate_fn_test._normalizer.apply(spectrum) + + # spectrum augment + audio = self.collate_fn_test.augmentation.transform_feature(spectrum) + + audio_len = audio.shape[0] + audio = paddle.to_tensor(audio, dtype='float32') + # audio_len = paddle.to_tensor(audio_len) + audio = paddle.unsqueeze(audio, axis=0) + + x_chunk = audio.numpy() + x_chunk_lens = np.array([audio_len]) + + return x_chunk, x_chunk_lens + + +class ASREngine(BaseEngine): + """ASR server engine + + Args: + metaclass: Defaults to Singleton. + """ + + def __init__(self): + super(ASREngine, self).__init__() + + def init(self, config: dict) -> bool: + """init engine resource + + Args: + config_file (str): config file + + Returns: + bool: init failed or success + """ + self.input = None + self.output = "" + self.executor = ASRServerExecutor() + self.config = config + + self.executor._init_from_path( + model_type=self.config.model_type, + am_model=self.config.am_model, + am_params=self.config.am_params, + lang=self.config.lang, + sample_rate=self.config.sample_rate, + cfg_path=self.config.cfg_path, + decode_method=self.config.decode_method, + am_predictor_conf=self.config.am_predictor_conf) + + logger.info("Initialize ASR server engine successfully.") + return True + + def preprocess(self, samples, sample_rate): + """preprocess + + Args: + samples (numpy.array): numpy.float32 + sample_rate (int): sample rate + + Returns: + x_chunk (numpy.array): shape[B, T, D] + x_chunk_lens (numpy.array): shape[B] + """ + x_chunk, x_chunk_lens = self.executor.extract_feat(samples, sample_rate) + return x_chunk, x_chunk_lens + + def run(self, x_chunk, x_chunk_lens, decoder_chunk_size=1): + """run online engine + + Args: + x_chunk (numpy.array): shape[B, T, D] + x_chunk_lens (numpy.array): shape[B] + decoder_chunk_size(int) + """ + self.output = self.executor.decode_one_chunk(x_chunk, x_chunk_lens, self.config.model_type) + + def postprocess(self): + """postprocess + """ + return self.output + + def reset(self): + """reset engine decoder and inference state + """ + self.executor.reset_decoder_and_chunk() + self.output = "" diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py index c39c44cae..2a39fb79b 100644 --- a/paddlespeech/server/engine/engine_factory.py +++ b/paddlespeech/server/engine/engine_factory.py @@ -25,6 +25,9 @@ class EngineFactory(object): elif engine_name == 'asr' and engine_type == 'python': from paddlespeech.server.engine.asr.python.asr_engine import ASREngine return ASREngine() + elif engine_name == 'asr' and engine_type == 'online': + from paddlespeech.server.engine.asr.online.asr_engine import ASREngine + return ASREngine() elif engine_name == 'tts' and engine_type == 'inference': from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine return TTSEngine() diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py new file mode 100644 index 000000000..74d457c56 --- /dev/null +++ b/paddlespeech/server/tests/asr/online/microphone_client.py @@ -0,0 +1,154 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +record wave from the mic +""" + +import threading +import pyaudio +import wave +import logging +import asyncio +import websockets +import json +from signal import SIGINT, SIGTERM + + +class ASRAudioHandler(threading.Thread): + def __init__(self, + url="127.0.0.1", + port=8090): + threading.Thread.__init__(self) + self.url = url + self.port = port + self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr" + self.fileName = "./output.wav" + self.chunk = 5120 + self.format = pyaudio.paInt16 + self.channels = 1 + self.rate = 16000 + self._running = True + self._frames = [] + self.data_backup = [] + + def startrecord(self): + """ + start a new thread to record wave + """ + threading._start_new_thread(self.recording, ()) + + def recording(self): + """ + recording wave + """ + self._running = True + self._frames = [] + p = pyaudio.PyAudio() + stream = p.open(format=self.format, + channels=self.channels, + rate=self.rate, + input=True, + frames_per_buffer=self.chunk) + while(self._running): + data = stream.read(self.chunk) + self._frames.append(data) + self.data_backup.append(data) + + stream.stop_stream() + stream.close() + p.terminate() + + def save(self): + """ + save wave data + """ + p = pyaudio.PyAudio() + wf = wave.open(self.fileName, 'wb') + wf.setnchannels(self.channels) + wf.setsampwidth(p.get_sample_size(self.format)) + wf.setframerate(self.rate) + wf.writeframes(b''.join(self.data_backup)) + wf.close() + p.terminate() + + def stoprecord(self): + """ + stop recording + """ + self._running = False + + async def run(self): + aa = input("是否开始录音? (y/n)") + if aa.strip() == "y": + self.startrecord() + logging.info("*" * 10 + "开始录音,请输入语音") + + async with websockets.connect(self.url) as ws: + # 发送开始指令 + audio_info = json.dumps({ + "name": "test.wav", + "signal": "start", + "nbest": 5 + }, sort_keys=True, indent=4, separators=(',', ': ')) + await ws.send(audio_info) + msg = await ws.recv() + logging.info("receive msg={}".format(msg)) + + # send bytes data + logging.info("结束录音请: Ctrl + c。继续请按回车。") + try: + while True: + while len(self._frames) > 0: + await ws.send(self._frames.pop(0)) + msg = await ws.recv() + logging.info("receive msg={}".format(msg)) + except asyncio.CancelledError: + # quit + # send finished + audio_info = json.dumps({ + "name": "test.wav", + "signal": "end", + "nbest": 5 + }, sort_keys=True, indent=4, separators=(',', ': ')) + await ws.send(audio_info) + msg = await ws.recv() + logging.info("receive msg={}".format(msg)) + + self.stoprecord() + logging.info("*" * 10 + "录音结束") + self.save() + elif aa.strip() == "n": + exit() + else: + print("无效输入!") + exit() + + +if __name__ == "__main__": + + logging.basicConfig(level=logging.INFO) + logging.info("asr websocket client start") + + handler = ASRAudioHandler("127.0.0.1", 8090) + loop = asyncio.get_event_loop() + main_task = asyncio.ensure_future(handler.run()) + for signal in [SIGINT, SIGTERM]: + loop.add_signal_handler(signal, main_task.cancel) + try: + loop.run_until_complete(main_task) + finally: + loop.close() + + logging.info("asr websocket client finished") diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py new file mode 100644 index 000000000..d849ffeac --- /dev/null +++ b/paddlespeech/server/tests/asr/online/websocket_client.py @@ -0,0 +1,115 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +import argparse +import logging +import time +import os +import json +import wave +import numpy as np +import asyncio +import websockets +import soundfile + + +class ASRAudioHandler: + def __init__(self, + url="127.0.0.1", + port=8090): + self.url = url + self.port = port + self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr" + + def read_wave(self, wavfile_path: str): + samples, sample_rate = soundfile.read(wavfile_path, dtype='int16') + x_len = len(samples) + chunk_stride = 40 * 16 #40ms, sample_rate = 16kHz + chunk_size = 80 * 16 #80ms, sample_rate = 16kHz + + if (x_len - chunk_size) % chunk_stride != 0: + padding_len_x = chunk_stride - (x_len - chunk_size + ) % chunk_stride + else: + padding_len_x = 0 + + padding = np.zeros( + (padding_len_x), dtype=samples.dtype) + padded_x = np.concatenate([samples, padding], axis=0) + + num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1 + num_chunk = int(num_chunk) + + for i in range(0, num_chunk): + start = i * chunk_stride + end = start + chunk_size + x_chunk = padded_x[start:end] + yield x_chunk + + async def run(self, wavfile_path: str): + logging.info("send a message to the server") + # 读取音频 + # self.read_wave() + # 发送 websocket 的 handshake 协议头 + async with websockets.connect(self.url) as ws: + # server 端已经接收到 handshake 协议头 + # 发送开始指令 + audio_info = json.dumps({ + "name": "test.wav", + "signal": "start", + "nbest": 5 + }, sort_keys=True, indent=4, separators=(',', ': ')) + await ws.send(audio_info) + msg = await ws.recv() + logging.info("receive msg={}".format(msg)) + + # send chunk audio data to engine + for chunk_data in self.read_wave(wavfile_path): + await ws.send(chunk_data.tobytes()) + msg = await ws.recv() + logging.info("receive msg={}".format(msg)) + + # finished + audio_info = json.dumps({ + "name": "test.wav", + "signal": "end", + "nbest": 5 + }, sort_keys=True, indent=4, separators=(',', ': ')) + await ws.send(audio_info) + msg = await ws.recv() + logging.info("receive msg={}".format(msg)) + + +def main(args): + logging.basicConfig(level=logging.INFO) + logging.info("asr websocket client start") + handler = ASRAudioHandler("127.0.0.1", 8090) + loop = asyncio.get_event_loop() + loop.run_until_complete(handler.run(args.wavfile)) + logging.info("asr websocket client finished") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--wavfile", + action="store", + help="wav file path ", + default="./16_audio.wav") + args = parser.parse_args() + + main(args) diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py new file mode 100644 index 000000000..4c1a3958a --- /dev/null +++ b/paddlespeech/server/utils/buffer.py @@ -0,0 +1,59 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Frame(object): + """Represents a "frame" of audio data.""" + + def __init__(self, bytes, timestamp, duration): + self.bytes = bytes + self.timestamp = timestamp + self.duration = duration + + +class ChunkBuffer(object): + def __init__(self, + frame_duration_ms=80, + shift_ms=40, + sample_rate=16000, + sample_width=2): + self.sample_rate = sample_rate + self.frame_duration_ms = frame_duration_ms + self.shift_ms = shift_ms + self.remained_audio = b'' + self.sample_width = sample_width # int16 = 2; float32 = 4 + + def frame_generator(self, audio): + """Generates audio frames from PCM audio data. + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + Yields Frames of the requested duration. + """ + audio = self.remained_audio + audio + self.remained_audio = b'' + + n = int(self.sample_rate * + (self.frame_duration_ms / 1000.0) * self.sample_width) + shift_n = int(self.sample_rate * + (self.shift_ms / 1000.0) * self.sample_width) + offset = 0 + timestamp = 0.0 + duration = (float(n) / self.sample_rate) / self.sample_width + shift_duration = (float(shift_n) / self.sample_rate) / self.sample_width + while offset + n <= len(audio): + yield Frame(audio[offset:offset + n], timestamp, duration) + timestamp += shift_duration + offset += shift_n + + self.remained_audio += audio[offset:] diff --git a/paddlespeech/server/utils/vad.py b/paddlespeech/server/utils/vad.py new file mode 100644 index 000000000..e9b557171 --- /dev/null +++ b/paddlespeech/server/utils/vad.py @@ -0,0 +1,79 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import logging + +import webrtcvad + + +class VADAudio(): + def __init__(self, + aggressiveness, + rate, + frame_duration_ms, + sample_width=2, + padding_ms=200, + padding_ratio=0.9): + """Initializes VAD with given aggressivenes and sets up internal queues""" + self.vad = webrtcvad.Vad(aggressiveness) + self.rate = rate + self.sample_width = sample_width + self.frame_duration_ms = frame_duration_ms + self._frame_length = int(rate * (frame_duration_ms / 1000.0) * + self.sample_width) + self._buffer_queue = collections.deque() + self.ring_buffer = collections.deque(maxlen=padding_ms // + frame_duration_ms) + self._ratio = padding_ratio + self.triggered = False + + def add_audio(self, audio): + """Adds new audio to internal queue""" + for x in audio: + self._buffer_queue.append(x) + + def frame_generator(self): + """Generator that yields audio frames of frame_duration_ms""" + while len(self._buffer_queue) > self._frame_length: + frame = bytearray() + for _ in range(self._frame_length): + frame.append(self._buffer_queue.popleft()) + yield bytes(frame) + + def vad_collector(self): + """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None. + Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered. + Example: (frame, ..., frame, None, frame, ..., frame, None, ...) + |---utterence---| |---utterence---| + """ + for frame in self.frame_generator(): + is_speech = self.vad.is_speech(frame, self.rate) + if not self.triggered: + self.ring_buffer.append((frame, is_speech)) + num_voiced = len( + [f for f, speech in self.ring_buffer if speech]) + if num_voiced > self._ratio * self.ring_buffer.maxlen: + self.triggered = True + for f, s in self.ring_buffer: + yield f + self.ring_buffer.clear() + else: + yield frame + self.ring_buffer.append((frame, is_speech)) + num_unvoiced = len( + [f for f, speech in self.ring_buffer if not speech]) + if num_unvoiced > self._ratio * self.ring_buffer.maxlen: + self.triggered = False + yield None + self.ring_buffer.clear() diff --git a/paddlespeech/server/ws/__init__.py b/paddlespeech/server/ws/__init__.py new file mode 100644 index 000000000..97043fd7b --- /dev/null +++ b/paddlespeech/server/ws/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/ws/api.py b/paddlespeech/server/ws/api.py new file mode 100644 index 000000000..10664d114 --- /dev/null +++ b/paddlespeech/server/ws/api.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +from fastapi import APIRouter + +from paddlespeech.server.ws.asr_socket import router as asr_router + +_router = APIRouter() + + +def setup_router(api_list: List): + """setup router for fastapi + Args: + api_list (List): [asr, tts] + Returns: + APIRouter + """ + for api_name in api_list: + if api_name == 'asr': + _router.include_router(asr_router) + elif api_name == 'tts': + pass + else: + pass + + return _router diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py new file mode 100644 index 000000000..5cc9472c8 --- /dev/null +++ b/paddlespeech/server/ws/asr_socket.py @@ -0,0 +1,106 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import traceback +from typing import Union +import random +import numpy as np +import json + +from fastapi import APIRouter +from fastapi import WebSocket +from fastapi import WebSocketDisconnect +from starlette.websockets import WebSocketState as WebSocketState + +from paddlespeech.server.engine.asr.online.asr_engine import ASREngine +from paddlespeech.server.engine.engine_pool import get_engine_pool +from paddlespeech.server.utils.buffer import ChunkBuffer +from paddlespeech.server.utils.vad import VADAudio + + +router = APIRouter() + +@router.websocket('/ws/asr') +async def websocket_endpoint(websocket: WebSocket): + + await websocket.accept() + + # init buffer + chunk_buffer = ChunkBuffer(sample_width=2) + # init vad + vad = VADAudio(2, 16000, 20) + + try: + while True: + # careful here, changed the source code from starlette.websockets + assert websocket.application_state == WebSocketState.CONNECTED + message = await websocket.receive() + websocket._raise_on_disconnect(message) + if "text" in message: + message = json.loads(message["text"]) + if 'signal' not in message: + resp = { + "status": "ok", + "message": "no valid json data" + } + await websocket.send_json(resp) + + if message['signal'] == 'start': + resp = { + "status": "ok", + "signal": "server_ready" + } + # do something at begining here + await websocket.send_json(resp) + elif message['signal'] == 'end': + engine_pool = get_engine_pool() + asr_engine = engine_pool['asr'] + # reset single engine for an new connection + asr_engine.reset() + resp = { + "status": "ok", + "signal": "finished" + } + await websocket.send_json(resp) + break + else: + resp = { + "status": "ok", + "message": "no valid json data" + } + await websocket.send_json(resp) + elif "bytes" in message: + message = message["bytes"] + + # vad for input bytes audio + vad.add_audio(message) + message = b''.join(f for f in vad.vad_collector() if f is not None) + + engine_pool = get_engine_pool() + asr_engine = engine_pool['asr'] + asr_results = "" + frames = chunk_buffer.frame_generator(message) + for frame in frames: + samples = np.frombuffer(frame.bytes, dtype=np.int16) + sample_rate = asr_engine.config.sample_rate + x_chunk, x_chunk_lens = asr_engine.preprocess(samples, sample_rate) + asr_engine.run(x_chunk, x_chunk_lens) + asr_results = asr_engine.postprocess() + + asr_results = asr_engine.postprocess() + resp = {'asr_results': asr_results} + + await websocket.send_json(resp) + except WebSocketDisconnect: + pass From 9602749a3cb166cf57e04af81c069f387364905a Mon Sep 17 00:00:00 2001 From: qingen Date: Wed, 30 Mar 2022 17:29:22 +0800 Subject: [PATCH 10/15] [vec][search] update client image url, test=doc fix #1608 --- demos/README.md | 1 + demos/README_cn.md | 1 + demos/audio_searching/docker-compose.yaml | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/demos/README.md b/demos/README.md index 4482aa191..36e93dbf1 100644 --- a/demos/README.md +++ b/demos/README.md @@ -4,6 +4,7 @@ The directory containes many speech applications in multi scenarios. +* audio searching - mass audio similarity retrieval * audio tagging - multi-label tagging of an audio file * automatic_video_subtitiles - generate subtitles from a video * metaverse - 2D AR with TTS diff --git a/demos/README_cn.md b/demos/README_cn.md index 242b4f070..add6e25f5 100644 --- a/demos/README_cn.md +++ b/demos/README_cn.md @@ -4,6 +4,7 @@ 该目录包含基于 PaddleSpeech 开发的不同场景的语音应用 Demo: +* 声音检索 - 海量音频相似性检索。 * 声音分类 - 基于 AudioSet 的 527 类标签的音频多标签分类。 * 视频字幕生成 - 识别视频中语音的文本,并进行文本后处理。 * 元宇宙 - 基于语音合成的 2D 增强现实。 diff --git a/demos/audio_searching/docker-compose.yaml b/demos/audio_searching/docker-compose.yaml index 8916e76fd..16ac054d6 100644 --- a/demos/audio_searching/docker-compose.yaml +++ b/demos/audio_searching/docker-compose.yaml @@ -64,7 +64,7 @@ services: webclient: container_name: audio-webclient - image: qingen1/paddlespeech-audio-search-client:2.3 + image: paddlepaddle/paddlespeech-audio-search-client:2.3 networks: app_net: ipv4_address: 172.16.23.13 From aa0cb236e2f78f3cb1b494650fca4c3a867591bf Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 31 Mar 2022 03:20:05 +0000 Subject: [PATCH 11/15] fix openfst patch --- speechx/cmake/external/openfst.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/external/openfst.cmake index dc9cdff6c..9acf530a1 100644 --- a/speechx/cmake/external/openfst.cmake +++ b/speechx/cmake/external/openfst.cmake @@ -13,7 +13,7 @@ ExternalProject_Add(openfst "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}" "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}" "LIBS=-lgflags_nothreads -lglog -lpthread" - COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} BUILD_COMMAND make -j 4 ) link_directories(${openfst_PREFIX_DIR}/lib) From 21c4132edacd436ceb2709f64ae5cdf2f39b3bd9 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Thu, 31 Mar 2022 15:16:32 +0800 Subject: [PATCH 12/15] Update paddlespeech_client.py --- paddlespeech/server/bin/paddlespeech_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 40f17c63c..413f00872 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -150,7 +150,7 @@ class TTSClientExecutor(BaseExecutor): res = requests.post(url, json.dumps(request)) response_dict = res.json() - if not output: + if output is not None: self.postprocess(response_dict["result"]["audio"], output) return res From f17347f454fd1000ce235e74713c443777c7a79a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 31 Mar 2022 07:21:28 +0000 Subject: [PATCH 13/15] glog to stderr --- demos/audio_searching/src/encode.py | 4 ---- speechx/examples/CMakeLists.txt | 2 ++ speechx/examples/README.md | 3 ++- speechx/examples/decoder/offline_decoder_main.cc | 1 + speechx/examples/decoder/run.sh | 5 ++++- speechx/examples/feat/linear_spectrogram_main.cc | 10 ++++++++++ speechx/examples/feat/run.sh | 1 + 7 files changed, 20 insertions(+), 6 deletions(-) diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py index cf5f29a40..358057841 100644 --- a/demos/audio_searching/src/encode.py +++ b/demos/audio_searching/src/encode.py @@ -11,11 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os - -import librosa import numpy as np -from config import DEFAULT_TABLE from logs import LOGGER from paddlespeech.cli import VectorExecutor diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt index ef0a72b88..7f1543c25 100644 --- a/speechx/examples/CMakeLists.txt +++ b/speechx/examples/CMakeLists.txt @@ -3,3 +3,5 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) add_subdirectory(feat) add_subdirectory(nnet) add_subdirectory(decoder) + +add_subdirectory(glog) \ No newline at end of file diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 941c4272d..705ca2006 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,8 +1,9 @@ # Examples -* decoder - online decoder to work as offline +* glog - glog usage * feat - mfcc, linear * nnet - ds2 nn +* decoder - online decoder to work as offline ## How to run diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc index 44127c73b..eccd7c099 100644 --- a/speechx/examples/decoder/offline_decoder_main.cc +++ b/speechx/examples/decoder/offline_decoder_main.cc @@ -63,6 +63,7 @@ int main(int argc, char* argv[]) { int32 chunk_size = 35; decoder.InitDecoder(); + LOG(INFO) << "chunk size: " << chunk_size; for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh index fc5e91824..1e5a678c2 100755 --- a/speechx/examples/decoder/run.sh +++ b/speechx/examples/decoder/run.sh @@ -25,6 +25,9 @@ model_dir=../paddle_asr_model feat_wspecifier=./feats.ark cmvn=./cmvn.ark + +export GLOG_logtostderr=1 + # 3. run feat linear_spectrogram_main \ --wav_rspecifier=scp:$model_dir/wav.scp \ @@ -37,4 +40,4 @@ offline_decoder_main \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdparams \ --dict_file=$model_dir/vocab.txt \ - --lm_path=$model_dir/avg_1.jit.klm \ No newline at end of file + --lm_path=$model_dir/avg_1.jit.klm diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index 9ed4d6f93..a27db56fd 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -25,6 +25,8 @@ #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" +#include + DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn"); @@ -172,6 +174,9 @@ int main(int argc, char* argv[]) { ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; + LOG(INFO) << "frame length (ms):" << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame shift (ms):" << opt.frame_opts.frame_shift_ms; + ppspeech::DecibelNormalizerOptions db_norm_opt; std::unique_ptr base_feature_extractor( new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); @@ -190,6 +195,11 @@ int main(int argc, char* argv[]) { int sample_rate = 16000; int chunk_sample_size = streaming_chunk * sample_rate; + LOG(INFO) << "sr:" << sample_rate; + LOG(INFO) << "chunk size (s):" << streaming_chunk; + LOG(INFO) << "chunk size (sample):" << chunk_sample_size; + + for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); diff --git a/speechx/examples/feat/run.sh b/speechx/examples/feat/run.sh index bd21bd7f4..29c49d325 100755 --- a/speechx/examples/feat/run.sh +++ b/speechx/examples/feat/run.sh @@ -25,6 +25,7 @@ feat_wspecifier=./feats.ark cmvn=./cmvn.ark # 3. run feat +export GLOG_logtostderr=1 linear_spectrogram_main \ --wav_rspecifier=scp:$model_dir/wav.scp \ --feature_wspecifier=ark,t:$feat_wspecifier \ From 2ec8d608bf1ad5b0be9c36dc4339702271c27a6e Mon Sep 17 00:00:00 2001 From: WilliamZhang06 Date: Thu, 31 Mar 2022 16:06:16 +0800 Subject: [PATCH 14/15] fixed comments, test=doc --- paddlespeech/server/conf/application.yaml | 13 ++-- paddlespeech/server/conf/ws_application.yaml | 51 ++++++++++++++++ .../tests/asr/online/microphone_client.py | 61 +++++++++++-------- .../tests/asr/online/websocket_client.py | 56 ++++++++--------- paddlespeech/server/utils/vad.py | 7 +-- paddlespeech/server/ws/asr_socket.py | 48 +++++++-------- 6 files changed, 142 insertions(+), 94 deletions(-) create mode 100644 paddlespeech/server/conf/ws_application.yaml diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 40de8e3b2..849349c2d 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -3,18 +3,15 @@ ################################################################################# # SERVER SETTING # ################################################################################# -host: 0.0.0.0 +host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] -# protocol: 'http' -# engine_list: ['asr_python', 'tts_python', 'cls_python'] - - -# websocket, http (only choose one). websocket only support online engine type. -protocol: 'websocket' -engine_list: ['asr_online'] +# protocol = ['websocket', 'http'] (only one can be selected). +# http only support offline engine type. +protocol: 'http' +engine_list: ['asr_python', 'tts_python', 'cls_python'] ################################################################################# diff --git a/paddlespeech/server/conf/ws_application.yaml b/paddlespeech/server/conf/ws_application.yaml new file mode 100644 index 000000000..ef23593ed --- /dev/null +++ b/paddlespeech/server/conf/ws_application.yaml @@ -0,0 +1,51 @@ +# This is the parameter configuration file for PaddleSpeech Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8091 + +# The task format in the engin_list is: _ +# task choices = ['asr_online', 'tts_online'] +# protocol = ['websocket', 'http'] (only one can be selected). +# websocket only support online engine type. +protocol: 'websocket' +engine_list: ['asr_online'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### ASR ######################################### +################### speech task: asr; engine_type: online ####################### +asr_online: + model_type: 'deepspeech2online_aishell' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + chunk_buffer_conf: + frame_duration_ms: 80 + shift_ms: 40 + sample_rate: 16000 + sample_width: 2 + + vad_conf: + aggressiveness: 2 + sample_rate: 16000 + frame_duration_ms: 20 + sample_width: 2 + padding_ms: 200 + padding_ratio: 0.9 diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py index 74d457c56..2ceaf6d03 100644 --- a/paddlespeech/server/tests/asr/online/microphone_client.py +++ b/paddlespeech/server/tests/asr/online/microphone_client.py @@ -11,25 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ record wave from the mic """ - +import asyncio +import json +import logging import threading -import pyaudio import wave -import logging -import asyncio +from signal import SIGINT +from signal import SIGTERM + +import pyaudio import websockets -import json -from signal import SIGINT, SIGTERM class ASRAudioHandler(threading.Thread): - def __init__(self, - url="127.0.0.1", - port=8090): + def __init__(self, url="127.0.0.1", port=8091): threading.Thread.__init__(self) self.url = url self.port = port @@ -56,12 +54,13 @@ class ASRAudioHandler(threading.Thread): self._running = True self._frames = [] p = pyaudio.PyAudio() - stream = p.open(format=self.format, - channels=self.channels, - rate=self.rate, - input=True, - frames_per_buffer=self.chunk) - while(self._running): + stream = p.open( + format=self.format, + channels=self.channels, + rate=self.rate, + input=True, + frames_per_buffer=self.chunk) + while (self._running): data = stream.read(self.chunk) self._frames.append(data) self.data_backup.append(data) @@ -97,11 +96,15 @@ class ASRAudioHandler(threading.Thread): async with websockets.connect(self.url) as ws: # 发送开始指令 - audio_info = json.dumps({ - "name": "test.wav", - "signal": "start", - "nbest": 5 - }, sort_keys=True, indent=4, separators=(',', ': ')) + audio_info = json.dumps( + { + "name": "test.wav", + "signal": "start", + "nbest": 5 + }, + sort_keys=True, + indent=4, + separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() logging.info("receive msg={}".format(msg)) @@ -117,11 +120,15 @@ class ASRAudioHandler(threading.Thread): except asyncio.CancelledError: # quit # send finished - audio_info = json.dumps({ - "name": "test.wav", - "signal": "end", - "nbest": 5 - }, sort_keys=True, indent=4, separators=(',', ': ')) + audio_info = json.dumps( + { + "name": "test.wav", + "signal": "end", + "nbest": 5 + }, + sort_keys=True, + indent=4, + separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() logging.info("receive msg={}".format(msg)) @@ -141,7 +148,7 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO) logging.info("asr websocket client start") - handler = ASRAudioHandler("127.0.0.1", 8090) + handler = ASRAudioHandler("127.0.0.1", 8091) loop = asyncio.get_event_loop() main_task = asyncio.ensure_future(handler.run()) for signal in [SIGINT, SIGTERM]: diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py index d849ffeac..58b1a452c 100644 --- a/paddlespeech/server/tests/asr/online/websocket_client.py +++ b/paddlespeech/server/tests/asr/online/websocket_client.py @@ -11,26 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - #!/usr/bin/python # -*- coding: UTF-8 -*- - import argparse -import logging -import time -import os +import asyncio import json -import wave +import logging + import numpy as np -import asyncio -import websockets import soundfile +import websockets class ASRAudioHandler: - def __init__(self, - url="127.0.0.1", - port=8090): + def __init__(self, url="127.0.0.1", port=8090): self.url = url self.port = port self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr" @@ -38,17 +32,15 @@ class ASRAudioHandler: def read_wave(self, wavfile_path: str): samples, sample_rate = soundfile.read(wavfile_path, dtype='int16') x_len = len(samples) - chunk_stride = 40 * 16 #40ms, sample_rate = 16kHz - chunk_size = 80 * 16 #80ms, sample_rate = 16kHz + chunk_stride = 40 * 16 #40ms, sample_rate = 16kHz + chunk_size = 80 * 16 #80ms, sample_rate = 16kHz if (x_len - chunk_size) % chunk_stride != 0: - padding_len_x = chunk_stride - (x_len - chunk_size - ) % chunk_stride + padding_len_x = chunk_stride - (x_len - chunk_size) % chunk_stride else: padding_len_x = 0 - padding = np.zeros( - (padding_len_x), dtype=samples.dtype) + padding = np.zeros((padding_len_x), dtype=samples.dtype) padded_x = np.concatenate([samples, padding], axis=0) num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1 @@ -68,11 +60,15 @@ class ASRAudioHandler: async with websockets.connect(self.url) as ws: # server 端已经接收到 handshake 协议头 # 发送开始指令 - audio_info = json.dumps({ - "name": "test.wav", - "signal": "start", - "nbest": 5 - }, sort_keys=True, indent=4, separators=(',', ': ')) + audio_info = json.dumps( + { + "name": "test.wav", + "signal": "start", + "nbest": 5 + }, + sort_keys=True, + indent=4, + separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() logging.info("receive msg={}".format(msg)) @@ -84,11 +80,15 @@ class ASRAudioHandler: logging.info("receive msg={}".format(msg)) # finished - audio_info = json.dumps({ - "name": "test.wav", - "signal": "end", - "nbest": 5 - }, sort_keys=True, indent=4, separators=(',', ': ')) + audio_info = json.dumps( + { + "name": "test.wav", + "signal": "end", + "nbest": 5 + }, + sort_keys=True, + indent=4, + separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() logging.info("receive msg={}".format(msg)) @@ -97,7 +97,7 @@ class ASRAudioHandler: def main(args): logging.basicConfig(level=logging.INFO) logging.info("asr websocket client start") - handler = ASRAudioHandler("127.0.0.1", 8090) + handler = ASRAudioHandler("127.0.0.1", 8091) loop = asyncio.get_event_loop() loop.run_until_complete(handler.run(args.wavfile)) logging.info("asr websocket client finished") diff --git a/paddlespeech/server/utils/vad.py b/paddlespeech/server/utils/vad.py index e9b557171..a2dcf68b8 100644 --- a/paddlespeech/server/utils/vad.py +++ b/paddlespeech/server/utils/vad.py @@ -12,16 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import collections -import logging import webrtcvad class VADAudio(): def __init__(self, - aggressiveness, - rate, - frame_duration_ms, + aggressiveness=2, + rate=16000, + frame_duration_ms=20, sample_width=2, padding_ms=200, padding_ratio=0.9): diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py index 5cc9472c8..ea19816b6 100644 --- a/paddlespeech/server/ws/asr_socket.py +++ b/paddlespeech/server/ws/asr_socket.py @@ -11,35 +11,39 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import base64 -import traceback -from typing import Union -import random -import numpy as np import json +import numpy as np from fastapi import APIRouter from fastapi import WebSocket from fastapi import WebSocketDisconnect from starlette.websockets import WebSocketState as WebSocketState -from paddlespeech.server.engine.asr.online.asr_engine import ASREngine from paddlespeech.server.engine.engine_pool import get_engine_pool from paddlespeech.server.utils.buffer import ChunkBuffer from paddlespeech.server.utils.vad import VADAudio - router = APIRouter() + @router.websocket('/ws/asr') async def websocket_endpoint(websocket: WebSocket): await websocket.accept() + engine_pool = get_engine_pool() + asr_engine = engine_pool['asr'] # init buffer - chunk_buffer = ChunkBuffer(sample_width=2) + chunk_buffer_conf = asr_engine.config.chunk_buffer_conf + chunk_buffer = ChunkBuffer( + sample_rate=chunk_buffer_conf['sample_rate'], + sample_width=chunk_buffer_conf['sample_width']) # init vad - vad = VADAudio(2, 16000, 20) + vad_conf = asr_engine.config.vad_conf + vad = VADAudio( + aggressiveness=vad_conf['aggressiveness'], + rate=vad_conf['sample_rate'], + frame_duration_ms=vad_conf['frame_duration_ms']) try: while True: @@ -50,17 +54,11 @@ async def websocket_endpoint(websocket: WebSocket): if "text" in message: message = json.loads(message["text"]) if 'signal' not in message: - resp = { - "status": "ok", - "message": "no valid json data" - } + resp = {"status": "ok", "message": "no valid json data"} await websocket.send_json(resp) if message['signal'] == 'start': - resp = { - "status": "ok", - "signal": "server_ready" - } + resp = {"status": "ok", "signal": "server_ready"} # do something at begining here await websocket.send_json(resp) elif message['signal'] == 'end': @@ -68,24 +66,19 @@ async def websocket_endpoint(websocket: WebSocket): asr_engine = engine_pool['asr'] # reset single engine for an new connection asr_engine.reset() - resp = { - "status": "ok", - "signal": "finished" - } + resp = {"status": "ok", "signal": "finished"} await websocket.send_json(resp) break else: - resp = { - "status": "ok", - "message": "no valid json data" - } + resp = {"status": "ok", "message": "no valid json data"} await websocket.send_json(resp) elif "bytes" in message: message = message["bytes"] # vad for input bytes audio vad.add_audio(message) - message = b''.join(f for f in vad.vad_collector() if f is not None) + message = b''.join(f for f in vad.vad_collector() + if f is not None) engine_pool = get_engine_pool() asr_engine = engine_pool['asr'] @@ -94,7 +87,8 @@ async def websocket_endpoint(websocket: WebSocket): for frame in frames: samples = np.frombuffer(frame.bytes, dtype=np.int16) sample_rate = asr_engine.config.sample_rate - x_chunk, x_chunk_lens = asr_engine.preprocess(samples, sample_rate) + x_chunk, x_chunk_lens = asr_engine.preprocess(samples, + sample_rate) asr_engine.run(x_chunk, x_chunk_lens) asr_results = asr_engine.postprocess() From cb66b742ab75287e35915e13b59cf71bf42d4146 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 31 Mar 2022 09:03:01 +0000 Subject: [PATCH 15/15] more comment --- .../examples/decoder/offline_decoder_main.cc | 41 ++++++++++++++----- speechx/examples/decoder/run.sh | 2 +- speechx/examples/feat/feature-mfcc-test.cc | 1 - .../examples/feat/linear_spectrogram_main.cc | 38 +++++++++-------- speechx/speechx/frontend/feature_cache.h | 4 +- speechx/speechx/frontend/raw_audio.h | 3 +- 6 files changed, 57 insertions(+), 32 deletions(-) diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc index eccd7c099..3a94cc947 100644 --- a/speechx/examples/decoder/offline_decoder_main.cc +++ b/speechx/examples/decoder/offline_decoder_main.cc @@ -22,11 +22,12 @@ #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" -DEFINE_string(feature_respecifier, "", "test feature rspecifier"); +DEFINE_string(feature_respecifier, "", "feature matrix rspecifier"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); DEFINE_string(lm_path, "lm.klm", "language model"); +DEFINE_int32(chunk_size, 35, "feat chunk size"); using kaldi::BaseFloat; @@ -43,14 +44,16 @@ int main(int argc, char* argv[]) { std::string model_params = FLAGS_param_path; std::string dict_file = FLAGS_dict_file; std::string lm_path = FLAGS_lm_path; + int32 chunk_size = FLAGS_chunk_size; + LOG(INFO) << "model path: " << model_graph; + LOG(INFO) << "model param: " << model_params; + LOG(INFO) << "dict path: " << dict_file; + LOG(INFO) << "lm path: " << lm_path; + LOG(INFO) << "chunk size (frame): " << chunk_size; int32 num_done = 0, num_err = 0; - ppspeech::CTCBeamSearchOptions opts; - opts.dict_file = dict_file; - opts.lm_path = lm_path; - ppspeech::CTCBeamSearch decoder(opts); - + // frontend + nnet is decodable ppspeech::ModelOptions model_opts; model_opts.model_path = model_graph; model_opts.params_path = model_params; @@ -60,34 +63,50 @@ int main(int argc, char* argv[]) { new ppspeech::RawDataCache()); std::shared_ptr decodable( new ppspeech::Decodable(nnet, raw_data)); + LOG(INFO) << "Init decodeable."; - int32 chunk_size = 35; - decoder.InitDecoder(); - LOG(INFO) << "chunk size: " << chunk_size; + // init decoder + ppspeech::CTCBeamSearchOptions opts; + opts.dict_file = dict_file; + opts.lm_path = lm_path; + ppspeech::CTCBeamSearch decoder(opts); + LOG(INFO) << "Init decoder."; + decoder.InitDecoder(); for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); const kaldi::Matrix feature = feature_reader.Value(); + LOG(INFO) << "utt: " << utt; + + // feat dim raw_data->SetDim(feature.NumCols()); + LOG(INFO) << "dim: " << raw_data->Dim(); + int32 row_idx = 0; int32 num_chunks = feature.NumRows() / chunk_size; + LOG(INFO) << "n chunks: " << num_chunks; for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + // feat chunk kaldi::Vector feature_chunk(chunk_size * feature.NumCols()); for (int row_id = 0; row_id < chunk_size; ++row_id) { - kaldi::SubVector tmp(feature, row_idx); + kaldi::SubVector feat_one_row(feature, + row_idx); kaldi::SubVector f_chunk_tmp( feature_chunk.Data() + row_id * feature.NumCols(), feature.NumCols()); - f_chunk_tmp.CopyFromVec(tmp); + f_chunk_tmp.CopyFromVec(feat_one_row); row_idx++; } + // feed to raw cache raw_data->Accept(feature_chunk); if (chunk_idx == num_chunks - 1) { raw_data->SetFinished(); } + // decode step decoder.AdvanceDecode(decodable); } + std::string result; result = decoder.GetFinalBestPath(); KALDI_LOG << " the result of " << utt << " is " << result; diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh index 1e5a678c2..ddda89702 100755 --- a/speechx/examples/decoder/run.sh +++ b/speechx/examples/decoder/run.sh @@ -28,7 +28,7 @@ cmvn=./cmvn.ark export GLOG_logtostderr=1 -# 3. run feat +# 3. gen linear feat linear_spectrogram_main \ --wav_rspecifier=scp:$model_dir/wav.scp \ --feature_wspecifier=ark,t:$feat_wspecifier \ diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc index ae32aba9e..48a9e1c29 100644 --- a/speechx/examples/feat/feature-mfcc-test.cc +++ b/speechx/examples/feat/feature-mfcc-test.cc @@ -41,7 +41,6 @@ using namespace kaldi; - static void UnitTestReadWave() { std::cout << "=== UnitTestReadWave() ===\n"; diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index a27db56fd..cde78c4d3 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -151,7 +151,7 @@ void WriteMatrix() { cmvn_stats(1, idx) = variance_[idx]; } cmvn_stats(0, mean_.size()) = count_; - kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); + kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false); } int main(int argc, char* argv[]) { @@ -163,51 +163,56 @@ int main(int argc, char* argv[]) { kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); WriteMatrix(); - // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning - // window -->linear_spectrogram --> cmvn + int32 num_done = 0, num_err = 0; + + // feature pipeline: wave cache --> decibel_normalizer --> hanning + // window -->linear_spectrogram --> global cmvn -> feat cache + // std::unique_ptr data_source(new // ppspeech::RawDataCache()); std::unique_ptr data_source( new ppspeech::RawAudioCache()); + ppspeech::DecibelNormalizerOptions db_norm_opt; + std::unique_ptr db_norm( + new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; - LOG(INFO) << "frame length (ms):" << opt.frame_opts.frame_length_ms; - LOG(INFO) << "frame shift (ms):" << opt.frame_opts.frame_shift_ms; - - ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; std::unique_ptr linear_spectrogram( - new ppspeech::LinearSpectrogram(opt, - std::move(base_feature_extractor))); + new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_write_path, std::move(linear_spectrogram))); ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); + LOG(INFO) << "feat dim: " << feature_cache.Dim(); - float streaming_chunk = 0.36; int sample_rate = 16000; + float streaming_chunk = 0.36; int chunk_sample_size = streaming_chunk * sample_rate; - - LOG(INFO) << "sr:" << sample_rate; - LOG(INFO) << "chunk size (s):" << streaming_chunk; - LOG(INFO) << "chunk size (sample):" << chunk_sample_size; + LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "chunk size (s): " << streaming_chunk; + LOG(INFO) << "chunk size (sample): " << chunk_sample_size; for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); + LOG(INFO) << "process utt: " << utt; int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); int tot_samples = waveform.Dim(); + LOG(INFO) << "wav len (sample): " << tot_samples; + int sample_offset = 0; std::vector> feats; int feature_rows = 0; @@ -219,6 +224,7 @@ int main(int argc, char* argv[]) { for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } + kaldi::Vector features; feature_cache.Accept(wav_chunk); if (cur_chunk_size < chunk_sample_size) { diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index b6bbdf3cb..f52b9b0f6 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface { // Feed feats or waves virtual void Accept(const kaldi::VectorBase& inputs); - // feats dim = num_frames * feature_dim + // feats size = num_frames * feat_dim virtual bool Read(kaldi::Vector* feats); - // feature cache only cache feature which from base extractor + // feat dim virtual size_t Dim() const { return base_extractor_->Dim(); } virtual void SetFinished() { diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index ce75c137c..7a28f2c98 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface { data_.Resize(0); return true; } - virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { finished_ = true; } virtual bool IsFinished() const { return finished_; } + virtual size_t Dim() const { return dim_; } void SetDim(int32 dim) { dim_ = dim; } virtual void Reset() { finished_ = true; }