diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh new file mode 100644 index 00000000..19ce0ebb --- /dev/null +++ b/demos/speech_recognition/run.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# asr +paddlespeech asr --input ./zh.wav + + +# asr + punc +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc diff --git a/paddlespeech/server/tests/asr/online/web/app.py b/paddlespeech/server/tests/asr/online/web/app.py index b880cf7f..22993c08 100644 --- a/paddlespeech/server/tests/asr/online/web/app.py +++ b/paddlespeech/server/tests/asr/online/web/app.py @@ -1,12 +1,11 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- - # Copyright 2021 Mobvoi Inc. All Rights Reserved. # Author: zhendong.peng@mobvoi.com (Zhendong Peng) - import argparse -from flask import Flask, render_template +from flask import Flask +from flask import render_template parser = argparse.ArgumentParser(description='training your network') parser.add_argument('--port', default=19999, type=int, help='port id') @@ -14,9 +13,11 @@ args = parser.parse_args() app = Flask(__name__) + @app.route('/') def index(): return render_template('index.html') + if __name__ == '__main__': app.run(host='0.0.0.0', port=args.port, debug=True) diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py index 661eb4dd..01b19405 100644 --- a/paddlespeech/server/tests/asr/online/websocket_client.py +++ b/paddlespeech/server/tests/asr/online/websocket_client.py @@ -15,10 +15,11 @@ # -*- coding: UTF-8 -*- import argparse import asyncio +import codecs import json import logging import os -import codecs + import numpy as np import soundfile import websockets @@ -35,17 +36,17 @@ class ASRAudioHandler: x_len = len(samples) # chunk_stride = 40 * 16 #40ms, sample_rate = 16kHz chunk_size = 80 * 16 #80ms, sample_rate = 16kHz - + if x_len % chunk_size != 0: - padding_len_x = chunk_size - x_len % chunk_size + padding_len_x = chunk_size - x_len % chunk_size else: padding_len_x = 0 padding = np.zeros((padding_len_x), dtype=samples.dtype) padded_x = np.concatenate([samples, padding], axis=0) - assert ( x_len + padding_len_x ) % chunk_size == 0 - num_chunk = (x_len + padding_len_x ) / chunk_size + assert (x_len + padding_len_x) % chunk_size == 0 + num_chunk = (x_len + padding_len_x) / chunk_size num_chunk = int(num_chunk) for i in range(0, num_chunk): @@ -56,12 +57,7 @@ class ASRAudioHandler: async def run(self, wavfile_path: str): logging.info("send a message to the server") - # 读取音频 - # self.read_wave() - # 发送 websocket 的 handshake 协议头 async with websockets.connect(self.url) as ws: - # server 端已经接收到 handshake 协议头 - # 发送开始指令 audio_info = json.dumps( { "name": "test.wav", @@ -97,7 +93,6 @@ class ASRAudioHandler: msg = await ws.recv() msg = json.loads(msg) logging.info("receive msg={}".format(msg)) - return result diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py index 06eff964..12b1f0e5 100644 --- a/paddlespeech/server/utils/buffer.py +++ b/paddlespeech/server/utils/buffer.py @@ -24,12 +24,22 @@ class Frame(object): class ChunkBuffer(object): def __init__(self, - window_n=7, # frame - shift_n=4, # frame - window_ms=20, # ms - shift_ms=10, # ms + window_n=7, + shift_n=4, + window_ms=20, + shift_ms=10, sample_rate=16000, sample_width=2): + """audio sample data point buffer + + Args: + window_n (int, optional): decode window frame length. Defaults to 7 frame. + shift_n (int, optional): decode shift frame length. Defaults to 4 frame. + window_ms (int, optional): frame length, ms. Defaults to 20 ms. + shift_ms (int, optional): shift length, ms. Defaults to 10 ms. + sample_rate (int, optional): audio sample rate. Defaults to 16000. + sample_width (int, optional): sample point bytes. Defaults to 2 bytes. + """ self.window_n = window_n self.shift_n = shift_n self.window_ms = window_ms @@ -38,11 +48,14 @@ class ChunkBuffer(object): self.sample_width = sample_width # int16 = 2; float32 = 4 self.remained_audio = b'' - self.window_sec = float((self.window_n - 1) * self.shift_ms + self.window_ms) / 1000.0 + self.window_sec = float((self.window_n - 1) * self.shift_ms + + self.window_ms) / 1000.0 self.shift_sec = float(self.shift_n * self.shift_ms / 1000.0) - self.window_bytes = int(self.window_sec * self.sample_rate * self.sample_width) - self.shift_bytes = int(self.shift_sec * self.sample_rate * self.sample_width) + self.window_bytes = int(self.window_sec * self.sample_rate * + self.sample_width) + self.shift_bytes = int(self.shift_sec * self.sample_rate * + self.sample_width) def frame_generator(self, audio): """Generates audio frames from PCM audio data. @@ -57,7 +70,8 @@ class ChunkBuffer(object): timestamp = 0.0 while offset + self.window_bytes <= len(audio): - yield Frame(audio[offset:offset + self.window_bytes], timestamp, self.window_sec) + yield Frame(audio[offset:offset + self.window_bytes], timestamp, + self.window_sec) timestamp += self.shift_sec offset += self.shift_bytes diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py index ad4a1124..03a49b48 100644 --- a/paddlespeech/server/ws/asr_socket.py +++ b/paddlespeech/server/ws/asr_socket.py @@ -79,11 +79,6 @@ async def websocket_endpoint(websocket: WebSocket): elif "bytes" in message: message = message["bytes"] - # vad for input bytes audio - # vad.add_audio(message) - # message = b''.join(f for f in vad.vad_collector() - # if f is not None) - engine_pool = get_engine_pool() asr_engine = engine_pool['asr'] asr_results = "" @@ -95,6 +90,7 @@ async def websocket_endpoint(websocket: WebSocket): sample_rate) asr_engine.run(x_chunk, x_chunk_lens) asr_results = asr_engine.postprocess() + asr_results = asr_engine.postprocess() resp = {'asr_results': asr_results}