PaddleSpeech/demos/speech_web/speech_server/main.py

import argparse
import base64
import datetime
import json
import os
from typing import List

import aiofiles
import librosa
import soundfile as sf
import uvicorn
from fastapi import FastAPI
from fastapi import File
from fastapi import Form
from fastapi import UploadFile
from fastapi import WebSocket
from fastapi import WebSocketDisconnect
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from src.AudioManeger import AudioMannger
from src.robot import Robot
from src.SpeechBase.vpr import VPR
from src.util import *
from src.WebsocketManeger import ConnectionManager
from starlette.middleware.cors import CORSMiddleware
from starlette.requests import Request
from starlette.responses import FileResponse
from starlette.websockets import WebSocketState as WebSocketState

from paddlespeech.cli.tts.infer import TTSExecutor
from paddlespeech.server.engine.asr.online.python.asr_engine import PaddleASRConnectionHanddler
from paddlespeech.server.utils.audio_process import float2pcm

# 解析配置
parser = argparse.ArgumentParser(prog='PaddleSpeechDemo', add_help=True)

parser.add_argument(
    "--port",
    action="store",
    type=int,
    help="port of the app",
    default=8010,
    required=False)

args = parser.parse_args()
port = args.port

# 配置文件
tts_config = "conf/tts_online_application.yaml"
asr_config = "conf/ws_conformer_wenetspeech_application_faster.yaml"
asr_init_path = "source/demo/demo.wav"
db_path = "source/db/vpr.sqlite"
ie_model_path = "source/model"
tts_model = TTSExecutor()
# 路径配置
UPLOAD_PATH = "source/vpr"
WAV_PATH = "source/wav"

base_sources = [UPLOAD_PATH, WAV_PATH]
for path in base_sources:
    os.makedirs(path, exist_ok=True)

# 初始化
app = FastAPI()
chatbot = Robot(
    asr_config, tts_config, asr_init_path, ie_model_path=ie_model_path)
manager = ConnectionManager()
aumanager = AudioMannger(chatbot)
aumanager.init()
vpr = VPR(db_path, dim=192, top_k=5)
# 初始化下载模型
tts_model(
    text="今天天气准不错",
    output="test.wav",
    am='fastspeech2_mix',
    spk_id=174,
    voc='hifigan_csmsc',
    lang='mix', )


# 服务配置
class NlpBase(BaseModel):
    chat: str


class TtsBase(BaseModel):
    text: str


class Audios:
    def __init__(self) -> None:
        self.audios = b""


audios = Audios()

######################################################################
########################### ASR 服务 #################################
#####################################################################


# 接收文件，返回ASR结果
# 上传文件
@app.post("/asr/offline")
async def speech2textOffline(files: List[UploadFile]):
    # 只有第一个有效
    asr_res = ""
    for file in files[:1]:
        # 生成时间戳
        now_name = "asr_offline_" + datetime.datetime.strftime(
            datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav"
        out_file_path = os.path.join(WAV_PATH, now_name)
        async with aiofiles.open(out_file_path, 'wb') as out_file:
            content = await file.read()  # async read
            await out_file.write(content)  # async write

        # 返回ASR识别结果
        asr_res = chatbot.speech2text(out_file_path)
        return SuccessRequest(result=asr_res)
    return ErrorRequest(message="上传文件为空")


# 接收文件，同时将wav强制转成16k, int16类型
@app.post("/asr/offlinefile")
async def speech2textOfflineFile(files: List[UploadFile]):
    # 只有第一个有效
    asr_res = ""
    for file in files[:1]:
        # 生成时间戳
        now_name = "asr_offline_" + datetime.datetime.strftime(
            datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav"
        out_file_path = os.path.join(WAV_PATH, now_name)
        async with aiofiles.open(out_file_path, 'wb') as out_file:
            content = await file.read()  # async read
            await out_file.write(content)  # async write

        # 将文件转成16k, 16bit类型的wav文件
        wav, sr = librosa.load(out_file_path, sr=16000)
        wav = float2pcm(wav)  # float32 to int16
        wav_bytes = wav.tobytes()  # to bytes
        wav_base64 = base64.b64encode(wav_bytes).decode('utf8')

        # 将文件重新写入
        now_name = now_name[:-4] + "_16k" + ".wav"
        out_file_path = os.path.join(WAV_PATH, now_name)
        sf.write(out_file_path, wav, 16000)

        # 返回ASR识别结果
        asr_res = chatbot.speech2text(out_file_path)
        response_res = {"asr_result": asr_res, "wav_base64": wav_base64}
        return SuccessRequest(result=response_res)

    return ErrorRequest(message="上传文件为空")


# 流式接收测试
@app.post("/asr/online1")
async def speech2textOnlineRecive(files: List[UploadFile]):
    audio_bin = b''
    for file in files:
        content = await file.read()
        audio_bin += content
    audios.audios += audio_bin
    print(f"audios长度变化: {len(audios.audios)}")
    return SuccessRequest(message="接收成功")


# 采集环境噪音大小
@app.post("/asr/collectEnv")
async def collectEnv(files: List[UploadFile]):
    for file in files[:1]:
        content = await file.read()  # async read
        # 初始化, wav 前44字节是头部信息
        aumanager.compute_env_volume(content[44:])
        vad_ = aumanager.vad_threshold
        return SuccessRequest(result=vad_, message="采集环境噪音成功")


# 停止录音
@app.get("/asr/stopRecord")
async def stopRecord():
    audios.audios = b""
    aumanager.stop()
    print("Online录音暂停")
    return SuccessRequest(message="停止成功")


# 恢复录音
@app.get("/asr/resumeRecord")
async def resumeRecord():
    aumanager.resume()
    print("Online录音恢复")
    return SuccessRequest(message="Online录音恢复")


# 聊天用的 ASR
@app.websocket("/ws/asr/offlineStream")
async def websocket_endpoint(websocket: WebSocket):
    await manager.connect(websocket)
    try:
        while True:
            asr_res = None
            # websocket 不接收，只推送
            data = await websocket.receive_bytes()
            if not aumanager.is_pause:
                asr_res = aumanager.stream_asr(data)
            else:
                print("录音暂停")
            if asr_res:
                await manager.send_personal_message(asr_res, websocket)
                aumanager.clear_asr()

    except WebSocketDisconnect:
        manager.disconnect(websocket)
        # await manager.broadcast(f"用户-{user}-离开")
        # print(f"用户-{user}-离开")


    # 流式识别的 ASR
@app.websocket('/ws/asr/onlineStream')
async def websocket_endpoint_online(websocket: WebSocket):
    """PaddleSpeech Online ASR Server api

    Args:
        websocket (WebSocket): the websocket instance
    """

    #1. the interface wait to accept the websocket protocal header
    #   and only we receive the header, it establish the connection with specific thread
    await websocket.accept()

    #2. if we accept the websocket headers, we will get the online asr engine instance
    engine = chatbot.asr.engine

    #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio
    #   and each connection has its own connection instance to process the request
    #   and only if client send the start signal, we create the PaddleASRConnectionHanddler instance
    connection_handler = None

    try:
        #4. we do a loop to process the audio package by package according the protocal
        #   and only if the client send finished signal, we will break the loop
        while True:
            # careful here, changed the source code from starlette.websockets
            # 4.1 we wait for the client signal for the specific action
            assert websocket.application_state == WebSocketState.CONNECTED
            message = await websocket.receive()
            websocket._raise_on_disconnect(message)

            #4.2 text for the action command and bytes for pcm data
            if "text" in message:
                # we first parse the specific command
                message = json.loads(message["text"])
                if 'signal' not in message:
                    resp = {"status": "ok", "message": "no valid json data"}
                    await websocket.send_json(resp)

                # start command, we create the PaddleASRConnectionHanddler instance to process the audio data
                # end command, we process the all the last audio pcm and return the final result
                #              and we break the loop
                if message['signal'] == 'start':
                    resp = {"status": "ok", "signal": "server_ready"}
                    # do something at begining here
                    # create the instance to process the audio
                    # connection_handler = chatbot.asr.connection_handler
                    connection_handler = PaddleASRConnectionHanddler(engine)
                    await websocket.send_json(resp)
                elif message['signal'] == 'end':
                    # reset single  engine for an new connection
                    # and we will destroy the connection
                    connection_handler.decode(is_finished=True)
                    connection_handler.rescoring()
                    asr_results = connection_handler.get_result()
                    connection_handler.reset()

                    resp = {
                        "status": "ok",
                        "signal": "finished",
                        'result': asr_results
                    }
                    await websocket.send_json(resp)
                    break
                else:
                    resp = {"status": "ok", "message": "no valid json data"}
                    await websocket.send_json(resp)
            elif "bytes" in message:
                # bytes for the pcm data
                message = message["bytes"]
                print("###############")
                print("len message: ", len(message))
                print("###############")

                # we extract the remained audio pcm 
                # and decode for the result in this package data
                connection_handler.extract_feat(message)
                connection_handler.decode(is_finished=False)
                asr_results = connection_handler.get_result()

                # return the current period result
                # if the engine create the vad instance, this connection will have many period results 
                resp = {'result': asr_results}
                print(resp)
                await websocket.send_json(resp)
    except WebSocketDisconnect:
        pass


######################################################################
########################### NLP 服务 #################################
#####################################################################


@app.post("/nlp/chat")
async def chatOffline(nlp_base: NlpBase):
    chat = nlp_base.chat
    if not chat:
        return ErrorRequest(message="传入文本为空")
    else:
        res = chatbot.chat(chat)
        return SuccessRequest(result=res)


@app.post("/nlp/ie")
async def ieOffline(nlp_base: NlpBase):
    nlp_text = nlp_base.chat
    if not nlp_text:
        return ErrorRequest(message="传入文本为空")
    else:
        res = chatbot.ie(nlp_text)
        return SuccessRequest(result=res)


######################################################################
########################### TTS 服务 #################################
#####################################################################


# 端到端合成
@app.post("/tts/offline")
async def text2speechOffline(tts_base: TtsBase):
    text = tts_base.text
    if not text:
        return ErrorRequest(message="文本为空")
    else:
        now_name = "tts_" + datetime.datetime.strftime(
            datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav"
        out_file_path = os.path.join(WAV_PATH, now_name)
        # 使用中英混合CLI
        tts_model(
            text=text,
            output=out_file_path,
            am='fastspeech2_mix',
            spk_id=174,
            voc='hifigan_csmsc',
            lang='mix')
        with open(out_file_path, "rb") as f:
            data_bin = f.read()
        base_str = base64.b64encode(data_bin)
        return SuccessRequest(result=base_str)


# http流式TTS
@app.post("/tts/online")
async def stream_tts(request_body: TtsBase):
    text = request_body.text
    return StreamingResponse(chatbot.text2speechStreamBytes(text=text))


# ws流式TTS
@app.websocket("/ws/tts/online")
async def stream_ttsWS(websocket: WebSocket):
    await manager.connect(websocket)
    try:
        while True:
            text = await websocket.receive_text()
            # 用 websocket 流式接收音频数据
            if text:
                for sub_wav in chatbot.text2speechStream(text=text):
                    # print("发送sub wav: ", len(sub_wav))
                    res = {"wav": sub_wav, "done": False}
                    await websocket.send_json(res)

                # 输送结束
                res = {"wav": sub_wav, "done": True}
                await websocket.send_json(res)
            # manager.disconnect(websocket)

    except WebSocketDisconnect:
        manager.disconnect(websocket)


######################################################################
########################### VPR 服务 #################################
#####################################################################

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"])


@app.post('/vpr/enroll')
async def vpr_enroll(table_name: str=None,
                     spk_id: str=Form(...),
                     audio: UploadFile=File(...)):
    # Enroll the uploaded audio with spk-id into MySQL
    try:
        if not spk_id:
            return {'status': False, 'msg': "spk_id can not be None"}
        # Save the upload data to server.
        content = await audio.read()
        now_name = "vpr_enroll_" + datetime.datetime.strftime(
            datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav"
        audio_path = os.path.join(UPLOAD_PATH, now_name)

        with open(audio_path, "wb+") as f:
            f.write(content)
        vpr.vpr_enroll(username=spk_id, wav_path=audio_path)
        return {'status': True, 'msg': "Successfully enroll data!"}
    except Exception as e:
        return {'status': False, 'msg': e}


@app.post('/vpr/recog')
async def vpr_recog(request: Request,
                    table_name: str=None,
                    audio: UploadFile=File(...)):
    # Voice print recognition online
    # try:
    # Save the upload data to server.
    content = await audio.read()
    now_name = "vpr_query_" + datetime.datetime.strftime(
        datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav"
    query_audio_path = os.path.join(UPLOAD_PATH, now_name)
    with open(query_audio_path, "wb+") as f:
        f.write(content)
    spk_ids, paths, scores = vpr.do_search_vpr(query_audio_path)

    res = dict(zip(spk_ids, zip(paths, scores)))
    # Sort results by distance metric, closest distances first
    res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
    return res


@app.post('/vpr/del')
async def vpr_del(spk_id: dict=None):
    # Delete a record by spk_id in MySQL
    try:
        spk_id = spk_id['spk_id']
        if not spk_id:
            return {'status': False, 'msg': "spk_id can not be None"}
        vpr.vpr_del(username=spk_id)
        return {'status': True, 'msg': "Successfully delete data!"}
    except Exception as e:
        return {'status': False, 'msg': e}, 400


@app.get('/vpr/list')
async def vpr_list():
    # Get all records in MySQL
    try:
        spk_ids, vpr_ids = vpr.do_list()
        return spk_ids, vpr_ids
    except Exception as e:
        return {'status': False, 'msg': e}, 400


@app.get('/vpr/database64')
async def vpr_database64(vprId: int):
    # Get the audio file from path by spk_id in MySQL
    try:
        if not vprId:
            return {'status': False, 'msg': "vpr_id can not be None"}
        audio_path = vpr.do_get_wav(vprId)
        # 返回base64

        # 将文件转成16k, 16bit类型的wav文件
        wav, sr = librosa.load(audio_path, sr=16000)
        wav = float2pcm(wav)  # float32 to int16
        wav_bytes = wav.tobytes()  # to bytes
        wav_base64 = base64.b64encode(wav_bytes).decode('utf8')

        return SuccessRequest(result=wav_base64)
    except Exception as e:
        return {'status': False, 'msg': e}, 400


@app.get('/vpr/data')
async def vpr_data(vprId: int):
    # Get the audio file from path by spk_id in MySQL
    try:
        if not vprId:
            return {'status': False, 'msg': "vpr_id can not be None"}
        audio_path = vpr.do_get_wav(vprId)
        return FileResponse(audio_path)
    except Exception as e:
        return {'status': False, 'msg': e}, 400


if __name__ == '__main__':
    uvicorn.run(app=app, host='0.0.0.0', port=port)