import argparse import base64 import datetime import json import os from typing import List import aiofiles import librosa import soundfile as sf import uvicorn from fastapi import FastAPI from fastapi import File from fastapi import Form from fastapi import UploadFile from fastapi import WebSocket from fastapi import WebSocketDisconnect from fastapi.responses import StreamingResponse from pydantic import BaseModel from src.AudioManeger import AudioMannger from src.robot import Robot from src.SpeechBase.vpr import VPR from src.util import * from src.WebsocketManeger import ConnectionManager from starlette.middleware.cors import CORSMiddleware from starlette.requests import Request from starlette.responses import FileResponse from starlette.websockets import WebSocketState as WebSocketState from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.server.engine.asr.online.python.asr_engine import PaddleASRConnectionHanddler from paddlespeech.server.utils.audio_process import float2pcm # 解析配置 parser = argparse.ArgumentParser(prog='PaddleSpeechDemo', add_help=True) parser.add_argument( "--port", action="store", type=int, help="port of the app", default=8010, required=False) args = parser.parse_args() port = args.port # 配置文件 tts_config = "conf/tts_online_application.yaml" asr_config = "conf/ws_conformer_wenetspeech_application_faster.yaml" asr_init_path = "source/demo/demo.wav" db_path = "source/db/vpr.sqlite" ie_model_path = "source/model" tts_model = TTSExecutor() # 路径配置 UPLOAD_PATH = "source/vpr" WAV_PATH = "source/wav" base_sources = [UPLOAD_PATH, WAV_PATH] for path in base_sources: os.makedirs(path, exist_ok=True) # 初始化 app = FastAPI() chatbot = Robot( asr_config, tts_config, asr_init_path, ie_model_path=ie_model_path) manager = ConnectionManager() aumanager = AudioMannger(chatbot) aumanager.init() vpr = VPR(db_path, dim=192, top_k=5) # 初始化下载模型 tts_model( text="今天天气准不错", output="test.wav", am='fastspeech2_mix', spk_id=174, voc='hifigan_csmsc', lang='mix', ) # 服务配置 class NlpBase(BaseModel): chat: str class TtsBase(BaseModel): text: str class Audios: def __init__(self) -> None: self.audios = b"" audios = Audios() ###################################################################### ########################### ASR 服务 ################################# ##################################################################### # 接收文件,返回ASR结果 # 上传文件 @app.post("/asr/offline") async def speech2textOffline(files: List[UploadFile]): # 只有第一个有效 asr_res = "" for file in files[:1]: # 生成时间戳 now_name = "asr_offline_" + datetime.datetime.strftime( datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav" out_file_path = os.path.join(WAV_PATH, now_name) async with aiofiles.open(out_file_path, 'wb') as out_file: content = await file.read() # async read await out_file.write(content) # async write # 返回ASR识别结果 asr_res = chatbot.speech2text(out_file_path) return SuccessRequest(result=asr_res) return ErrorRequest(message="上传文件为空") # 接收文件,同时将wav强制转成16k, int16类型 @app.post("/asr/offlinefile") async def speech2textOfflineFile(files: List[UploadFile]): # 只有第一个有效 asr_res = "" for file in files[:1]: # 生成时间戳 now_name = "asr_offline_" + datetime.datetime.strftime( datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav" out_file_path = os.path.join(WAV_PATH, now_name) async with aiofiles.open(out_file_path, 'wb') as out_file: content = await file.read() # async read await out_file.write(content) # async write # 将文件转成16k, 16bit类型的wav文件 wav, sr = librosa.load(out_file_path, sr=16000) wav = float2pcm(wav) # float32 to int16 wav_bytes = wav.tobytes() # to bytes wav_base64 = base64.b64encode(wav_bytes).decode('utf8') # 将文件重新写入 now_name = now_name[:-4] + "_16k" + ".wav" out_file_path = os.path.join(WAV_PATH, now_name) sf.write(out_file_path, wav, 16000) # 返回ASR识别结果 asr_res = chatbot.speech2text(out_file_path) response_res = {"asr_result": asr_res, "wav_base64": wav_base64} return SuccessRequest(result=response_res) return ErrorRequest(message="上传文件为空") # 流式接收测试 @app.post("/asr/online1") async def speech2textOnlineRecive(files: List[UploadFile]): audio_bin = b'' for file in files: content = await file.read() audio_bin += content audios.audios += audio_bin print(f"audios长度变化: {len(audios.audios)}") return SuccessRequest(message="接收成功") # 采集环境噪音大小 @app.post("/asr/collectEnv") async def collectEnv(files: List[UploadFile]): for file in files[:1]: content = await file.read() # async read # 初始化, wav 前44字节是头部信息 aumanager.compute_env_volume(content[44:]) vad_ = aumanager.vad_threshold return SuccessRequest(result=vad_, message="采集环境噪音成功") # 停止录音 @app.get("/asr/stopRecord") async def stopRecord(): audios.audios = b"" aumanager.stop() print("Online录音暂停") return SuccessRequest(message="停止成功") # 恢复录音 @app.get("/asr/resumeRecord") async def resumeRecord(): aumanager.resume() print("Online录音恢复") return SuccessRequest(message="Online录音恢复") # 聊天用的 ASR @app.websocket("/ws/asr/offlineStream") async def websocket_endpoint(websocket: WebSocket): await manager.connect(websocket) try: while True: asr_res = None # websocket 不接收,只推送 data = await websocket.receive_bytes() if not aumanager.is_pause: asr_res = aumanager.stream_asr(data) else: print("录音暂停") if asr_res: await manager.send_personal_message(asr_res, websocket) aumanager.clear_asr() except WebSocketDisconnect: manager.disconnect(websocket) # await manager.broadcast(f"用户-{user}-离开") # print(f"用户-{user}-离开") # 流式识别的 ASR @app.websocket('/ws/asr/onlineStream') async def websocket_endpoint_online(websocket: WebSocket): """PaddleSpeech Online ASR Server api Args: websocket (WebSocket): the websocket instance """ #1. the interface wait to accept the websocket protocal header # and only we receive the header, it establish the connection with specific thread await websocket.accept() #2. if we accept the websocket headers, we will get the online asr engine instance engine = chatbot.asr.engine #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio # and each connection has its own connection instance to process the request # and only if client send the start signal, we create the PaddleASRConnectionHanddler instance connection_handler = None try: #4. we do a loop to process the audio package by package according the protocal # and only if the client send finished signal, we will break the loop while True: # careful here, changed the source code from starlette.websockets # 4.1 we wait for the client signal for the specific action assert websocket.application_state == WebSocketState.CONNECTED message = await websocket.receive() websocket._raise_on_disconnect(message) #4.2 text for the action command and bytes for pcm data if "text" in message: # we first parse the specific command message = json.loads(message["text"]) if 'signal' not in message: resp = {"status": "ok", "message": "no valid json data"} await websocket.send_json(resp) # start command, we create the PaddleASRConnectionHanddler instance to process the audio data # end command, we process the all the last audio pcm and return the final result # and we break the loop if message['signal'] == 'start': resp = {"status": "ok", "signal": "server_ready"} # do something at beginning here # create the instance to process the audio # connection_handler = chatbot.asr.connection_handler connection_handler = PaddleASRConnectionHanddler(engine) await websocket.send_json(resp) elif message['signal'] == 'end': # reset single engine for an new connection # and we will destroy the connection connection_handler.decode(is_finished=True) connection_handler.rescoring() asr_results = connection_handler.get_result() connection_handler.reset() resp = { "status": "ok", "signal": "finished", 'result': asr_results } await websocket.send_json(resp) break else: resp = {"status": "ok", "message": "no valid json data"} await websocket.send_json(resp) elif "bytes" in message: # bytes for the pcm data message = message["bytes"] print("###############") print("len message: ", len(message)) print("###############") # we extract the remained audio pcm # and decode for the result in this package data connection_handler.extract_feat(message) connection_handler.decode(is_finished=False) asr_results = connection_handler.get_result() # return the current period result # if the engine create the vad instance, this connection will have many period results resp = {'result': asr_results} print(resp) await websocket.send_json(resp) except WebSocketDisconnect: pass ###################################################################### ########################### NLP 服务 ################################# ##################################################################### @app.post("/nlp/chat") async def chatOffline(nlp_base: NlpBase): chat = nlp_base.chat if not chat: return ErrorRequest(message="传入文本为空") else: res = chatbot.chat(chat) return SuccessRequest(result=res) @app.post("/nlp/ie") async def ieOffline(nlp_base: NlpBase): nlp_text = nlp_base.chat if not nlp_text: return ErrorRequest(message="传入文本为空") else: res = chatbot.ie(nlp_text) return SuccessRequest(result=res) ###################################################################### ########################### TTS 服务 ################################# ##################################################################### # 端到端合成 @app.post("/tts/offline") async def text2speechOffline(tts_base: TtsBase): text = tts_base.text if not text: return ErrorRequest(message="文本为空") else: now_name = "tts_" + datetime.datetime.strftime( datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav" out_file_path = os.path.join(WAV_PATH, now_name) # 使用中英混合CLI tts_model( text=text, output=out_file_path, am='fastspeech2_mix', spk_id=174, voc='hifigan_csmsc', lang='mix') with open(out_file_path, "rb") as f: data_bin = f.read() base_str = base64.b64encode(data_bin) return SuccessRequest(result=base_str) # http流式TTS @app.post("/tts/online") async def stream_tts(request_body: TtsBase): text = request_body.text return StreamingResponse(chatbot.text2speechStreamBytes(text=text)) # ws流式TTS @app.websocket("/ws/tts/online") async def stream_ttsWS(websocket: WebSocket): await manager.connect(websocket) try: while True: text = await websocket.receive_text() # 用 websocket 流式接收音频数据 if text: for sub_wav in chatbot.text2speechStream(text=text): # print("发送sub wav: ", len(sub_wav)) res = {"wav": sub_wav, "done": False} await websocket.send_json(res) # 输送结束 res = {"wav": sub_wav, "done": True} await websocket.send_json(res) # manager.disconnect(websocket) except WebSocketDisconnect: manager.disconnect(websocket) ###################################################################### ########################### VPR 服务 ################################# ##################################################################### app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]) @app.post('/vpr/enroll') async def vpr_enroll(table_name: str=None, spk_id: str=Form(...), audio: UploadFile=File(...)): # Enroll the uploaded audio with spk-id into MySQL try: if not spk_id: return {'status': False, 'msg': "spk_id can not be None"} # Save the upload data to server. content = await audio.read() now_name = "vpr_enroll_" + datetime.datetime.strftime( datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav" audio_path = os.path.join(UPLOAD_PATH, now_name) with open(audio_path, "wb+") as f: f.write(content) vpr.vpr_enroll(username=spk_id, wav_path=audio_path) return {'status': True, 'msg': "Successfully enroll data!"} except Exception as e: return {'status': False, 'msg': e} @app.post('/vpr/recog') async def vpr_recog(request: Request, table_name: str=None, audio: UploadFile=File(...)): # Voice print recognition online # try: # Save the upload data to server. content = await audio.read() now_name = "vpr_query_" + datetime.datetime.strftime( datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav" query_audio_path = os.path.join(UPLOAD_PATH, now_name) with open(query_audio_path, "wb+") as f: f.write(content) spk_ids, paths, scores = vpr.do_search_vpr(query_audio_path) res = dict(zip(spk_ids, zip(paths, scores))) # Sort results by distance metric, closest distances first res = sorted(res.items(), key=lambda item: item[1][1], reverse=True) return res @app.post('/vpr/del') async def vpr_del(spk_id: dict=None): # Delete a record by spk_id in MySQL try: spk_id = spk_id['spk_id'] if not spk_id: return {'status': False, 'msg': "spk_id can not be None"} vpr.vpr_del(username=spk_id) return {'status': True, 'msg': "Successfully delete data!"} except Exception as e: return {'status': False, 'msg': e}, 400 @app.get('/vpr/list') async def vpr_list(): # Get all records in MySQL try: spk_ids, vpr_ids = vpr.do_list() return spk_ids, vpr_ids except Exception as e: return {'status': False, 'msg': e}, 400 @app.get('/vpr/database64') async def vpr_database64(vprId: int): # Get the audio file from path by spk_id in MySQL try: if not vprId: return {'status': False, 'msg': "vpr_id can not be None"} audio_path = vpr.do_get_wav(vprId) # 返回base64 # 将文件转成16k, 16bit类型的wav文件 wav, sr = librosa.load(audio_path, sr=16000) wav = float2pcm(wav) # float32 to int16 wav_bytes = wav.tobytes() # to bytes wav_base64 = base64.b64encode(wav_bytes).decode('utf8') return SuccessRequest(result=wav_base64) except Exception as e: return {'status': False, 'msg': e}, 400 @app.get('/vpr/data') async def vpr_data(vprId: int): # Get the audio file from path by spk_id in MySQL try: if not vprId: return {'status': False, 'msg': "vpr_id can not be None"} audio_path = vpr.do_get_wav(vprId) return FileResponse(audio_path) except Exception as e: return {'status': False, 'msg': e}, 400 if __name__ == '__main__': uvicorn.run(app=app, host='0.0.0.0', port=port)