diff --git a/demos/speech_web/.gitignore b/demos/speech_web/.gitignore index 54418e60..1e961a38 100644 --- a/demos/speech_web/.gitignore +++ b/demos/speech_web/.gitignore @@ -13,4 +13,7 @@ *.pdmodel */source/* */PaddleSpeech/* +*/tmp*/* +*/duration.txt +*/oov_info.txt diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md index 3b2da6e9..e8c59ea8 100644 --- a/demos/speech_web/README.md +++ b/demos/speech_web/README.md @@ -1,55 +1,79 @@ # Paddle Speech Demo -PaddleSpeechDemo 是一个以 PaddleSpeech 的语音交互功能为主体开发的 Demo 展示项目,用于帮助大家更好的上手 PaddleSpeech 以及使用 PaddleSpeech 构建自己的应用。 +## 简介 +Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开发的 Demo 展示项目,用于帮助大家更好的上手 PaddleSpeech 以及使用 PaddleSpeech 构建自己的应用。 -智能语音交互部分使用 PaddleSpeech,对话以及信息抽取部分使用 PaddleNLP,网页前端展示部分基于 Vue3 进行开发 +智能语音交互部分使用 PaddleSpeech,对话以及信息抽取部分使用 PaddleNLP,网页前端展示部分基于 Vue3 进行开发。 主要功能: +`main.py` 中包含功能 + 语音聊天:PaddleSpeech 的语音识别能力+语音合成能力,对话部分基于 PaddleNLP 的闲聊功能 + 声纹识别:PaddleSpeech 的声纹识别功能展示 + 语音识别:支持【实时语音识别】,【端到端识别】,【音频文件识别】三种模式 + 语音合成:支持【流式合成】与【端到端合成】两种方式 + 语音指令:基于 PaddleSpeech 的语音识别能力与 PaddleNLP 的信息抽取,实现交通费的智能报销 +`vc.py` 中包含功能 ++ 一句话合成:基于 GE2E 和 ECAPA-TDNN 模型的一句话合成方案,可以模仿输入的音频的音色进行合成任务 + + GE2E 音色克隆方案可以参考: [【FastSpeech2 + AISHELL-3 Voice Cloning】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1) + + ECAPA-TDNN 音色克隆方案可以参考: [【FastSpeech2 + AISHELL-3 Voice Cloning (ECAPA-TDNN)】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc2) + ++ 小数据微调:基于小数据集的微调方案,内置用12句话标贝中文女声微调示例,你也可以通过一键重置,录制自己的声音,注意在安静环境下录制,效果会更好。你可以在 [【Finetune your own AM based on FastSpeech2 with AISHELL-3】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tts_finetune/tts3)中尝试使用自己的数据集进行微调。 + ++ ENIRE-SAT:语言-语音跨模态大模型 ENIRE-SAT 可视化展示示例,支持个性化合成,跨语言语音合成(音频为中文则输入英文文本进行合成),语音编辑(修改音频文字中间的结果)功能。 ENIRE-SAT 更多实现细节,可以参考: + + [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat) + + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat) + + [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat) + 运行效果: - ![效果](docs/效果展示.png) + ![效果](https://user-images.githubusercontent.com/30135920/191188766-12e7ca15-f7b4-45f8-9da5-0c0b0bbe5fcb.png) -## 安装 -### 后端环境安装 -``` -# 安装环境 -cd speech_server -pip install -r requirements.txt +## 基础环境安装 -# 下载 ie 模型,针对地点进行微调,效果更好,不下载的话会使用其它版本,效果没有这个好 -cd source -mkdir model -cd model -wget https://bj.bcebos.com/paddlenlp/applications/speech-cmd-analysis/finetune/model_state.pdparams +### 后端环境安装 +```bash +cd speech_server +pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple +cd ../ ``` ### 前端环境安装 - 前端依赖 `node.js` ,需要提前安装,确保 `npm` 可用,`npm` 测试版本 `8.3.1`,建议下载[官网](https://nodejs.org/en/)稳定版的 `node.js` -``` +```bash # 进入前端目录 cd web_client - # 安装 `yarn`,已经安装可跳过 npm install -g yarn - # 使用yarn安装前端依赖 yarn install +cd ../ ``` + ## 启动服务 +【注意】目前只支持 `main.py` 和 `vc.py` 两者中选择开启一个后端服务。 + +### 启动 `main.py` 后端服务 + +#### 下载相关模型 + +只需手动下载语音指令所需模型即可,其他模型会自动下载。 -### 开启后端服务 +```bash +cd speech_server +mkdir -p source/model +cd source/model +# 下载IE模型 +wget https://bj.bcebos.com/paddlenlp/applications/speech-cmd-analysis/finetune/model_state.pdparams +cd ../../ + +``` +#### 启动后端服务 ``` cd speech_server @@ -57,7 +81,91 @@ cd speech_server python main.py --port 8010 ``` -### 开启前端服务 + +### 启动 `vc.py` 后端服务 + +#### 下载相关模型和音频 + +```bash +cd speech_server + +# 已创建则跳过 +mkdir -p source/model +cd source +# 下载 & 解压 wav (包含VC测试音频) +wget https://paddlespeech.bj.bcebos.com/demos/speech_web/wav_vc.zip +unzip wav_vc.zip + +cd model +# 下载 GE2E 相关模型 +wget https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip +unzip ge2e_ckpt_0.3.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip +unzip pwg_aishell3_ckpt_0.5.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip +unzip fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip + +# 下载 ECAPA-TDNN 相关模型 +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_vc2_1.2.0.zip +unzip fastspeech2_aishell3_ckpt_vc2_1.2.0.zip + +# 下载 ERNIE-SAT 相关模型 +# aishell3 ERNIE-SAT +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_ckpt_1.2.0.zip +unzip erniesat_aishell3_ckpt_1.2.0.zip + +# vctk ERNIE-SAT +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_vctk_ckpt_1.2.0.zip +unzip erniesat_vctk_ckpt_1.2.0.zip + +# aishell3_vctk ERNIE-SAT +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_vctk_ckpt_1.2.0.zip +unzip erniesat_aishell3_vctk_ckpt_1.2.0.zip + +# 下载 finetune 相关模型 +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip +unzip fastspeech2_aishell3_ckpt_1.1.0.zip + +# 下载声码器 +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip +unzip hifigan_aishell3_ckpt_0.2.0.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip +unzip hifigan_vctk_ckpt_0.2.0.zip + +cd ../../../ +``` + +#### ERNIE-SAT 环境配置 + +ERNIE-SAT 体验依赖于 [examples/aishell3_vctk/ernie_sat](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat) 的环境。参考 `examples/aishell3_vctk/ernie_sat` 下的 `README.md`, 确保 `examples/aishell3_vctk/ernie_sat` 下 `run.sh` 相关示例代码有效。 + +运行好 `examples/aishell3_vctk/ernie_sat` 后,回到当前目录,创建环境: +```bash +cd speech_server +ln -snf ../../../examples/aishell3_vctk/ernie_sat/download . +ln -snf ../../../examples/aishell3_vctk/ernie_sat/tools . +cd ../ +``` + +#### finetune 环境配置 + +`finetune` 需要解压 `tools/aligner` 中的 `aishell3_model.zip`,finetune 过程需要使用到 `tools/aligner/aishell3_model/meta.yaml` 文件。 + +```bash +cd speech_server/tools/aligner +unzip aishell3_model.zip +cd - +``` + +#### 启动后端服务 + +``` +cd speech_server +# 默认8010端口 +python vc.py --port 8010 +``` + +### 启动前端服务 ``` cd web_client @@ -65,6 +173,9 @@ yarn dev --port 8011 ``` 默认配置下,前端中配置的后台地址信息是 localhost,确保后端服务器和打开页面的游览器在同一台机器上,不在一台机器的配置方式见下方的 FAQ:【后端如果部署在其它机器或者别的端口如何修改】 + + + ## FAQ #### Q: 如何安装node.js @@ -75,7 +186,7 @@ A: node.js的安装可以参考[【菜鸟教程】](https://www.runoob.com/nod A:后端的配置地址有分散在两个文件中 -修改第一个文件 `PaddleSpeechWebClient/vite.config.js` +修改第一个文件 `./web_client/vite.config.js` ``` server: { @@ -90,7 +201,7 @@ server: { } ``` -修改第二个文件 `PaddleSpeechWebClient/src/api/API.js`( Websocket 代理配置失败,所以需要在这个文件中修改) +修改第二个文件 `./web_client/src/api/API.js`( Websocket 代理配置失败,所以需要在这个文件中修改) ``` // websocket (这里改成后端所在的接口) diff --git a/demos/speech_web/docs/效果展示.png b/demos/speech_web/docs/效果展示.png deleted file mode 100644 index 5f7997c1..00000000 Binary files a/demos/speech_web/docs/效果展示.png and /dev/null differ diff --git a/examples/other/tts_finetune/tts3/finetune.yaml b/demos/speech_web/speech_server/conf/tts3_finetune.yaml similarity index 86% rename from examples/other/tts_finetune/tts3/finetune.yaml rename to demos/speech_web/speech_server/conf/tts3_finetune.yaml index 374a69f3..4f708bd7 100644 --- a/examples/other/tts_finetune/tts3/finetune.yaml +++ b/demos/speech_web/speech_server/conf/tts3_finetune.yaml @@ -3,10 +3,10 @@ ########################################################### # Set to -1 to indicate that the parameter is the same as the pretrained model configuration -batch_size: -1 +batch_size: 10 learning_rate: 0.0001 # learning rate num_snapshots: -1 # frozen_layers should be a list # if you don't need to freeze, set frozen_layers to [] -frozen_layers: ["encoder", "duration_predictor"] +frozen_layers: ["encoder"] diff --git a/demos/speech_web/speech_server/main.py b/demos/speech_web/speech_server/main.py index d4750d59..03e7e599 100644 --- a/demos/speech_web/speech_server/main.py +++ b/demos/speech_web/speech_server/main.py @@ -1,8 +1,3 @@ -# todo: -# 1. 开启服务 -# 2. 接收录音音频,返回识别结果 -# 3. 接收ASR识别结果,返回NLP对话结果 -# 4. 接收NLP对话结果,返回TTS音频 import argparse import base64 import datetime @@ -32,6 +27,7 @@ from starlette.requests import Request from starlette.responses import FileResponse from starlette.websockets import WebSocketState as WebSocketState +from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.server.engine.asr.online.python.asr_engine import PaddleASRConnectionHanddler from paddlespeech.server.utils.audio_process import float2pcm @@ -55,7 +51,7 @@ asr_config = "conf/ws_conformer_wenetspeech_application_faster.yaml" asr_init_path = "source/demo/demo.wav" db_path = "source/db/vpr.sqlite" ie_model_path = "source/model" - +tts_model = TTSExecutor() # 路径配置 UPLOAD_PATH = "source/vpr" WAV_PATH = "source/wav" @@ -72,6 +68,14 @@ manager = ConnectionManager() aumanager = AudioMannger(chatbot) aumanager.init() vpr = VPR(db_path, dim=192, top_k=5) +# 初始化下载模型 +tts_model( + text="今天天气准不错", + output="test.wav", + am='fastspeech2_mix', + spk_id=174, + voc='hifigan_csmsc', + lang='mix', ) # 服务配置 @@ -331,6 +335,7 @@ async def ieOffline(nlp_base: NlpBase): ##################################################################### +# 端到端合成 @app.post("/tts/offline") async def text2speechOffline(tts_base: TtsBase): text = tts_base.text @@ -340,8 +345,14 @@ async def text2speechOffline(tts_base: TtsBase): now_name = "tts_" + datetime.datetime.strftime( datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav" out_file_path = os.path.join(WAV_PATH, now_name) - # 保存为文件,再转成base64传输 - chatbot.text2speech(text, outpath=out_file_path) + # 使用中英混合CLI + tts_model( + text=text, + output=out_file_path, + am='fastspeech2_mix', + spk_id=174, + voc='hifigan_csmsc', + lang='mix') with open(out_file_path, "rb") as f: data_bin = f.read() base_str = base64.b64encode(data_bin) diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt index 607f0d4d..cdc65465 100644 --- a/demos/speech_web/speech_server/requirements.txt +++ b/demos/speech_web/speech_server/requirements.txt @@ -1,13 +1,8 @@ aiofiles faiss-cpu -fastapi -librosa -numpy -paddlenlp -paddlepaddle -paddlespeech +praatio==5.0.0 pydantic -python-multipartscikit_learn -SoundFile +python-multipart +scikit_learn starlette uvicorn diff --git a/demos/speech_web/speech_server/src/ernie_sat.py b/demos/speech_web/speech_server/src/ernie_sat.py new file mode 100644 index 00000000..b74dd8e3 --- /dev/null +++ b/demos/speech_web/speech_server/src/ernie_sat.py @@ -0,0 +1,195 @@ +import os + +from .util import MAIN_ROOT +from .util import run_cmd + + +class SAT: + def __init__(self): + # pretrain model path + self.zh_pretrain_model_path = os.path.realpath( + "source/model/erniesat_aishell3_ckpt_1.2.0") + self.en_pretrain_model_path = os.path.realpath( + "source/model/erniesat_vctk_ckpt_1.2.0") + self.cross_pretrain_model_path = os.path.realpath( + "source/model/erniesat_aishell3_vctk_ckpt_1.2.0") + + self.zh_voc_model_path = os.path.realpath( + "source/model/hifigan_aishell3_ckpt_0.2.0") + self.eb_voc_model_path = os.path.realpath( + "source/model/hifigan_vctk_ckpt_0.2.0") + self.cross_voc_model_path = os.path.realpath( + "source/model/hifigan_aishell3_ckpt_0.2.0") + + self.BIN_DIR = os.path.join(MAIN_ROOT, + "paddlespeech/t2s/exps/ernie_sat") + + def zh_synthesize_edit(self, + old_str: str, + new_str: str, + input_name: os.PathLike, + output_name: os.PathLike, + task_name: str="synthesize", + erniesat_ckpt_name: str="snapshot_iter_289500.pdz"): + + if task_name not in ['synthesize', 'edit']: + print("task name only in ['edit', 'synthesize']") + return None + + # 推理文件配置 + config_path = os.path.join(self.zh_pretrain_model_path, "default.yaml") + phones_dict = os.path.join(self.zh_pretrain_model_path, + "phone_id_map.txt") + erniesat_ckpt = os.path.join(self.zh_pretrain_model_path, + erniesat_ckpt_name) + erniesat_stat = os.path.join(self.zh_pretrain_model_path, + "speech_stats.npy") + + voc = "hifigan_aishell3" + voc_config = os.path.join(self.zh_voc_model_path, "default.yaml") + voc_ckpt = os.path.join(self.zh_voc_model_path, + "snapshot_iter_2500000.pdz") + voc_stat = os.path.join(self.zh_voc_model_path, "feats_stats.npy") + + cmd = self.get_cmd( + task_name=task_name, + input_name=input_name, + old_str=old_str, + new_str=new_str, + config_path=config_path, + phones_dict=phones_dict, + erniesat_ckpt=erniesat_ckpt, + erniesat_stat=erniesat_stat, + voc=voc, + voc_config=voc_config, + voc_ckpt=voc_ckpt, + voc_stat=voc_stat, + output_name=output_name, + source_lang="zh", + target_lang="zh") + + return run_cmd(cmd, output_name) + + def crossclone(self, + old_str: str, + new_str: str, + input_name: os.PathLike, + output_name: os.PathLike, + source_lang: str, + target_lang: str, + erniesat_ckpt_name: str="snapshot_iter_489000.pdz"): + # 推理文件配置 + config_path = os.path.join(self.cross_pretrain_model_path, + "default.yaml") + phones_dict = os.path.join(self.cross_pretrain_model_path, + "phone_id_map.txt") + erniesat_ckpt = os.path.join(self.cross_pretrain_model_path, + erniesat_ckpt_name) + erniesat_stat = os.path.join(self.cross_pretrain_model_path, + "speech_stats.npy") + + voc = "hifigan_aishell3" + voc_config = os.path.join(self.cross_voc_model_path, "default.yaml") + voc_ckpt = os.path.join(self.cross_voc_model_path, + "snapshot_iter_2500000.pdz") + voc_stat = os.path.join(self.cross_voc_model_path, "feats_stats.npy") + task_name = "synthesize" + cmd = self.get_cmd( + task_name=task_name, + input_name=input_name, + old_str=old_str, + new_str=new_str, + config_path=config_path, + phones_dict=phones_dict, + erniesat_ckpt=erniesat_ckpt, + erniesat_stat=erniesat_stat, + voc=voc, + voc_config=voc_config, + voc_ckpt=voc_ckpt, + voc_stat=voc_stat, + output_name=output_name, + source_lang=source_lang, + target_lang=target_lang) + + return run_cmd(cmd, output_name) + + def en_synthesize_edit(self, + old_str: str, + new_str: str, + input_name: os.PathLike, + output_name: os.PathLike, + task_name: str="synthesize", + erniesat_ckpt_name: str="snapshot_iter_199500.pdz"): + + # 推理文件配置 + config_path = os.path.join(self.en_pretrain_model_path, "default.yaml") + phones_dict = os.path.join(self.en_pretrain_model_path, + "phone_id_map.txt") + erniesat_ckpt = os.path.join(self.en_pretrain_model_path, + erniesat_ckpt_name) + erniesat_stat = os.path.join(self.en_pretrain_model_path, + "speech_stats.npy") + + voc = "hifigan_aishell3" + voc_config = os.path.join(self.zh_voc_model_path, "default.yaml") + voc_ckpt = os.path.join(self.zh_voc_model_path, + "snapshot_iter_2500000.pdz") + voc_stat = os.path.join(self.zh_voc_model_path, "feats_stats.npy") + + cmd = self.get_cmd( + task_name=task_name, + input_name=input_name, + old_str=old_str, + new_str=new_str, + config_path=config_path, + phones_dict=phones_dict, + erniesat_ckpt=erniesat_ckpt, + erniesat_stat=erniesat_stat, + voc=voc, + voc_config=voc_config, + voc_ckpt=voc_ckpt, + voc_stat=voc_stat, + output_name=output_name, + source_lang="en", + target_lang="en") + + return run_cmd(cmd, output_name) + + def get_cmd(self, + task_name: str, + input_name: str, + old_str: str, + new_str: str, + config_path: str, + phones_dict: str, + erniesat_ckpt: str, + erniesat_stat: str, + voc: str, + voc_config: str, + voc_ckpt: str, + voc_stat: str, + output_name: str, + source_lang: str, + target_lang: str): + cmd = f""" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 {self.BIN_DIR}/synthesize_e2e.py \ + --task_name={task_name} \ + --wav_path={input_name} \ + --old_str='{old_str}' \ + --new_str='{new_str}' \ + --source_lang={source_lang} \ + --target_lang={target_lang} \ + --erniesat_config={config_path} \ + --phones_dict={phones_dict} \ + --erniesat_ckpt={erniesat_ckpt} \ + --erniesat_stat={erniesat_stat} \ + --voc={voc} \ + --voc_config={voc_config} \ + --voc_ckpt={voc_ckpt} \ + --voc_stat={voc_stat} \ + --output_name={output_name} + """ + + return cmd diff --git a/demos/speech_web/speech_server/src/finetune.py b/demos/speech_web/speech_server/src/finetune.py new file mode 100644 index 00000000..d7a440f9 --- /dev/null +++ b/demos/speech_web/speech_server/src/finetune.py @@ -0,0 +1,125 @@ +import os + +from .util import MAIN_ROOT +from .util import run_cmd + + +def find_max_ckpt(model_path): + max_ckpt = 0 + for filename in os.listdir(model_path): + if filename.endswith('.pdz'): + files = filename[:-4] + a1, a2, it = files.split("_") + if int(it) > max_ckpt: + max_ckpt = int(it) + return max_ckpt + + +class FineTune: + def __init__(self): + self.now_file_path = os.path.dirname(__file__) + self.PYTHONPATH = os.path.join(MAIN_ROOT, + "examples/other/tts_finetune/tts3") + self.BIN_DIR = os.path.join(MAIN_ROOT, + "paddlespeech/t2s/exps/fastspeech2") + self.pretrained_model_dir = os.path.realpath( + "source/model/fastspeech2_aishell3_ckpt_1.1.0") + self.voc_model_dir = os.path.realpath( + "source/model/hifigan_aishell3_ckpt_0.2.0") + self.finetune_config = os.path.join("conf/tts3_finetune.yaml") + + def finetune(self, input_dir, exp_dir='temp', epoch=100): + """ + use cmd follow examples/other/tts_finetune/tts3/run.sh + """ + newdir_name = "newdir" + new_dir = os.path.join(input_dir, newdir_name) + mfa_dir = os.path.join(exp_dir, 'mfa_result') + dump_dir = os.path.join(exp_dir, 'dump') + output_dir = os.path.join(exp_dir, 'exp') + lang = "zh" + ngpu = 1 + + cmd = f""" + # check oov + python3 {self.PYTHONPATH}/local/check_oov.py \ + --input_dir={input_dir} \ + --pretrained_model_dir={self.pretrained_model_dir} \ + --newdir_name={newdir_name} \ + --lang={lang} + + # get mfa result + python3 {self.PYTHONPATH}/local/get_mfa_result.py \ + --input_dir={new_dir} \ + --mfa_dir={mfa_dir} \ + --lang={lang} + + # generate durations.txt + python3 {self.PYTHONPATH}/local/generate_duration.py \ + --mfa_dir={mfa_dir} + + # extract feature + python3 {self.PYTHONPATH}/local/extract_feature.py \ + --duration_file="./durations.txt" \ + --input_dir={new_dir} \ + --dump_dir={dump_dir} \ + --pretrained_model_dir={self.pretrained_model_dir} + + # create finetune env + python3 {self.PYTHONPATH}/local/prepare_env.py \ + --pretrained_model_dir={self.pretrained_model_dir} \ + --output_dir={output_dir} + + # finetune + python3 {self.PYTHONPATH}/local/finetune.py \ + --pretrained_model_dir={self.pretrained_model_dir} \ + --dump_dir={dump_dir} \ + --output_dir={output_dir} \ + --ngpu={ngpu} \ + --epoch=100 \ + --finetune_config={self.finetune_config} + """ + + print(cmd) + + return run_cmd(cmd, exp_dir) + + def synthesize(self, text, wav_name, out_wav_dir, exp_dir='temp'): + + voc = "hifigan_aishell3" + dump_dir = os.path.join(exp_dir, 'dump') + output_dir = os.path.join(exp_dir, 'exp') + text_path = os.path.join(exp_dir, 'sentences.txt') + lang = "zh" + ngpu = 1 + + model_path = f"{output_dir}/checkpoints" + ckpt = find_max_ckpt(model_path) + + # 生成对应的语句 + with open(text_path, "w", encoding='utf8') as f: + f.write(wav_name + " " + text) + + cmd = f""" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 {self.BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_aishell3 \ + --am_config={self.pretrained_model_dir}/default.yaml \ + --am_ckpt={output_dir}/checkpoints/snapshot_iter_{ckpt}.pdz \ + --am_stat={self.pretrained_model_dir}/speech_stats.npy \ + --voc={voc} \ + --voc_config={self.voc_model_dir}/default.yaml \ + --voc_ckpt={self.voc_model_dir}/snapshot_iter_2500000.pdz \ + --voc_stat={self.voc_model_dir}/feats_stats.npy \ + --lang={lang} \ + --text={text_path} \ + --output_dir={out_wav_dir} \ + --phones_dict={dump_dir}/phone_id_map.txt \ + --speaker_dict={dump_dir}/speaker_id_map.txt \ + --spk_id=0 + """ + + out_path = os.path.join(out_wav_dir, f"{wav_name}.wav") + + return run_cmd(cmd, out_path) diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py new file mode 100644 index 00000000..d90013b9 --- /dev/null +++ b/demos/speech_web/speech_server/src/ge2e_clone.py @@ -0,0 +1,57 @@ +import os +import shutil + +from .util import MAIN_ROOT +from .util import run_cmd + + +class VoiceCloneGE2E(): + def __init__(self): + # Path 到指定路径上 + self.BIN_DIR = os.path.join(MAIN_ROOT, "paddlespeech/t2s/exps") + # am + self.am = "fastspeech2_aishell3" + self.am_config = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/default.yaml" + self.am_ckpt = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/snapshot_iter_96400.pdz" + self.am_stat = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/speech_stats.npy" + self.phones_dict = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/phone_id_map.txt" + # voc + self.voc = "pwgan_aishell3" + self.voc_config = "source/model/pwg_aishell3_ckpt_0.5/default.yaml" + self.voc_ckpt = "source/model/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz" + self.voc_stat = "source/model/pwg_aishell3_ckpt_0.5/feats_stats.npy" + # ge2e + self.ge2e_params_path = "source/model/ge2e_ckpt_0.3/step-3000000.pdparams" + + def vc(self, text, input_wav, out_wav): + + # input wav 需要形成临时单独文件夹 + _, full_file_name = os.path.split(input_wav) + ref_audio_dir = os.path.realpath("tmp_dir/ge2e") + if os.path.exists(ref_audio_dir): + shutil.rmtree(ref_audio_dir) + else: + os.makedirs(ref_audio_dir, exist_ok=True) + shutil.copy(input_wav, ref_audio_dir) + + output_dir = os.path.dirname(out_wav) + + cmd = f""" + python3 {self.BIN_DIR}/voice_cloning.py \ + --am={self.am} \ + --am_config={self.am_config} \ + --am_ckpt={self.am_ckpt} \ + --am_stat={self.am_stat} \ + --voc={self.voc} \ + --voc_config={self.voc_config} \ + --voc_ckpt={self.voc_ckpt} \ + --voc_stat={self.voc_stat} \ + --ge2e_params_path={self.ge2e_params_path} \ + --text="{text}" \ + --input-dir={ref_audio_dir} \ + --output-dir={output_dir} \ + --phones-dict={self.phones_dict} + """ + + output_name = os.path.join(output_dir, full_file_name) + return run_cmd(cmd, output_name=output_name) diff --git a/demos/speech_web/speech_server/src/tdnn_clone.py b/demos/speech_web/speech_server/src/tdnn_clone.py new file mode 100644 index 00000000..c24b9b07 --- /dev/null +++ b/demos/speech_web/speech_server/src/tdnn_clone.py @@ -0,0 +1,54 @@ +import os +import shutil + +from .util import MAIN_ROOT +from .util import run_cmd + + +class VoiceCloneTDNN(): + def __init__(self): + # Path 到指定路径上 + self.BIN_DIR = os.path.join(MAIN_ROOT, "paddlespeech/t2s/exps") + + self.am = "fastspeech2_aishell3" + self.am_config = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/default.yaml" + self.am_ckpt = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/snapshot_iter_96400.pdz" + self.am_stat = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/speech_stats.npy" + self.phones_dict = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/phone_id_map.txt" + # voc + self.voc = "pwgan_aishell3" + self.voc_config = "source/model/pwg_aishell3_ckpt_0.5/default.yaml" + self.voc_ckpt = "source/model/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz" + self.voc_stat = "source/model/pwg_aishell3_ckpt_0.5/feats_stats.npy" + + def vc(self, text, input_wav, out_wav): + # input wav 需要形成临时单独文件夹 + _, full_file_name = os.path.split(input_wav) + ref_audio_dir = os.path.realpath("tmp_dir/tdnn") + if os.path.exists(ref_audio_dir): + shutil.rmtree(ref_audio_dir) + else: + os.makedirs(ref_audio_dir, exist_ok=True) + shutil.copy(input_wav, ref_audio_dir) + + output_dir = os.path.dirname(out_wav) + + cmd = f""" + python3 {self.BIN_DIR}/voice_cloning.py \ + --am={self.am} \ + --am_config={self.am_config} \ + --am_ckpt={self.am_ckpt} \ + --am_stat={self.am_stat} \ + --voc={self.voc} \ + --voc_config={self.voc_config} \ + --voc_ckpt={self.voc_ckpt} \ + --voc_stat={self.voc_stat} \ + --text="{text}" \ + --input-dir={ref_audio_dir} \ + --output-dir={output_dir} \ + --phones-dict={self.phones_dict} \ + --use_ecapa=True + """ + + output_name = os.path.join(output_dir, full_file_name) + return run_cmd(cmd, output_name=output_name) diff --git a/demos/speech_web/speech_server/src/util.py b/demos/speech_web/speech_server/src/util.py index 4a566b6e..a69e6c42 100644 --- a/demos/speech_web/speech_server/src/util.py +++ b/demos/speech_web/speech_server/src/util.py @@ -1,4 +1,9 @@ +import os import random +import subprocess + +NOW_FILE_PATH = os.path.dirname(__file__) +MAIN_ROOT = os.path.realpath(os.path.join(NOW_FILE_PATH, "../../../../")) def randName(n=5): @@ -11,3 +16,20 @@ def SuccessRequest(result=None, message="ok"): def ErrorRequest(result=None, message="error"): return {"code": -1, "result": result, "message": message} + + +def run_cmd(cmd, output_name): + p = subprocess.Popen(cmd, shell=True) + res = p.wait() + print(cmd) + print("运行结果:", res) + if res == 0: + # 运行成功 + if os.path.exists(output_name): + return output_name + else: + # 合成的文件不存在 + return None + else: + # 运行失败 + return None diff --git a/demos/speech_web/speech_server/vc.py b/demos/speech_web/speech_server/vc.py new file mode 100644 index 00000000..99e56b40 --- /dev/null +++ b/demos/speech_web/speech_server/vc.py @@ -0,0 +1,547 @@ +import argparse +import base64 +import datetime +import json +import os +from typing import List + +import aiofiles +import librosa +import soundfile as sf +import uvicorn +from fastapi import FastAPI +from fastapi import UploadFile +from pydantic import BaseModel +from src.ernie_sat import SAT +from src.finetune import FineTune +from src.ge2e_clone import VoiceCloneGE2E +from src.tdnn_clone import VoiceCloneTDNN +from src.util import * +from starlette.responses import FileResponse + +from paddlespeech.server.utils.audio_process import float2pcm + +# 解析配置 +parser = argparse.ArgumentParser(prog='PaddleSpeechDemo', add_help=True) + +parser.add_argument( + "--port", + action="store", + type=int, + help="port of the app", + default=8010, + required=False) + +args = parser.parse_args() +port = args.port + +# 这里会对finetune产生影响,所以finetune使用了cmd +vc_model = VoiceCloneGE2E() +vc_model_tdnn = VoiceCloneTDNN() + +sat_model = SAT() +ft_model = FineTune() + +# 配置文件 +tts_config = "conf/tts_online_application.yaml" +asr_config = "conf/ws_conformer_wenetspeech_application_faster.yaml" +asr_init_path = "source/demo/demo.wav" +db_path = "source/db/vc.sqlite" +ie_model_path = "source/model" + +# 路径配置 +VC_UPLOAD_PATH = "source/wav/vc/upload" +VC_OUT_PATH = "source/wav/vc/out" + +FT_UPLOAD_PATH = "source/wav/finetune/upload" +FT_OUT_PATH = "source/wav/finetune/out" +FT_LABEL_PATH = "source/wav/finetune/label.json" +FT_LABEL_TXT_PATH = "source/wav/finetune/labels.txt" +FT_DEFAULT_PATH = "source/wav/finetune/default" +FT_EXP_BASE_PATH = "tmp_dir/finetune" + +SAT_UPLOAD_PATH = "source/wav/SAT/upload" +SAT_OUT_PATH = "source/wav/SAT/out" +SAT_LABEL_PATH = "source/wav/SAT/label.json" + +# SAT 标注结果初始化 +if os.path.exists(SAT_LABEL_PATH): + with open(SAT_LABEL_PATH, "r", encoding='utf8') as f: + sat_label_dic = json.load(f) +else: + sat_label_dic = {} + +# ft 标注结果初始化 +if os.path.exists(FT_LABEL_PATH): + with open(FT_LABEL_PATH, "r", encoding='utf8') as f: + ft_label_dic = json.load(f) +else: + ft_label_dic = {} + +# 新建文件夹 +base_sources = [ + VC_UPLOAD_PATH, + VC_OUT_PATH, + FT_UPLOAD_PATH, + FT_OUT_PATH, + FT_DEFAULT_PATH, + SAT_UPLOAD_PATH, + SAT_OUT_PATH, +] +for path in base_sources: + os.makedirs(path, exist_ok=True) +##################################################################### +########################### APP初始化 ############################### +##################################################################### +app = FastAPI() + +###################################################################### +########################### 接口类型 ################################# +##################################################################### + + +# 接口结构 +class VcBase(BaseModel): + wavName: str + wavPath: str + + +class VcBaseText(BaseModel): + wavName: str + wavPath: str + text: str + func: str + + +class VcBaseSAT(BaseModel): + old_str: str + new_str: str + language: str + function: str + wav: str # base64编码 + filename: str + + +class FTPath(BaseModel): + dataPath: str + + +class VcBaseFT(BaseModel): + wav: str # base64编码 + filename: str + wav_path: str + + +class VcBaseFTModel(BaseModel): + wav_path: str + + +class VcBaseFTSyn(BaseModel): + exp_path: str + text: str + + +###################################################################### +########################### 文件列表查询与保存服务 ################################# +##################################################################### + + +def getVCList(path): + VC_FileDict = [] + # 查询upload路径下的wav文件名 + for root, dirs, files in os.walk(path, topdown=False): + for name in files: + # print(os.path.join(root, name)) + VC_FileDict.append({'name': name, 'path': os.path.join(root, name)}) + VC_FileDict = sorted(VC_FileDict, key=lambda x: x['name'], reverse=True) + return VC_FileDict + + +async def saveFiles(files, SavePath): + right = 0 + error = 0 + error_info = "错误文件:" + for file in files: + try: + if 'blob' in file.filename: + out_file_path = os.path.join( + SavePath, + datetime.datetime.strftime(datetime.datetime.now(), + '%H%M') + randName(3) + ".wav") + else: + out_file_path = os.path.join(SavePath, file.filename) + + print("上传文件名:", out_file_path) + async with aiofiles.open(out_file_path, 'wb') as out_file: + content = await file.read() # async read + await out_file.write(content) # async write + # 将文件转成24k, 16bit类型的wav文件 + wav, sr = librosa.load(out_file_path, sr=16000) + sf.write(out_file_path, data=wav, samplerate=sr) + right += 1 + except Exception as e: + error += 1 + error_info = error_info + file.filename + " " + str(e) + "\n" + continue + return f"上传成功:{right}, 上传失败:{error}, 失败原因: {error_info}" + + +# 音频下载 +@app.post("/vc/download") +async def VcDownload(base: VcBase): + if os.path.exists(base.wavPath): + return FileResponse(base.wavPath) + else: + return ErrorRequest(message="下载请求失败,文件不存在") + + +# 音频下载base64 +@app.post("/vc/download_base64") +async def VcDownloadBase64(base: VcBase): + if os.path.exists(base.wavPath): + # 将文件转成16k, 16bit类型的wav文件 + wav, sr = librosa.load(base.wavPath, sr=16000) + wav = float2pcm(wav) # float32 to int16 + wav_bytes = wav.tobytes() # to bytes + wav_base64 = base64.b64encode(wav_bytes).decode('utf8') + return SuccessRequest(result=wav_base64) + else: + return ErrorRequest(message="播放请求失败,文件不存在") + + +###################################################################### +########################### VC 服务 ################################# +##################################################################### + + +# 上传文件 +@app.post("/vc/upload") +async def VcUpload(files: List[UploadFile]): + # res = saveFiles(files, VC_UPLOAD_PATH) + right = 0 + error = 0 + error_info = "错误文件:" + for file in files: + try: + if 'blob' in file.filename: + out_file_path = os.path.join( + VC_UPLOAD_PATH, + datetime.datetime.strftime(datetime.datetime.now(), + '%H%M') + randName(3) + ".wav") + else: + out_file_path = os.path.join(VC_UPLOAD_PATH, file.filename) + + print("上传文件名:", out_file_path) + async with aiofiles.open(out_file_path, 'wb') as out_file: + content = await file.read() # async read + await out_file.write(content) # async write + # 将文件转成24k, 16bit类型的wav文件 + wav, sr = librosa.load(out_file_path, sr=16000) + sf.write(out_file_path, data=wav, samplerate=sr) + right += 1 + except Exception as e: + error += 1 + error_info = error_info + file.filename + " " + str(e) + "\n" + continue + return SuccessRequest( + result=f"上传成功:{right}, 上传失败:{error}, 失败原因: {error_info}") + + +# 获取文件列表 +@app.get("/vc/list") +async def VcList(): + res = getVCList(VC_UPLOAD_PATH) + return SuccessRequest(result=res) + + +# 获取音频文件 +@app.post("/vc/file") +async def VcFileGet(base: VcBase): + if os.path.exists(base.wavPath): + return FileResponse(base.wavPath) + else: + return ErrorRequest(result="获取文件失败") + + +# 删除音频文件 +@app.post("/vc/del") +async def VcFileDel(base: VcBase): + if os.path.exists(base.wavPath): + os.remove(base.wavPath) + return SuccessRequest(result="删除成功") + else: + return ErrorRequest(result="删除失败") + + +# 声音克隆G2P +@app.post("/vc/clone_g2p") +async def VcCloneG2P(base: VcBaseText): + if os.path.exists(base.wavPath): + try: + if base.func == 'ge2e': + wavName = base.wavName + wavPath = os.path.join(VC_OUT_PATH, wavName) + vc_model.vc( + text=base.text, input_wav=base.wavPath, out_wav=wavPath) + else: + wavName = base.wavName + wavPath = os.path.join(VC_OUT_PATH, wavName) + vc_model_tdnn.vc( + text=base.text, input_wav=base.wavPath, out_wav=wavPath) + res = {"wavName": wavName, "wavPath": wavPath} + return SuccessRequest(result=res) + except Exception as e: + print(e) + return ErrorRequest(message="克隆失败,合成过程报错") + else: + return ErrorRequest(message="克隆失败,音频不存在") + + +###################################################################### +########################### SAT 服务 ################################# +##################################################################### +# 声音克隆SAT +@app.post("/vc/clone_sat") +async def VcCloneSAT(base: VcBaseSAT): + # 重新整理 sat_label_dict + if base.filename not in sat_label_dic or sat_label_dic[ + base.filename] != base.old_str: + sat_label_dic[base.filename] = base.old_str + with open(SAT_LABEL_PATH, "w", encoding='utf8') as f: + json.dump(sat_label_dic, f, ensure_ascii=False, indent=4) + + input_file_path = base.wav + + # 选择任务 + if base.language == "zh": + # 中文 + if base.function == "synthesize": + output_file_path = os.path.join(SAT_OUT_PATH, + "sat_syn_zh_" + base.filename) + # 中文克隆 + sat_result = sat_model.zh_synthesize_edit( + old_str=base.old_str, + new_str=base.new_str, + input_name=os.path.realpath(input_file_path), + output_name=os.path.realpath(output_file_path), + task_name="synthesize") + elif base.function == "edit": + output_file_path = os.path.join(SAT_OUT_PATH, + "sat_edit_zh_" + base.filename) + # 中文语音编辑 + sat_result = sat_model.zh_synthesize_edit( + old_str=base.old_str, + new_str=base.new_str, + input_name=os.path.realpath(input_file_path), + output_name=os.path.realpath(output_file_path), + task_name="edit") + elif base.function == "crossclone": + output_file_path = os.path.join(SAT_OUT_PATH, + "sat_cross_zh_" + base.filename) + # 中文跨语言 + sat_result = sat_model.crossclone( + old_str=base.old_str, + new_str=base.new_str, + input_name=os.path.realpath(input_file_path), + output_name=os.path.realpath(output_file_path), + source_lang="zh", + target_lang="en") + else: + return ErrorRequest( + message="请检查功能选项是否正确,仅支持:synthesize, edit, crossclone") + elif base.language == "en": + if base.function == "synthesize": + output_file_path = os.path.join(SAT_OUT_PATH, + "sat_syn_zh_" + base.filename) + # 英文语音克隆 + sat_result = sat_model.en_synthesize_edit( + old_str=base.old_str, + new_str=base.new_str, + input_name=os.path.realpath(input_file_path), + output_name=os.path.realpath(output_file_path), + task_name="synthesize") + elif base.function == "edit": + output_file_path = os.path.join(SAT_OUT_PATH, + "sat_edit_zh_" + base.filename) + # 英文语音编辑 + sat_result = sat_model.en_synthesize_edit( + old_str=base.old_str, + new_str=base.new_str, + input_name=os.path.realpath(input_file_path), + output_name=os.path.realpath(output_file_path), + task_name="edit") + elif base.function == "crossclone": + output_file_path = os.path.join(SAT_OUT_PATH, + "sat_cross_zh_" + base.filename) + # 英文跨语言 + sat_result = sat_model.crossclone( + old_str=base.old_str, + new_str=base.new_str, + input_name=os.path.realpath(input_file_path), + output_name=os.path.realpath(output_file_path), + source_lang="en", + target_lang="zh") + else: + return ErrorRequest( + message="请检查功能选项是否正确,仅支持:synthesize, edit, crossclone") + else: + return ErrorRequest(message="请检查功能选项是否正确,仅支持中文和英文") + + if sat_result: + return SuccessRequest(result=sat_result, message="SAT合成成功") + else: + return ErrorRequest(message="SAT 合成失败,请从后台检查错误信息!") + + +# SAT 文件列表 +@app.get("/sat/list") +async def SatList(): + res = [] + filelist = getVCList(SAT_UPLOAD_PATH) + for fileitem in filelist: + if fileitem['name'] in sat_label_dic: + fileitem['label'] = sat_label_dic[fileitem['name']] + else: + fileitem['label'] = "" + res.append(fileitem) + return SuccessRequest(result=res) + + +# 上传 SAT 音频 +# 上传文件 +@app.post("/sat/upload") +async def SATUpload(files: List[UploadFile]): + right = 0 + error = 0 + error_info = "错误文件:" + for file in files: + try: + if 'blob' in file.filename: + out_file_path = os.path.join( + SAT_UPLOAD_PATH, + datetime.datetime.strftime(datetime.datetime.now(), + '%H%M') + randName(3) + ".wav") + else: + out_file_path = os.path.join(SAT_UPLOAD_PATH, file.filename) + + print("上传文件名:", out_file_path) + async with aiofiles.open(out_file_path, 'wb') as out_file: + content = await file.read() # async read + await out_file.write(content) # async write + # 将文件转成24k, 16bit类型的wav文件 + wav, sr = librosa.load(out_file_path, sr=16000) + sf.write(out_file_path, data=wav, samplerate=sr) + right += 1 + except Exception as e: + error += 1 + error_info = error_info + file.filename + " " + str(e) + "\n" + continue + return SuccessRequest( + result=f"上传成功:{right}, 上传失败:{error}, 失败原因: {error_info}") + + +###################################################################### +########################### FinueTune 服务 ################################# +##################################################################### + + +# finetune 文件列表 +@app.post("/finetune/list") +async def FineTuneList(Path: FTPath): + dataPath = Path.dataPath + if dataPath == "default": + # 默认路径 + FT_PATH = FT_DEFAULT_PATH + else: + FT_PATH = dataPath + + res = [] + filelist = getVCList(FT_PATH) + for name, value in ft_label_dic.items(): + wav_path = os.path.join(FT_PATH, name) + if not os.path.exists(wav_path): + wav_path = "" + d = {'text': value['text'], 'name': name, 'path': wav_path} + res.append(d) + return SuccessRequest(result=res) + + +# 一键重置,获取新的文件地址 +@app.get('/finetune/newdir') +async def FTGetNewDir(): + new_path = os.path.join(FT_UPLOAD_PATH, randName(3)) + if not os.path.exists(new_path): + os.makedirs(new_path, exist_ok=True) + # 把 labels.txt 复制进去 + cmd = f"cp {FT_LABEL_TXT_PATH} {new_path}" + os.system(cmd) + return SuccessRequest(result=new_path) + + +# finetune 上传文件 +@app.post("/finetune/upload") +async def FTUpload(base: VcBaseFT): + try: + # 文件夹是否存在 + if not os.path.exists(base.wav_path): + os.makedirs(base.wav_path) + # 保存音频文件 + out_file_path = os.path.join(base.wav_path, base.filename) + wav_b = base64.b64decode(base.wav) + async with aiofiles.open(out_file_path, 'wb') as out_file: + await out_file.write(wav_b) # async write + + return SuccessRequest(result="上传成功") + except Exception as e: + return ErrorRequest(result="上传失败") + + +# finetune 微调 +@app.post("/finetune/clone_finetune") +async def FTModel(base: VcBaseFTModel): + # 先检查 wav_path 是否有效 + if base.wav_path == 'default': + data_path = FT_DEFAULT_PATH + else: + data_path = base.wav_path + if not os.path.exists(data_path): + return ErrorRequest(message="数据文件夹不存在") + + data_base = data_path.split(os.sep)[-1] + exp_dir = os.path.join(FT_EXP_BASE_PATH, data_base) + try: + exp_dir = ft_model.finetune( + input_dir=os.path.realpath(data_path), + exp_dir=os.path.realpath(exp_dir)) + if exp_dir: + return SuccessRequest(result=exp_dir) + else: + return ErrorRequest(message="微调失败") + except Exception as e: + print(e) + return ErrorRequest(message="微调失败") + + +# finetune 合成 +@app.post("/finetune/clone_finetune_syn") +async def FTSyn(base: VcBaseFTSyn): + try: + if not os.path.exists(base.exp_path): + return ErrorRequest(result="模型路径不存在") + wav_name = randName(5) + wav_path = ft_model.synthesize( + text=base.text, + wav_name=wav_name, + out_wav_dir=os.path.realpath(FT_OUT_PATH), + exp_dir=os.path.realpath(base.exp_path)) + if wav_path: + res = {"wavName": wav_name + ".wav", "wavPath": wav_path} + return SuccessRequest(result=res) + else: + return ErrorRequest(message="音频合成失败") + except Exception as e: + return ErrorRequest(message="音频合成失败") + + +if __name__ == '__main__': + uvicorn.run(app=app, host='0.0.0.0', port=port) diff --git a/demos/speech_web/web_client/package.json b/demos/speech_web/web_client/package.json index 7f28d4c9..d8c213e4 100644 --- a/demos/speech_web/web_client/package.json +++ b/demos/speech_web/web_client/package.json @@ -8,6 +8,7 @@ "preview": "vite preview" }, "dependencies": { + "@element-plus/icons-vue": "^2.0.9", "ant-design-vue": "^2.2.8", "axios": "^0.26.1", "element-plus": "^2.1.9", @@ -18,6 +19,7 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^2.3.0", - "vite": "^2.9.0" + "vite": "^2.9.13", + "@vue/compiler-sfc": "^3.1.0" } } diff --git a/demos/speech_web/web_client/src/api/API.js b/demos/speech_web/web_client/src/api/API.js index 0feaa63f..5adca362 100644 --- a/demos/speech_web/web_client/src/api/API.js +++ b/demos/speech_web/web_client/src/api/API.js @@ -19,6 +19,26 @@ export const apiURL = { CHAT_SOCKET_RECORD: 'ws://localhost:8010/ws/asr/offlineStream', // ChatBot websocket 接口 ASR_SOCKET_RECORD: 'ws://localhost:8010/ws/asr/onlineStream', // Stream ASR 接口 TTS_SOCKET_RECORD: 'ws://localhost:8010/ws/tts/online', // Stream TTS 接口 + + // voice clone + // Voice Clone + VC_List: '/api/vc/list', + SAT_List: '/api/sat/list', + FineTune_List: '/api/finetune/list', + + VC_Upload: '/api/vc/upload', + SAT_Upload: '/api/sat/upload', + FineTune_Upload: '/api/finetune/upload', + FineTune_NewDir: '/api/finetune/newdir', + + VC_Download: '/api/vc/download', + VC_Download_Base64: '/api/vc/download_base64', + VC_Del: '/api/vc/del', + + VC_CloneG2p: '/api/vc/clone_g2p', + VC_CloneSAT: '/api/vc/clone_sat', + VC_CloneFineTune: '/api/finetune/clone_finetune', + VC_CloneFineTuneSyn: '/api/finetune/clone_finetune_syn', } diff --git a/demos/speech_web/web_client/src/api/ApiVC.js b/demos/speech_web/web_client/src/api/ApiVC.js new file mode 100644 index 00000000..0dc0f683 --- /dev/null +++ b/demos/speech_web/web_client/src/api/ApiVC.js @@ -0,0 +1,88 @@ +import axios from 'axios' +import {apiURL} from "./API.js" + +// 上传音频-vc +export async function vcUpload(params){ + const result = await axios.post(apiURL.VC_Upload, params); + return result +} + +// 上传音频-sat +export async function satUpload(params){ + const result = await axios.post(apiURL.SAT_Upload, params); + return result +} + +// 上传音频-finetune +export async function fineTuneUpload(params){ + const result = await axios.post(apiURL.FineTune_Upload, params); + return result +} + +// 删除音频 +export async function vcDel(params){ + const result = await axios.post(apiURL.VC_Del, params); + return result +} + +// 获取音频列表vc +export async function vcList(){ + const result = await axios.get(apiURL.VC_List); + return result +} +// 获取音频列表Sat +export async function satList(){ + const result = await axios.get(apiURL.SAT_List); + return result +} + +// 获取音频列表fineTune +export async function fineTuneList(params){ + const result = await axios.post(apiURL.FineTune_List, params); + return result +} + +// fineTune 一键重置 获取新的文件夹 +export async function fineTuneNewDir(){ + const result = await axios.get(apiURL.FineTune_NewDir); + return result +} + +// 获取音频数据 +export async function vcDownload(params){ + const result = await axios.post(apiURL.VC_Download, params); + return result +} + +// 获取音频数据Base64 +export async function vcDownloadBase64(params){ + const result = await axios.post(apiURL.VC_Download_Base64, params); + return result +} + + +// 克隆合成G2P +export async function vcCloneG2P(params){ + const result = await axios.post(apiURL.VC_CloneG2p, params); + return result +} + +// 克隆合成SAT +export async function vcCloneSAT(params){ + const result = await axios.post(apiURL.VC_CloneSAT, params); + return result +} + +// 克隆合成 - finetune 微调 +export async function vcCloneFineTune(params){ + const result = await axios.post(apiURL.VC_CloneFineTune, params); + return result +} + +// 克隆合成 - finetune 合成 +export async function vcCloneFineTuneSyn(params){ + const result = await axios.post(apiURL.VC_CloneFineTuneSyn, params); + return result +} + + diff --git a/demos/speech_web/web_client/src/components/Content/Header/Header.vue b/demos/speech_web/web_client/src/components/Content/Header/Header.vue index 8135a2bf..c20f3366 100644 --- a/demos/speech_web/web_client/src/components/Content/Header/Header.vue +++ b/demos/speech_web/web_client/src/components/Content/Header/Header.vue @@ -4,7 +4,7 @@ 飞桨-PaddleSpeech
- PaddleSpeech 是基于飞桨 PaddlePaddle 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发,欢迎大家Star收藏鼓励 + PaddleSpeech 是基于飞桨 PaddlePaddle 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发。支持语音识别,语音合成,声纹识别,声音分类,语音唤醒,语音翻译等多种语音任务,荣获 NAACL2022 Best Demo Award 。如果你喜欢这个示例,欢迎在 github 中 star 收藏鼓励。
diff --git a/demos/speech_web/web_client/src/components/Content/Header/style.less b/demos/speech_web/web_client/src/components/Content/Header/style.less index 9d026137..cc97c741 100644 --- a/demos/speech_web/web_client/src/components/Content/Header/style.less +++ b/demos/speech_web/web_client/src/components/Content/Header/style.less @@ -43,6 +43,7 @@ margin-bottom: 40px; display: flex; align-items: center; + margin-top: 40px; }; .speech_header_link { display: block; diff --git a/demos/speech_web/web_client/src/components/Experience.vue b/demos/speech_web/web_client/src/components/Experience.vue index 5620d6af..4f32faf9 100644 --- a/demos/speech_web/web_client/src/components/Experience.vue +++ b/demos/speech_web/web_client/src/components/Experience.vue @@ -6,6 +6,10 @@ import TTST from './SubMenu/TTS/TTST.vue' import VPRT from './SubMenu/VPR/VPRT.vue' import IET from './SubMenu/IE/IET.vue' +import VoiceCloneT from './SubMenu/VoiceClone/VoiceClone.vue' +import ENIRE_SATT from './SubMenu/ENIRE_SAT/ENIRE_SAT.vue' +import FineTuneT from './SubMenu/FineTune/FineTune.vue' +