update vc and tts mix

3 years ago · 63700756ec
parent b5f376e63b
commit 63700756ec
2 changed files with 101 additions and 4 deletions
--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@ -6,12 +6,18 @@ PaddleSpeechDemo 是一个以 PaddleSpeech 的语音交互功能为主体开发
 主要功能：
 `main.py` 中包含功能
 + 语音聊天：PaddleSpeech 的语音识别能力+语音合成能力，对话部分基于 PaddleNLP 的闲聊功能
 + 声纹识别：PaddleSpeech 的声纹识别功能展示
 + 语音识别：支持【实时语音识别】，【端到端识别】，【音频文件识别】三种模式
 + 语音合成：支持【流式合成】与【端到端合成】两种方式
 + 语音指令：基于 PaddleSpeech 的语音识别能力与 PaddleNLP 的信息抽取，实现交通费的智能报销
 `vc.py` 中包含功能
 + 一句话合成：基于 GE2E 和 ECAPA-TDNN 模型的一句话合成方案，可以模仿输入的音频的音色进行合成任务
 + 小数据微调：基于小数据集的微调方案，内置用10句话标贝中文女声微调示例，你也可以通过一键重置，录制自己的声音，注意在安静环境下录制，效果会更好，你可以在[finetune](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tts_finetune/tts3)中，使用自己的小数据集，训练音色
 + ENIRE SAT：语言-语音跨模态大模型 ENIRE SAT 可视化展示示例，支持个性化合成，跨语言语音合成（输入音频为中文则合成），语音编辑功能
 运行效果：
 ![效果](docs/效果展示.png)
@ -25,11 +31,59 @@ PaddleSpeechDemo 是一个以 PaddleSpeech 的语音交互功能为主体开发
 cd speech_server
 pip install -r requirements.txt
-# 下载 ie 模型，针对地点进行微调，效果更好，不下载的话会使用其它版本，效果没有这个好
+mkdir source
 cd source
 # 下载 tools 
 wget https://paddlespeech.bj.bcebos.com/demos/speech_web/tools.zip
 unzip tools.zip
 # 下载 wav
 wget https://paddlespeech.bj.bcebos.com/demos/speech_web/wav.zip
 unzip tools.zip
 # 下载 ie 模型，针对地点进行微调
 mkdir model
 cd model
 # 下载IE模型
 wget https://bj.bcebos.com/paddlenlp/applications/speech-cmd-analysis/finetune/model_state.pdparams
 # 下载 GE2E 相关模型
 wget https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip
 unzip ge2e_ckpt_0.3.zip
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip
 unzip pwg_aishell3_ckpt_0.5.zip
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip
 unzip fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip
 # 下载 SAT 相关模型
 # fastspeech2
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip
 unzip fastspeech2_conformer_baker_ckpt_0.5.zip
 unzip fastspeech2_nosil_ljspeech_ckpt_0.5.zip
 # aishell3
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip
 unzip hifigan_aishell3_ckpt_0.2.0.zip
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_ckpt_1.2.0.zip
 unzip erniesat_aishell3_ckpt_1.2.0.zip
 # vctk
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip
 unzip unzip hifigan_vctk_ckpt_0.2.0.zip
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_vctk_ckpt_1.2.0.zip
 unzip erniesat_vctk_ckpt_1.2.0.zip
 # aishell3_vctk
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_vctk_ckpt_1.2.0.zip
 unzip erniesat_aishell3_vctk_ckpt_1.2.0.zip
 # 下载 finetune 相关模型
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip
 unzip fastspeech2_aishell3_ckpt_1.1.0.zip
 ```
 ### 前端环境安装
@ -51,12 +105,35 @@ yarn install
 ### 开启后端服务
 #### `main.py`
 【语音聊天】【声纹识别】【语音识别】【语音合成】【语音指令】功能体验，可直接使用下面的代码
 ```
 cd speech_server
 # 默认8010端口
 python main.py --port 8010
 ```
 #### `vc.py`
 【一句话合成】【小数据微调】【ENIRE SAT】体验都依赖于MFA，体验前先确保 MFA 可用，项目兼容 mfa v1 和 v2 ，source tools中已包含 v1.02版本编译好的工具，如果你是linux系统且mfa可使用，可以将`vc.py`中
 ```python
 sat_model = SAT(mfa_version='v2')
 ft_model = FineTune(mfa_version='v2')
 ```
 更改为
 ```python
 sat_model = SAT(mfa_version='v1')
 ft_model = FineTune(mfa_version='v1')
 ```
 如果你是其它的系统，可以使用 conda 安装 mfa v2 进行体验，安装请参考 [Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/getting_started.html)，确保自己环境中 MFA 可用
 ```
 cd speech_server
 # 默认8010端口
 python vc.py --port 8010
 ```
 ### 开启前端服务
 ```
--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@ -34,6 +34,7 @@ from starlette.websockets import WebSocketState as WebSocketState
 from paddlespeech.server.engine.asr.online.python.asr_engine import PaddleASRConnectionHanddler
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.cli.tts.infer import TTSExecutor
 # 解析配置
 parser = argparse.ArgumentParser(prog='PaddleSpeechDemo', add_help=True)
@ -55,7 +56,7 @@ asr_config = "conf/ws_conformer_wenetspeech_application_faster.yaml"
 asr_init_path = "source/demo/demo.wav"
 db_path = "source/db/vpr.sqlite"
 ie_model_path = "source/model"
-
+tts_model = TTSExecutor()
 # 路径配置
 UPLOAD_PATH = "source/vpr"
 WAV_PATH = "source/wav"
@ -72,6 +73,15 @@ manager = ConnectionManager()
 aumanager = AudioMannger(chatbot)
 aumanager.init()
 vpr = VPR(db_path, dim=192, top_k=5)
 # 初始化下载模型
 tts_model(
        text="今天天气准不错",
        output="test.wav",
        am='fastspeech2_mix',
        spk_id=174,
        voc='hifigan_csmsc',
        lang='mix',
    )
 # 服务配置
@ -330,7 +340,7 @@ async def ieOffline(nlp_base: NlpBase):
 ########################### TTS 服务 #################################
 #####################################################################
-
+# 端到端合成
@app.post("/tts/offline")
 async def text2speechOffline(tts_base: TtsBase):
    text = tts_base.text
@ -341,13 +351,23 @@ async def text2speechOffline(tts_base: TtsBase):
            datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav"
        out_file_path = os.path.join(WAV_PATH, now_name)
        # 保存为文件，再转成base64传输
-        chatbot.text2speech(text, outpath=out_file_path)
+        # chatbot.text2speech(text, outpath=out_file_path)
        # 使用中英混合CLI
        tts_model(
                text=text,
                output=out_file_path,
                am='fastspeech2_mix',
                spk_id=174,
                voc='hifigan_csmsc',
                lang='mix',
            ) 
        with open(out_file_path, "rb") as f:
            data_bin = f.read()
        base_str = base64.b64encode(data_bin)
        return SuccessRequest(result=base_str)
 # http流式TTS
@app.post("/tts/online")
 async def stream_tts(request_body: TtsBase):