From 2da51752d2afa04ecc94346a5cd2b163bb0dcce9 Mon Sep 17 00:00:00 2001 From: Echo-Nie <157974576+Echo-Nie@users.noreply.github.com> Date: Wed, 16 Apr 2025 15:41:30 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90PaddleSpeech=20No.5=E3=80=91=20(#4042)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * run.sh修改:为 synthesize 和 synthesize_e2e 添加 --stage 参数控制 vocoder 模型选择,REAMDE.md修改:补充 stage 参数说明,明确 vocoder 选择逻辑 * 添加run.sh中stage参数相关的注释 * HiFiGAN改为MultiBand MelGAN * cmsc文件改回原位(No.15不修改),这里只对No.6做修改 * fix Speech No.5 * update README.md * fix the README.md --- examples/aishell3_vctk/ernie_sat/README.md | 6 ++++-- examples/aishell3_vctk/ernie_sat/run.sh | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md index 57dcd472f..e73291add 100644 --- a/examples/aishell3_vctk/ernie_sat/README.md +++ b/examples/aishell3_vctk/ernie_sat/README.md @@ -15,7 +15,7 @@ In ERNIE-SAT, we propose two innovations: Download all datasets and extract it to `~/datasets`: - The aishell3 dataset is in the directory `~/datasets/data_aishell3` - The vctk dataset is in the directory `~/datasets/VCTK-Corpus-0.92` - + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for the fastspeech2 training. You can download from here: @@ -97,6 +97,8 @@ hifigan_aishell3_ckpt_0.2.0 ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` +synthesize, vocoder is `hifigan` + ## Speech Synthesis and Speech Editing ### Prepare @@ -150,7 +152,7 @@ You can check the text of downloaded wavs in `source/README.md`. ```bash ./run.sh --stage 3 --stop-stage 3 --gpus 0 ``` -`stage 3` of `run.sh` calls `local/synthesize_e2e.sh`. +`stage 3` of `run.sh` calls `local/synthesize_e2e.sh`. `synthesize_e2e.sh` is a script for end-to-end speech synthesis, supporting cross-language speech synthesis tasks, including English-to-Chinese (en → zh) and Chinese-to-English (zh → en). You can modify `--wav_path`、`--old_str` and `--new_str` yourself, `--old_str` should be the text corresponding to the audio of `--wav_path`, `--new_str` should be designed according to `--task_name`, `--source_lang` and `--target_lang` should be different in this example. ## Pretrained Model diff --git a/examples/aishell3_vctk/ernie_sat/run.sh b/examples/aishell3_vctk/ernie_sat/run.sh index 8cd9d8d1b..d29f0b6e8 100755 --- a/examples/aishell3_vctk/ernie_sat/run.sh +++ b/examples/aishell3_vctk/ernie_sat/run.sh @@ -27,10 +27,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan + # synthesize, vocoder is hifigan CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, default speech synthesis from Chinese to English, use stage1 to switch from English to Chinese + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi