From 9e25663b95cc470da705b33ca9faf29fd97b015f Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 13 May 2025 09:20:54 +0000 Subject: [PATCH] fix --- examples/aishell3/tts3/README.md | 8 ++++---- examples/aishell3/tts3/local/synthesize.sh | 4 ++-- examples/aishell3/tts3/local/synthesize_e2e.sh | 4 ++-- examples/aishell3/tts3/run.sh | 8 ++++---- examples/aishell3_vctk/ernie_sat/run.sh | 4 ++-- examples/canton/tts3/local/synthesize_e2e.sh | 4 ++-- examples/canton/tts3/run.sh | 8 ++++---- examples/csmsc/tts2/README.md | 8 ++++---- examples/csmsc/tts2/local/synthesize.sh | 4 ++-- examples/csmsc/tts2/local/synthesize_e2e.sh | 4 ++-- examples/csmsc/tts2/run.sh | 12 ++++++------ examples/csmsc/tts3/local/synthesize.sh | 1 + examples/csmsc/tts3_rhy/local/synthesize_e2e.sh | 4 ++-- examples/csmsc/tts3_rhy/run.sh | 12 ++++++------ examples/ljspeech/tts3/README.md | 8 ++++---- examples/ljspeech/tts3/local/synthesize.sh | 4 ++-- examples/ljspeech/tts3/local/synthesize_e2e.sh | 4 ++-- examples/ljspeech/tts3/run.sh | 8 ++++---- examples/opencpop/svs1/README.md | 4 ++-- examples/opencpop/svs1/README_cn.md | 4 ++-- examples/opencpop/svs1/local/synthesize_e2e.sh | 4 ++-- examples/opencpop/svs1/run.sh | 4 ++-- examples/vctk/ernie_sat/README.md | 4 +--- examples/vctk/ernie_sat/run.sh | 6 +++--- examples/vctk/tts3/README.md | 8 ++++---- examples/vctk/tts3/local/synthesize.sh | 4 ++-- examples/vctk/tts3/local/synthesize_e2e.sh | 4 ++-- examples/vctk/tts3/run.sh | 8 ++++---- 28 files changed, 79 insertions(+), 80 deletions(-) diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index ee501fe2d..32d99ce47 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -109,9 +109,9 @@ pwg_aishell3_ckpt_0.5 ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. +The last number controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize.py [-h] [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] @@ -158,9 +158,9 @@ optional arguments: ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. +The last number controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize_e2e.py [-h] [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] diff --git a/examples/aishell3/tts3/local/synthesize.sh b/examples/aishell3/tts3/local/synthesize.sh index 9134e0426..aec9af03b 100755 --- a/examples/aishell3/tts3/local/synthesize.sh +++ b/examples/aishell3/tts3/local/synthesize.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh index 2cc22ede2..807714c9a 100755 --- a/examples/aishell3/tts3/local/synthesize_e2e.sh +++ b/examples/aishell3/tts3/local/synthesize_e2e.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index 3fd5d73c6..265f9054e 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -27,13 +27,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/aishell3_vctk/ernie_sat/run.sh b/examples/aishell3_vctk/ernie_sat/run.sh index d29f0b6e8..3aedc9c61 100755 --- a/examples/aishell3_vctk/ernie_sat/run.sh +++ b/examples/aishell3_vctk/ernie_sat/run.sh @@ -32,6 +32,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, default speech synthesis from Chinese to English, use stage1 to switch from English to Chinese - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, run both speech synthesis from Chinese to English, and English to Chinese + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi diff --git a/examples/canton/tts3/local/synthesize_e2e.sh b/examples/canton/tts3/local/synthesize_e2e.sh index 38b7e1af0..5d21aa9f0 100755 --- a/examples/canton/tts3/local/synthesize_e2e.sh +++ b/examples/canton/tts3/local/synthesize_e2e.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh index 0e1f52a1c..297091a7c 100755 --- a/examples/canton/tts3/run.sh +++ b/examples/canton/tts3/run.sh @@ -28,13 +28,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 3c6e7d96c..1b40ded27 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -116,9 +116,9 @@ pwg_baker_ckpt_0.4 ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can use stage `0-4` to select the vocoder to use {`pwgan`, `multi band melgan`, `style melgan`, `hifigan`, `wavernn`} +The last number controls the vocoder model during synthesis, which can use `0-4` to select the vocoder in {`pwgan`, `multi band melgan`, `style melgan`, `hifigan`, `wavernn`} ```text usage: synthesize.py [-h] @@ -166,9 +166,9 @@ optional arguments: ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can use stage `0,1,3,4` to select the vocoder to use {`pwgan`, `multi band melgan`, `hifigan`, `wavernn`} +The last number controls the vocoder model during synthesis, which can use `0,1,3,4` to select the vocoder in {`pwgan`, `multi band melgan`, `hifigan`, `wavernn`} ```text usage: synthesize_e2e.py [-h] diff --git a/examples/csmsc/tts2/local/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh index b8982a16d..2489942bd 100755 --- a/examples/csmsc/tts2/local/synthesize.sh +++ b/examples/csmsc/tts2/local/synthesize.sh @@ -3,8 +3,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 2b2787295..ec994644a 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh index 6f62bc95b..b19e7bbd2 100755 --- a/examples/csmsc/tts2/run.sh +++ b/examples/csmsc/tts2/run.sh @@ -27,15 +27,15 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default stage 0 - # use stage 1-4 to select the vocoder to use {multi band melgan, style melgan, hifigan, wavernn} - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default 0 + # use 1-4 to select the vocoder to use {multi band melgan, style melgan, hifigan, wavernn} + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default stage 0 - # use stage 1,3,4 to select the vocoder to use {multi band melgan, hifigan, wavernn} - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default 0 + # use 1,3,4 to select the vocoder to use {multi band melgan, hifigan, wavernn} + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/csmsc/tts3/local/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh index dc6cfdd54..1d1e7c1d4 100755 --- a/examples/csmsc/tts3/local/synthesize.sh +++ b/examples/csmsc/tts3/local/synthesize.sh @@ -3,6 +3,7 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 + stage=${4:-0} stop_stage=${4:-0} diff --git a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh index bf7229e13..39ced6985 100755 --- a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/csmsc/tts3_rhy/run.sh b/examples/csmsc/tts3_rhy/run.sh index 294ceded5..49d7aeeae 100755 --- a/examples/csmsc/tts3_rhy/run.sh +++ b/examples/csmsc/tts3_rhy/run.sh @@ -28,13 +28,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default stage 0 - # use stage 1-4 to select the vocoder to use {multi band melgan, style melgan, hifigan, wavernn} - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default 0 + # use 1-4 to select the vocoder in {multi band melgan, style melgan, hifigan, wavernn} + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default stage 0 - # use stage 1,3,4 to select the vocoder to use {multi band melgan, hifigan, wavernn} - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default 0 + # use 1,3,4 to select the vocoder in {multi band melgan, hifigan, wavernn} + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index 12bd73777..90133e307 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -105,9 +105,9 @@ pwg_ljspeech_ckpt_0.5 ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. +The last number controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize.py [-h] @@ -155,9 +155,9 @@ optional arguments: ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. +The last number controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize_e2e.py [-h] diff --git a/examples/ljspeech/tts3/local/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh index 0733e96fa..32e9affbf 100755 --- a/examples/ljspeech/tts3/local/synthesize.sh +++ b/examples/ljspeech/tts3/local/synthesize.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh index 3f2340808..da44c99df 100755 --- a/examples/ljspeech/tts3/local/synthesize_e2e.sh +++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh index b02126e6e..5a089ae9b 100755 --- a/examples/ljspeech/tts3/run.sh +++ b/examples/ljspeech/tts3/run.sh @@ -27,13 +27,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/opencpop/svs1/README.md b/examples/opencpop/svs1/README.md index 534a741fd..afc83e82b 100644 --- a/examples/opencpop/svs1/README.md +++ b/examples/opencpop/svs1/README.md @@ -172,9 +172,9 @@ optional arguments: `local/pinyin_to_phone.txt` comes from the readme of the opencpop dataset, indicating the mapping from pinyin to phonemes in opencpop. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. +The last number controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize_e2e.py [-h] diff --git a/examples/opencpop/svs1/README_cn.md b/examples/opencpop/svs1/README_cn.md index e35967d71..35005aade 100644 --- a/examples/opencpop/svs1/README_cn.md +++ b/examples/opencpop/svs1/README_cn.md @@ -175,9 +175,9 @@ optional arguments: `local/pinyin_to_phone.txt`来源于opencpop数据集中的README,表示opencpop中拼音到音素的映射。 ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` 用于选择合成时使用的声码器模型,取值为 `0` 或 `1`,分别对应使用 `pwgan` 或 `hifigan` 模型作为声码器。 +最后一位参数 `0` 用于选择合成时使用的声码器模型,取值为 `0` 或 `1`,分别对应使用 `pwgan` 或 `hifigan` 模型作为声码器。 ```text usage: synthesize_e2e.py [-h] diff --git a/examples/opencpop/svs1/local/synthesize_e2e.sh b/examples/opencpop/svs1/local/synthesize_e2e.sh index e8d0cc45a..f3e2dc9f4 100755 --- a/examples/opencpop/svs1/local/synthesize_e2e.sh +++ b/examples/opencpop/svs1/local/synthesize_e2e.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh index 6c6688b2f..9f780a3da 100755 --- a/examples/opencpop/svs1/run.sh +++ b/examples/opencpop/svs1/run.sh @@ -32,6 +32,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi diff --git a/examples/vctk/ernie_sat/README.md b/examples/vctk/ernie_sat/README.md index 3fe99172b..8efd94a6b 100644 --- a/examples/vctk/ernie_sat/README.md +++ b/examples/vctk/ernie_sat/README.md @@ -85,9 +85,8 @@ hifigan_vctk_ckpt_0.2.0 ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` -`--stage` controls the vocoder model during synthesis, which can be `0` , use`hifigan` model as vocoder. ## Speech Synthesis and Speech Editing @@ -142,7 +141,6 @@ You can check the text of downloaded wavs in `source/README.md`. ```bash ./run.sh --stage 3 --stop-stage 3 --gpus 0 ``` -`stage 3` of `run.sh` calls `local/synthesize_e2e.sh`, `stage 0` of it is **Speech Synthesis** and `stage 1` of it is **Speech Editing**. You can modify `--wav_path`、`--old_str` and `--new_str` yourself, `--old_str` should be the text corresponding to the audio of `--wav_path`, `--new_str` should be designed according to `--task_name`, both `--source_lang` and `--target_lang` should be `en` for model trained with VCTK dataset. ## Pretrained Model diff --git a/examples/vctk/ernie_sat/run.sh b/examples/vctk/ernie_sat/run.sh index c9bdfde79..628557847 100755 --- a/examples/vctk/ernie_sat/run.sh +++ b/examples/vctk/ernie_sat/run.sh @@ -28,10 +28,10 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # synthesize, vocoder is hifigan by default - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize, task_name is speech synthesize by default stage 0, stage 1 will use speech edit as taskname - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, run both speech synthesize and speech edit + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index a9d568ebf..72169c8f3 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -108,9 +108,9 @@ pwg_vctk_ckpt_0.1.1 ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. +The last number controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize.py [-h] @@ -158,9 +158,9 @@ optional arguments: ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 ``` -`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. +The last number controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize_e2e.py [-h] diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh index 87145959f..9c1302ace 100755 --- a/examples/vctk/tts3/local/synthesize.sh +++ b/examples/vctk/tts3/local/synthesize.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh index 971c83853..09506edcd 100755 --- a/examples/vctk/tts3/local/synthesize_e2e.sh +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -4,8 +4,8 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=0 -stop_stage=0 +stage=${4:-0} +stop_stage=${4:-0} # pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh index 8ce3b707d..d37068c22 100755 --- a/examples/vctk/tts3/run.sh +++ b/examples/vctk/tts3/run.sh @@ -27,13 +27,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default 0, stage 1 will use hifigan as vocoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by 0, use 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} 0 || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then