[TTS]soft link for shell in example, add skip_copy_wave in norm stage of G… (#2851)

soft link for shell in example, add skip_copy_wave in norm stage of GANVocoders to save disk
3 years ago · 2b01e40525
parent 140aed4b54
commit 2b01e40525
55 changed files with 90 additions and 979 deletions
--- a/examples/aishell3/tts3/path.sh
+++ b/examples/aishell3/tts3/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=fastspeech2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/aishell3/tts3/path.sh
+++ b/examples/aishell3/tts3/path.sh
@ -0,0 +1 @@
 ../../csmsc/tts3/path.sh
--- a/examples/aishell3/vc0/path.sh
+++ b/examples/aishell3/vc0/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=tacotron2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/aishell3/vc0/path.sh
+++ b/examples/aishell3/vc0/path.sh
@ -0,0 +1 @@
 ../../csmsc/tts0/path.sh
--- a/examples/aishell3/vc1/local/train.sh
+++ b/examples/aishell3/vc1/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=2 \
    --phones-dict=dump/phone_id_map.txt \
    --voice-cloning=True
--- a/examples/aishell3/vc1/local/train.sh
+++ b/examples/aishell3/vc1/local/train.sh
@ -0,0 +1 @@
 ../../vc0/local/train.sh
--- a/examples/aishell3/vc1/path.sh
+++ b/examples/aishell3/vc1/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=fastspeech2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/aishell3/vc1/path.sh
+++ b/examples/aishell3/vc1/path.sh
@ -0,0 +1 @@
 ../../csmsc/tts3/path.sh
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ b/examples/aishell3/vc2/local/synthesize.sh
@ -1,20 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 python3 ${BIN_DIR}/../synthesize.py \
    --am=fastspeech2_aishell3 \
    --am_config=${config_path} \
    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
    --am_stat=dump/train/speech_stats.npy \
    --voc=pwgan_aishell3 \
    --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
    --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
    --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
    --test_metadata=dump/test/norm/metadata.jsonl \
    --output_dir=${train_output_path}/test \
    --phones_dict=dump/phone_id_map.txt \
    --speaker_dict=dump/speaker_id_map.txt \
    --voice-cloning=True
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ b/examples/aishell3/vc2/local/synthesize.sh
@ -0,0 +1 @@
 ../../vc1/local/synthesize.sh
--- a/examples/aishell3/vc2/local/train.sh
+++ b/examples/aishell3/vc2/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=2 \
    --phones-dict=dump/phone_id_map.txt \
    --voice-cloning=True
--- a/examples/aishell3/vc2/local/train.sh
+++ b/examples/aishell3/vc2/local/train.sh
@ -0,0 +1 @@
 ../../vc0/local/train.sh
--- a/examples/aishell3/vc2/path.sh
+++ b/examples/aishell3/vc2/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=fastspeech2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/aishell3/vc2/path.sh
+++ b/examples/aishell3/vc2/path.sh
@ -0,0 +1 @@
 ../../csmsc/tts3/path.sh
--- a/examples/aishell3/voc1/local/preprocess.sh
+++ b/examples/aishell3/voc1/local/preprocess.sh
@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
 fi
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ b/examples/aishell3/voc1/local/synthesize.sh
@ -1,14 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
    --config=${config_path} \
    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
    --test-metadata=dump/test/norm/metadata.jsonl \
    --output-dir=${train_output_path}/test \
    --generator-type=pwgan
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ b/examples/aishell3/voc1/local/synthesize.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/synthesize.sh
--- a/examples/aishell3/voc1/local/train.sh
+++ b/examples/aishell3/voc1/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/aishell3/voc1/local/train.sh
+++ b/examples/aishell3/voc1/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/train.sh
--- a/examples/aishell3/voc1/path.sh
+++ b/examples/aishell3/voc1/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=parallelwave_gan
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
--- a/examples/aishell3/voc1/path.sh
+++ b/examples/aishell3/voc1/path.sh
@ -0,0 +1 @@
 ../../csmsc/voc1/path.sh
--- a/examples/aishell3/voc5/local/preprocess.sh
+++ b/examples/aishell3/voc5/local/preprocess.sh
@ -1,55 +0,0 @@
 #!/bin/bash
 stage=0
 stop_stage=100
 config_path=$1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
    echo "Generate durations.txt from MFA results ..."
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./aishell3_alignment_tone \
        --output=durations.txt \
        --config=${config_path}
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/data_aishell3/ \
        --dataset=aishell3 \
        --dumpdir=dump \
        --dur-file=durations.txt \
        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
 fi
--- a/examples/aishell3/voc5/local/preprocess.sh
+++ b/examples/aishell3/voc5/local/preprocess.sh
@ -0,0 +1 @@
 ../../voc1/local/preprocess.sh
--- a/examples/aishell3/voc5/local/synthesize.sh
+++ b/examples/aishell3/voc5/local/synthesize.sh
@ -1,14 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
    --config=${config_path} \
    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
    --test-metadata=dump/test/norm/metadata.jsonl \
    --output-dir=${train_output_path}/test \
    --generator-type=hifigan
--- a/examples/aishell3/voc5/local/synthesize.sh
+++ b/examples/aishell3/voc5/local/synthesize.sh
@ -0,0 +1 @@
 ../../../csmsc/voc5/local/synthesize.sh
--- a/examples/aishell3/voc5/local/train.sh
+++ b/examples/aishell3/voc5/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/aishell3/voc5/local/train.sh
+++ b/examples/aishell3/voc5/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/train.sh
--- a/examples/aishell3/voc5/path.sh
+++ b/examples/aishell3/voc5/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=hifigan
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
--- a/examples/aishell3/voc5/path.sh
+++ b/examples/aishell3/voc5/path.sh
@ -0,0 +1 @@
 ../../csmsc/voc5/path.sh
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
@ -1,25 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 stage=0
 stop_stage=0
 # hifigan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/synthesize.py \
        --erniesat_config=${config_path} \
        --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --erniesat_stat=dump/train/speech_stats.npy \
        --voc=hifigan_aishell3 \
        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --test_metadata=dump/test/norm/metadata.jsonl \
        --output_dir=${train_output_path}/test \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
@ -0,0 +1 @@
 ../../../aishell3/ernie_sat/local/synthesize.sh
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
@ -1,12 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=8 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
@ -0,0 +1 @@
 ../../../aishell3/ernie_sat/local/train.sh
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=ernie_sat
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
@ -0,0 +1 @@
 ../../aishell3/ernie_sat/path.sh
--- a/examples/csmsc/voc1/local/preprocess.sh
+++ b/examples/csmsc/voc1/local/preprocess.sh
@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
 fi
--- a/examples/csmsc/voc3/finetune.sh
+++ b/examples/csmsc/voc3/finetune.sh
@ -1,64 +0,0 @@
 #!/bin/bash
 source path.sh
 gpus=0
 stage=0
 stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
        --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
        --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
        --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
        --dur-file=durations.txt \
        --output-dir=dump_finetune \
        --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
        --dataset=baker \
        --rootdir=~/datasets/BZNSYP/
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/link_wav.py \
        --old-dump-dir=dump \
        --dump-dir=dump_finetune
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    cp dump/train/feats_stats.npy dump_finetune/train/
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/train/raw/metadata.jsonl \
        --dumpdir=dump_finetune/train/norm \
        --stats=dump_finetune/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/dev/raw/metadata.jsonl \
        --dumpdir=dump_finetune/dev/norm \
        --stats=dump_finetune/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/test/raw/metadata.jsonl \
        --dumpdir=dump_finetune/test/norm \
        --stats=dump_finetune/train/feats_stats.npy
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} \
    FLAGS_cudnn_exhaustive_search=true \
    FLAGS_conv_workspace_size_limit=4000 \
    python ${BIN_DIR}/train.py \
        --train-metadata=dump_finetune/train/norm/metadata.jsonl \
        --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
        --config=conf/finetune.yaml \
        --output-dir=exp/finetune \
        --ngpu=1
 fi 
--- a/examples/csmsc/voc3/finetune.sh
+++ b/examples/csmsc/voc3/finetune.sh
@ -0,0 +1 @@
 ../voc5/finetune.sh
--- a/examples/csmsc/voc3/local/preprocess.sh
+++ b/examples/csmsc/voc3/local/preprocess.sh
@ -1,55 +0,0 @@
 #!/bin/bash
 stage=0
 stop_stage=100
 config_path=$1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
    echo "Generate durations.txt from MFA results ..."
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
        --config=${config_path}
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/BZNSYP/ \
        --dataset=baker \
        --dumpdir=dump \
        --dur-file=durations.txt \
        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
 fi
--- a/examples/csmsc/voc3/local/preprocess.sh
+++ b/examples/csmsc/voc3/local/preprocess.sh
@ -0,0 +1 @@
 ../../voc1/local/preprocess.sh
--- a/examples/csmsc/voc3/local/train.sh
+++ b/examples/csmsc/voc3/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/csmsc/voc3/local/train.sh
+++ b/examples/csmsc/voc3/local/train.sh
@ -0,0 +1 @@
 ../../voc1/local/train.sh
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ b/examples/csmsc/voc4/local/preprocess.sh
@ -1,55 +0,0 @@
 #!/bin/bash
 stage=0
 stop_stage=100
 config_path=$1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
    echo "Generate durations.txt from MFA results ..."
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
        --config=${config_path}
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/BZNSYP/ \
        --dataset=baker \
        --dumpdir=dump \
        --dur-file=durations.txt \
        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
 fi
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ b/examples/csmsc/voc4/local/preprocess.sh
@ -0,0 +1 @@
 ../../voc1/local/preprocess.sh
--- a/examples/csmsc/voc4/local/train.sh
+++ b/examples/csmsc/voc4/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/csmsc/voc4/local/train.sh
+++ b/examples/csmsc/voc4/local/train.sh
@ -0,0 +1 @@
 ../../voc1/local/train.sh
--- a/examples/csmsc/voc5/finetune.sh
+++ b/examples/csmsc/voc5/finetune.sh
@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/train/raw/metadata.jsonl \
        --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/dev/raw/metadata.jsonl \
        --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/test/raw/metadata.jsonl \
        --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
        --skip-wav-copy
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--- a/examples/csmsc/voc5/local/preprocess.sh
+++ b/examples/csmsc/voc5/local/preprocess.sh
@ -1,55 +0,0 @@
 #!/bin/bash
 stage=0
 stop_stage=100
 config_path=$1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
    echo "Generate durations.txt from MFA results ..."
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
        --config=${config_path}
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/BZNSYP/ \
        --dataset=baker \
        --dumpdir=dump \
        --dur-file=durations.txt \
        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
 fi
--- a/examples/csmsc/voc5/local/preprocess.sh
+++ b/examples/csmsc/voc5/local/preprocess.sh
@ -0,0 +1 @@
 ../../voc1/local/preprocess.sh
--- a/examples/csmsc/voc5/local/train.sh
+++ b/examples/csmsc/voc5/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/csmsc/voc5/local/train.sh
+++ b/examples/csmsc/voc5/local/train.sh
@ -0,0 +1 @@
 ../../voc1/local/train.sh
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
 fi
--- a/examples/csmsc/voc6/local/train.sh
+++ b/examples/csmsc/voc6/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/csmsc/voc6/local/train.sh
+++ b/examples/csmsc/voc6/local/train.sh
@ -0,0 +1 @@
 ../../voc1/local/train.sh
--- a/examples/ljspeech/tts0/local/train.sh
+++ b/examples/ljspeech/tts0/local/train.sh
@ -1,12 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts0/local/train.sh
+++ b/examples/ljspeech/tts0/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/tts0/local/train.sh
--- a/examples/ljspeech/tts0/path.sh
+++ b/examples/ljspeech/tts0/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=tacotron2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/ljspeech/tts0/path.sh
+++ b/examples/ljspeech/tts0/path.sh
@ -0,0 +1 @@
 ../../csmsc/tts0/path.sh
--- a/examples/ljspeech/tts3/local/train.sh
+++ b/examples/ljspeech/tts3/local/train.sh
@ -1,12 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts3/local/train.sh
+++ b/examples/ljspeech/tts3/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/tts3/local/train.sh
--- a/examples/ljspeech/tts3/path.sh
+++ b/examples/ljspeech/tts3/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=fastspeech2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/ljspeech/tts3/path.sh
+++ b/examples/ljspeech/tts3/path.sh
@ -0,0 +1 @@
 ../../csmsc/tts3/path.sh
--- a/examples/ljspeech/voc1/local/preprocess.sh
+++ b/examples/ljspeech/voc1/local/preprocess.sh
@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
 fi
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@ -1,14 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
    --config=${config_path} \
    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
    --test-metadata=dump/test/norm/metadata.jsonl \
    --output-dir=${train_output_path}/test \
    --generator-type=pwgan
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/synthesize.sh
--- a/examples/ljspeech/voc1/local/train.sh
+++ b/examples/ljspeech/voc1/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/ljspeech/voc1/local/train.sh
+++ b/examples/ljspeech/voc1/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/train.sh
--- a/examples/ljspeech/voc1/path.sh
+++ b/examples/ljspeech/voc1/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=parallelwave_gan
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
--- a/examples/ljspeech/voc1/path.sh
+++ b/examples/ljspeech/voc1/path.sh
@ -0,0 +1 @@
 ../../csmsc/voc1/path.sh
--- a/examples/ljspeech/voc5/local/preprocess.sh
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@ -1,55 +0,0 @@
 #!/bin/bash
 stage=0
 stop_stage=100
 config_path=$1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
    echo "Generate durations.txt from MFA results ..."
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./ljspeech_alignment \
        --output=durations.txt \
        --config=${config_path}
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/LJSpeech-1.1/ \
        --dataset=ljspeech \
        --dumpdir=dump \
        --dur-file=durations.txt \
        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
 fi
--- a/examples/ljspeech/voc5/local/preprocess.sh
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@ -0,0 +1 @@
 ../../voc1/local/preprocess.sh
--- a/examples/ljspeech/voc5/local/synthesize.sh
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@ -1,14 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
    --config=${config_path} \
    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
    --test-metadata=dump/test/norm/metadata.jsonl \
    --output-dir=${train_output_path}/test \
    --generator-type=hifigan
--- a/examples/ljspeech/voc5/local/synthesize.sh
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@ -0,0 +1 @@
 ../../../csmsc/voc5/local/synthesize.sh
--- a/examples/ljspeech/voc5/local/train.sh
+++ b/examples/ljspeech/voc5/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/ljspeech/voc5/local/train.sh
+++ b/examples/ljspeech/voc5/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/train.sh
--- a/examples/ljspeech/voc5/path.sh
+++ b/examples/ljspeech/voc5/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=hifigan
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
--- a/examples/ljspeech/voc5/path.sh
+++ b/examples/ljspeech/voc5/path.sh
@ -0,0 +1 @@
 ../../csmsc/voc5/path.sh
--- a/examples/vctk/ernie_sat/local/train.sh
+++ b/examples/vctk/ernie_sat/local/train.sh
@ -1,12 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=8 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/ernie_sat/local/train.sh
+++ b/examples/vctk/ernie_sat/local/train.sh
@ -0,0 +1 @@
 ../../../aishell3/ernie_sat/local/train.sh
--- a/examples/vctk/ernie_sat/path.sh
+++ b/examples/vctk/ernie_sat/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=ernie_sat
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/vctk/ernie_sat/path.sh
+++ b/examples/vctk/ernie_sat/path.sh
@ -0,0 +1 @@
 ../../aishell3/ernie_sat/path.sh
--- a/examples/vctk/tts3/local/train.sh
+++ b/examples/vctk/tts3/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1 \
    --phones-dict=dump/phone_id_map.txt \
    --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/tts3/local/train.sh
+++ b/examples/vctk/tts3/local/train.sh
@ -0,0 +1 @@
 ../../../aishell3/tts3/local/train.sh
--- a/examples/vctk/tts3/path.sh
+++ b/examples/vctk/tts3/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=fastspeech2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/vctk/tts3/path.sh
+++ b/examples/vctk/tts3/path.sh
@ -0,0 +1 @@
 ../../csmsc/tts3/path.sh
--- a/examples/vctk/voc1/local/preprocess.sh
+++ b/examples/vctk/voc1/local/preprocess.sh
@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
        --skip-wav-copy
 fi
--- a/examples/vctk/voc1/local/synthesize.sh
+++ b/examples/vctk/voc1/local/synthesize.sh
@ -1,14 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
    --config=${config_path} \
    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
    --test-metadata=dump/test/norm/metadata.jsonl \
    --output-dir=${train_output_path}/test \
    --generator-type=pwgan
--- a/examples/vctk/voc1/local/synthesize.sh
+++ b/examples/vctk/voc1/local/synthesize.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/synthesize.sh
--- a/examples/vctk/voc1/local/train.sh
+++ b/examples/vctk/voc1/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/vctk/voc1/local/train.sh
+++ b/examples/vctk/voc1/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/train.sh
--- a/examples/vctk/voc1/path.sh
+++ b/examples/vctk/voc1/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=parallelwave_gan
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
--- a/examples/vctk/voc1/path.sh
+++ b/examples/vctk/voc1/path.sh
@ -0,0 +1 @@
 ../../csmsc/voc1/path.sh
--- a/examples/vctk/voc5/local/preprocess.sh
+++ b/examples/vctk/voc5/local/preprocess.sh
@ -1,55 +0,0 @@
 #!/bin/bash
 stage=0
 stop_stage=100
 config_path=$1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
    echo "Generate durations.txt from MFA results ..."
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./vctk_alignment \
        --output=durations.txt \
        --config=${config_path}
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
        --dataset=vctk \
        --dumpdir=dump \
        --dur-file=durations.txt \
        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
 fi
--- a/examples/vctk/voc5/local/preprocess.sh
+++ b/examples/vctk/voc5/local/preprocess.sh
@ -0,0 +1 @@
 ../../voc1/local/preprocess.sh
--- a/examples/vctk/voc5/local/synthesize.sh
+++ b/examples/vctk/voc5/local/synthesize.sh
@ -1,14 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
    --config=${config_path} \
    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
    --test-metadata=dump/test/norm/metadata.jsonl \
    --output-dir=${train_output_path}/test \
    --generator-type=hifigan
--- a/examples/vctk/voc5/local/synthesize.sh
+++ b/examples/vctk/voc5/local/synthesize.sh
@ -0,0 +1 @@
 ../../../csmsc/voc5/local/synthesize.sh
--- a/examples/vctk/voc5/local/train.sh
+++ b/examples/vctk/voc5/local/train.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/vctk/voc5/local/train.sh
+++ b/examples/vctk/voc5/local/train.sh
@ -0,0 +1 @@
 ../../../csmsc/voc1/local/train.sh
--- a/examples/vctk/voc5/path.sh
+++ b/examples/vctk/voc5/path.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=hifigan
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
--- a/examples/vctk/voc5/path.sh
+++ b/examples/vctk/voc5/path.sh
@ -0,0 +1 @@
 ../../csmsc/voc5/path.sh
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`../../../aishell3/ernie_sat/local/synthesize.sh`