Merge pull request #1511 from yt605155624/pre_fix_for_streaming

[TTS]add rtf for synthesize, add more vocoder for synthesize.sh
pull/1520/head
TianYuan 3 years ago committed by GitHub
commit 175c39b4a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -3,18 +3,98 @@
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ # pwgan
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_allocator_strategy=naive_best_fit \
--am=tacotron2_csmsc \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
--am_config=${config_path} \ python3 ${BIN_DIR}/../synthesize.py \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am=tacotron2_csmsc \
--am_stat=dump/train/speech_stats.npy \ --am_config=${config_path} \
--voc=pwgan_csmsc \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --am_stat=dump/train/speech_stats.npy \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc=pwgan_csmsc \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--test_metadata=dump/test/norm/metadata.jsonl \ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--output_dir=${train_output_path}/test \ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--phones_dict=dump/phone_id_map.txt --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi

@ -8,6 +8,7 @@ stage=0
stop_stage=0 stop_stage=0
# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐 # TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
@ -39,14 +40,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \ --voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
@ -88,8 +89,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# wavernn # wavernn
@ -111,4 +112,4 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference --inference_dir=${train_output_path}/inference
fi fi

@ -1,20 +1,105 @@
#!/bin/bash #!/bin/bash
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
FLAGS_allocator_strategy=naive_best_fit \ # wavernn
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
python3 ${BIN_DIR}/../synthesize.py \ echo "in wavernn syn"
--am=speedyspeech_csmsc \ FLAGS_allocator_strategy=naive_best_fit \
--am_config=${config_path} \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ python3 ${BIN_DIR}/../synthesize.py \
--am_stat=dump/train/feats_stats.npy \ --am=speedyspeech_csmsc \
--voc=pwgan_csmsc \ --am_config=${config_path} \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --am_stat=dump/train/speech_stats.npy \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --voc=wavernn_csmsc \
--test_metadata=dump/test/norm/metadata.jsonl \ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--output_dir=${train_output_path}/test \ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--phones_dict=dump/phone_id_map.txt \ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--tones_dict=dump/tone_id_map.txt --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt
fi

@ -7,6 +7,7 @@ ckpt_name=$3
stage=0 stage=0
stop_stage=0 stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
@ -22,9 +23,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# for more GAN Vocoders # for more GAN Vocoders
@ -44,9 +45,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
@ -88,12 +89,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# wavernn # wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn_e2e" echo "in wavernn syn_e2e"

@ -3,18 +3,98 @@
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ # pwgan
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_allocator_strategy=naive_best_fit \
--am=fastspeech2_csmsc \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
--am_config=${config_path} \ python3 ${BIN_DIR}/../synthesize.py \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \ --am_config=${config_path} \
--voc=pwgan_csmsc \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --am_stat=dump/train/speech_stats.npy \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc=pwgan_csmsc \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--test_metadata=dump/test/norm/metadata.jsonl \ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--output_dir=${train_output_path}/test \ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--phones_dict=dump/phone_id_map.txt --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi

@ -7,6 +7,7 @@ ckpt_name=$3
stage=0 stage=0
stop_stage=0 stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
@ -22,8 +23,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# for more GAN Vocoders # for more GAN Vocoders
@ -43,8 +44,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
@ -86,8 +87,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi

@ -20,6 +20,7 @@ import numpy as np
import paddle import paddle
import soundfile as sf import soundfile as sf
import yaml import yaml
from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@ -50,6 +51,18 @@ model_alias = {
"paddlespeech.t2s.models.melgan:MelGANGenerator", "paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference": "mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference", "paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
} }
@ -146,10 +159,15 @@ def evaluate(args):
voc_name = args.voc[:args.voc.rindex('_')] voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias) voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
voc = voc_class(**voc_config["generator_params"]) if voc_name != 'wavernn':
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) voc = voc_class(**voc_config["generator_params"])
voc.remove_weight_norm() voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.eval() voc.remove_weight_norm()
voc.eval()
else:
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat) voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu) voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std) voc_std = paddle.to_tensor(voc_std)
@ -162,38 +180,51 @@ def evaluate(args):
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
N = 0
T = 0
for datum in test_dataset: for datum in test_dataset:
utt_id = datum["utt_id"] utt_id = datum["utt_id"]
with paddle.no_grad(): with timer() as t:
# acoustic model with paddle.no_grad():
if am_name == 'fastspeech2': # acoustic model
phone_ids = paddle.to_tensor(datum["text"]) if am_name == 'fastspeech2':
spk_emb = None phone_ids = paddle.to_tensor(datum["text"])
spk_id = None spk_emb = None
# multi speaker spk_id = None
if args.voice_cloning and "spk_emb" in datum: # multi speaker
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) if args.voice_cloning and "spk_emb" in datum:
elif "spk_id" in datum: spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
spk_id = paddle.to_tensor(datum["spk_id"]) elif "spk_id" in datum:
mel = am_inference(phone_ids, spk_id=spk_id, spk_emb=spk_emb) spk_id = paddle.to_tensor(datum["spk_id"])
elif am_name == 'speedyspeech': mel = am_inference(
phone_ids = paddle.to_tensor(datum["phones"]) phone_ids, spk_id=spk_id, spk_emb=spk_emb)
tone_ids = paddle.to_tensor(datum["tones"]) elif am_name == 'speedyspeech':
mel = am_inference(phone_ids, tone_ids) phone_ids = paddle.to_tensor(datum["phones"])
elif am_name == 'tacotron2': tone_ids = paddle.to_tensor(datum["tones"])
phone_ids = paddle.to_tensor(datum["text"]) mel = am_inference(phone_ids, tone_ids)
spk_emb = None elif am_name == 'tacotron2':
# multi speaker phone_ids = paddle.to_tensor(datum["text"])
if args.voice_cloning and "spk_emb" in datum: spk_emb = None
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) # multi speaker
mel = am_inference(phone_ids, spk_emb=spk_emb) if args.voice_cloning and "spk_emb" in datum:
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
mel = am_inference(phone_ids, spk_emb=spk_emb)
# vocoder # vocoder
wav = voc_inference(mel) wav = voc_inference(mel)
wav = wav.numpy()
N += wav.size
T += t.elapse
speed = wav.size / t.elapse
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
wav.numpy(),
samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def main():
@ -246,7 +277,8 @@ def main():
default='pwgan_csmsc', default='pwgan_csmsc',
choices=[ choices=[
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
'mb_melgan_csmsc' 'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
'style_melgan_csmsc'
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')

@ -21,6 +21,7 @@ import soundfile as sf
import yaml import yaml
from paddle import jit from paddle import jit
from paddle.static import InputSpec from paddle.static import InputSpec
from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@ -233,59 +234,68 @@ def evaluate(args):
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
if am_name == 'tacotron2': if am_name == 'tacotron2':
merge_sentences = True merge_sentences = True
N = 0
T = 0
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
get_tone_ids = False with timer() as t:
if am_name == 'speedyspeech': get_tone_ids = False
get_tone_ids = True if am_name == 'speedyspeech':
if args.lang == 'zh': get_tone_ids = True
input_ids = frontend.get_input_ids( if args.lang == 'zh':
sentence, input_ids = frontend.get_input_ids(
merge_sentences=merge_sentences, sentence,
get_tone_ids=get_tone_ids) merge_sentences=merge_sentences,
phone_ids = input_ids["phone_ids"] get_tone_ids=get_tone_ids)
if get_tone_ids: phone_ids = input_ids["phone_ids"]
tone_ids = input_ids["tone_ids"] if get_tone_ids:
elif args.lang == 'en': tone_ids = input_ids["tone_ids"]
input_ids = frontend.get_input_ids( elif args.lang == 'en':
sentence, merge_sentences=merge_sentences) input_ids = frontend.get_input_ids(
phone_ids = input_ids["phone_ids"] sentence, merge_sentences=merge_sentences)
else: phone_ids = input_ids["phone_ids"]
print("lang should in {'zh', 'en'}!") else:
with paddle.no_grad(): print("lang should in {'zh', 'en'}!")
flags = 0 with paddle.no_grad():
for i in range(len(phone_ids)): flags = 0
part_phone_ids = phone_ids[i] for i in range(len(phone_ids)):
# acoustic model part_phone_ids = phone_ids[i]
if am_name == 'fastspeech2': # acoustic model
# multi speaker if am_name == 'fastspeech2':
if am_dataset in {"aishell3", "vctk"}: # multi speaker
spk_id = paddle.to_tensor(args.spk_id) if am_dataset in {"aishell3", "vctk"}:
mel = am_inference(part_phone_ids, spk_id) spk_id = paddle.to_tensor(args.spk_id)
else: mel = am_inference(part_phone_ids, spk_id)
else:
mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech':
part_tone_ids = tone_ids[i]
if am_dataset in {"aishell3", "vctk"}:
spk_id = paddle.to_tensor(args.spk_id)
mel = am_inference(part_phone_ids, part_tone_ids,
spk_id)
else:
mel = am_inference(part_phone_ids, part_tone_ids)
elif am_name == 'tacotron2':
mel = am_inference(part_phone_ids) mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech': # vocoder
part_tone_ids = tone_ids[i] wav = voc_inference(mel)
if am_dataset in {"aishell3", "vctk"}: if flags == 0:
spk_id = paddle.to_tensor(args.spk_id) wav_all = wav
mel = am_inference(part_phone_ids, part_tone_ids, flags = 1
spk_id)
else: else:
mel = am_inference(part_phone_ids, part_tone_ids) wav_all = paddle.concat([wav_all, wav])
elif am_name == 'tacotron2': wav = wav_all.numpy()
mel = am_inference(part_phone_ids) N += wav.size
# vocoder T += t.elapse
wav = voc_inference(mel) speed = wav.size / t.elapse
if flags == 0: rtf = am_config.fs / speed
wav_all = wav print(
flags = 1 f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
else: )
wav_all = paddle.concat([wav_all, wav])
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
wav_all.numpy(),
samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def main():

@ -91,7 +91,7 @@ def main():
target=config.inference.target, target=config.inference.target,
overlap=config.inference.overlap, overlap=config.inference.overlap,
mu_law=config.mu_law, mu_law=config.mu_law,
gen_display=True) gen_display=False)
wav = wav.numpy() wav = wav.numpy()
N += wav.size N += wav.size
T += t.elapse T += t.elapse

@ -66,7 +66,7 @@ class MelGANGenerator(nn.Layer):
nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
by default {} by default {}
pad (str): Padding function module name before dilated convolution layer. pad (str): Padding function module name before dilated convolution layer.
pad_params dict): Hyperparameters for padding function. pad_params (dict): Hyperparameters for padding function.
use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
use_weight_norm (bool): Whether to use weight norm. use_weight_norm (bool): Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.

@ -509,16 +509,20 @@ class WaveRNN(nn.Layer):
total_len = num_folds * (target + overlap) + overlap total_len = num_folds * (target + overlap) + overlap
# Need some silence for the run warmup # Need some silence for the run warmup
slience_len = overlap // 2 slience_len = 0
linear_len = slience_len
fade_len = overlap - slience_len fade_len = overlap - slience_len
slience = paddle.zeros([slience_len], dtype=paddle.float32) slience = paddle.zeros([slience_len], dtype=paddle.float32)
linear = paddle.ones([fade_len], dtype=paddle.float32) linear = paddle.ones([linear_len], dtype=paddle.float32)
# Equal power crossfade # Equal power crossfade
# fade_in increase from 0 to 1, fade_out reduces from 1 to 0 # fade_in increase from 0 to 1, fade_out reduces from 1 to 0
t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32) sigmoid_scale = 2.3
fade_in = paddle.sqrt(0.5 * (1 + t)) t = paddle.linspace(
fade_out = paddle.sqrt(0.5 * (1 - t)) -sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32)
# sigmoid 曲线应该更好
fade_in = paddle.nn.functional.sigmoid(t)
fade_out = 1 - paddle.nn.functional.sigmoid(t)
# Concat the silence to the fades # Concat the silence to the fades
fade_out = paddle.concat([linear, fade_out]) fade_out = paddle.concat([linear, fade_out])
fade_in = paddle.concat([slience, fade_in]) fade_in = paddle.concat([slience, fade_in])

Loading…
Cancel
Save