add rtf for synthesize, add more vocoder for synthesize_e2e.sh, test=tts

pull/1511/head
TianYuan 3 years ago
parent 83465d2a06
commit cb07bd2a94

@ -3,10 +3,13 @@
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \ --am=tacotron2_csmsc \
--am_config=${config_path} \ --am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
@ -18,3 +21,78 @@ python3 ${BIN_DIR}/../synthesize.py \
--test_metadata=dump/test/norm/metadata.jsonl \ --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \ --output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi

@ -39,14 +39,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \ --voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
@ -88,8 +88,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# wavernn # wavernn

@ -1,15 +1,19 @@
#!/bin/bash #!/bin/bash
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \ --am=speedyspeech_csmsc \
--am_config=${config_path} \ --am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \ --voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
@ -18,3 +22,82 @@ python3 ${BIN_DIR}/../synthesize.py \
--output_dir=${train_output_path}/test \ --output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt
fi

@ -22,9 +22,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# for more GAN Vocoders # for more GAN Vocoders
@ -44,9 +44,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
@ -88,9 +88,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi

@ -3,10 +3,13 @@
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \ --am=fastspeech2_csmsc \
--am_config=${config_path} \ --am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
@ -18,3 +21,78 @@ python3 ${BIN_DIR}/../synthesize.py \
--test_metadata=dump/test/norm/metadata.jsonl \ --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \ --output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi

@ -22,8 +22,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# for more GAN Vocoders # for more GAN Vocoders
@ -43,8 +43,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
@ -86,8 +86,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi

@ -20,6 +20,7 @@ import numpy as np
import paddle import paddle
import soundfile as sf import soundfile as sf
import yaml import yaml
from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@ -50,6 +51,18 @@ model_alias = {
"paddlespeech.t2s.models.melgan:MelGANGenerator", "paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference": "mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference", "paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
} }
@ -146,10 +159,15 @@ def evaluate(args):
voc_name = args.voc[:args.voc.rindex('_')] voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias) voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
if voc_name != 'wavernn':
voc = voc_class(**voc_config["generator_params"]) voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.remove_weight_norm() voc.remove_weight_norm()
voc.eval() voc.eval()
else:
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat) voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu) voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std) voc_std = paddle.to_tensor(voc_std)
@ -162,8 +180,12 @@ def evaluate(args):
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
N = 0
T = 0
for datum in test_dataset: for datum in test_dataset:
utt_id = datum["utt_id"] utt_id = datum["utt_id"]
with timer() as t:
with paddle.no_grad(): with paddle.no_grad():
# acoustic model # acoustic model
if am_name == 'fastspeech2': if am_name == 'fastspeech2':
@ -175,7 +197,8 @@ def evaluate(args):
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
elif "spk_id" in datum: elif "spk_id" in datum:
spk_id = paddle.to_tensor(datum["spk_id"]) spk_id = paddle.to_tensor(datum["spk_id"])
mel = am_inference(phone_ids, spk_id=spk_id, spk_emb=spk_emb) mel = am_inference(
phone_ids, spk_id=spk_id, spk_emb=spk_emb)
elif am_name == 'speedyspeech': elif am_name == 'speedyspeech':
phone_ids = paddle.to_tensor(datum["phones"]) phone_ids = paddle.to_tensor(datum["phones"])
tone_ids = paddle.to_tensor(datum["tones"]) tone_ids = paddle.to_tensor(datum["tones"])
@ -189,11 +212,19 @@ def evaluate(args):
mel = am_inference(phone_ids, spk_emb=spk_emb) mel = am_inference(phone_ids, spk_emb=spk_emb)
# vocoder # vocoder
wav = voc_inference(mel) wav = voc_inference(mel)
wav = wav.numpy()
N += wav.size
T += t.elapse
speed = wav.size / t.elapse
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
wav.numpy(),
samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def main():
@ -246,7 +277,8 @@ def main():
default='pwgan_csmsc', default='pwgan_csmsc',
choices=[ choices=[
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
'mb_melgan_csmsc' 'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
'style_melgan_csmsc'
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')

@ -21,6 +21,7 @@ import soundfile as sf
import yaml import yaml
from paddle import jit from paddle import jit
from paddle.static import InputSpec from paddle.static import InputSpec
from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@ -233,8 +234,10 @@ def evaluate(args):
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
if am_name == 'tacotron2': if am_name == 'tacotron2':
merge_sentences = True merge_sentences = True
N = 0
T = 0
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
with timer() as t:
get_tone_ids = False get_tone_ids = False
if am_name == 'speedyspeech': if am_name == 'speedyspeech':
get_tone_ids = True get_tone_ids = True
@ -281,11 +284,18 @@ def evaluate(args):
flags = 1 flags = 1
else: else:
wav_all = paddle.concat([wav_all, wav]) wav_all = paddle.concat([wav_all, wav])
wav = wav_all.numpy()
N += wav.size
T += t.elapse
speed = wav.size / t.elapse
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
wav_all.numpy(),
samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def main():

@ -91,7 +91,7 @@ def main():
target=config.inference.target, target=config.inference.target,
overlap=config.inference.overlap, overlap=config.inference.overlap,
mu_law=config.mu_law, mu_law=config.mu_law,
gen_display=True) gen_display=False)
wav = wav.numpy() wav = wav.numpy()
N += wav.size N += wav.size
T += t.elapse T += t.elapse

@ -66,7 +66,7 @@ class MelGANGenerator(nn.Layer):
nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
by default {} by default {}
pad (str): Padding function module name before dilated convolution layer. pad (str): Padding function module name before dilated convolution layer.
pad_params dict): Hyperparameters for padding function. pad_params (dict): Hyperparameters for padding function.
use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
use_weight_norm (bool): Whether to use weight norm. use_weight_norm (bool): Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.

@ -509,16 +509,20 @@ class WaveRNN(nn.Layer):
total_len = num_folds * (target + overlap) + overlap total_len = num_folds * (target + overlap) + overlap
# Need some silence for the run warmup # Need some silence for the run warmup
slience_len = overlap // 2 slience_len = 0
linear_len = slience_len
fade_len = overlap - slience_len fade_len = overlap - slience_len
slience = paddle.zeros([slience_len], dtype=paddle.float32) slience = paddle.zeros([slience_len], dtype=paddle.float32)
linear = paddle.ones([fade_len], dtype=paddle.float32) linear = paddle.ones([linear_len], dtype=paddle.float32)
# Equal power crossfade # Equal power crossfade
# fade_in increase from 0 to 1, fade_out reduces from 1 to 0 # fade_in increase from 0 to 1, fade_out reduces from 1 to 0
t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32) sigmoid_scale = 2.3
fade_in = paddle.sqrt(0.5 * (1 + t)) t = paddle.linspace(
fade_out = paddle.sqrt(0.5 * (1 - t)) -sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32)
# sigmoid 曲线应该更好
fade_in = paddle.nn.functional.sigmoid(t)
fade_out = 1 - paddle.nn.functional.sigmoid(t)
# Concat the silence to the fades # Concat the silence to the fades
fade_out = paddle.concat([linear, fade_out]) fade_out = paddle.concat([linear, fade_out])
fade_in = paddle.concat([slience, fade_in]) fade_in = paddle.concat([slience, fade_in])

Loading…
Cancel
Save