From 2071774d813a5f12628b2e9eea3b242567208171 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 25 Jan 2022 09:25:54 +0000 Subject: [PATCH] add wavernn in synthesize_e2e, test=tts --- examples/csmsc/tts3/local/synthesize_e2e.sh | 21 ++++++++++++++++++ paddlespeech/t2s/exps/synthesize_e2e.py | 21 +++++++++++++----- paddlespeech/t2s/models/wavernn/wavernn.py | 24 +++++++++++++++++++++ 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index d4744486..49101ea0 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -89,3 +89,24 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_test/default.yaml \ + --voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \ + --voc_stat=wavernn_test/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 9f58579f..1f3f6773 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -59,6 +59,10 @@ model_alias = { "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", "hifigan_inference": "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", } @@ -148,10 +152,16 @@ def evaluate(args): voc_name = args.voc[:args.voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() + if voc_name != 'wavernn': + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**voc_config["model"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) + voc.eval() + voc_mu, voc_std = np.load(args.voc_stat) voc_mu = paddle.to_tensor(voc_mu) voc_std = paddle.to_tensor(voc_std) @@ -307,7 +317,8 @@ def main(): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' + 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', + 'wavernn_csmsc' ], help='Choose vocoder type of tts task.') diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index 5d1cbd39..2c6941b0 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -590,3 +590,27 @@ class WaveRNN(nn.Layer): for i in range(size): bar += '█' if i <= done else '░' return bar + + +class WaveRNNInference(nn.Layer): + def __init__(self, normalizer, wavernn): + super().__init__() + self.normalizer = normalizer + self.wavernn = wavernn + + def forward(self, + logmel, + batched: bool=True, + target: int=12000, + overlap: int=600, + mu_law: bool=True, + gen_display: bool=False): + normalized_mel = self.normalizer(logmel) + wav = self.wavernn.generate( + normalized_mel, + batched=batched, + target=target, + overlap=overlap, + mu_law=mu_law, + gen_display=gen_display) + return wav