From fe8bf2a38c85f88d52dc316659e22f28447439f1 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 9 Mar 2022 11:21:27 +0000 Subject: [PATCH 1/2] format synthesize, test=tts --- paddlespeech/t2s/exps/inference.py | 232 ++++++++++------- paddlespeech/t2s/exps/syn_utils.py | 243 ++++++++++++++++++ paddlespeech/t2s/exps/synthesize.py | 142 +--------- paddlespeech/t2s/exps/synthesize_e2e.py | 202 ++------------- paddlespeech/t2s/exps/voice_cloning.py | 77 +----- .../t2s/modules/predictor/length_regulator.py | 10 + 6 files changed, 438 insertions(+), 468 deletions(-) create mode 100644 paddlespeech/t2s/exps/syn_utils.py diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 26d7e2c0..cdbf10e6 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -17,13 +17,92 @@ from pathlib import Path import numpy import soundfile as sf from paddle import inference - -from paddlespeech.t2s.frontend import English -from paddlespeech.t2s.frontend.zh_frontend import Frontend +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.utils import str2bool + + +def get_predictor(args, filed='am'): + full_name = '' + if filed == 'am': + full_name = args.am + elif filed == 'voc': + full_name = args.voc + model_name = full_name[:full_name.rindex('_')] + config = inference.Config( + str(Path(args.inference_dir) / (full_name + ".pdmodel")), + str(Path(args.inference_dir) / (full_name + ".pdiparams"))) + if args.device == "gpu": + config.enable_use_gpu(100, 0) + elif args.device == "cpu": + config.disable_gpu() + # This line must be commented for fastspeech2, if not, it will OOM + if model_name != 'fastspeech2': + config.enable_memory_optim() + predictor = inference.create_predictor(config) + return predictor -# only inference for models trained with csmsc now -def main(): +def get_am_output(args, am_predictor, frontend, merge_sentences, input): + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + am_input_names = am_predictor.get_input_names() + get_tone_ids = False + get_spk_id = False + if am_name == 'speedyspeech': + get_tone_ids = True + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + get_spk_id = True + spk_id = numpy.array([args.spk_id]) + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + elif args.lang == 'en': + input_ids = frontend.get_input_ids( + input, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + tones = tone_ids[0].numpy() + tones_handle = am_predictor.get_input_handle(am_input_names[1]) + tones_handle.reshape(tones.shape) + tones_handle.copy_from_cpu(tones) + if get_spk_id: + spk_id_handle = am_predictor.get_input_handle(am_input_names[1]) + spk_id_handle.reshape(spk_id.shape) + spk_id_handle.copy_from_cpu(spk_id) + phones = phone_ids[0].numpy() + phones_handle = am_predictor.get_input_handle(am_input_names[0]) + phones_handle.reshape(phones.shape) + phones_handle.copy_from_cpu(phones) + + am_predictor.run() + am_output_names = am_predictor.get_output_names() + am_output_handle = am_predictor.get_output_handle(am_output_names[0]) + am_output_data = am_output_handle.copy_to_cpu() + return am_output_data + + +def get_voc_output(args, voc_predictor, input): + voc_input_names = voc_predictor.get_input_names() + mel_handle = voc_predictor.get_input_handle(voc_input_names[0]) + mel_handle.reshape(input.shape) + mel_handle.copy_from_cpu(input) + + voc_predictor.run() + voc_output_names = voc_predictor.get_output_names() + voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0]) + wav = voc_output_handle.copy_to_cpu() + return wav + + +def parse_args(): parser = argparse.ArgumentParser( description="Paddle Infernce with speedyspeech & parallel wavegan.") # acoustic model @@ -70,113 +149,82 @@ def main(): parser.add_argument( "--inference_dir", type=str, help="dir to save inference models") parser.add_argument("--output_dir", type=str, help="output dir") + # inference + parser.add_argument( + "--use_trt", + type=str2bool, + default=False, + help="Whether to use inference engin TensorRT.", ) + parser.add_argument( + "--int8", + type=str2bool, + default=False, + help="Whether to use int8 inference.", ) + parser.add_argument( + "--fp16", + type=str2bool, + default=False, + help="Whether to use float16 inference.", ) + parser.add_argument( + "--device", + default="gpu", + choices=["gpu", "cpu"], + help="Device selected for inference.", ) args, _ = parser.parse_known_args() + return args + +# only inference for models trained with csmsc now +def main(): + args = parse_args() # frontend - if args.lang == 'zh': - frontend = Frontend( - phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) - elif args.lang == 'en': - frontend = English(phone_vocab_path=args.phones_dict) - print("frontend done!") + frontend = get_frontend(args) + # am_predictor + am_predictor = get_predictor(args, filed='am') # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] am_dataset = args.am[args.am.rindex('_') + 1:] - am_config = inference.Config( - str(Path(args.inference_dir) / (args.am + ".pdmodel")), - str(Path(args.inference_dir) / (args.am + ".pdiparams"))) - am_config.enable_use_gpu(100, 0) - # This line must be commented for fastspeech2, if not, it will OOM - if am_name != 'fastspeech2': - am_config.enable_memory_optim() - am_predictor = inference.create_predictor(am_config) - - voc_config = inference.Config( - str(Path(args.inference_dir) / (args.voc + ".pdmodel")), - str(Path(args.inference_dir) / (args.voc + ".pdiparams"))) - voc_config.enable_use_gpu(100, 0) - voc_config.enable_memory_optim() - voc_predictor = inference.create_predictor(voc_config) + # voc_predictor + voc_predictor = get_predictor(args, filed='voc') output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - sentences = [] - - print("in new inference") - - # construct dataset for evaluation - sentences = [] - with open(args.text, 'rt') as f: - for line in f: - items = line.strip().split() - utt_id = items[0] - if args.lang == 'zh': - sentence = "".join(items[1:]) - elif args.lang == 'en': - sentence = " ".join(items[1:]) - sentences.append((utt_id, sentence)) - get_tone_ids = False - get_spk_id = False - if am_name == 'speedyspeech': - get_tone_ids = True - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - get_spk_id = True - spk_id = numpy.array([args.spk_id]) + sentences = get_sentences(args) - am_input_names = am_predictor.get_input_names() - print("am_input_names:", am_input_names) merge_sentences = True + N = 0 + T = 0 + fs = 24000 if am_dataset != 'ljspeech' else 22050 + i = 0 for utt_id, sentence in sentences: - if args.lang == 'zh': - input_ids = frontend.get_input_ids( - sentence, + # warmup + i += 1 + with timer() as t: + am_output_data = get_am_output( + args, + am_predictor=am_predictor, + frontend=frontend, merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids) - phone_ids = input_ids["phone_ids"] - elif args.lang == 'en': - input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) - phone_ids = input_ids["phone_ids"] - else: - print("lang should in {'zh', 'en'}!") - - if get_tone_ids: - tone_ids = input_ids["tone_ids"] - tones = tone_ids[0].numpy() - tones_handle = am_predictor.get_input_handle(am_input_names[1]) - tones_handle.reshape(tones.shape) - tones_handle.copy_from_cpu(tones) - if get_spk_id: - spk_id_handle = am_predictor.get_input_handle(am_input_names[1]) - spk_id_handle.reshape(spk_id.shape) - spk_id_handle.copy_from_cpu(spk_id) - phones = phone_ids[0].numpy() - phones_handle = am_predictor.get_input_handle(am_input_names[0]) - phones_handle.reshape(phones.shape) - phones_handle.copy_from_cpu(phones) - - am_predictor.run() - am_output_names = am_predictor.get_output_names() - am_output_handle = am_predictor.get_output_handle(am_output_names[0]) - am_output_data = am_output_handle.copy_to_cpu() - - voc_input_names = voc_predictor.get_input_names() - mel_handle = voc_predictor.get_input_handle(voc_input_names[0]) - mel_handle.reshape(am_output_data.shape) - mel_handle.copy_from_cpu(am_output_data) - - voc_predictor.run() - voc_output_names = voc_predictor.get_output_names() - voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0]) - wav = voc_output_handle.copy_to_cpu() + input=sentence) + wav = get_voc_output( + args, voc_predictor=voc_predictor, input=am_output_data) + + if i >= 3: + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = fs / speed sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000) + print( + f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") if __name__ == "__main__": diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py new file mode 100644 index 00000000..c52cb372 --- /dev/null +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -0,0 +1,243 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import paddle +from paddle import jit +from paddle.static import InputSpec + +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.modules.normalizer import ZScore + +model_alias = { + # acoustic model + "speedyspeech": + "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", + "speedyspeech_inference": + "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", + "fastspeech2": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2", + "fastspeech2_inference": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", + # voc + "pwgan": + "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", + "pwgan_inference": + "paddlespeech.t2s.models.parallel_wavegan:PWGInference", + "mb_melgan": + "paddlespeech.t2s.models.melgan:MelGANGenerator", + "mb_melgan_inference": + "paddlespeech.t2s.models.melgan:MelGANInference", + "style_melgan": + "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", + "style_melgan_inference": + "paddlespeech.t2s.models.melgan:StyleMelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", +} + + +# input +def get_sentences(args): + # construct dataset for evaluation + sentences = [] + with open(args.text, 'rt') as f: + for line in f: + items = line.strip().split() + utt_id = items[0] + if 'lang' in args and args.lang == 'zh': + sentence = "".join(items[1:]) + elif 'lang' in args and args.lang == 'en': + sentence = " ".join(items[1:]) + sentences.append((utt_id, sentence)) + return sentences + + +def get_test_dataset(args, test_metadata, am_name, am_dataset): + if am_name == 'fastspeech2': + fields = ["utt_id", "text"] + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + print("multiple speaker fastspeech2!") + fields += ["spk_id"] + elif 'voice_cloning' in args and args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] + else: + print("single speaker fastspeech2!") + elif am_name == 'speedyspeech': + fields = ["utt_id", "phones", "tones"] + elif am_name == 'tacotron2': + fields = ["utt_id", "text"] + if 'voice_cloning' in args and args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] + + test_dataset = DataTable(data=test_metadata, fields=fields) + return test_dataset + + +# frontend +def get_frontend(args): + if 'lang' in args and args.lang == 'zh': + frontend = Frontend( + phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) + elif 'lang' in args and args.lang == 'en': + frontend = English(phone_vocab_path=args.phones_dict) + else: + print("wrong lang!") + print("frontend done!") + return frontend + + +# dygraph +def get_am_inference(args, am_config): + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + tone_size = None + if 'tones_dict' in args and args.tones_dict: + with open(args.tones_dict, "r") as f: + tone_id = [line.strip().split() for line in f.readlines()] + tone_size = len(tone_id) + print("tone_size:", tone_size) + + spk_num = None + if 'speaker_dict' in args and args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id) + print("spk_num:", spk_num) + + odim = am_config.n_mels + # model: {model_name}_{dataset} + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + + am_class = dynamic_import(am_name, model_alias) + am_inference_class = dynamic_import(am_name + '_inference', model_alias) + + if am_name == 'fastspeech2': + am = am_class( + idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) + elif am_name == 'speedyspeech': + am = am_class( + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) + + am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) + am.eval() + am_mu, am_std = np.load(args.am_stat) + am_mu = paddle.to_tensor(am_mu) + am_std = paddle.to_tensor(am_std) + am_normalizer = ZScore(am_mu, am_std) + am_inference = am_inference_class(am_normalizer, am) + am_inference.eval() + print("acoustic model done!") + return am_inference, am_name, am_dataset + + +def get_voc_inference(args, voc_config): + # model: {model_name}_{dataset} + voc_name = args.voc[:args.voc.rindex('_')] + voc_class = dynamic_import(voc_name, model_alias) + voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) + if voc_name != 'wavernn': + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**voc_config["model"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) + voc.eval() + + voc_mu, voc_std = np.load(args.voc_stat) + voc_mu = paddle.to_tensor(voc_mu) + voc_std = paddle.to_tensor(voc_std) + voc_normalizer = ZScore(voc_mu, voc_std) + voc_inference = voc_inference_class(voc_normalizer, voc) + voc_inference.eval() + print("voc done!") + return voc_inference + + +# to static +def am_to_static(args, am_inference, am_name, am_dataset): + if am_name == 'fastspeech2': + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), + InputSpec([1], dtype=paddle.int64), + ], ) + else: + am_inference = jit.to_static( + am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + + elif am_name == 'speedyspeech': + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), # text + InputSpec([-1], dtype=paddle.int64), # tone + InputSpec([1], dtype=paddle.int64), # spk_id + None # duration + ]) + else: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), + InputSpec([-1], dtype=paddle.int64) + ]) + + elif am_name == 'tacotron2': + am_inference = jit.to_static( + am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + + paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am)) + am_inference = paddle.jit.load(os.path.join(args.inference_dir, args.am)) + return am_inference + + +def voc_to_static(args, voc_inference): + voc_inference = jit.to_static( + voc_inference, input_spec=[ + InputSpec([-1, 80], dtype=paddle.float32), + ]) + paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc)) + voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc)) + return voc_inference diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 81da14f2..426b7617 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -23,48 +23,11 @@ import yaml from timer import timer from yacs.config import CfgNode -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.t2s.datasets.data_table import DataTable -from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_test_dataset +from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.utils import str2bool -model_alias = { - # acoustic model - "speedyspeech": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", - "speedyspeech_inference": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - "tacotron2": - "paddlespeech.t2s.models.tacotron2:Tacotron2", - "tacotron2_inference": - "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", - # voc - "pwgan": - "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", - "pwgan_inference": - "paddlespeech.t2s.models.parallel_wavegan:PWGInference", - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "style_melgan": - "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", - "style_melgan_inference": - "paddlespeech.t2s.models.melgan:StyleMelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", - "wavernn": - "paddlespeech.t2s.models.wavernn:WaveRNN", - "wavernn_inference": - "paddlespeech.t2s.models.wavernn:WaveRNNInference", -} - def evaluate(args): # dataloader has been too verbose @@ -86,96 +49,12 @@ def evaluate(args): print(am_config) print(voc_config) - # construct dataset for evaluation - - # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] - - if am_name == 'fastspeech2': - fields = ["utt_id", "text"] - spk_num = None - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - print("multiple speaker fastspeech2!") - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - spk_num = len(spk_id) - fields += ["spk_id"] - elif args.voice_cloning: - print("voice cloning!") - fields += ["spk_emb"] - else: - print("single speaker fastspeech2!") - print("spk_num:", spk_num) - elif am_name == 'speedyspeech': - fields = ["utt_id", "phones", "tones"] - elif am_name == 'tacotron2': - fields = ["utt_id", "text"] - if args.voice_cloning: - print("voice cloning!") - fields += ["spk_emb"] - - test_dataset = DataTable(data=test_metadata, fields=fields) - - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - - tone_size = None - if args.tones_dict: - with open(args.tones_dict, "r") as f: - tone_id = [line.strip().split() for line in f.readlines()] - tone_size = len(tone_id) - print("tone_size:", tone_size) - # acoustic model - odim = am_config.n_mels - am_class = dynamic_import(am_name, model_alias) - am_inference_class = dynamic_import(am_name + '_inference', model_alias) - - if am_name == 'fastspeech2': - am = am_class( - idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) - elif am_name == 'speedyspeech': - am = am_class( - vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) - elif am_name == 'tacotron2': - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - am_normalizer = ZScore(am_mu, am_std) - am_inference = am_inference_class(am_normalizer, am) - print("am_inference.training0:", am_inference.training) - am_inference.eval() - print("acoustic model done!") + am_inference, am_name, am_dataset = get_am_inference(args, am_config) + test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset) # vocoder - # model: {model_name}_{dataset} - voc_name = args.voc[:args.voc.rindex('_')] - voc_class = dynamic_import(voc_name, model_alias) - voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - if voc_name != 'wavernn': - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() - else: - voc = voc_class(**voc_config["model"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) - voc.eval() - voc_mu, voc_std = np.load(args.voc_stat) - voc_mu = paddle.to_tensor(voc_mu) - voc_std = paddle.to_tensor(voc_std) - voc_normalizer = ZScore(voc_mu, voc_std) - voc_inference = voc_inference_class(voc_normalizer, voc) - print("voc_inference.training0:", voc_inference.training) - voc_inference.eval() - print("voc done!") + voc_inference = get_voc_inference(args, voc_config) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -227,7 +106,7 @@ def evaluate(args): print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") -def main(): +def parse_args(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser( description="Synthesize with acoustic model & vocoder") @@ -264,7 +143,6 @@ def main(): "--tones_dict", type=str, default=None, help="tone vocabulary file.") parser.add_argument( "--speaker_dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( "--voice-cloning", type=str2bool, @@ -281,7 +159,6 @@ def main(): 'style_melgan_csmsc' ], help='Choose vocoder type of tts task.') - parser.add_argument( '--voc_config', type=str, @@ -302,7 +179,12 @@ def main(): parser.add_argument("--output_dir", type=str, help="output dir.") args = parser.parse_args() + return args + + +def main(): + args = parse_args() if args.ngpu == 0: paddle.set_device("cpu") elif args.ngpu > 0: diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 94180f85..49be2b40 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -12,59 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import os from pathlib import Path -import numpy as np import paddle import soundfile as sf import yaml -from paddle import jit -from paddle.static import InputSpec from timer import timer from yacs.config import CfgNode -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.t2s.frontend import English -from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.t2s.modules.normalizer import ZScore - -model_alias = { - # acoustic model - "speedyspeech": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", - "speedyspeech_inference": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - "tacotron2": - "paddlespeech.t2s.models.tacotron2:Tacotron2", - "tacotron2_inference": - "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", - # voc - "pwgan": - "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", - "pwgan_inference": - "paddlespeech.t2s.models.parallel_wavegan:PWGInference", - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "style_melgan": - "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", - "style_melgan_inference": - "paddlespeech.t2s.models.melgan:StyleMelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", - "wavernn": - "paddlespeech.t2s.models.wavernn:WaveRNN", - "wavernn_inference": - "paddlespeech.t2s.models.wavernn:WaveRNNInference", -} +from paddlespeech.t2s.exps.syn_utils import am_to_static +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.exps.syn_utils import voc_to_static def evaluate(args): @@ -81,155 +42,28 @@ def evaluate(args): print(am_config) print(voc_config) - # construct dataset for evaluation - sentences = [] - with open(args.text, 'rt') as f: - for line in f: - items = line.strip().split() - utt_id = items[0] - if args.lang == 'zh': - sentence = "".join(items[1:]) - elif args.lang == 'en': - sentence = " ".join(items[1:]) - sentences.append((utt_id, sentence)) - - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - - tone_size = None - if args.tones_dict: - with open(args.tones_dict, "r") as f: - tone_id = [line.strip().split() for line in f.readlines()] - tone_size = len(tone_id) - print("tone_size:", tone_size) - - spk_num = None - if args.speaker_dict: - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - spk_num = len(spk_id) - print("spk_num:", spk_num) + sentences = get_sentences(args) # frontend - if args.lang == 'zh': - frontend = Frontend( - phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) - elif args.lang == 'en': - frontend = English(phone_vocab_path=args.phones_dict) - print("frontend done!") + frontend = get_frontend(args) # acoustic model - odim = am_config.n_mels - # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] - - am_class = dynamic_import(am_name, model_alias) - am_inference_class = dynamic_import(am_name + '_inference', model_alias) - - if am_name == 'fastspeech2': - am = am_class( - idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) - elif am_name == 'speedyspeech': - am = am_class( - vocab_size=vocab_size, - tone_size=tone_size, - spk_num=spk_num, - **am_config["model"]) - elif am_name == 'tacotron2': - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - am_normalizer = ZScore(am_mu, am_std) - am_inference = am_inference_class(am_normalizer, am) - am_inference.eval() - print("acoustic model done!") + am_inference, am_name, am_dataset = get_am_inference(args, am_config) # vocoder - # model: {model_name}_{dataset} - voc_name = args.voc[:args.voc.rindex('_')] - voc_class = dynamic_import(voc_name, model_alias) - voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - if voc_name != 'wavernn': - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() - else: - voc = voc_class(**voc_config["model"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) - voc.eval() - - voc_mu, voc_std = np.load(args.voc_stat) - voc_mu = paddle.to_tensor(voc_mu) - voc_std = paddle.to_tensor(voc_std) - voc_normalizer = ZScore(voc_mu, voc_std) - voc_inference = voc_inference_class(voc_normalizer, voc) - voc_inference.eval() - print("voc done!") + voc_inference = get_voc_inference(args, voc_config) # whether dygraph to static if args.inference_dir: # acoustic model - if am_name == 'fastspeech2': - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), - InputSpec([1], dtype=paddle.int64) - ]) - else: - am_inference = jit.to_static( - am_inference, - input_spec=[InputSpec([-1], dtype=paddle.int64)]) - - elif am_name == 'speedyspeech': - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), # text - InputSpec([-1], dtype=paddle.int64), # tone - InputSpec([1], dtype=paddle.int64), # spk_id - None # duration - ]) - else: - am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), - InputSpec([-1], dtype=paddle.int64) - ]) - - elif am_name == 'tacotron2': - am_inference = jit.to_static( - am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) - - paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am)) - am_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.am)) + am_inference = am_to_static(args, am_inference, am_name, am_dataset) # vocoder - voc_inference = jit.to_static( - voc_inference, - input_spec=[ - InputSpec([-1, 80], dtype=paddle.float32), - ]) - paddle.jit.save(voc_inference, - os.path.join(args.inference_dir, args.voc)) - voc_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.voc)) + voc_inference = voc_to_static(args, voc_inference) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - merge_sentences = False + merge_sentences = True # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) if am_name == 'tacotron2': @@ -266,6 +100,8 @@ def evaluate(args): spk_id = paddle.to_tensor(args.spk_id) mel = am_inference(part_phone_ids, spk_id) else: + # import pdb + # pdb.set_trace() mel = am_inference(part_phone_ids) elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i] @@ -298,7 +134,7 @@ def evaluate(args): print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") -def main(): +def parse_args(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser( description="Synthesize with acoustic model & vocoder") @@ -351,7 +187,6 @@ def main(): 'wavernn_csmsc' ], help='Choose vocoder type of tts task.') - parser.add_argument( '--voc_config', type=str, @@ -386,6 +221,11 @@ def main(): parser.add_argument("--output_dir", type=str, help="output dir.") args = parser.parse_args() + return args + + +def main(): + args = parse_args() if args.ngpu == 0: paddle.set_device("cpu") diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py index 3de30774..1afd21df 100644 --- a/paddlespeech/t2s/exps/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning.py @@ -21,29 +21,12 @@ import soundfile as sf import yaml from yacs.config import CfgNode -from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder -model_alias = { - # acoustic model - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - "tacotron2": - "paddlespeech.t2s.models.tacotron2:Tacotron2", - "tacotron2_inference": - "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", - # voc - "pwgan": - "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", - "pwgan_inference": - "paddlespeech.t2s.models.parallel_wavegan:PWGInference", -} - def voice_cloning(args): # Init body. @@ -79,55 +62,14 @@ def voice_cloning(args): speaker_encoder.eval() print("GE2E Done!") - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) + frontend = Frontend(phone_vocab_path=args.phones_dict) + print("frontend done!") # acoustic model - odim = am_config.n_mels - # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] - - am_class = dynamic_import(am_name, model_alias) - am_inference_class = dynamic_import(am_name + '_inference', model_alias) - - if am_name == 'fastspeech2': - am = am_class( - idim=vocab_size, odim=odim, spk_num=None, **am_config["model"]) - elif am_name == 'tacotron2': - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - am_normalizer = ZScore(am_mu, am_std) - am_inference = am_inference_class(am_normalizer, am) - am_inference.eval() - print("acoustic model done!") + am_inference, *_ = get_am_inference(args, am_config) # vocoder - # model: {model_name}_{dataset} - voc_name = args.voc[:args.voc.rindex('_')] - voc_class = dynamic_import(voc_name, model_alias) - voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() - voc_mu, voc_std = np.load(args.voc_stat) - voc_mu = paddle.to_tensor(voc_mu) - voc_std = paddle.to_tensor(voc_std) - voc_normalizer = ZScore(voc_mu, voc_std) - voc_inference = voc_inference_class(voc_normalizer, voc) - voc_inference.eval() - print("voc done!") - - frontend = Frontend(phone_vocab_path=args.phones_dict) - print("frontend done!") + voc_inference = get_voc_inference(args, voc_config) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -170,7 +112,7 @@ def voice_cloning(args): print(f"{utt_id} done!") -def main(): +def parse_args(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="") parser.add_argument( @@ -240,6 +182,11 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") args = parser.parse_args() + return args + + +def main(): + args = parse_args() if args.ngpu == 0: paddle.set_device("cpu") diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index 62d707d2..2472c413 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -101,6 +101,16 @@ class LengthRegulator(nn.Layer): assert alpha > 0 ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha) ds = ds.cast(dtype=paddle.int64) + ''' + from distutils.version import LooseVersion + from paddlespeech.t2s.modules.nets_utils import pad_list + # 这里在 paddle 2.2.2 的动转静是不通的 + # if LooseVersion(paddle.__version__) >= "2.3.0" or hasattr(paddle, 'repeat_interleave'): + # if LooseVersion(paddle.__version__) >= "2.3.0": + if hasattr(paddle, 'repeat_interleave'): + repeat = [paddle.repeat_interleave(x, d, axis=0) for x, d in zip(xs, ds)] + return pad_list(repeat, self.pad_value) + ''' if is_inference: return self.expand(xs, ds) else: From 544c372b5053545a776d1a00319379bdc28535b6 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 9 Mar 2022 12:12:44 +0000 Subject: [PATCH 2/2] fix cr, test=tts --- paddlespeech/t2s/exps/csmsc_test.txt | 100 ++++++++++++++++++++++++ paddlespeech/t2s/exps/inference.py | 29 +++++-- paddlespeech/t2s/exps/synthesize_e2e.py | 2 - 3 files changed, 122 insertions(+), 9 deletions(-) create mode 100644 paddlespeech/t2s/exps/csmsc_test.txt diff --git a/paddlespeech/t2s/exps/csmsc_test.txt b/paddlespeech/t2s/exps/csmsc_test.txt new file mode 100644 index 00000000..d8cf367c --- /dev/null +++ b/paddlespeech/t2s/exps/csmsc_test.txt @@ -0,0 +1,100 @@ +009901 昨日,这名伤者与医生全部被警方依法刑事拘留。 +009902 钱伟长想到上海来办学校是经过深思熟虑的。 +009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 +009904 李述德在离开之前,只说了一句柱驼杀父亲了。 +009905 这种车票和保险单捆绑出售属于重复性购买。 +009906 戴佩妮的男友西米露接唱情歌,让她非常开心。 +009907 观大势,谋大局,出大策始终是该院的办院方针。 +009908 他们骑着摩托回家,正好为农忙时的父母帮忙。 +009909 但是因为还没到退休年龄,只能掰着指头捱日子。 +009910 这几天雨水不断,人们恨不得待在家里不出门。 +009911 没想到徐赟,张海翔两人就此玩起了人间蒸发。 +009912 藤村此番发言可能是为了凸显野田的领导能力。 +009913 程长庚,生在清王朝嘉庆年间,安徽的潜山小县。 +009914 南海海域综合补给基地码头项目正在论证中。 +009915 也就是说今晚成都市民极有可能再次看到飘雪。 +009916 随着天气转热,各地的游泳场所开始人头攒动。 +009917 更让徐先生纳闷的是,房客的手机也打不通了。 +009918 遇到颠簸时,应听从乘务员的安全指令,回座位坐好。 +009919 他在后面呆惯了,怕自己一插身后的人会不满,不敢排进去。 +009920 傍晚七个小人回来了,白雪公主说,你们就是我命中的七个小矮人吧。 +009921 他本想说,教育局管这个,他们是一路的,这样一管岂不是妓女起嫖客? +009922 一种表示商品所有权的财物证券,也称商品证券,如提货单,交货单。 +009923 会有很丰富的东西留下来,说都说不完。 +009924 这句话像从天而降,吓得四周一片寂静。 +009925 记者所在的是受害人家属所在的右区。 +009926 不管哈大爷去哪,它都一步不离地跟着。 +009927 大家抬头望去,一只老鼠正趴在吊顶上。 +009928 我决定过年就辞职,接手我爸的废品站! +009929 最终,中国男子乒乓球队获得此奖项。 +009930 防汛抗旱两手抓,抗旱相对抓的不够。 +009931 图们江下游地区开发开放的进展如何? +009932 这要求中国必须有一个坚强的政党领导。 +009933 再说,关于利益上的事俺俩都不好开口。 +009934 明代瓦剌,鞑靼入侵明境也是通过此地。 +009935 咪咪舔着孩子,把它身上的毛舔干净。 +009936 是否这次的国标修订被大企业绑架了? +009937 判决后,姚某妻子胡某不服,提起上诉。 +009938 由此可以看出邯钢的经济效益来自何处。 +009939 琳达说,是瑜伽改变了她和马儿的生活。 +009940 楼下的保安告诉记者,这里不租也不卖。 +009941 习近平说,中斯两国人民传统友谊深厚。 +009942 传闻越来越多,后来连老汉儿自己都怕了。 +009943 我怒吼一声冲上去,举起砖头砸了过去。 +009944 我现在还不会,这就回去问问发明我的人。 +009945 显然,洛阳性奴案不具备上述两个前提。 +009946 另外,杰克逊有文唇线,眼线,眉毛的动作。 +009947 昨晚,华西都市报记者电话采访了尹琪。 +009948 涅拉季科未透露这些航空公司的名称。 +009949 从运行轨迹上来说,它也不可能是星星。 +009950 目前看,如果继续加息也存在两难问题。 +009951 曾宝仪在节目录制现场大爆观众糗事。 +009952 但任凭周某怎么叫,男子仍酣睡不醒。 +009953 老大爷说,小子,你挡我财路了,知道不? +009954 没料到,闯下大头佛的阿伟还不知悔改。 +009955 卡扎菲部落式统治已遭遇部落内讧。 +009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。 +009957 出现这种泥鳅内阁的局面既是野田有意为之,也实属无奈。 +009958 济青高速济南,华山,章丘,邹平,周村,淄博,临淄站。 +009959 赵凌飞的话,反映了沈阳赛区所有奥运志愿者的共同心声。 +009960 因为,我们所发出的力量必会因难度加大而减弱。 +009961 发生事故的楼梯拐角处仍可看到血迹。 +009962 想过进公安,可能身高不够,老汉儿也不让我进去。 +009963 路上关卡很多,为了方便撤离,只好轻装前进。 +009964 原来比尔盖茨就是美国微软公司联合创始人呀。 +009965 之后他们一家三口将与双方父母往峇里岛旅游。 +009966 谢谢总理,也感谢广大网友的参与,我们明年再见。 +009967 事实上是,从来没有一个欺善怕恶的人能作出过稍大一点的成就。 +009968 我会打开邮件,你可以从那里继续。 +009969 美方对近期东海局势表示关切。 +009970 据悉,奥巴马一家人对这座冬季白宫极为满意。 +009971 打扫完你会很有成就感的,试一试,你就信了。 +009972 诺曼站在滑板车上,各就各位,准备出发啦! +009973 塔河的寒夜,气温降到了零下三十多摄氏度。 +009974 其间,连破六点六,六点五,六点四,六点三五等多个重要关口。 +009975 算命其实只是人们的一种自我安慰和自我暗示而已,我们还是要相信科学才好。 +009976 这一切都令人欢欣鼓舞,阿讷西没理由不坚持到最后。 +009977 直至公元前一万一千年,它又再次出现。 +009978 尽量少玩电脑,少看电视,少打游戏。 +009979 从五到七,前后也就是六个月的时间。 +009980 一进咖啡店,他就遇见一张熟悉的脸。 +009981 好在众弟兄看到了把她追了回来。 +009982 有一个人说,哥们儿我们跑过它才能活。 +009983 捅了她以后,模糊记得她没咋动了。 +009984 从小到大,葛启义没有收到过压岁钱。 +009985 舞台下的你会对舞台上的你说什么? +009986 但考生普遍认为,试题的怪多过难。 +009987 我希望每个人都能够尊重我们的隐私。 +009988 漫天的红霞使劲给两人增添气氛。 +009989 晚上加完班开车回家,太累了,迷迷糊糊开着车,走一半的时候,铛一声! +009990 该车将三人撞倒后,在大雾中逃窜。 +009991 这人一哆嗦,方向盘也把不稳了,差点撞上了高速边道护栏。 +009992 那女孩儿委屈的说,我一回头见你已经进去了我不敢进去啊! +009993 小明摇摇头说,不是,我只是美女看多了,想换个口味而已。 +009994 接下来,红娘要求记者交费,记者表示不知表姐身份证号码。 +009995 李东蓊表示,自己当时在法庭上发表了一次独特的公诉意见。 +009996 另一男子扑了上来,手里拿着明晃晃的长刀,向他胸口直刺。 +009997 今天,快递员拿着一个快递在办公室喊,秦王是哪个,有他快递? +009998 这场抗议活动究竟是如何发展演变的,又究竟是谁伤害了谁? +009999 因华国锋肖鸡,墓地设计根据其属相设计。 +010000 在狱中,张明宝悔恨交加,写了一份忏悔书。 diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index cdbf10e6..1188ddfb 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -195,13 +195,29 @@ def main(): sentences = get_sentences(args) merge_sentences = True + fs = 24000 if am_dataset != 'ljspeech' else 22050 + # warmup + for utt_id, sentence in sentences[:3]: + with timer() as t: + am_output_data = get_am_output( + args, + am_predictor=am_predictor, + frontend=frontend, + merge_sentences=merge_sentences, + input=sentence) + wav = get_voc_output( + args, voc_predictor=voc_predictor, input=am_output_data) + speed = wav.size / t.elapse + rtf = fs / speed + print( + f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print("warm up done!") + N = 0 T = 0 - fs = 24000 if am_dataset != 'ljspeech' else 22050 - i = 0 for utt_id, sentence in sentences: - # warmup - i += 1 with timer() as t: am_output_data = get_am_output( args, @@ -212,9 +228,8 @@ def main(): wav = get_voc_output( args, voc_predictor=voc_predictor, input=am_output_data) - if i >= 3: - N += wav.size - T += t.elapse + N += wav.size + T += t.elapse speed = wav.size / t.elapse rtf = fs / speed diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 49be2b40..3d01bdb0 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -100,8 +100,6 @@ def evaluate(args): spk_id = paddle.to_tensor(args.spk_id) mel = am_inference(part_phone_ids, spk_id) else: - # import pdb - # pdb.set_trace() mel = am_inference(part_phone_ids) elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i]