diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 26d7e2c0..cdbf10e6 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -17,13 +17,92 @@ from pathlib import Path import numpy import soundfile as sf from paddle import inference - -from paddlespeech.t2s.frontend import English -from paddlespeech.t2s.frontend.zh_frontend import Frontend +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.utils import str2bool + + +def get_predictor(args, filed='am'): + full_name = '' + if filed == 'am': + full_name = args.am + elif filed == 'voc': + full_name = args.voc + model_name = full_name[:full_name.rindex('_')] + config = inference.Config( + str(Path(args.inference_dir) / (full_name + ".pdmodel")), + str(Path(args.inference_dir) / (full_name + ".pdiparams"))) + if args.device == "gpu": + config.enable_use_gpu(100, 0) + elif args.device == "cpu": + config.disable_gpu() + # This line must be commented for fastspeech2, if not, it will OOM + if model_name != 'fastspeech2': + config.enable_memory_optim() + predictor = inference.create_predictor(config) + return predictor -# only inference for models trained with csmsc now -def main(): +def get_am_output(args, am_predictor, frontend, merge_sentences, input): + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + am_input_names = am_predictor.get_input_names() + get_tone_ids = False + get_spk_id = False + if am_name == 'speedyspeech': + get_tone_ids = True + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + get_spk_id = True + spk_id = numpy.array([args.spk_id]) + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + elif args.lang == 'en': + input_ids = frontend.get_input_ids( + input, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + tones = tone_ids[0].numpy() + tones_handle = am_predictor.get_input_handle(am_input_names[1]) + tones_handle.reshape(tones.shape) + tones_handle.copy_from_cpu(tones) + if get_spk_id: + spk_id_handle = am_predictor.get_input_handle(am_input_names[1]) + spk_id_handle.reshape(spk_id.shape) + spk_id_handle.copy_from_cpu(spk_id) + phones = phone_ids[0].numpy() + phones_handle = am_predictor.get_input_handle(am_input_names[0]) + phones_handle.reshape(phones.shape) + phones_handle.copy_from_cpu(phones) + + am_predictor.run() + am_output_names = am_predictor.get_output_names() + am_output_handle = am_predictor.get_output_handle(am_output_names[0]) + am_output_data = am_output_handle.copy_to_cpu() + return am_output_data + + +def get_voc_output(args, voc_predictor, input): + voc_input_names = voc_predictor.get_input_names() + mel_handle = voc_predictor.get_input_handle(voc_input_names[0]) + mel_handle.reshape(input.shape) + mel_handle.copy_from_cpu(input) + + voc_predictor.run() + voc_output_names = voc_predictor.get_output_names() + voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0]) + wav = voc_output_handle.copy_to_cpu() + return wav + + +def parse_args(): parser = argparse.ArgumentParser( description="Paddle Infernce with speedyspeech & parallel wavegan.") # acoustic model @@ -70,113 +149,82 @@ def main(): parser.add_argument( "--inference_dir", type=str, help="dir to save inference models") parser.add_argument("--output_dir", type=str, help="output dir") + # inference + parser.add_argument( + "--use_trt", + type=str2bool, + default=False, + help="Whether to use inference engin TensorRT.", ) + parser.add_argument( + "--int8", + type=str2bool, + default=False, + help="Whether to use int8 inference.", ) + parser.add_argument( + "--fp16", + type=str2bool, + default=False, + help="Whether to use float16 inference.", ) + parser.add_argument( + "--device", + default="gpu", + choices=["gpu", "cpu"], + help="Device selected for inference.", ) args, _ = parser.parse_known_args() + return args + +# only inference for models trained with csmsc now +def main(): + args = parse_args() # frontend - if args.lang == 'zh': - frontend = Frontend( - phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) - elif args.lang == 'en': - frontend = English(phone_vocab_path=args.phones_dict) - print("frontend done!") + frontend = get_frontend(args) + # am_predictor + am_predictor = get_predictor(args, filed='am') # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] am_dataset = args.am[args.am.rindex('_') + 1:] - am_config = inference.Config( - str(Path(args.inference_dir) / (args.am + ".pdmodel")), - str(Path(args.inference_dir) / (args.am + ".pdiparams"))) - am_config.enable_use_gpu(100, 0) - # This line must be commented for fastspeech2, if not, it will OOM - if am_name != 'fastspeech2': - am_config.enable_memory_optim() - am_predictor = inference.create_predictor(am_config) - - voc_config = inference.Config( - str(Path(args.inference_dir) / (args.voc + ".pdmodel")), - str(Path(args.inference_dir) / (args.voc + ".pdiparams"))) - voc_config.enable_use_gpu(100, 0) - voc_config.enable_memory_optim() - voc_predictor = inference.create_predictor(voc_config) + # voc_predictor + voc_predictor = get_predictor(args, filed='voc') output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - sentences = [] - - print("in new inference") - - # construct dataset for evaluation - sentences = [] - with open(args.text, 'rt') as f: - for line in f: - items = line.strip().split() - utt_id = items[0] - if args.lang == 'zh': - sentence = "".join(items[1:]) - elif args.lang == 'en': - sentence = " ".join(items[1:]) - sentences.append((utt_id, sentence)) - get_tone_ids = False - get_spk_id = False - if am_name == 'speedyspeech': - get_tone_ids = True - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - get_spk_id = True - spk_id = numpy.array([args.spk_id]) + sentences = get_sentences(args) - am_input_names = am_predictor.get_input_names() - print("am_input_names:", am_input_names) merge_sentences = True + N = 0 + T = 0 + fs = 24000 if am_dataset != 'ljspeech' else 22050 + i = 0 for utt_id, sentence in sentences: - if args.lang == 'zh': - input_ids = frontend.get_input_ids( - sentence, + # warmup + i += 1 + with timer() as t: + am_output_data = get_am_output( + args, + am_predictor=am_predictor, + frontend=frontend, merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids) - phone_ids = input_ids["phone_ids"] - elif args.lang == 'en': - input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) - phone_ids = input_ids["phone_ids"] - else: - print("lang should in {'zh', 'en'}!") - - if get_tone_ids: - tone_ids = input_ids["tone_ids"] - tones = tone_ids[0].numpy() - tones_handle = am_predictor.get_input_handle(am_input_names[1]) - tones_handle.reshape(tones.shape) - tones_handle.copy_from_cpu(tones) - if get_spk_id: - spk_id_handle = am_predictor.get_input_handle(am_input_names[1]) - spk_id_handle.reshape(spk_id.shape) - spk_id_handle.copy_from_cpu(spk_id) - phones = phone_ids[0].numpy() - phones_handle = am_predictor.get_input_handle(am_input_names[0]) - phones_handle.reshape(phones.shape) - phones_handle.copy_from_cpu(phones) - - am_predictor.run() - am_output_names = am_predictor.get_output_names() - am_output_handle = am_predictor.get_output_handle(am_output_names[0]) - am_output_data = am_output_handle.copy_to_cpu() - - voc_input_names = voc_predictor.get_input_names() - mel_handle = voc_predictor.get_input_handle(voc_input_names[0]) - mel_handle.reshape(am_output_data.shape) - mel_handle.copy_from_cpu(am_output_data) - - voc_predictor.run() - voc_output_names = voc_predictor.get_output_names() - voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0]) - wav = voc_output_handle.copy_to_cpu() + input=sentence) + wav = get_voc_output( + args, voc_predictor=voc_predictor, input=am_output_data) + + if i >= 3: + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = fs / speed sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000) + print( + f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") if __name__ == "__main__": diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py new file mode 100644 index 00000000..c52cb372 --- /dev/null +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -0,0 +1,243 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import paddle +from paddle import jit +from paddle.static import InputSpec + +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.modules.normalizer import ZScore + +model_alias = { + # acoustic model + "speedyspeech": + "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", + "speedyspeech_inference": + "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", + "fastspeech2": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2", + "fastspeech2_inference": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", + # voc + "pwgan": + "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", + "pwgan_inference": + "paddlespeech.t2s.models.parallel_wavegan:PWGInference", + "mb_melgan": + "paddlespeech.t2s.models.melgan:MelGANGenerator", + "mb_melgan_inference": + "paddlespeech.t2s.models.melgan:MelGANInference", + "style_melgan": + "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", + "style_melgan_inference": + "paddlespeech.t2s.models.melgan:StyleMelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", +} + + +# input +def get_sentences(args): + # construct dataset for evaluation + sentences = [] + with open(args.text, 'rt') as f: + for line in f: + items = line.strip().split() + utt_id = items[0] + if 'lang' in args and args.lang == 'zh': + sentence = "".join(items[1:]) + elif 'lang' in args and args.lang == 'en': + sentence = " ".join(items[1:]) + sentences.append((utt_id, sentence)) + return sentences + + +def get_test_dataset(args, test_metadata, am_name, am_dataset): + if am_name == 'fastspeech2': + fields = ["utt_id", "text"] + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + print("multiple speaker fastspeech2!") + fields += ["spk_id"] + elif 'voice_cloning' in args and args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] + else: + print("single speaker fastspeech2!") + elif am_name == 'speedyspeech': + fields = ["utt_id", "phones", "tones"] + elif am_name == 'tacotron2': + fields = ["utt_id", "text"] + if 'voice_cloning' in args and args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] + + test_dataset = DataTable(data=test_metadata, fields=fields) + return test_dataset + + +# frontend +def get_frontend(args): + if 'lang' in args and args.lang == 'zh': + frontend = Frontend( + phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) + elif 'lang' in args and args.lang == 'en': + frontend = English(phone_vocab_path=args.phones_dict) + else: + print("wrong lang!") + print("frontend done!") + return frontend + + +# dygraph +def get_am_inference(args, am_config): + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + tone_size = None + if 'tones_dict' in args and args.tones_dict: + with open(args.tones_dict, "r") as f: + tone_id = [line.strip().split() for line in f.readlines()] + tone_size = len(tone_id) + print("tone_size:", tone_size) + + spk_num = None + if 'speaker_dict' in args and args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id) + print("spk_num:", spk_num) + + odim = am_config.n_mels + # model: {model_name}_{dataset} + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + + am_class = dynamic_import(am_name, model_alias) + am_inference_class = dynamic_import(am_name + '_inference', model_alias) + + if am_name == 'fastspeech2': + am = am_class( + idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) + elif am_name == 'speedyspeech': + am = am_class( + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) + + am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) + am.eval() + am_mu, am_std = np.load(args.am_stat) + am_mu = paddle.to_tensor(am_mu) + am_std = paddle.to_tensor(am_std) + am_normalizer = ZScore(am_mu, am_std) + am_inference = am_inference_class(am_normalizer, am) + am_inference.eval() + print("acoustic model done!") + return am_inference, am_name, am_dataset + + +def get_voc_inference(args, voc_config): + # model: {model_name}_{dataset} + voc_name = args.voc[:args.voc.rindex('_')] + voc_class = dynamic_import(voc_name, model_alias) + voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) + if voc_name != 'wavernn': + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**voc_config["model"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) + voc.eval() + + voc_mu, voc_std = np.load(args.voc_stat) + voc_mu = paddle.to_tensor(voc_mu) + voc_std = paddle.to_tensor(voc_std) + voc_normalizer = ZScore(voc_mu, voc_std) + voc_inference = voc_inference_class(voc_normalizer, voc) + voc_inference.eval() + print("voc done!") + return voc_inference + + +# to static +def am_to_static(args, am_inference, am_name, am_dataset): + if am_name == 'fastspeech2': + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), + InputSpec([1], dtype=paddle.int64), + ], ) + else: + am_inference = jit.to_static( + am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + + elif am_name == 'speedyspeech': + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), # text + InputSpec([-1], dtype=paddle.int64), # tone + InputSpec([1], dtype=paddle.int64), # spk_id + None # duration + ]) + else: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), + InputSpec([-1], dtype=paddle.int64) + ]) + + elif am_name == 'tacotron2': + am_inference = jit.to_static( + am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + + paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am)) + am_inference = paddle.jit.load(os.path.join(args.inference_dir, args.am)) + return am_inference + + +def voc_to_static(args, voc_inference): + voc_inference = jit.to_static( + voc_inference, input_spec=[ + InputSpec([-1, 80], dtype=paddle.float32), + ]) + paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc)) + voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc)) + return voc_inference diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 81da14f2..426b7617 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -23,48 +23,11 @@ import yaml from timer import timer from yacs.config import CfgNode -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.t2s.datasets.data_table import DataTable -from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_test_dataset +from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.utils import str2bool -model_alias = { - # acoustic model - "speedyspeech": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", - "speedyspeech_inference": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - "tacotron2": - "paddlespeech.t2s.models.tacotron2:Tacotron2", - "tacotron2_inference": - "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", - # voc - "pwgan": - "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", - "pwgan_inference": - "paddlespeech.t2s.models.parallel_wavegan:PWGInference", - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "style_melgan": - "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", - "style_melgan_inference": - "paddlespeech.t2s.models.melgan:StyleMelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", - "wavernn": - "paddlespeech.t2s.models.wavernn:WaveRNN", - "wavernn_inference": - "paddlespeech.t2s.models.wavernn:WaveRNNInference", -} - def evaluate(args): # dataloader has been too verbose @@ -86,96 +49,12 @@ def evaluate(args): print(am_config) print(voc_config) - # construct dataset for evaluation - - # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] - - if am_name == 'fastspeech2': - fields = ["utt_id", "text"] - spk_num = None - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - print("multiple speaker fastspeech2!") - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - spk_num = len(spk_id) - fields += ["spk_id"] - elif args.voice_cloning: - print("voice cloning!") - fields += ["spk_emb"] - else: - print("single speaker fastspeech2!") - print("spk_num:", spk_num) - elif am_name == 'speedyspeech': - fields = ["utt_id", "phones", "tones"] - elif am_name == 'tacotron2': - fields = ["utt_id", "text"] - if args.voice_cloning: - print("voice cloning!") - fields += ["spk_emb"] - - test_dataset = DataTable(data=test_metadata, fields=fields) - - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - - tone_size = None - if args.tones_dict: - with open(args.tones_dict, "r") as f: - tone_id = [line.strip().split() for line in f.readlines()] - tone_size = len(tone_id) - print("tone_size:", tone_size) - # acoustic model - odim = am_config.n_mels - am_class = dynamic_import(am_name, model_alias) - am_inference_class = dynamic_import(am_name + '_inference', model_alias) - - if am_name == 'fastspeech2': - am = am_class( - idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) - elif am_name == 'speedyspeech': - am = am_class( - vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) - elif am_name == 'tacotron2': - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - am_normalizer = ZScore(am_mu, am_std) - am_inference = am_inference_class(am_normalizer, am) - print("am_inference.training0:", am_inference.training) - am_inference.eval() - print("acoustic model done!") + am_inference, am_name, am_dataset = get_am_inference(args, am_config) + test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset) # vocoder - # model: {model_name}_{dataset} - voc_name = args.voc[:args.voc.rindex('_')] - voc_class = dynamic_import(voc_name, model_alias) - voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - if voc_name != 'wavernn': - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() - else: - voc = voc_class(**voc_config["model"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) - voc.eval() - voc_mu, voc_std = np.load(args.voc_stat) - voc_mu = paddle.to_tensor(voc_mu) - voc_std = paddle.to_tensor(voc_std) - voc_normalizer = ZScore(voc_mu, voc_std) - voc_inference = voc_inference_class(voc_normalizer, voc) - print("voc_inference.training0:", voc_inference.training) - voc_inference.eval() - print("voc done!") + voc_inference = get_voc_inference(args, voc_config) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -227,7 +106,7 @@ def evaluate(args): print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") -def main(): +def parse_args(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser( description="Synthesize with acoustic model & vocoder") @@ -264,7 +143,6 @@ def main(): "--tones_dict", type=str, default=None, help="tone vocabulary file.") parser.add_argument( "--speaker_dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( "--voice-cloning", type=str2bool, @@ -281,7 +159,6 @@ def main(): 'style_melgan_csmsc' ], help='Choose vocoder type of tts task.') - parser.add_argument( '--voc_config', type=str, @@ -302,7 +179,12 @@ def main(): parser.add_argument("--output_dir", type=str, help="output dir.") args = parser.parse_args() + return args + + +def main(): + args = parse_args() if args.ngpu == 0: paddle.set_device("cpu") elif args.ngpu > 0: diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 94180f85..49be2b40 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -12,59 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import os from pathlib import Path -import numpy as np import paddle import soundfile as sf import yaml -from paddle import jit -from paddle.static import InputSpec from timer import timer from yacs.config import CfgNode -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.t2s.frontend import English -from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.t2s.modules.normalizer import ZScore - -model_alias = { - # acoustic model - "speedyspeech": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", - "speedyspeech_inference": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - "tacotron2": - "paddlespeech.t2s.models.tacotron2:Tacotron2", - "tacotron2_inference": - "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", - # voc - "pwgan": - "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", - "pwgan_inference": - "paddlespeech.t2s.models.parallel_wavegan:PWGInference", - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "style_melgan": - "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", - "style_melgan_inference": - "paddlespeech.t2s.models.melgan:StyleMelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", - "wavernn": - "paddlespeech.t2s.models.wavernn:WaveRNN", - "wavernn_inference": - "paddlespeech.t2s.models.wavernn:WaveRNNInference", -} +from paddlespeech.t2s.exps.syn_utils import am_to_static +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.exps.syn_utils import voc_to_static def evaluate(args): @@ -81,155 +42,28 @@ def evaluate(args): print(am_config) print(voc_config) - # construct dataset for evaluation - sentences = [] - with open(args.text, 'rt') as f: - for line in f: - items = line.strip().split() - utt_id = items[0] - if args.lang == 'zh': - sentence = "".join(items[1:]) - elif args.lang == 'en': - sentence = " ".join(items[1:]) - sentences.append((utt_id, sentence)) - - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - - tone_size = None - if args.tones_dict: - with open(args.tones_dict, "r") as f: - tone_id = [line.strip().split() for line in f.readlines()] - tone_size = len(tone_id) - print("tone_size:", tone_size) - - spk_num = None - if args.speaker_dict: - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - spk_num = len(spk_id) - print("spk_num:", spk_num) + sentences = get_sentences(args) # frontend - if args.lang == 'zh': - frontend = Frontend( - phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) - elif args.lang == 'en': - frontend = English(phone_vocab_path=args.phones_dict) - print("frontend done!") + frontend = get_frontend(args) # acoustic model - odim = am_config.n_mels - # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] - - am_class = dynamic_import(am_name, model_alias) - am_inference_class = dynamic_import(am_name + '_inference', model_alias) - - if am_name == 'fastspeech2': - am = am_class( - idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) - elif am_name == 'speedyspeech': - am = am_class( - vocab_size=vocab_size, - tone_size=tone_size, - spk_num=spk_num, - **am_config["model"]) - elif am_name == 'tacotron2': - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - am_normalizer = ZScore(am_mu, am_std) - am_inference = am_inference_class(am_normalizer, am) - am_inference.eval() - print("acoustic model done!") + am_inference, am_name, am_dataset = get_am_inference(args, am_config) # vocoder - # model: {model_name}_{dataset} - voc_name = args.voc[:args.voc.rindex('_')] - voc_class = dynamic_import(voc_name, model_alias) - voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - if voc_name != 'wavernn': - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() - else: - voc = voc_class(**voc_config["model"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) - voc.eval() - - voc_mu, voc_std = np.load(args.voc_stat) - voc_mu = paddle.to_tensor(voc_mu) - voc_std = paddle.to_tensor(voc_std) - voc_normalizer = ZScore(voc_mu, voc_std) - voc_inference = voc_inference_class(voc_normalizer, voc) - voc_inference.eval() - print("voc done!") + voc_inference = get_voc_inference(args, voc_config) # whether dygraph to static if args.inference_dir: # acoustic model - if am_name == 'fastspeech2': - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), - InputSpec([1], dtype=paddle.int64) - ]) - else: - am_inference = jit.to_static( - am_inference, - input_spec=[InputSpec([-1], dtype=paddle.int64)]) - - elif am_name == 'speedyspeech': - if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), # text - InputSpec([-1], dtype=paddle.int64), # tone - InputSpec([1], dtype=paddle.int64), # spk_id - None # duration - ]) - else: - am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), - InputSpec([-1], dtype=paddle.int64) - ]) - - elif am_name == 'tacotron2': - am_inference = jit.to_static( - am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) - - paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am)) - am_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.am)) + am_inference = am_to_static(args, am_inference, am_name, am_dataset) # vocoder - voc_inference = jit.to_static( - voc_inference, - input_spec=[ - InputSpec([-1, 80], dtype=paddle.float32), - ]) - paddle.jit.save(voc_inference, - os.path.join(args.inference_dir, args.voc)) - voc_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.voc)) + voc_inference = voc_to_static(args, voc_inference) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - merge_sentences = False + merge_sentences = True # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) if am_name == 'tacotron2': @@ -266,6 +100,8 @@ def evaluate(args): spk_id = paddle.to_tensor(args.spk_id) mel = am_inference(part_phone_ids, spk_id) else: + # import pdb + # pdb.set_trace() mel = am_inference(part_phone_ids) elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i] @@ -298,7 +134,7 @@ def evaluate(args): print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") -def main(): +def parse_args(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser( description="Synthesize with acoustic model & vocoder") @@ -351,7 +187,6 @@ def main(): 'wavernn_csmsc' ], help='Choose vocoder type of tts task.') - parser.add_argument( '--voc_config', type=str, @@ -386,6 +221,11 @@ def main(): parser.add_argument("--output_dir", type=str, help="output dir.") args = parser.parse_args() + return args + + +def main(): + args = parse_args() if args.ngpu == 0: paddle.set_device("cpu") diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py index 3de30774..1afd21df 100644 --- a/paddlespeech/t2s/exps/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning.py @@ -21,29 +21,12 @@ import soundfile as sf import yaml from yacs.config import CfgNode -from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder -model_alias = { - # acoustic model - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - "tacotron2": - "paddlespeech.t2s.models.tacotron2:Tacotron2", - "tacotron2_inference": - "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", - # voc - "pwgan": - "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", - "pwgan_inference": - "paddlespeech.t2s.models.parallel_wavegan:PWGInference", -} - def voice_cloning(args): # Init body. @@ -79,55 +62,14 @@ def voice_cloning(args): speaker_encoder.eval() print("GE2E Done!") - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) + frontend = Frontend(phone_vocab_path=args.phones_dict) + print("frontend done!") # acoustic model - odim = am_config.n_mels - # model: {model_name}_{dataset} - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] - - am_class = dynamic_import(am_name, model_alias) - am_inference_class = dynamic_import(am_name + '_inference', model_alias) - - if am_name == 'fastspeech2': - am = am_class( - idim=vocab_size, odim=odim, spk_num=None, **am_config["model"]) - elif am_name == 'tacotron2': - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - am_normalizer = ZScore(am_mu, am_std) - am_inference = am_inference_class(am_normalizer, am) - am_inference.eval() - print("acoustic model done!") + am_inference, *_ = get_am_inference(args, am_config) # vocoder - # model: {model_name}_{dataset} - voc_name = args.voc[:args.voc.rindex('_')] - voc_class = dynamic_import(voc_name, model_alias) - voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() - voc_mu, voc_std = np.load(args.voc_stat) - voc_mu = paddle.to_tensor(voc_mu) - voc_std = paddle.to_tensor(voc_std) - voc_normalizer = ZScore(voc_mu, voc_std) - voc_inference = voc_inference_class(voc_normalizer, voc) - voc_inference.eval() - print("voc done!") - - frontend = Frontend(phone_vocab_path=args.phones_dict) - print("frontend done!") + voc_inference = get_voc_inference(args, voc_config) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -170,7 +112,7 @@ def voice_cloning(args): print(f"{utt_id} done!") -def main(): +def parse_args(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="") parser.add_argument( @@ -240,6 +182,11 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") args = parser.parse_args() + return args + + +def main(): + args = parse_args() if args.ngpu == 0: paddle.set_device("cpu") diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index 62d707d2..2472c413 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -101,6 +101,16 @@ class LengthRegulator(nn.Layer): assert alpha > 0 ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha) ds = ds.cast(dtype=paddle.int64) + ''' + from distutils.version import LooseVersion + from paddlespeech.t2s.modules.nets_utils import pad_list + # 这里在 paddle 2.2.2 的动转静是不通的 + # if LooseVersion(paddle.__version__) >= "2.3.0" or hasattr(paddle, 'repeat_interleave'): + # if LooseVersion(paddle.__version__) >= "2.3.0": + if hasattr(paddle, 'repeat_interleave'): + repeat = [paddle.repeat_interleave(x, d, axis=0) for x, d in zip(xs, ds)] + return pad_list(repeat, self.pad_value) + ''' if is_inference: return self.expand(xs, ds) else: