format synthesize, test=tts

pull/1552/head
TianYuan 2 years ago
parent 10ab7aabfe
commit fe8bf2a38c

@ -17,13 +17,92 @@ from pathlib import Path
import numpy import numpy
import soundfile as sf import soundfile as sf
from paddle import inference from paddle import inference
from timer import timer
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.exps.syn_utils import get_frontend
from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.utils import str2bool
def get_predictor(args, filed='am'):
full_name = ''
if filed == 'am':
full_name = args.am
elif filed == 'voc':
full_name = args.voc
model_name = full_name[:full_name.rindex('_')]
config = inference.Config(
str(Path(args.inference_dir) / (full_name + ".pdmodel")),
str(Path(args.inference_dir) / (full_name + ".pdiparams")))
if args.device == "gpu":
config.enable_use_gpu(100, 0)
elif args.device == "cpu":
config.disable_gpu()
# This line must be commented for fastspeech2, if not, it will OOM
if model_name != 'fastspeech2':
config.enable_memory_optim()
predictor = inference.create_predictor(config)
return predictor
# only inference for models trained with csmsc now def get_am_output(args, am_predictor, frontend, merge_sentences, input):
def main(): am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
am_input_names = am_predictor.get_input_names()
get_tone_ids = False
get_spk_id = False
if am_name == 'speedyspeech':
get_tone_ids = True
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
get_spk_id = True
spk_id = numpy.array([args.spk_id])
if args.lang == 'zh':
input_ids = frontend.get_input_ids(
input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]
elif args.lang == 'en':
input_ids = frontend.get_input_ids(
input, merge_sentences=merge_sentences)
phone_ids = input_ids["phone_ids"]
else:
print("lang should in {'zh', 'en'}!")
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
tones = tone_ids[0].numpy()
tones_handle = am_predictor.get_input_handle(am_input_names[1])
tones_handle.reshape(tones.shape)
tones_handle.copy_from_cpu(tones)
if get_spk_id:
spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
spk_id_handle.reshape(spk_id.shape)
spk_id_handle.copy_from_cpu(spk_id)
phones = phone_ids[0].numpy()
phones_handle = am_predictor.get_input_handle(am_input_names[0])
phones_handle.reshape(phones.shape)
phones_handle.copy_from_cpu(phones)
am_predictor.run()
am_output_names = am_predictor.get_output_names()
am_output_handle = am_predictor.get_output_handle(am_output_names[0])
am_output_data = am_output_handle.copy_to_cpu()
return am_output_data
def get_voc_output(args, voc_predictor, input):
voc_input_names = voc_predictor.get_input_names()
mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
mel_handle.reshape(input.shape)
mel_handle.copy_from_cpu(input)
voc_predictor.run()
voc_output_names = voc_predictor.get_output_names()
voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
wav = voc_output_handle.copy_to_cpu()
return wav
def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Paddle Infernce with speedyspeech & parallel wavegan.") description="Paddle Infernce with speedyspeech & parallel wavegan.")
# acoustic model # acoustic model
@ -70,113 +149,82 @@ def main():
parser.add_argument( parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models") "--inference_dir", type=str, help="dir to save inference models")
parser.add_argument("--output_dir", type=str, help="output dir") parser.add_argument("--output_dir", type=str, help="output dir")
# inference
parser.add_argument(
"--use_trt",
type=str2bool,
default=False,
help="Whether to use inference engin TensorRT.", )
parser.add_argument(
"--int8",
type=str2bool,
default=False,
help="Whether to use int8 inference.", )
parser.add_argument(
"--fp16",
type=str2bool,
default=False,
help="Whether to use float16 inference.", )
parser.add_argument(
"--device",
default="gpu",
choices=["gpu", "cpu"],
help="Device selected for inference.", )
args, _ = parser.parse_known_args() args, _ = parser.parse_known_args()
return args
# only inference for models trained with csmsc now
def main():
args = parse_args()
# frontend # frontend
if args.lang == 'zh': frontend = get_frontend(args)
frontend = Frontend(
phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
elif args.lang == 'en':
frontend = English(phone_vocab_path=args.phones_dict)
print("frontend done!")
# am_predictor
am_predictor = get_predictor(args, filed='am')
# model: {model_name}_{dataset} # model: {model_name}_{dataset}
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:] am_dataset = args.am[args.am.rindex('_') + 1:]
am_config = inference.Config( # voc_predictor
str(Path(args.inference_dir) / (args.am + ".pdmodel")), voc_predictor = get_predictor(args, filed='voc')
str(Path(args.inference_dir) / (args.am + ".pdiparams")))
am_config.enable_use_gpu(100, 0)
# This line must be commented for fastspeech2, if not, it will OOM
if am_name != 'fastspeech2':
am_config.enable_memory_optim()
am_predictor = inference.create_predictor(am_config)
voc_config = inference.Config(
str(Path(args.inference_dir) / (args.voc + ".pdmodel")),
str(Path(args.inference_dir) / (args.voc + ".pdiparams")))
voc_config.enable_use_gpu(100, 0)
voc_config.enable_memory_optim()
voc_predictor = inference.create_predictor(voc_config)
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
sentences = []
print("in new inference")
# construct dataset for evaluation
sentences = []
with open(args.text, 'rt') as f:
for line in f:
items = line.strip().split()
utt_id = items[0]
if args.lang == 'zh':
sentence = "".join(items[1:])
elif args.lang == 'en':
sentence = " ".join(items[1:])
sentences.append((utt_id, sentence))
get_tone_ids = False sentences = get_sentences(args)
get_spk_id = False
if am_name == 'speedyspeech':
get_tone_ids = True
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
get_spk_id = True
spk_id = numpy.array([args.spk_id])
am_input_names = am_predictor.get_input_names()
print("am_input_names:", am_input_names)
merge_sentences = True merge_sentences = True
N = 0
T = 0
fs = 24000 if am_dataset != 'ljspeech' else 22050
i = 0
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
if args.lang == 'zh': # warmup
input_ids = frontend.get_input_ids( i += 1
sentence, with timer() as t:
am_output_data = get_am_output(
args,
am_predictor=am_predictor,
frontend=frontend,
merge_sentences=merge_sentences, merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids) input=sentence)
phone_ids = input_ids["phone_ids"] wav = get_voc_output(
elif args.lang == 'en': args, voc_predictor=voc_predictor, input=am_output_data)
input_ids = frontend.get_input_ids(
sentence, merge_sentences=merge_sentences) if i >= 3:
phone_ids = input_ids["phone_ids"] N += wav.size
else: T += t.elapse
print("lang should in {'zh', 'en'}!") speed = wav.size / t.elapse
rtf = fs / speed
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
tones = tone_ids[0].numpy()
tones_handle = am_predictor.get_input_handle(am_input_names[1])
tones_handle.reshape(tones.shape)
tones_handle.copy_from_cpu(tones)
if get_spk_id:
spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
spk_id_handle.reshape(spk_id.shape)
spk_id_handle.copy_from_cpu(spk_id)
phones = phone_ids[0].numpy()
phones_handle = am_predictor.get_input_handle(am_input_names[0])
phones_handle.reshape(phones.shape)
phones_handle.copy_from_cpu(phones)
am_predictor.run()
am_output_names = am_predictor.get_output_names()
am_output_handle = am_predictor.get_output_handle(am_output_names[0])
am_output_data = am_output_handle.copy_to_cpu()
voc_input_names = voc_predictor.get_input_names()
mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
mel_handle.reshape(am_output_data.shape)
mel_handle.copy_from_cpu(am_output_data)
voc_predictor.run()
voc_output_names = voc_predictor.get_output_names()
voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
wav = voc_output_handle.copy_to_cpu()
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000) sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
print(
f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
if __name__ == "__main__": if __name__ == "__main__":

@ -0,0 +1,243 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import paddle
from paddle import jit
from paddle.static import InputSpec
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
model_alias = {
# acoustic model
"speedyspeech":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
"speedyspeech_inference":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
"pwgan_inference":
"paddlespeech.t2s.models.parallel_wavegan:PWGInference",
"mb_melgan":
"paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
}
# input
def get_sentences(args):
# construct dataset for evaluation
sentences = []
with open(args.text, 'rt') as f:
for line in f:
items = line.strip().split()
utt_id = items[0]
if 'lang' in args and args.lang == 'zh':
sentence = "".join(items[1:])
elif 'lang' in args and args.lang == 'en':
sentence = " ".join(items[1:])
sentences.append((utt_id, sentence))
return sentences
def get_test_dataset(args, test_metadata, am_name, am_dataset):
if am_name == 'fastspeech2':
fields = ["utt_id", "text"]
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
print("multiple speaker fastspeech2!")
fields += ["spk_id"]
elif 'voice_cloning' in args and args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
else:
print("single speaker fastspeech2!")
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
elif am_name == 'tacotron2':
fields = ["utt_id", "text"]
if 'voice_cloning' in args and args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
test_dataset = DataTable(data=test_metadata, fields=fields)
return test_dataset
# frontend
def get_frontend(args):
if 'lang' in args and args.lang == 'zh':
frontend = Frontend(
phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
elif 'lang' in args and args.lang == 'en':
frontend = English(phone_vocab_path=args.phones_dict)
else:
print("wrong lang!")
print("frontend done!")
return frontend
# dygraph
def get_am_inference(args, am_config):
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
tone_size = None
if 'tones_dict' in args and args.tones_dict:
with open(args.tones_dict, "r") as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
print("tone_size:", tone_size)
spk_num = None
if 'speaker_dict' in args and args.speaker_dict:
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
print("spk_num:", spk_num)
odim = am_config.n_mels
# model: {model_name}_{dataset}
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
am_class = dynamic_import(am_name, model_alias)
am_inference_class = dynamic_import(am_name + '_inference', model_alias)
if am_name == 'fastspeech2':
am = am_class(
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size,
tone_size=tone_size,
spk_num=spk_num,
**am_config["model"])
elif am_name == 'tacotron2':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
am_mu, am_std = np.load(args.am_stat)
am_mu = paddle.to_tensor(am_mu)
am_std = paddle.to_tensor(am_std)
am_normalizer = ZScore(am_mu, am_std)
am_inference = am_inference_class(am_normalizer, am)
am_inference.eval()
print("acoustic model done!")
return am_inference, am_name, am_dataset
def get_voc_inference(args, voc_config):
# model: {model_name}_{dataset}
voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
if voc_name != 'wavernn':
voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.remove_weight_norm()
voc.eval()
else:
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std)
voc_normalizer = ZScore(voc_mu, voc_std)
voc_inference = voc_inference_class(voc_normalizer, voc)
voc_inference.eval()
print("voc done!")
return voc_inference
# to static
def am_to_static(args, am_inference, am_name, am_dataset):
if am_name == 'fastspeech2':
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64),
InputSpec([1], dtype=paddle.int64),
], )
else:
am_inference = jit.to_static(
am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
elif am_name == 'speedyspeech':
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64), # text
InputSpec([-1], dtype=paddle.int64), # tone
InputSpec([1], dtype=paddle.int64), # spk_id
None # duration
])
else:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64),
InputSpec([-1], dtype=paddle.int64)
])
elif am_name == 'tacotron2':
am_inference = jit.to_static(
am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am))
am_inference = paddle.jit.load(os.path.join(args.inference_dir, args.am))
return am_inference
def voc_to_static(args, voc_inference):
voc_inference = jit.to_static(
voc_inference, input_spec=[
InputSpec([-1, 80], dtype=paddle.float32),
])
paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc))
voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc))
return voc_inference

@ -23,48 +23,11 @@ import yaml
from timer import timer from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.exps.syn_utils import get_test_dataset
from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.utils import str2bool from paddlespeech.t2s.utils import str2bool
model_alias = {
# acoustic model
"speedyspeech":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
"speedyspeech_inference":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
"pwgan_inference":
"paddlespeech.t2s.models.parallel_wavegan:PWGInference",
"mb_melgan":
"paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
}
def evaluate(args): def evaluate(args):
# dataloader has been too verbose # dataloader has been too verbose
@ -86,96 +49,12 @@ def evaluate(args):
print(am_config) print(am_config)
print(voc_config) print(voc_config)
# construct dataset for evaluation
# model: {model_name}_{dataset}
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
if am_name == 'fastspeech2':
fields = ["utt_id", "text"]
spk_num = None
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
print("multiple speaker fastspeech2!")
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
else:
print("single speaker fastspeech2!")
print("spk_num:", spk_num)
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
elif am_name == 'tacotron2':
fields = ["utt_id", "text"]
if args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
test_dataset = DataTable(data=test_metadata, fields=fields)
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
tone_size = None
if args.tones_dict:
with open(args.tones_dict, "r") as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
print("tone_size:", tone_size)
# acoustic model # acoustic model
odim = am_config.n_mels am_inference, am_name, am_dataset = get_am_inference(args, am_config)
am_class = dynamic_import(am_name, model_alias) test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset)
am_inference_class = dynamic_import(am_name + '_inference', model_alias)
if am_name == 'fastspeech2':
am = am_class(
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
elif am_name == 'tacotron2':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
am_mu, am_std = np.load(args.am_stat)
am_mu = paddle.to_tensor(am_mu)
am_std = paddle.to_tensor(am_std)
am_normalizer = ZScore(am_mu, am_std)
am_inference = am_inference_class(am_normalizer, am)
print("am_inference.training0:", am_inference.training)
am_inference.eval()
print("acoustic model done!")
# vocoder # vocoder
# model: {model_name}_{dataset} voc_inference = get_voc_inference(args, voc_config)
voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
if voc_name != 'wavernn':
voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.remove_weight_norm()
voc.eval()
else:
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std)
voc_normalizer = ZScore(voc_mu, voc_std)
voc_inference = voc_inference_class(voc_normalizer, voc)
print("voc_inference.training0:", voc_inference.training)
voc_inference.eval()
print("voc done!")
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
@ -227,7 +106,7 @@ def evaluate(args):
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def parse_args():
# parse args and config and redirect to train_sp # parse args and config and redirect to train_sp
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Synthesize with acoustic model & vocoder") description="Synthesize with acoustic model & vocoder")
@ -264,7 +143,6 @@ def main():
"--tones_dict", type=str, default=None, help="tone vocabulary file.") "--tones_dict", type=str, default=None, help="tone vocabulary file.")
parser.add_argument( parser.add_argument(
"--speaker_dict", type=str, default=None, help="speaker id map file.") "--speaker_dict", type=str, default=None, help="speaker id map file.")
parser.add_argument( parser.add_argument(
"--voice-cloning", "--voice-cloning",
type=str2bool, type=str2bool,
@ -281,7 +159,6 @@ def main():
'style_melgan_csmsc' 'style_melgan_csmsc'
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
parser.add_argument( parser.add_argument(
'--voc_config', '--voc_config',
type=str, type=str,
@ -302,7 +179,12 @@ def main():
parser.add_argument("--output_dir", type=str, help="output dir.") parser.add_argument("--output_dir", type=str, help="output dir.")
args = parser.parse_args() args = parser.parse_args()
return args
def main():
args = parse_args()
if args.ngpu == 0: if args.ngpu == 0:
paddle.set_device("cpu") paddle.set_device("cpu")
elif args.ngpu > 0: elif args.ngpu > 0:

@ -12,59 +12,20 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse import argparse
import os
from pathlib import Path from pathlib import Path
import numpy as np
import paddle import paddle
import soundfile as sf import soundfile as sf
import yaml import yaml
from paddle import jit
from paddle.static import InputSpec
from timer import timer from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.exps.syn_utils import am_to_static
from paddlespeech.t2s.frontend import English from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.exps.syn_utils import get_frontend
from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
model_alias = { from paddlespeech.t2s.exps.syn_utils import voc_to_static
# acoustic model
"speedyspeech":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
"speedyspeech_inference":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
"pwgan_inference":
"paddlespeech.t2s.models.parallel_wavegan:PWGInference",
"mb_melgan":
"paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
}
def evaluate(args): def evaluate(args):
@ -81,155 +42,28 @@ def evaluate(args):
print(am_config) print(am_config)
print(voc_config) print(voc_config)
# construct dataset for evaluation sentences = get_sentences(args)
sentences = []
with open(args.text, 'rt') as f:
for line in f:
items = line.strip().split()
utt_id = items[0]
if args.lang == 'zh':
sentence = "".join(items[1:])
elif args.lang == 'en':
sentence = " ".join(items[1:])
sentences.append((utt_id, sentence))
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
tone_size = None
if args.tones_dict:
with open(args.tones_dict, "r") as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
print("tone_size:", tone_size)
spk_num = None
if args.speaker_dict:
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
print("spk_num:", spk_num)
# frontend # frontend
if args.lang == 'zh': frontend = get_frontend(args)
frontend = Frontend(
phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
elif args.lang == 'en':
frontend = English(phone_vocab_path=args.phones_dict)
print("frontend done!")
# acoustic model # acoustic model
odim = am_config.n_mels am_inference, am_name, am_dataset = get_am_inference(args, am_config)
# model: {model_name}_{dataset}
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
am_class = dynamic_import(am_name, model_alias)
am_inference_class = dynamic_import(am_name + '_inference', model_alias)
if am_name == 'fastspeech2':
am = am_class(
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size,
tone_size=tone_size,
spk_num=spk_num,
**am_config["model"])
elif am_name == 'tacotron2':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
am_mu, am_std = np.load(args.am_stat)
am_mu = paddle.to_tensor(am_mu)
am_std = paddle.to_tensor(am_std)
am_normalizer = ZScore(am_mu, am_std)
am_inference = am_inference_class(am_normalizer, am)
am_inference.eval()
print("acoustic model done!")
# vocoder # vocoder
# model: {model_name}_{dataset} voc_inference = get_voc_inference(args, voc_config)
voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
if voc_name != 'wavernn':
voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.remove_weight_norm()
voc.eval()
else:
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std)
voc_normalizer = ZScore(voc_mu, voc_std)
voc_inference = voc_inference_class(voc_normalizer, voc)
voc_inference.eval()
print("voc done!")
# whether dygraph to static # whether dygraph to static
if args.inference_dir: if args.inference_dir:
# acoustic model # acoustic model
if am_name == 'fastspeech2': am_inference = am_to_static(args, am_inference, am_name, am_dataset)
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64),
InputSpec([1], dtype=paddle.int64)
])
else:
am_inference = jit.to_static(
am_inference,
input_spec=[InputSpec([-1], dtype=paddle.int64)])
elif am_name == 'speedyspeech':
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64), # text
InputSpec([-1], dtype=paddle.int64), # tone
InputSpec([1], dtype=paddle.int64), # spk_id
None # duration
])
else:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64),
InputSpec([-1], dtype=paddle.int64)
])
elif am_name == 'tacotron2':
am_inference = jit.to_static(
am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am))
am_inference = paddle.jit.load(
os.path.join(args.inference_dir, args.am))
# vocoder # vocoder
voc_inference = jit.to_static( voc_inference = voc_to_static(args, voc_inference)
voc_inference,
input_spec=[
InputSpec([-1, 80], dtype=paddle.float32),
])
paddle.jit.save(voc_inference,
os.path.join(args.inference_dir, args.voc))
voc_inference = paddle.jit.load(
os.path.join(args.inference_dir, args.voc))
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
merge_sentences = False merge_sentences = True
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
if am_name == 'tacotron2': if am_name == 'tacotron2':
@ -266,6 +100,8 @@ def evaluate(args):
spk_id = paddle.to_tensor(args.spk_id) spk_id = paddle.to_tensor(args.spk_id)
mel = am_inference(part_phone_ids, spk_id) mel = am_inference(part_phone_ids, spk_id)
else: else:
# import pdb
# pdb.set_trace()
mel = am_inference(part_phone_ids) mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech': elif am_name == 'speedyspeech':
part_tone_ids = tone_ids[i] part_tone_ids = tone_ids[i]
@ -298,7 +134,7 @@ def evaluate(args):
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def parse_args():
# parse args and config and redirect to train_sp # parse args and config and redirect to train_sp
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Synthesize with acoustic model & vocoder") description="Synthesize with acoustic model & vocoder")
@ -351,7 +187,6 @@ def main():
'wavernn_csmsc' 'wavernn_csmsc'
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
parser.add_argument( parser.add_argument(
'--voc_config', '--voc_config',
type=str, type=str,
@ -386,6 +221,11 @@ def main():
parser.add_argument("--output_dir", type=str, help="output dir.") parser.add_argument("--output_dir", type=str, help="output dir.")
args = parser.parse_args() args = parser.parse_args()
return args
def main():
args = parse_args()
if args.ngpu == 0: if args.ngpu == 0:
paddle.set_device("cpu") paddle.set_device("cpu")

@ -21,29 +21,12 @@ import soundfile as sf
import yaml import yaml
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
model_alias = {
# acoustic model
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
"pwgan_inference":
"paddlespeech.t2s.models.parallel_wavegan:PWGInference",
}
def voice_cloning(args): def voice_cloning(args):
# Init body. # Init body.
@ -79,55 +62,14 @@ def voice_cloning(args):
speaker_encoder.eval() speaker_encoder.eval()
print("GE2E Done!") print("GE2E Done!")
with open(args.phones_dict, "r") as f: frontend = Frontend(phone_vocab_path=args.phones_dict)
phn_id = [line.strip().split() for line in f.readlines()] print("frontend done!")
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
# acoustic model # acoustic model
odim = am_config.n_mels am_inference, *_ = get_am_inference(args, am_config)
# model: {model_name}_{dataset}
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
am_class = dynamic_import(am_name, model_alias)
am_inference_class = dynamic_import(am_name + '_inference', model_alias)
if am_name == 'fastspeech2':
am = am_class(
idim=vocab_size, odim=odim, spk_num=None, **am_config["model"])
elif am_name == 'tacotron2':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
am_mu, am_std = np.load(args.am_stat)
am_mu = paddle.to_tensor(am_mu)
am_std = paddle.to_tensor(am_std)
am_normalizer = ZScore(am_mu, am_std)
am_inference = am_inference_class(am_normalizer, am)
am_inference.eval()
print("acoustic model done!")
# vocoder # vocoder
# model: {model_name}_{dataset} voc_inference = get_voc_inference(args, voc_config)
voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.remove_weight_norm()
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std)
voc_normalizer = ZScore(voc_mu, voc_std)
voc_inference = voc_inference_class(voc_normalizer, voc)
voc_inference.eval()
print("voc done!")
frontend = Frontend(phone_vocab_path=args.phones_dict)
print("frontend done!")
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
@ -170,7 +112,7 @@ def voice_cloning(args):
print(f"{utt_id} done!") print(f"{utt_id} done!")
def main(): def parse_args():
# parse args and config and redirect to train_sp # parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="") parser = argparse.ArgumentParser(description="")
parser.add_argument( parser.add_argument(
@ -240,6 +182,11 @@ def main():
parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument("--output-dir", type=str, help="output dir.")
args = parser.parse_args() args = parser.parse_args()
return args
def main():
args = parse_args()
if args.ngpu == 0: if args.ngpu == 0:
paddle.set_device("cpu") paddle.set_device("cpu")

@ -101,6 +101,16 @@ class LengthRegulator(nn.Layer):
assert alpha > 0 assert alpha > 0
ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha) ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
ds = ds.cast(dtype=paddle.int64) ds = ds.cast(dtype=paddle.int64)
'''
from distutils.version import LooseVersion
from paddlespeech.t2s.modules.nets_utils import pad_list
# 这里在 paddle 2.2.2 的动转静是不通的
# if LooseVersion(paddle.__version__) >= "2.3.0" or hasattr(paddle, 'repeat_interleave'):
# if LooseVersion(paddle.__version__) >= "2.3.0":
if hasattr(paddle, 'repeat_interleave'):
repeat = [paddle.repeat_interleave(x, d, axis=0) for x, d in zip(xs, ds)]
return pad_list(repeat, self.pad_value)
'''
if is_inference: if is_inference:
return self.expand(xs, ds) return self.expand(xs, ds)
else: else:

Loading…
Cancel
Save