|
|
@ -20,6 +20,7 @@ import numpy as np
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
import soundfile as sf
|
|
|
|
import soundfile as sf
|
|
|
|
import yaml
|
|
|
|
import yaml
|
|
|
|
|
|
|
|
from timer import timer
|
|
|
|
from yacs.config import CfgNode
|
|
|
|
from yacs.config import CfgNode
|
|
|
|
|
|
|
|
|
|
|
|
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
|
|
|
|
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
|
|
|
@ -50,6 +51,18 @@ model_alias = {
|
|
|
|
"paddlespeech.t2s.models.melgan:MelGANGenerator",
|
|
|
|
"paddlespeech.t2s.models.melgan:MelGANGenerator",
|
|
|
|
"mb_melgan_inference":
|
|
|
|
"mb_melgan_inference":
|
|
|
|
"paddlespeech.t2s.models.melgan:MelGANInference",
|
|
|
|
"paddlespeech.t2s.models.melgan:MelGANInference",
|
|
|
|
|
|
|
|
"style_melgan":
|
|
|
|
|
|
|
|
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
|
|
|
|
|
|
|
|
"style_melgan_inference":
|
|
|
|
|
|
|
|
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
|
|
|
|
|
|
|
|
"hifigan":
|
|
|
|
|
|
|
|
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
|
|
|
|
|
|
|
|
"hifigan_inference":
|
|
|
|
|
|
|
|
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
|
|
|
|
|
|
|
|
"wavernn":
|
|
|
|
|
|
|
|
"paddlespeech.t2s.models.wavernn:WaveRNN",
|
|
|
|
|
|
|
|
"wavernn_inference":
|
|
|
|
|
|
|
|
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -146,10 +159,15 @@ def evaluate(args):
|
|
|
|
voc_name = args.voc[:args.voc.rindex('_')]
|
|
|
|
voc_name = args.voc[:args.voc.rindex('_')]
|
|
|
|
voc_class = dynamic_import(voc_name, model_alias)
|
|
|
|
voc_class = dynamic_import(voc_name, model_alias)
|
|
|
|
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
|
|
|
|
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
|
|
|
|
|
|
|
|
if voc_name != 'wavernn':
|
|
|
|
voc = voc_class(**voc_config["generator_params"])
|
|
|
|
voc = voc_class(**voc_config["generator_params"])
|
|
|
|
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
|
|
|
|
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
|
|
|
|
voc.remove_weight_norm()
|
|
|
|
voc.remove_weight_norm()
|
|
|
|
voc.eval()
|
|
|
|
voc.eval()
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
voc = voc_class(**voc_config["model"])
|
|
|
|
|
|
|
|
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
|
|
|
|
|
|
|
|
voc.eval()
|
|
|
|
voc_mu, voc_std = np.load(args.voc_stat)
|
|
|
|
voc_mu, voc_std = np.load(args.voc_stat)
|
|
|
|
voc_mu = paddle.to_tensor(voc_mu)
|
|
|
|
voc_mu = paddle.to_tensor(voc_mu)
|
|
|
|
voc_std = paddle.to_tensor(voc_std)
|
|
|
|
voc_std = paddle.to_tensor(voc_std)
|
|
|
@ -162,8 +180,12 @@ def evaluate(args):
|
|
|
|
output_dir = Path(args.output_dir)
|
|
|
|
output_dir = Path(args.output_dir)
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
N = 0
|
|
|
|
|
|
|
|
T = 0
|
|
|
|
|
|
|
|
|
|
|
|
for datum in test_dataset:
|
|
|
|
for datum in test_dataset:
|
|
|
|
utt_id = datum["utt_id"]
|
|
|
|
utt_id = datum["utt_id"]
|
|
|
|
|
|
|
|
with timer() as t:
|
|
|
|
with paddle.no_grad():
|
|
|
|
with paddle.no_grad():
|
|
|
|
# acoustic model
|
|
|
|
# acoustic model
|
|
|
|
if am_name == 'fastspeech2':
|
|
|
|
if am_name == 'fastspeech2':
|
|
|
@ -175,7 +197,8 @@ def evaluate(args):
|
|
|
|
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
|
|
|
|
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
|
|
|
|
elif "spk_id" in datum:
|
|
|
|
elif "spk_id" in datum:
|
|
|
|
spk_id = paddle.to_tensor(datum["spk_id"])
|
|
|
|
spk_id = paddle.to_tensor(datum["spk_id"])
|
|
|
|
mel = am_inference(phone_ids, spk_id=spk_id, spk_emb=spk_emb)
|
|
|
|
mel = am_inference(
|
|
|
|
|
|
|
|
phone_ids, spk_id=spk_id, spk_emb=spk_emb)
|
|
|
|
elif am_name == 'speedyspeech':
|
|
|
|
elif am_name == 'speedyspeech':
|
|
|
|
phone_ids = paddle.to_tensor(datum["phones"])
|
|
|
|
phone_ids = paddle.to_tensor(datum["phones"])
|
|
|
|
tone_ids = paddle.to_tensor(datum["tones"])
|
|
|
|
tone_ids = paddle.to_tensor(datum["tones"])
|
|
|
@ -189,11 +212,19 @@ def evaluate(args):
|
|
|
|
mel = am_inference(phone_ids, spk_emb=spk_emb)
|
|
|
|
mel = am_inference(phone_ids, spk_emb=spk_emb)
|
|
|
|
# vocoder
|
|
|
|
# vocoder
|
|
|
|
wav = voc_inference(mel)
|
|
|
|
wav = voc_inference(mel)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wav = wav.numpy()
|
|
|
|
|
|
|
|
N += wav.size
|
|
|
|
|
|
|
|
T += t.elapse
|
|
|
|
|
|
|
|
speed = wav.size / t.elapse
|
|
|
|
|
|
|
|
rtf = am_config.fs / speed
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
|
|
|
f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
|
|
|
|
|
|
|
)
|
|
|
|
sf.write(
|
|
|
|
sf.write(
|
|
|
|
str(output_dir / (utt_id + ".wav")),
|
|
|
|
str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
|
|
|
|
wav.numpy(),
|
|
|
|
|
|
|
|
samplerate=am_config.fs)
|
|
|
|
|
|
|
|
print(f"{utt_id} done!")
|
|
|
|
print(f"{utt_id} done!")
|
|
|
|
|
|
|
|
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
def main():
|
|
|
@ -246,7 +277,8 @@ def main():
|
|
|
|
default='pwgan_csmsc',
|
|
|
|
default='pwgan_csmsc',
|
|
|
|
choices=[
|
|
|
|
choices=[
|
|
|
|
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
|
|
|
|
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
|
|
|
|
'mb_melgan_csmsc'
|
|
|
|
'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
|
|
|
|
|
|
|
|
'style_melgan_csmsc'
|
|
|
|
],
|
|
|
|
],
|
|
|
|
help='Choose vocoder type of tts task.')
|
|
|
|
help='Choose vocoder type of tts task.')
|
|
|
|
|
|
|
|
|
|
|
|