From 04bcb6a12d44641f3c17c6aac1b2c6bf5eabb752 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 28 Oct 2021 07:46:11 +0000 Subject: [PATCH] fix rtf, fix inf input of speedyspeech, fix stft dir for 2.2.0 --- parakeet/exps/gan_vocoder/multi_band_melgan/synthesize.py | 3 ++- parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py | 3 ++- parakeet/exps/speedyspeech/inference.py | 4 ++-- parakeet/modules/stft_loss.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/parakeet/exps/gan_vocoder/multi_band_melgan/synthesize.py b/parakeet/exps/gan_vocoder/multi_band_melgan/synthesize.py index d48fbbd0..00b1b96c 100644 --- a/parakeet/exps/gan_vocoder/multi_band_melgan/synthesize.py +++ b/parakeet/exps/gan_vocoder/multi_band_melgan/synthesize.py @@ -86,8 +86,9 @@ def main(): N += wav.size T += t.elapse speed = wav.size / t.elapse + rtf = config.fs / speed print( - f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.fs / speed}." + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs) print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }") diff --git a/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py index 9129caa5..2400e00b 100644 --- a/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py @@ -86,8 +86,9 @@ def main(): N += wav.size T += t.elapse speed = wav.size / t.elapse + rtf = config.fs / speed print( - f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.fs / speed}." + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs) print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }") diff --git a/parakeet/exps/speedyspeech/inference.py b/parakeet/exps/speedyspeech/inference.py index bf144d76..77a90915 100644 --- a/parakeet/exps/speedyspeech/inference.py +++ b/parakeet/exps/speedyspeech/inference.py @@ -96,8 +96,8 @@ def main(): input_ids = frontend.get_input_ids( sentence, merge_sentences=True, get_tone_ids=True) - phone_ids = input_ids["phone_ids"] - tone_ids = input_ids["tone_ids"] + phone_ids = input_ids["phone_ids"].numpy() + tone_ids = input_ids["tone_ids"].numpy() phones = phone_ids[0] tones = tone_ids[0] diff --git a/parakeet/modules/stft_loss.py b/parakeet/modules/stft_loss.py index 1f400b46..8af55ab1 100644 --- a/parakeet/modules/stft_loss.py +++ b/parakeet/modules/stft_loss.py @@ -51,7 +51,7 @@ def stft(x, # calculate window window = signal.get_window(window, win_length, fftbins=True) window = paddle.to_tensor(window) - x_stft = paddle.tensor.signal.stft( + x_stft = paddle.signal.stft( x, fft_size, hop_length,