From 67ae7c8dd2317882806c439c038c4cdff3aba896 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:33:16 +0800 Subject: [PATCH] [Hackathon 7th] fix Voc5/Jets/TTS2 with CSMSC (#3906) * fix Voc5/Jets with CSMSC * fix Voc5/Jets with CSMSC * Update README.md * Update README.md * Update README.md * Update iSTFTNet.md * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review --- examples/csmsc/jets/README.md | 13 ++++++++++++- examples/csmsc/tts2/README.md | 11 +++++++++++ examples/csmsc/voc5/README.md | 11 +++++++++++ examples/csmsc/voc5/iSTFTNet.md | 11 +++++++++++ paddlespeech/t2s/exps/gan_vocoder/preprocess.py | 2 +- paddlespeech/t2s/models/jets/length_regulator.py | 4 +++- 6 files changed, 49 insertions(+), 3 deletions(-) diff --git a/examples/csmsc/jets/README.md b/examples/csmsc/jets/README.md index 07dade0e..20314cec 100644 --- a/examples/csmsc/jets/README.md +++ b/examples/csmsc/jets/README.md @@ -3,7 +3,18 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168 ## Dataset ### Download and Extract -Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). +Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. + +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 96956776..7f7cdde0 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -5,6 +5,17 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2 ### Download and Extract Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 3347c647..e4d10061 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -4,6 +4,17 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010. ### Download and Extract Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/examples/csmsc/voc5/iSTFTNet.md b/examples/csmsc/voc5/iSTFTNet.md index 8f121938..693950c5 100644 --- a/examples/csmsc/voc5/iSTFTNet.md +++ b/examples/csmsc/voc5/iSTFTNet.md @@ -6,6 +6,17 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203 ### Download and Extract Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index a2629a90..c1513e0c 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -203,9 +203,9 @@ def main(): sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) - # split data into 3 sections if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections num_train = 9800 num_dev = 100 train_wav_files = wav_files[:num_train] diff --git a/paddlespeech/t2s/models/jets/length_regulator.py b/paddlespeech/t2s/models/jets/length_regulator.py index f7a395a6..f8629382 100644 --- a/paddlespeech/t2s/models/jets/length_regulator.py +++ b/paddlespeech/t2s/models/jets/length_regulator.py @@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer): if h_masks is not None: t = t * paddle.to_tensor(h_masks, dtype="float32") - c = ds.cumsum(axis=-1) - ds / 2 + ds_cumsum = ds.cumsum(axis=-1) + ds_half = ds / 2 + c = ds_cumsum.astype(ds_half.dtype) - ds_half energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2 if d_masks is not None: d_masks = ~(d_masks.unsqueeze(1))