From f75cc251371e085b69fd2a3c68550f442c098f19 Mon Sep 17 00:00:00 2001 From: liyulingyue <852433440@qq.com> Date: Sat, 23 Nov 2024 08:27:32 +0800 Subject: [PATCH] fix Voc5/Jets with CSMSC --- examples/csmsc/tts2/README.md | 8 ++++++++ paddlespeech/t2s/exps/ernie_sat/preprocess.py | 6 +++--- paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py | 6 +++--- paddlespeech/t2s/exps/fastspeech2/preprocess.py | 6 +++--- paddlespeech/t2s/exps/gan_vocoder/preprocess.py | 1 - paddlespeech/t2s/exps/jets/preprocess.py | 6 +++--- paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py | 6 +++--- paddlespeech/t2s/exps/speedyspeech/preprocess.py | 6 +++--- paddlespeech/t2s/exps/tacotron2/preprocess.py | 6 +++--- paddlespeech/t2s/exps/vits/preprocess.py | 6 +++--- 10 files changed, 32 insertions(+), 25 deletions(-) diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 969567762..100f03511 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -5,6 +5,14 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2 ### Download and Extract Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +datasets/BZNSYP +└── Wave + └── .wav files +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/paddlespeech/t2s/exps/ernie_sat/preprocess.py b/paddlespeech/t2s/exps/ernie_sat/preprocess.py index 04bbc0743..ab6a36f8e 100644 --- a/paddlespeech/t2s/exps/ernie_sat/preprocess.py +++ b/paddlespeech/t2s/exps/ernie_sat/preprocess.py @@ -241,9 +241,9 @@ def main(): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:] diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py index 4c92ad1cc..e2348849b 100644 --- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -81,9 +81,9 @@ def evaluate(args, fastspeech2_config): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:] diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index a2353242b..dd3da6c2e 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -271,9 +271,9 @@ def main(): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:] diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 13cf29795..a30c14a2b 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -203,7 +203,6 @@ def main(): sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) - # split data into 3 sections if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) # split data into 3 sections, the max number of dev/test is 10% or 100 diff --git a/paddlespeech/t2s/exps/jets/preprocess.py b/paddlespeech/t2s/exps/jets/preprocess.py index 468941ead..76854c2ee 100644 --- a/paddlespeech/t2s/exps/jets/preprocess.py +++ b/paddlespeech/t2s/exps/jets/preprocess.py @@ -314,9 +314,9 @@ def main(): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:] diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py index 31b7d2eac..deccfe1e6 100644 --- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py @@ -90,9 +90,9 @@ def evaluate(args, speedyspeech_config): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:] diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index 75a1b0791..af3070614 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -237,9 +237,9 @@ def main(): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:] diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py index 46b725916..96eb64616 100644 --- a/paddlespeech/t2s/exps/tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -228,9 +228,9 @@ def main(): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:] diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index 23c959d43..c8dca4563 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -241,9 +241,9 @@ def main(): if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) - # split data into 3 sections - num_train = 9800 - num_dev = 100 + # split data into 3 sections, the max number of dev/test is 10% or 100 + num_dev = min(int(len(wav_files) * 0.1), 100) + num_train = len(wav_files) - num_dev * 2 train_wav_files = wav_files[:num_train] dev_wav_files = wav_files[num_train:num_train + num_dev] test_wav_files = wav_files[num_train + num_dev:]