From 67ae7c8dd2317882806c439c038c4cdff3aba896 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Mon, 2 Dec 2024 11:33:16 +0800
Subject: [PATCH] [Hackathon 7th] fix Voc5/Jets/TTS2 with CSMSC (#3906)

* fix Voc5/Jets with CSMSC

* fix Voc5/Jets with CSMSC

* Update README.md

* Update README.md

* Update README.md

* Update iSTFTNet.md

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review
---
 examples/csmsc/jets/README.md                    | 13 ++++++++++++-
 examples/csmsc/tts2/README.md                    | 11 +++++++++++
 examples/csmsc/voc5/README.md                    | 11 +++++++++++
 examples/csmsc/voc5/iSTFTNet.md                  | 11 +++++++++++
 paddlespeech/t2s/exps/gan_vocoder/preprocess.py  |  2 +-
 paddlespeech/t2s/models/jets/length_regulator.py |  4 +++-
 6 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/examples/csmsc/jets/README.md b/examples/csmsc/jets/README.md
index 07dade0e..20314cec 100644
--- a/examples/csmsc/jets/README.md
+++ b/examples/csmsc/jets/README.md
@@ -3,7 +3,18 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168
 
 ## Dataset
 ### Download and Extract
-Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
+
+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 96956776..7f7cdde0 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -5,6 +5,17 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2
 ### Download and Extract
 Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 
+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```
+
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index 3347c647..e4d10061 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -4,6 +4,17 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 
+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```
+
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
diff --git a/examples/csmsc/voc5/iSTFTNet.md b/examples/csmsc/voc5/iSTFTNet.md
index 8f121938..693950c5 100644
--- a/examples/csmsc/voc5/iSTFTNet.md
+++ b/examples/csmsc/voc5/iSTFTNet.md
@@ -6,6 +6,17 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 
+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```
+
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
index a2629a90..c1513e0c 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -203,9 +203,9 @@ def main():
         sentences, speaker_set = get_phn_dur(dur_file)
         merge_silence(sentences)
 
-    # split data into 3 sections
     if args.dataset == "baker":
         wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
         num_train = 9800
         num_dev = 100
         train_wav_files = wav_files[:num_train]
diff --git a/paddlespeech/t2s/models/jets/length_regulator.py b/paddlespeech/t2s/models/jets/length_regulator.py
index f7a395a6..f8629382 100644
--- a/paddlespeech/t2s/models/jets/length_regulator.py
+++ b/paddlespeech/t2s/models/jets/length_regulator.py
@@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer):
         if h_masks is not None:
             t = t * paddle.to_tensor(h_masks, dtype="float32")
 
-        c = ds.cumsum(axis=-1) - ds / 2
+        ds_cumsum = ds.cumsum(axis=-1)
+        ds_half = ds / 2
+        c = ds_cumsum.astype(ds_half.dtype) - ds_half
         energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2
         if d_masks is not None:
             d_masks = ~(d_masks.unsqueeze(1))