fix Voc5/Jets with CSMSC

pull/3906/head
liyulingyue 10 months ago
parent afa9466c89
commit 231b78c828

@ -3,7 +3,15 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168
## Dataset ## Dataset
### Download and Extract ### Download and Extract
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
The structure of the folder is listed below.
```text
datasets/BZNSYP
└── Wave
└── .wav files
```
### Get MFA Result and Extract ### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS. We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.

@ -4,6 +4,14 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.
### Download and Extract ### Download and Extract
Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
The structure of the folder is listed below.
```text
datasets/BZNSYP
└── Wave
└── .wav files
```
### Get MFA Result and Extract ### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.

@ -6,6 +6,14 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203
### Download and Extract ### Download and Extract
Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
The structure of the folder is listed below.
```text
datasets/BZNSYP
└── Wave
└── .wav files
```
### Get MFA Result and Extract ### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.

@ -206,8 +206,9 @@ def main():
# split data into 3 sections # split data into 3 sections
if args.dataset == "baker": if args.dataset == "baker":
wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
num_train = 9800 # split data into 3 sections, the max number of dev/test is 10% or 100
num_dev = 100 num_dev = min(int(len(wav_files) * 0.1), 100)
num_train = len(wav_files) - num_dev * 2
train_wav_files = wav_files[:num_train] train_wav_files = wav_files[:num_train]
dev_wav_files = wav_files[num_train:num_train + num_dev] dev_wav_files = wav_files[num_train:num_train + num_dev]
test_wav_files = wav_files[num_train + num_dev:] test_wav_files = wav_files[num_train + num_dev:]

@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer):
if h_masks is not None: if h_masks is not None:
t = t * paddle.to_tensor(h_masks, dtype="float32") t = t * paddle.to_tensor(h_masks, dtype="float32")
c = ds.cumsum(axis=-1) - ds / 2 ds_cumsum = ds.cumsum(axis=-1)
ds_half = ds / 2
c = ds_cumsum.astype(ds_half.dtype) - ds_half
energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2 energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2
if d_masks is not None: if d_masks is not None:
d_masks = ~(d_masks.unsqueeze(1)) d_masks = ~(d_masks.unsqueeze(1))

@ -348,7 +348,9 @@ def get_random_segments(
""" """
b, c, t = paddle.shape(x) b, c, t = paddle.shape(x)
max_start_idx = x_lengths - segment_size max_start_idx = x_lengths - segment_size
start_idxs = paddle.cast(paddle.rand([b]) * max_start_idx, 'int64') rand_number = paddle.rand([b])
start_idxs = paddle.cast(rand_number *
max_start_idx.astype(rand_number.dtype), 'int64')
segments = get_segments(x, start_idxs, segment_size) segments = get_segments(x, start_idxs, segment_size)
return segments, start_idxs return segments, start_idxs

Loading…
Cancel
Save