diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py index df0549f84..4ca64bd94 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py @@ -270,10 +270,10 @@ class DiffSinger(nn.Layer): mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1)) cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur) cond_fs2 = cond_fs2.transpose((0, 2, 1)) - # mel, _ = self.diffusion(mel_fs2, cond_fs2) - noise = paddle.randn(mel_fs2.shape) - mel = self.diffusion.inference( - noise=noise, cond=cond_fs2, ref_x=mel_fs2, num_inference_steps=100) + mel, _ = self.diffusion(mel_fs2, cond_fs2) + # noise = paddle.randn(mel_fs2.shape) + # mel = self.diffusion.inference( + # noise=noise, cond=cond_fs2, ref_x=mel_fs2, num_inference_steps=100) mel = mel.transpose((0, 2, 1)) return mel[0] diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index 66754e3b2..02a412ab4 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -17,6 +17,7 @@ from typing import Callable from typing import Optional from typing import Tuple +import numpy as np import paddle import ppdiffusers from paddle import nn @@ -315,8 +316,46 @@ class GaussianDiffusion(nn.Layer): beta_end=beta_end, beta_schedule=beta_schedule) self.num_max_timesteps = num_max_timesteps + self.spec_min = paddle.to_tensor( + np.array([ + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, + -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0 + ])) + self.spec_max = paddle.to_tensor( + np.array([ + -0.79453, -0.81116, -0.61631, -0.30679, -0.13863, -0.050652, + -0.11563, -0.10679, -0.091068, -0.062174, -0.075302, -0.072217, + -0.063815, -0.073299, 0.007361, -0.072508, -0.050234, -0.16534, + -0.26928, -0.20782, -0.20823, -0.11702, -0.070128, -0.065868, + -0.012675, 0.0015121, -0.089902, -0.21392, -0.23789, -0.28922, + -0.30405, -0.23029, -0.22088, -0.21542, -0.29367, -0.30137, + -0.38281, -0.4359, -0.28681, -0.46855, -0.57485, -0.47022, + -0.54266, -0.44848, -0.6412, -0.687, -0.6486, -0.76436, + -0.49971, -0.71068, -0.69724, -0.61487, -0.55843, -0.69773, + -0.57502, -0.70919, -0.82431, -0.84213, -0.90431, -0.8284, + -0.77945, -0.82758, -0.87699, -1.0532, -1.0766, -1.1198, + -1.0185, -0.98983, -1.0001, -1.0756, -1.0024, -1.0304, -1.0579, + -1.0188, -1.05, -1.0842, -1.0923, -1.1223, -1.2381, -1.6467 + ])) + + def norm_spec(self, x): + """ + Linearly map x to [-1, 1] + Args: + x: [B, T, N] + """ + return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 - def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None + def denorm_spec(self, x): + return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min + + def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None, is_infer: bool=False, ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Generate random timesteps noised x. @@ -333,6 +372,11 @@ class GaussianDiffusion(nn.Layer): The noises which is added to the input. """ + # print("xxxxxxxxxxxxxxxx1: ", x, x.shape) + x = x.transpose((0, 2, 1)) + x = self.norm_spec(x) + x = x.transpose((0, 2, 1)) + print("xxxxxxxxxxxxxxxx2: ", x, x.shape) noise_scheduler = self.noise_scheduler # Sample noise that we'll add to the mel-spectrograms @@ -349,6 +393,13 @@ class GaussianDiffusion(nn.Layer): noisy_images = noise_scheduler.add_noise(x, noise, timesteps) y = self.denoiser(noisy_images, timesteps, cond) + + if is_infer: + y = y.transpose((0, 2, 1)) + y = self.denorm_spec(y) + y = y.transpose((0, 2, 1)) + + # y = self.denorm_spec(y) # then compute loss use output y and noisy target for prediction_type == "epsilon" return y, target @@ -360,7 +411,7 @@ class GaussianDiffusion(nn.Layer): num_inference_steps: Optional[int]=1000, strength: Optional[float]=None, scheduler_type: Optional[str]="ddpm", - clip_noise: Optional[bool]=True, + clip_noise: Optional[bool]=False, clip_noise_range: Optional[Tuple[float, float]]=(-1, 1), callback: Optional[Callable[[int, int, int, paddle.Tensor], None]]=None, @@ -426,10 +477,12 @@ class GaussianDiffusion(nn.Layer): scheduler.set_timesteps(num_inference_steps) # prepare first noise variables - import pdb;pdb.set_trace() noisy_input = noise timesteps = scheduler.timesteps - if ref_x is not None: + if ref_x is not None: + ref_x = ref_x.transpose((0, 2, 1)) + ref_x = self.norm_spec(ref_x) + ref_x = ref_x.transpose((0, 2, 1)) init_timestep = None if strength is None or strength < 0. or strength > 1.: strength = None @@ -445,8 +498,6 @@ class GaussianDiffusion(nn.Layer): noisy_input = scheduler.add_noise( ref_x, noise, timesteps[:1].tile([noise.shape[0]])) - - # denoising loop denoised_output = noisy_input if clip_noise: @@ -471,5 +522,11 @@ class GaussianDiffusion(nn.Layer): (i + 1) % scheduler.order == 0): if callback is not None and i % callback_steps == 0: callback(i, t, len(timesteps), denoised_output) + + denoised_output = denoised_output.transpose((0, 2, 1)) + denoised_output = self.denorm_spec(denoised_output) + denoised_output = denoised_output.transpose((0, 2, 1)) + + - return denoised_output \ No newline at end of file + return denoised_output