diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml index 7729889e5..13b803b5e 100644 --- a/examples/opencpop/svs1/conf/default.yaml +++ b/examples/opencpop/svs1/conf/default.yaml @@ -3,16 +3,16 @@ ########################################################### fs: 24000 # sr -n_fft: 512 # FFT size (samples). +n_fft: 512 # FFT size (samples). n_shift: 128 # Hop size (samples). 12.5ms -win_length: 512 # Window length (samples). 50ms +win_length: 512 # Window length (samples). 50ms # If set to null, it will be the same as fft_size. window: "hann" # Window function. # Only used for feats_type != raw fmin: 30 # Minimum frequency of Mel basis. -fmax: 12000 # Maximum frequency of Mel basis. +fmax: 12000 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) @@ -23,8 +23,8 @@ f0max: 750 # Maximum f0 for pitch extraction. ########################################################### # DATA SETTING # ########################################################### -batch_size: 48 -num_workers: 1 +batch_size: 24 # batch size +num_workers: 4 # number of gpu ########################################################### @@ -32,80 +32,79 @@ num_workers: 1 ########################################################### model: # music score related - note_num: 300 - is_slur_num: 2 + note_num: 300 # number of note + is_slur_num: 2 # number of slur # fastspeech2 module fastspeech2_params: - adim: 256 # attention dimension # lym check - aheads: 2 # number of attention heads # lym check - elayers: 4 # number of encoder layers # lym check - eunits: 1024 # number of encoder ff units # lym check adim * 4 - dlayers: 4 # number of decoder layers # lym check - dunits: 1024 # number of decoder ff units # lym check - positionwise_layer_type: conv1d-linear # type of position-wise layer # lym check - positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer # lym check - transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer # lym check - transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding # lym check - transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer # lym check - transformer_activation_type: "gelu" - encoder_normalize_before: True # whether to perform layer normalization before the input - decoder_normalize_before: True # whether to perform layer normalization before the input - reduction_factor: 1 # reduction factor - init_type: xavier_uniform # initialization type - init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding - init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding - use_scaled_pos_enc: True # whether to use scaled positional encoding - transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer - transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding - transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer - duration_predictor_layers: 5 # number of layers of duration predictor - duration_predictor_chans: 256 # number of channels of duration predictor - duration_predictor_kernel_size: 3 # filter size of duration predictor - duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor - pitch_predictor_layers: 5 # number of conv layers in pitch predictor - pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor - pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor - pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor - pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch - pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder - - - postnet_layers: 5 # number of layers of postnset - postnet_filts: 5 # filter size of conv layers in postnet - postnet_chans: 256 # number of channels of conv layers in postnet - energy_predictor_layers: 2 # number of conv layers in energy predictor - energy_predictor_chans: 256 # number of channels of conv layers in energy predictor - energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor - energy_predictor_dropout: 0.5 # dropout rate in energy predictor - energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy - energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder + adim: 256 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1024 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1024 # number of decoder ff units + positionwise_layer_type: conv1d-linear # type of position-wise layer + positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer + transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer + transformer_activation_type: "gelu" # Activation function type in transformer. + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + init_type: xavier_uniform # initialization type + init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding + init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding + use_scaled_pos_enc: True # whether to use scaled positional encoding + transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer + duration_predictor_layers: 5 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder + postnet_layers: 5 # number of layers of postnet + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + postnet_dropout_rate: 0.5 # dropout rate for postnet # denoiser module denoiser_params: - in_channels: 80 - out_channels: 80 - kernel_size: 3 - layers: 20 - stacks: 5 - residual_channels: 256 - gate_channels: 512 - skip_channels: 256 - aux_channels: 256 - dropout: 0.1 - bias: True - use_weight_norm: False - init_type: "kaiming_normal" + in_channels: 80 # Number of channels of the input mel-spectrogram + out_channels: 80 # Number of channels of the output mel-spectrogram + kernel_size: 3 # Kernel size of the residual blocks inside + layers: 20 # Number of residual blocks inside + stacks: 5 # The number of groups to split the residual blocks into + residual_channels: 256 # Residual channel of the residual blocks + gate_channels: 512 # Gate channel of the residual blocks + skip_channels: 256 # Skip channel of the residual blocks + aux_channels: 256 # Auxiliary channel of the residual blocks + dropout: 0.1 # Dropout of the residual blocks + bias: True # Whether to use bias in residual blocks + use_weight_norm: False # Whether to use weight norm in all convolutions + init_type: "kaiming_normal" # Type of initialize weights of a neural network module # diffusion module diffusion_params: - num_train_timesteps: 100 - beta_start: 0.0001 - beta_end: 0.06 - beta_schedule: "squaredcos_cap_v2" - num_max_timesteps: 60 + num_train_timesteps: 100 # The number of timesteps between the noise and the real during training + beta_start: 0.0001 # beta start parameter for the scheduler + beta_end: 0.06 # beta end parameter for the scheduler + beta_schedule: "linear" # beta schedule parameter for the scheduler + num_max_timesteps: 60 # The max timestep transition from real to noise ########################################################### @@ -130,7 +129,7 @@ fs2_optimizer: ds_optimizer_params: beta1: 0.9 beta2: 0.98 - weight_decay: 0.0 + weight_decay: 0.0 ds_scheduler_params: learning_rate: 0.001 @@ -142,17 +141,12 @@ ds_grad_norm: 1 ########################################################### # INTERVAL SETTING # ########################################################### -ds_train_start_steps: 32500 # Number of steps to start to train diffusion module. -train_max_steps: 65000 # Number of training steps. -save_interval_steps: 500 # Interval steps to save checkpoint. -eval_interval_steps: 500 # Interval steps to evaluate the network. -num_snapshots: 20 - -# ds_train_start_steps: 4 # Number of steps to start to train diffusion module. -# train_max_steps: 8 # Number of training steps. -# save_interval_steps: 1 # Interval steps to save checkpoint. -# eval_interval_steps: 2 # Interval steps to evaluate the network. -# num_snapshots: 5 +ds_train_start_steps: 100000 # Number of steps to start to train diffusion module. +train_max_steps: 200000 # Number of training steps. +save_interval_steps: 500 # Interval steps to save checkpoint. +eval_interval_steps: 100 # Interval steps to evaluate the network. +num_snapshots: 10 # Number of saved models + ########################################################### # OTHER SETTING # diff --git a/examples/opencpop/svs1/local/synthesize.sh b/examples/opencpop/svs1/local/synthesize.sh index cc58b58ce..37f8893a9 100755 --- a/examples/opencpop/svs1/local/synthesize.sh +++ b/examples/opencpop/svs1/local/synthesize.sh @@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --voc_config=pwgan_opencpop/default.yaml \ --voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \ --voc_stat=pwgan_opencpop/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ + --test_metadata=test1.jsonl \ --output_dir=${train_output_path}/test_${iter} \ --phones_dict=dump/phone_id_map.txt fi diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh index 10a2b5290..7f25a15bd 100755 --- a/examples/opencpop/svs1/run.sh +++ b/examples/opencpop/svs1/run.sh @@ -4,7 +4,6 @@ set -e source path.sh gpus=4,5,6,7 -#gpus=0 stage=1 stop_stage=1 diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py index 871699c81..e34cc214d 100644 --- a/paddlespeech/t2s/datasets/get_feats.py +++ b/paddlespeech/t2s/datasets/get_feats.py @@ -126,7 +126,7 @@ class Pitch(): input: np.ndarray, use_continuous_f0: bool=True, use_log_f0: bool=True) -> np.ndarray: - input = input.astype(np.float) + input = input.astype(float) frame_period = 1000 * self.hop_length / self.sr f0, timeaxis = pyworld.dio( input, diff --git a/paddlespeech/t2s/exps/diffsinger/normalize.py b/paddlespeech/t2s/exps/diffsinger/normalize.py index f0f4f0b0d..0a54cfbb6 100644 --- a/paddlespeech/t2s/exps/diffsinger/normalize.py +++ b/paddlespeech/t2s/exps/diffsinger/normalize.py @@ -80,27 +80,20 @@ def main(): # restore scaler speech_scaler = StandardScaler() - # speech_scaler.mean_ = np.load(args.speech_stats)[0] - # speech_scaler.scale_ = np.load(args.speech_stats)[1] - speech_scaler.mean_ = np.zeros(np.load(args.speech_stats)[0].shape, dtype="float32") - speech_scaler.scale_ = np.ones(np.load(args.speech_stats)[1].shape, dtype="float32") + speech_scaler.mean_ = np.load(args.speech_stats)[0] + speech_scaler.scale_ = np.load(args.speech_stats)[1] speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0] pitch_scaler = StandardScaler() - # pitch_scaler.mean_ = np.load(args.pitch_stats)[0] - # pitch_scaler.scale_ = np.load(args.pitch_stats)[1] - pitch_scaler.mean_ = np.zeros(np.load(args.pitch_stats)[0].shape, dtype="float32") - pitch_scaler.scale_ = np.ones(np.load(args.pitch_stats)[1].shape, dtype="float32") + pitch_scaler.mean_ = np.load(args.pitch_stats)[0] + pitch_scaler.scale_ = np.load(args.pitch_stats)[1] pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0] energy_scaler = StandardScaler() - # energy_scaler.mean_ = np.load(args.energy_stats)[0] - # energy_scaler.scale_ = np.load(args.energy_stats)[1] - energy_scaler.mean_ = np.zeros(np.load(args.energy_stats)[0].shape, dtype="float32") - energy_scaler.scale_ = np.ones(np.load(args.energy_stats)[1].shape, dtype="float32") + energy_scaler.mean_ = np.load(args.energy_stats)[0] + energy_scaler.scale_ = np.load(args.energy_stats)[1] energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0] - vocab_phones = {} with open(args.phones_dict, 'rt') as f: phn_id = [line.strip().split() for line in f.readlines()] diff --git a/paddlespeech/t2s/exps/diffsinger/preprocess.py b/paddlespeech/t2s/exps/diffsinger/preprocess.py index 7d47c61a5..d5209c2c5 100644 --- a/paddlespeech/t2s/exps/diffsinger/preprocess.py +++ b/paddlespeech/t2s/exps/diffsinger/preprocess.py @@ -88,10 +88,7 @@ def process_sentence( phones = sentences[utt_id][0] durations = sentences[utt_id][1] num_frames = logmel.shape[0] - word_boundary = [ - 1 if x in ALL_FINALS + ['AP', 'SP'] else 0 for x in phones - ] - # print(sum(durations), num_frames) + assert sum( durations ) == num_frames, "the sum of durations doesn't equal to the num of mel frames. " @@ -105,7 +102,6 @@ def process_sentence( pitch_dir = output_dir / "data_pitch" pitch_dir.mkdir(parents=True, exist_ok=True) pitch_path = pitch_dir / (utt_id + "_pitch.npy") - # print(pitch, pitch.shape) np.save(pitch_path, pitch) energy = energy_extractor.get_energy(wav) assert energy.shape[0] == num_frames diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py index 41b9f2a84..2444b0610 100644 --- a/paddlespeech/t2s/exps/diffsinger/train.py +++ b/paddlespeech/t2s/exps/diffsinger/train.py @@ -138,20 +138,17 @@ def train_sp(args, config): model_ds = model._layers.diffusion print("models done!") - # criterion_fs2 = FastSpeech2Loss(**config["fs2_updater"]) criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"]) criterion_ds = DiffusionLoss(**config["ds_updater"]) print("criterions done!") optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"]) - lr_schedule_ds = StepDecay(**config["ds_scheduler_params"]) gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"]) optimizer_ds = AdamW( - learning_rate=lr_schedule_ds, + learning_rate=config["ds_scheduler_params"]["learning_rate"], grad_clip=gradient_clip_ds, parameters=model_ds.parameters(), **config["ds_optimizer_params"]) - # optimizer_ds = build_optimizers(ds, **config["ds_optimizer"]) print("optimizer done!") output_dir = Path(args.output_dir) @@ -182,7 +179,7 @@ def train_sp(args, config): "ds": criterion_ds, }, dataloader=dev_dataloader, - output_dir=output_dir,) + output_dir=output_dir, ) trainer = Trainer( updater, diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py index df0549f84..496237c7b 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py @@ -54,10 +54,6 @@ class DiffSinger(nn.Layer): "eunits": 1024, "dlayers": 4, "dunits": 1024, - "postnet_layers": 5, - "postnet_chans": 512, - "postnet_filts": 5, - "postnet_dropout_rate": 0.5, "positionwise_layer_type": "conv1d", "positionwise_conv_kernel_size": 1, "use_scaled_pos_enc": True, @@ -80,15 +76,8 @@ class DiffSinger(nn.Layer): "duration_predictor_chans": 384, "duration_predictor_kernel_size": 3, "duration_predictor_dropout_rate": 0.1, - # energy predictor - "energy_predictor_layers": 2, - "energy_predictor_chans": 384, - "energy_predictor_kernel_size": 3, - "energy_predictor_dropout": 0.5, - "energy_embed_kernel_size": 9, - "energy_embed_dropout": 0.5, - "stop_gradient_from_energy_predictor": False, # pitch predictor + "use_pitch_embed": True, "pitch_predictor_layers": 2, "pitch_predictor_chans": 384, "pitch_predictor_kernel_size": 3, @@ -96,6 +85,20 @@ class DiffSinger(nn.Layer): "pitch_embed_kernel_size": 9, "pitch_embed_dropout": 0.5, "stop_gradient_from_pitch_predictor": False, + # energy predictor + "use_energy_embed": False, + "energy_predictor_layers": 2, + "energy_predictor_chans": 384, + "energy_predictor_kernel_size": 3, + "energy_predictor_dropout": 0.5, + "energy_embed_kernel_size": 9, + "energy_embed_dropout": 0.5, + "stop_gradient_from_energy_predictor": False, + # postnet + "postnet_layers": 5, + "postnet_chans": 512, + "postnet_filts": 5, + "postnet_dropout_rate": 0.5, # spk emb "spk_num": None, "spk_embed_dim": None, @@ -170,7 +173,7 @@ class DiffSinger(nn.Layer): energy: paddle.Tensor, spk_emb: paddle.Tensor=None, spk_id: paddle.Tensor=None, - train_fs2: bool=True, + only_train_fs2: bool=True, ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -199,7 +202,7 @@ class DiffSinger(nn.Layer): Batch of speaker embeddings (B, spk_embed_dim). spk_id(Tnesor[int64], optional(int64)): Batch of speaker ids (B,) - train_fs2(bool): + only_train_fs2(bool): Whether to train only the fastspeech2 module Returns: @@ -219,7 +222,7 @@ class DiffSinger(nn.Layer): energy=energy, spk_id=spk_id, spk_emb=spk_emb) - if train_fs2: + if only_train_fs2: return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits # get the encoder output from fastspeech2 as the condition of denoiser module @@ -236,9 +239,9 @@ class DiffSinger(nn.Layer): cond_fs2 = cond_fs2.transpose((0, 2, 1)) # get the output(final mel) from diffusion module - mel, mel_ref = self.diffusion( - speech.transpose((0, 2, 1)), cond_fs2.detach()) - return mel, mel_ref, mel_masks + noise_pred, noise_target = self.diffusion( + speech.transpose((0, 2, 1)), cond_fs2) + return noise_pred, noise_target, mel_masks def inference( self, @@ -270,10 +273,13 @@ class DiffSinger(nn.Layer): mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1)) cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur) cond_fs2 = cond_fs2.transpose((0, 2, 1)) - # mel, _ = self.diffusion(mel_fs2, cond_fs2) noise = paddle.randn(mel_fs2.shape) mel = self.diffusion.inference( - noise=noise, cond=cond_fs2, ref_x=mel_fs2, num_inference_steps=100) + noise=noise, + cond=cond_fs2, + ref_x=mel_fs2, + scheduler_type="ddpm", + num_inference_steps=25) mel = mel.transpose((0, 2, 1)) return mel[0] @@ -308,9 +314,7 @@ class DiffSingerInference(nn.Layer): note_dur=note_dur, is_slur=is_slur, get_mel_fs2=get_mel_fs2) - print(normalized_mel) - # logmel = self.normalizer.inverse(normalized_mel) - logmel = normalized_mel + logmel = self.normalizer.inverse(normalized_mel) return logmel @@ -339,16 +343,16 @@ class DiffusionLoss(nn.Layer): def forward( self, - ref_mels: paddle.Tensor, - out_mels: paddle.Tensor, + noise_pred: paddle.Tensor, + noise_target: paddle.Tensor, mel_masks: paddle.Tensor, ) -> paddle.Tensor: """Calculate forward propagation. Args: - ref_mels(Tensor): - Batch of real mel (B, Lmax, odim). - out_mels(Tensor): - Batch of outputs mel (B, Lmax, odim). + noise_pred(Tensor): + Batch of outputs predict noise (B, Lmax, odim). + noise_target(Tensor): + Batch of target noise (B, Lmax, odim). mel_masks(Tensor): Batch of mask of real mel (B, Lmax, 1). Returns: @@ -356,13 +360,13 @@ class DiffusionLoss(nn.Layer): """ # apply mask to remove padded part if self.use_masking: - out_mels = out_mels.masked_select( - mel_masks.broadcast_to(out_mels.shape)) - ref_mels = ref_mels.masked_select( - mel_masks.broadcast_to(ref_mels.shape)) + noise_pred = noise_pred.masked_select( + mel_masks.broadcast_to(noise_pred.shape)) + noise_target = noise_target.masked_select( + mel_masks.broadcast_to(noise_target.shape)) # calculate loss - l1_loss = self.l1_criterion(out_mels, ref_mels) + l1_loss = self.l1_criterion(noise_pred, noise_target) # make weighted mask and apply it if self.use_weighted_masking: @@ -370,7 +374,7 @@ class DiffusionLoss(nn.Layer): out_weights = mel_masks.cast(dtype=paddle.float32) / mel_masks.cast( dtype=paddle.float32).sum( axis=1, keepdim=True) - out_weights /= ref_mels.shape[0] * ref_mels.shape[2] + out_weights /= noise_target.shape[0] * noise_target.shape[2] # apply weight l1_loss = l1_loss.multiply(out_weights) diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py index 9d2924ef3..1ec1be5be 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py @@ -34,17 +34,18 @@ logger.setLevel(logging.INFO) class DiffSingerUpdater(StandardUpdater): - def __init__( - self, - model: Layer, - optimizers: Dict[str, Optimizer], - criterions: Dict[str, Layer], - dataloader: DataLoader, - ds_train_start_steps: int=160000, - output_dir: Path=None, ): + def __init__(self, + model: Layer, + optimizers: Dict[str, Optimizer], + criterions: Dict[str, Layer], + dataloader: DataLoader, + ds_train_start_steps: int=160000, + output_dir: Path=None, + only_train_diffusion: bool=True): super().__init__(model, optimizers, dataloader, init_state=None) self.model = model._layers if isinstance(model, paddle.DataParallel) else model + self.only_train_diffusion = only_train_diffusion self.optimizers = optimizers self.optimizer_fs2: Optimizer = optimizers['fs2'] @@ -78,8 +79,7 @@ class DiffSingerUpdater(StandardUpdater): spk_id = None # only train fastspeech2 module firstly - if self.state.iteration <= self.ds_train_start_steps: - # print(batch) + if self.state.iteration < self.ds_train_start_steps: before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( text=batch["text"], note=batch["note"], @@ -93,7 +93,7 @@ class DiffSingerUpdater(StandardUpdater): energy=batch["energy"], spk_id=spk_id, spk_emb=spk_emb, - train_fs2=True, ) + only_train_fs2=True, ) l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2( after_outs=after_outs, @@ -110,7 +110,7 @@ class DiffSingerUpdater(StandardUpdater): spk_logits=spk_logits, spk_ids=spk_id, ) - loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss self.optimizer_fs2.clear_grad() loss_fs2.backward() @@ -128,7 +128,10 @@ class DiffSingerUpdater(StandardUpdater): losses_dict["duration_loss"] = float(duration_loss) losses_dict["pitch_loss"] = float(pitch_loss) losses_dict["energy_loss"] = float(energy_loss) - losses_dict["energy_loss"] = float(energy_loss) + + if speaker_loss != 0.: + report("train/speaker_loss", float(speaker_loss)) + losses_dict["speaker_loss"] = float(speaker_loss) losses_dict["loss_fs2"] = float(loss_fs2) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) @@ -136,10 +139,11 @@ class DiffSingerUpdater(StandardUpdater): # Then only train diffusion module, freeze fastspeech2 parameters. if self.state.iteration > self.ds_train_start_steps: - for param in self.model.fs2.parameters(): - param.trainable = False + if self.only_train_diffusion: + for param in self.model.fs2.parameters(): + param.trainable = False - mel, mel_ref, mel_masks = self.model( + noise_pred, noise_target, mel_masks = self.model( text=batch["text"], note=batch["note"], note_dur=batch["note_dur"], @@ -152,14 +156,14 @@ class DiffSingerUpdater(StandardUpdater): energy=batch["energy"], spk_id=spk_id, spk_emb=spk_emb, - train_fs2=False, ) + only_train_fs2=False, ) - mel = mel.transpose((0, 2, 1)) - mel_ref = mel_ref.transpose((0, 2, 1)) + noise_pred = noise_pred.transpose((0, 2, 1)) + noise_target = noise_target.transpose((0, 2, 1)) mel_masks = mel_masks.transpose((0, 2, 1)) l1_loss_ds = self.criterion_ds( - ref_mels=mel_ref, - out_mels=mel, + noise_pred=noise_pred, + noise_target=noise_target, mel_masks=mel_masks, ) loss_ds = l1_loss_ds @@ -210,7 +214,7 @@ class DiffSingerEvaluator(StandardEvaluator): spk_id = None # Here show diffsinger eval - mel, mel_ref, mel_masks = self.model( + noise_pred, noise_target, mel_masks = self.model( text=batch["text"], note=batch["note"], note_dur=batch["note_dur"], @@ -223,14 +227,14 @@ class DiffSingerEvaluator(StandardEvaluator): energy=batch["energy"], spk_id=spk_id, spk_emb=spk_emb, - train_fs2=False, ) + only_train_fs2=False, ) - mel = mel.transpose((0, 2, 1)) - mel_ref = mel_ref.transpose((0, 2, 1)) + noise_pred = noise_pred.transpose((0, 2, 1)) + noise_target = noise_target.transpose((0, 2, 1)) mel_masks = mel_masks.transpose((0, 2, 1)) l1_loss_ds = self.criterion_ds( - ref_mels=mel_ref, - out_mels=mel, + noise_pred=noise_pred, + noise_target=noise_target, mel_masks=mel_masks, ) loss_ds = l1_loss_ds diff --git a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py index 0e24e69fb..53a72ebe6 100644 --- a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py +++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py @@ -23,10 +23,10 @@ from typeguard import check_argument_types from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss -from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask -from paddlespeech.t2s.modules.nets_utils import make_pad_mask from paddlespeech.t2s.modules.losses import ssim from paddlespeech.t2s.modules.masked_fill import masked_fill +from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask +from paddlespeech.t2s.modules.nets_utils import make_pad_mask class FastSpeech2MIDI(FastSpeech2): @@ -61,18 +61,18 @@ class FastSpeech2MIDI(FastSpeech2): self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_params[ "adim"] - if note_num is not None: - self.note_embedding_table = nn.Embedding( - num_embeddings=note_num, - embedding_dim=self.note_embed_dim, - padding_idx=self.padding_idx) - self.note_dur_layer = nn.Linear(1, self.note_embed_dim) + # note_ embed + self.note_embedding_table = nn.Embedding( + num_embeddings=note_num, + embedding_dim=self.note_embed_dim, + padding_idx=self.padding_idx) + self.note_dur_layer = nn.Linear(1, self.note_embed_dim) - if is_slur_num is not None: - self.is_slur_embedding_table = nn.Embedding( - num_embeddings=is_slur_num, - embedding_dim=self.is_slur_embed_dim, - padding_idx=self.padding_idx) + # slur embed + self.is_slur_embedding_table = nn.Embedding( + num_embeddings=is_slur_num, + embedding_dim=self.is_slur_embed_dim, + padding_idx=self.padding_idx) def forward( self, @@ -203,7 +203,7 @@ class FastSpeech2MIDI(FastSpeech2): spk_emb = self.spk_embedding_table(spk_id) hs = self._integrate_with_spk_embed(hs, spk_emb) - # forward duration predictor and variance predictors + # forward duration predictor (phone-level) and variance predictors (frame-level) d_masks = make_pad_mask(ilens) if olens is not None: pitch_masks = make_pad_mask(olens).unsqueeze(-1) @@ -214,13 +214,12 @@ class FastSpeech2MIDI(FastSpeech2): if is_train_diffusion: hs = self.length_regulator(hs, ds, is_inference=False) p_outs = self.pitch_predictor(hs.detach(), pitch_masks) - # e_outs = self.energy_predictor(hs.detach(), pitch_masks) + e_outs = self.energy_predictor(hs.detach(), pitch_masks) p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( (0, 2, 1)) - # e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( - # (0, 2, 1)) - # hs = hs + p_embs + e_embs - hs = hs + p_embs + e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + p_embs + e_embs elif is_inference: # (B, Tmax) @@ -240,20 +239,19 @@ class FastSpeech2MIDI(FastSpeech2): else: p_outs = self.pitch_predictor(hs, pitch_masks) - # if es is not None: - # e_outs = es - # else: - # if self.stop_gradient_from_energy_predictor: - # e_outs = self.energy_predictor(hs.detach(), pitch_masks) - # else: - # e_outs = self.energy_predictor(hs, pitch_masks) + if es is not None: + e_outs = es + else: + if self.stop_gradient_from_energy_predictor: + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + else: + e_outs = self.energy_predictor(hs, pitch_masks) p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( (0, 2, 1)) - # e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( - # (0, 2, 1)) - # hs = hs + p_embs + e_embs - hs = hs + p_embs + e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + p_embs + e_embs # training else: @@ -264,16 +262,15 @@ class FastSpeech2MIDI(FastSpeech2): p_outs = self.pitch_predictor(hs.detach(), pitch_masks) else: p_outs = self.pitch_predictor(hs, pitch_masks) - # if self.stop_gradient_from_energy_predictor: - # e_outs = self.energy_predictor(hs.detach(), pitch_masks) - # else: - # e_outs = self.energy_predictor(hs, pitch_masks) + if self.stop_gradient_from_energy_predictor: + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + else: + e_outs = self.energy_predictor(hs, pitch_masks) p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose( (0, 2, 1)) - # e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose( - # (0, 2, 1)) - # hs = hs + p_embs + e_embs - hs = hs + p_embs + e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + p_embs + e_embs # forward decoder if olens is not None and not is_inference: @@ -302,11 +299,11 @@ class FastSpeech2MIDI(FastSpeech2): (paddle.shape(zs)[0], -1, self.odim)) # postnet -> (B, Lmax//r * r, odim) - # if self.postnet is None: - # after_outs = before_outs - # else: - # after_outs = before_outs + self.postnet( - # before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + if self.postnet is None: + after_outs = before_outs + else: + after_outs = before_outs + self.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) after_outs = before_outs return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits @@ -478,8 +475,7 @@ class FastSpeech2MIDI(FastSpeech2): spk_emb=spk_emb, spk_id=spk_id, ) - # return outs[0], d_outs[0], p_outs[0], e_outs[0] - return outs[0], d_outs[0], p_outs[0], None + return outs[0], d_outs[0], p_outs[0], e_outs[0] class FastSpeech2MIDILoss(FastSpeech2Loss): @@ -551,21 +547,21 @@ class FastSpeech2MIDILoss(FastSpeech2Loss): """ l1_loss = duration_loss = pitch_loss = energy_loss = speaker_loss = ssim_loss = 0.0 - out_pad_masks = make_pad_mask(olens).unsqueeze(-1) - before_outs_batch = masked_fill(before_outs, out_pad_masks, 0.0) - # print(before_outs.shape, ys.shape) - ssim_loss = 1.0 - ssim(before_outs_batch.unsqueeze(1), ys.unsqueeze(1)) - ssim_loss = ssim_loss * 0.5 - # apply mask to remove padded part if self.use_masking: + # make feature for ssim loss + out_pad_masks = make_pad_mask(olens).unsqueeze(-1) + before_outs_ssim = masked_fill(before_outs, out_pad_masks, 0.0) + if after_outs is not None: + after_outs_ssim = masked_fill(after_outs, out_pad_masks, 0.0) + ys_ssim = masked_fill(ys, out_pad_masks, 0.0) + out_masks = make_non_pad_mask(olens).unsqueeze(-1) before_outs = before_outs.masked_select( out_masks.broadcast_to(before_outs.shape)) - - # if after_outs is not None: - # after_outs = after_outs.masked_select( - # out_masks.broadcast_to(after_outs.shape)) + if after_outs is not None: + after_outs = after_outs.masked_select( + out_masks.broadcast_to(after_outs.shape)) ys = ys.masked_select(out_masks.broadcast_to(ys.shape)) duration_masks = make_non_pad_mask(ilens) d_outs = d_outs.masked_select( @@ -574,8 +570,8 @@ class FastSpeech2MIDILoss(FastSpeech2Loss): pitch_masks = out_masks p_outs = p_outs.masked_select( pitch_masks.broadcast_to(p_outs.shape)) - # e_outs = e_outs.masked_select( - # pitch_masks.broadcast_to(e_outs.shape)) + e_outs = e_outs.masked_select( + pitch_masks.broadcast_to(e_outs.shape)) ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape)) es = es.masked_select(pitch_masks.broadcast_to(es.shape)) @@ -591,17 +587,18 @@ class FastSpeech2MIDILoss(FastSpeech2Loss): # calculate loss l1_loss = self.l1_criterion(before_outs, ys) - # if after_outs is not None: - # l1_loss += self.l1_criterion(after_outs, ys) - # ssim_loss += (1.0 - ssim(after_outs, ys)) + ssim_loss = 1.0 - ssim( + before_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1)) + if after_outs is not None: + l1_loss += self.l1_criterion(after_outs, ys) + ssim_loss += ( + 1.0 - ssim(after_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1))) l1_loss = l1_loss * 0.5 - + ssim_loss = ssim_loss * 0.5 + duration_loss = self.duration_criterion(d_outs, ds) - # print("ppppppppppoooooooooooo: ", p_outs, p_outs.shape) - # print("ppppppppppssssssssssss: ", ps, ps.shape) - # pitch_loss = self.mse_criterion(p_outs, ps) - # energy_loss = self.mse_criterion(e_outs, es) pitch_loss = self.l1_criterion(p_outs, ps) + energy_loss = self.l1_criterion(e_outs, es) if spk_logits is not None and spk_ids is not None: speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size @@ -623,6 +620,9 @@ class FastSpeech2MIDILoss(FastSpeech2Loss): l1_loss = l1_loss.multiply(out_weights) l1_loss = l1_loss.masked_select( out_masks.broadcast_to(l1_loss.shape)).sum() + ssim_loss = ssim_loss.multiply(out_weights) + ssim_loss = ssim_loss.masked_select( + out_masks.broadcast_to(ssim_loss.shape)).sum() duration_loss = (duration_loss.multiply(duration_weights) .masked_select(duration_masks).sum()) pitch_masks = out_masks @@ -630,8 +630,8 @@ class FastSpeech2MIDILoss(FastSpeech2Loss): pitch_loss = pitch_loss.multiply(pitch_weights) pitch_loss = pitch_loss.masked_select( pitch_masks.broadcast_to(pitch_loss.shape)).sum() - # energy_loss = energy_loss.multiply(pitch_weights) - # energy_loss = energy_loss.masked_select( - # pitch_masks.broadcast_to(energy_loss.shape)).sum() + energy_loss = energy_loss.multiply(pitch_weights) + energy_loss = energy_loss.masked_select( + pitch_masks.broadcast_to(energy_loss.shape)).sum() return l1_loss, ssim_loss, duration_loss, pitch_loss, energy_loss, speaker_loss diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index 67acb6fb8..be684ce38 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -17,7 +17,6 @@ from typing import Callable from typing import Optional from typing import Tuple -import numpy as np import paddle import ppdiffusers from paddle import nn @@ -316,46 +315,8 @@ class GaussianDiffusion(nn.Layer): beta_end=beta_end, beta_schedule=beta_schedule) self.num_max_timesteps = num_max_timesteps - self.spec_min = paddle.to_tensor( - np.array([ - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, - -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0 - ])) - self.spec_max = paddle.to_tensor( - np.array([ - -0.79453, -0.81116, -0.61631, -0.30679, -0.13863, -0.050652, - -0.11563, -0.10679, -0.091068, -0.062174, -0.075302, -0.072217, - -0.063815, -0.073299, 0.007361, -0.072508, -0.050234, -0.16534, - -0.26928, -0.20782, -0.20823, -0.11702, -0.070128, -0.065868, - -0.012675, 0.0015121, -0.089902, -0.21392, -0.23789, -0.28922, - -0.30405, -0.23029, -0.22088, -0.21542, -0.29367, -0.30137, - -0.38281, -0.4359, -0.28681, -0.46855, -0.57485, -0.47022, - -0.54266, -0.44848, -0.6412, -0.687, -0.6486, -0.76436, - -0.49971, -0.71068, -0.69724, -0.61487, -0.55843, -0.69773, - -0.57502, -0.70919, -0.82431, -0.84213, -0.90431, -0.8284, - -0.77945, -0.82758, -0.87699, -1.0532, -1.0766, -1.1198, - -1.0185, -0.98983, -1.0001, -1.0756, -1.0024, -1.0304, -1.0579, - -1.0188, -1.05, -1.0842, -1.0923, -1.1223, -1.2381, -1.6467 - ])) - - def norm_spec(self, x): - """ - Linearly map x to [-1, 1] - Args: - x: [B, T, N] - """ - return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 - def denorm_spec(self, x): - return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min - - def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None, is_infer: bool=False, + def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Generate random timesteps noised x. @@ -372,9 +333,6 @@ class GaussianDiffusion(nn.Layer): The noises which is added to the input. """ - x = x.transpose((0, 2, 1)) - x = self.norm_spec(x) - x = x.transpose((0, 2, 1)) noise_scheduler = self.noise_scheduler # Sample noise that we'll add to the mel-spectrograms @@ -391,13 +349,6 @@ class GaussianDiffusion(nn.Layer): noisy_images = noise_scheduler.add_noise(x, noise, timesteps) y = self.denoiser(noisy_images, timesteps, cond) - - if is_infer: - y = y.transpose((0, 2, 1)) - y = self.denorm_spec(y) - y = y.transpose((0, 2, 1)) - - # y = self.denorm_spec(y) # then compute loss use output y and noisy target for prediction_type == "epsilon" return y, target @@ -477,10 +428,7 @@ class GaussianDiffusion(nn.Layer): # prepare first noise variables noisy_input = noise timesteps = scheduler.timesteps - if ref_x is not None: - ref_x = ref_x.transpose((0, 2, 1)) - ref_x = self.norm_spec(ref_x) - ref_x = ref_x.transpose((0, 2, 1)) + if ref_x is not None: init_timestep = None if strength is None or strength < 0. or strength > 1.: strength = None @@ -520,11 +468,5 @@ class GaussianDiffusion(nn.Layer): (i + 1) % scheduler.order == 0): if callback is not None and i % callback_steps == 0: callback(i, t, len(timesteps), denoised_output) - - denoised_output = denoised_output.transpose((0, 2, 1)) - denoised_output = self.denorm_spec(denoised_output) - denoised_output = denoised_output.transpose((0, 2, 1)) - - return denoised_output