base diffsinger, test=tts

pull/2834/head
liangym 3 years ago
parent 6fb281ca8a
commit ef7d15dc02

@ -0,0 +1,148 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 512 # FFT size (samples).
n_shift: 128 # Hop size (samples). 12.5ms
win_length: 512 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 30 # Minimum frequency of Mel basis.
fmax: 12000 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 750 # Maximum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 32
num_workers: 4
###########################################################
# MODEL SETTING #
###########################################################
# fastspeech2 module
fs2_model:
adim: 256 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
note_num: 300
is_slur_num: 2
denoiser_model:
in_channels: 80
out_channels: 80
kernel_size: 3
layers: 20
stacks: 4
residual_channels: 256
gate_channels: 512
skip_channels: 256
aux_channels: 256
dropout: 0.1
bias: True
use_weight_norm: False
init_type: kaiming_uniform
diffusion:
num_train_timesteps: 100
beta_start: 0.0001
beta_end: 0.06
beta_schedule: "squaredcos_cap_v2"
num_max_timesteps: 60
###########################################################
# UPDATER SETTING #
###########################################################
fs2_updater:
use_masking: True # whether to apply masking for padded part in loss calculation
ds_updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
# gpu_num=2 config
# fastspeech2 optimizer
fs2_optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate
# diffusion optimizer
ds_optimizer_params:
beta1: 0.9
beta2: 0.98
weight_decay: 0.0
ds_scheduler_params:
learning_rate: 0.001
gamma: 0.5
step_size: 25000
ds_grad_norm: 1
###########################################################
# INTERVAL SETTING #
###########################################################
ds_train_start_steps: 80000 # Number of steps to start to train diffusion module.
train_max_steps: 160000 # Number of training steps.
save_interval_steps: 1000 # Interval steps to save checkpoint.
eval_interval_steps: 250 # Interval steps to evaluate the network.
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086
find_unused_parameters: True

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=diffsinger
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -23,8 +23,11 @@ import paddle
import yaml
from paddle import DataParallel
from paddle import distributed as dist
from paddle import nn
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddle.optimizer import AdamW
from paddle.optimizer.lr import StepDecay
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_multi_spk_batch_fn
@ -33,6 +36,8 @@ from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.diffsinger import DiffSinger
from paddlespeech.t2s.models.diffsinger import DiffSingerEvaluator
from paddlespeech.t2s.models.diffsinger import DiffSingerUpdater
from paddlespeech.t2s.models.diffsinger import DiffusionLoss
from paddlespeech.t2s.models.diffsinger import FastSpeech2MIDILoss
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
@ -40,7 +45,6 @@ from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
from paddlespeech.t2s.utils import str2bool
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
@ -59,8 +63,9 @@ def train_sp(args, config):
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
)
fields = [
"text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy",
"note", "note_dur", "is_slur"]
"text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy", "note", "note_dur", "is_slur"
]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None:
@ -99,7 +104,6 @@ def train_sp(args, config):
converters=converters, )
# collate function and dataloader
train_sampler = DistributedBatchSampler(
train_dataset,
batch_size=config.batch_size,
@ -129,13 +133,32 @@ def train_sp(args, config):
print("vocab_size:", vocab_size)
odim = config.n_mels
config["fs2_model"]["idim"] = vocab_size
config["fs2_model"]["odim"] = odim
config["fs2_model"]["spk_num"] = spk_num
model = DiffSinger(
idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
fs2_config=config["fs2_model"],
denoiser_config=config["denoiser_model"],
diffusion_config=config["diffusion"])
if world_size > 1:
model = DataParallel(model)
print("model done!")
optimizer = build_optimizers(model, **config["optimizer"])
print("models done!")
criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"])
criterion_ds = DiffusionLoss(**config["ds_updater"])
print("criterions done!")
optimizer_fs2 = build_optimizers(model._layers.fs2,
**config["fs2_optimizer"])
lr_schedule_ds = StepDecay(**config["ds_scheduler_params"])
gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
optimizer_ds = AdamW(
learning_rate=lr_schedule_ds,
grad_clip=gradient_clip_ds,
parameters=model._layers.diffusion.parameters(),
**config["ds_optimizer_params"])
# optimizer_ds = build_optimizers(ds, **config["ds_optimizer"])
print("optimizer done!")
output_dir = Path(args.output_dir)
@ -145,33 +168,42 @@ def train_sp(args, config):
# copy conf to output_dir
shutil.copyfile(args.config, output_dir / config_name)
if "enable_speaker_classifier" in config.model:
enable_spk_cls = config.model.enable_speaker_classifier
else:
enable_spk_cls = False
updater = DiffSingerUpdater(
model=model,
optimizer=optimizer,
optimizers={
"fs2": optimizer_fs2,
"ds": optimizer_ds,
},
criterions={
"fs2": criterion_fs2,
"ds": criterion_ds,
},
dataloader=train_dataloader,
output_dir=output_dir,
enable_spk_cls=enable_spk_cls,
**config["updater"], )
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
ds_train_start_steps=config.ds_train_start_steps,
output_dir=output_dir)
evaluator = DiffSingerEvaluator(
model,
dev_dataloader,
output_dir=output_dir,
enable_spk_cls=enable_spk_cls,
**config["updater"], )
model=model,
criterions={
"fs2": criterion_fs2,
"ds": criterion_ds,
},
dataloader=dev_dataloader,
output_dir=output_dir)
trainer = Trainer(
updater,
stop_trigger=(config.train_max_steps, "iteration"),
out=output_dir, )
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
trainer.extend(
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
trainer.extend(
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
Snapshot(max_size=config.num_snapshots),
trigger=(config.save_interval_steps, 'iteration'))
print("Trainer Done!")
trainer.run()

@ -23,6 +23,7 @@ from typing import Optional
import numpy as np
import onnxruntime as ort
import paddle
import yaml
from paddle import inference
from paddle import jit
from paddle.io import DataLoader
@ -59,6 +60,7 @@ model_alias = {
"paddlespeech.t2s.models.diffsinger:DiffSinger",
"diffsinger_inference":
"paddlespeech.t2s.models.diffsinger:DiffSingerInference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@ -147,6 +149,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
print("single speaker fastspeech2!")
elif am_name == 'diffsinger':
fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
elif am_name == 'fastspeech2midi':
fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
elif am_name == 'tacotron2':
@ -353,9 +357,14 @@ def get_am_inference(am: str='fastspeech2_csmsc',
if am_name == 'fastspeech2':
am = am_class(
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
if am_name == 'diffsinger':
elif am_name == 'diffsinger':
am_config["fs2_model"]["idim"] = vocab_size
am_config["fs2_model"]["odim"] = am_config.n_mels
am_config["fs2_model"]["spk_num"] = spk_num
am = am_class(
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
fs2_config=am_config["fs2_model"],
denoiser_config=am_config["denoiser_model"],
diffusion_config=am_config["diffusion"])
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size,
@ -366,8 +375,6 @@ def get_am_inference(am: str='fastspeech2_csmsc',
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
elif am_name == 'erniesat':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
else:
print("wrong am, please input right am!!!")
am.set_state_dict(paddle.load(am_ckpt)["main_params"])
am.eval()

@ -112,8 +112,14 @@ def evaluate(args):
note = paddle.to_tensor(datum["note"])
note_dur = paddle.to_tensor(datum["note_dur"])
is_slur = paddle.to_tensor(datum["is_slur"])
mel = am_inference(phone_ids, note=note, note_dur=note_dur, is_slur=is_slur)
# vocoder
get_mel_fs2 = False
# mel: [T, mel_bin]
mel = am_inference(
phone_ids,
note=note,
note_dur=note_dur,
is_slur=is_slur,
get_mel_fs2=get_mel_fs2)
wav = voc_inference(mel)
wav = wav.numpy()
@ -140,10 +146,16 @@ def parse_args():
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix',
"diffsinger_opencpop"
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_ljspeech',
'fastspeech2_aishell3',
'fastspeech2_vctk',
'tacotron2_csmsc',
'tacotron2_ljspeech',
'tacotron2_aishell3',
'fastspeech2_mix',
"diffsinger_opencpop",
],
help='Choose acoustic model type of tts task.')
parser.add_argument(
@ -176,10 +188,19 @@ def parse_args():
type=str,
default='pwgan_csmsc',
choices=[
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk',
'style_melgan_csmsc', "pwgan_opencpop",
'pwgan_csmsc',
'pwgan_ljspeech',
'pwgan_aishell3',
'pwgan_vctk',
'mb_melgan_csmsc',
'wavernn_csmsc',
'hifigan_csmsc',
'hifigan_ljspeech',
'hifigan_aishell3',
'hifigan_vctk',
'style_melgan_csmsc',
"pwgan_opencpop",
"hifigan_opencpop",
],
help='Choose vocoder type of tts task.')
parser.add_argument(

@ -25,8 +25,11 @@ import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer
from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier
from paddlespeech.t2s.modules.diffusion import GaussianDiffusion
from paddlespeech.t2s.modules.diffusion import WaveNetDenoiser
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@ -41,22 +44,13 @@ from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
class DiffSinger(nn.Layer):
"""DiffSinger module.
This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._
.. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`:
https://arxiv.org/pdf/2105.02446.pdf
Args:
Returns:
class FastSpeech2MIDI(FastSpeech2):
"""The Fastspeech2 module of DiffSinger.
"""
def __init__(
self,
# network structure related
# fastspeech2 network structure related
idim: int,
odim: int,
adim: int=384,
@ -133,12 +127,8 @@ class DiffSinger(nn.Layer):
tone_embed_integration_type: str="add",
# note emb
note_num: int=300,
# note_embed_dim: int=384,
note_embed_integration_type: str="add",
# is_slur emb
is_slur_num: int=2,
# is_slur_embed_dim: int=384,
is_slur_embed_integration_type: str="add",
# training related
init_type: str="xavier_uniform",
init_enc_alpha: float=1.0,
@ -146,7 +136,7 @@ class DiffSinger(nn.Layer):
# speaker classifier
enable_speaker_classifier: bool=False,
hidden_sc_dim: int=256, ):
"""Initialize DiffSinger module.
"""Initialize FastSpeech2 module for svs.
Args:
idim (int):
Dimension of the inputs.
@ -252,7 +242,7 @@ class DiffSinger(nn.Layer):
Kernel size of energy embedding.
energy_embed_dropout_rate (float):
Dropout rate for energy embedding.
stop_gradient_from_energy_predictorbool):
stop_gradient_from_energy_predictor (bool):
Whether to stop gradient from energy predictor to encoder.
spk_num (Optional[int]):
Number of speakers. If not None, assume that the spk_embed_dim is not None,
@ -271,7 +261,7 @@ class DiffSinger(nn.Layer):
How to integrate tone embedding.
init_type (str):
How to initialize transformer parameters.
init_enc_alpha float):
init_enc_alpha (float):
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha (float):
Initial value of alpha in scaled pos encoding of the decoder.
@ -279,10 +269,16 @@ class DiffSinger(nn.Layer):
Whether to use speaker classifier module
hidden_sc_dim (int):
The hidden layer dim of speaker classifier
note_num (Optional[int]):
Number of note. If not None, assume that the
note_ids will be provided as the input and use note_embedding_table.
is_slur_num (Optional[int]):
Number of note. If not None, assume that the
is_slur_ids will be provided as the input
"""
assert check_argument_types()
super().__init__()
super().__init__(idim, odim)
# store hyperparameters
self.odim = odim
@ -306,12 +302,9 @@ class DiffSinger(nn.Layer):
self.note_embed_dim = adim
if self.note_embed_dim is not None:
self.note_embed_integration_type = note_embed_integration_type
self.note_dur_layer = nn.Linear(1, self.note_embed_dim)
self.is_slur_embed_dim = adim
if self.is_slur_embed_dim is not None:
self.is_slur_embed_integration_type = is_slur_embed_integration_type
# use idx 0 as padding idx
self.padding_idx = 0
@ -627,6 +620,7 @@ class DiffSinger(nn.Layer):
ps: paddle.Tensor=None,
es: paddle.Tensor=None,
is_inference: bool=False,
is_train_diffusion: bool=False,
return_after_enc=False,
alpha: float=1.0,
spk_emb=None,
@ -639,7 +633,12 @@ class DiffSinger(nn.Layer):
is_slur_emb = self.is_slur_embedding_table(is_slur)
# (B, Tmax, adim)
hs, _ = self.encoder(xs, x_masks, note_emb, note_dur_emb, is_slur_emb,)
hs, _ = self.encoder(
xs,
x_masks,
note_emb,
note_dur_emb,
is_slur_emb, )
if self.spk_num and self.enable_speaker_classifier and not is_inference:
hs_for_spk_cls = self.grad_reverse(hs)
@ -668,12 +667,24 @@ class DiffSinger(nn.Layer):
else:
pitch_masks = None
if is_inference:
# inference for decoder input for duffusion
if is_train_diffusion:
hs = self.length_regulator(hs, ds, is_inference=False)
p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
e_outs = self.energy_predictor(hs.detach(), pitch_masks)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
elif is_inference:
# (B, Tmax)
if ds is not None:
d_outs = ds
else:
d_outs = self.duration_predictor.inference(hs, d_masks)
# (B, Lmax, adim)
hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
@ -699,6 +710,7 @@ class DiffSinger(nn.Layer):
(0, 2, 1))
hs = hs + e_embs + p_embs
# training
else:
d_outs = self.duration_predictor(hs, d_masks)
# (B, Lmax, adim)
@ -717,7 +729,6 @@ class DiffSinger(nn.Layer):
(0, 2, 1))
hs = hs + e_embs + p_embs
# forward decoder
if olens is not None and not is_inference:
if self.reduction_factor > 1:
@ -750,7 +761,7 @@ class DiffSinger(nn.Layer):
else:
after_outs = before_outs + self.postnet(
before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits
def encoder_infer(
@ -764,6 +775,7 @@ class DiffSinger(nn.Layer):
spk_id=None,
tone_id=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
# input of embedding must be int64
x = paddle.cast(text, 'int64')
note = paddle.cast(note, 'int64')
@ -785,7 +797,7 @@ class DiffSinger(nn.Layer):
# (1, L, odim)
# use *_ to avoid bug in dygraph to static graph
hs, *_ = self._forward(
hs, _ = self._forward(
xs,
note,
note_dur,
@ -799,6 +811,55 @@ class DiffSinger(nn.Layer):
tone_id=tone_id)
return hs
# for diffusion
def encoder_infer_batch(
self,
text: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
text_lengths: paddle.Tensor,
speech_lengths: paddle.Tensor,
ds: paddle.Tensor=None,
ps: paddle.Tensor=None,
es: paddle.Tensor=None,
alpha: float=1.0,
spk_emb=None,
spk_id=None,
tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
# input of embedding must be int64
xs = paddle.cast(text, 'int64')
note = paddle.cast(note, 'int64')
note_dur = paddle.cast(note_dur, 'float32')
is_slur = paddle.cast(is_slur, 'int64')
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
if tone_id is not None:
tone_id = tone_id.unsqueeze(0)
# (1, L, odim)
# use *_ to avoid bug in dygraph to static graph
hs, h_masks = self._forward(
xs,
note,
note_dur,
is_slur,
ilens=text_lengths,
olens=speech_lengths,
ds=ds,
ps=ps,
es=es,
return_after_enc=True,
is_train_diffusion=True,
alpha=alpha,
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id)
return hs, h_masks
def inference(
self,
text: paddle.Tensor,
@ -896,112 +957,8 @@ class DiffSinger(nn.Layer):
return outs[0], d_outs[0], p_outs[0], e_outs[0]
def _integrate_with_spk_embed(self, hs, spk_emb):
"""Integrate speaker embedding with hidden states.
Args:
hs(Tensor):
Batch of hidden state sequences (B, Tmax, adim).
spk_emb(Tensor):
Batch of speaker embeddings (B, spk_embed_dim).
Returns:
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spk_emb = self.spk_projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
shape=[-1, paddle.shape(hs)[1], -1])
hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")
return hs
def _integrate_with_tone_embed(self, hs, tone_embs):
"""Integrate speaker embedding with hidden states.
Args:
hs(Tensor):
Batch of hidden state sequences (B, Tmax, adim).
tone_embs(Tensor):
Batch of speaker embeddings (B, Tmax, tone_embed_dim).
Returns:
"""
if self.tone_embed_integration_type == "add":
# apply projection and then add to hidden states
tone_embs = self.tone_projection(F.normalize(tone_embs))
hs = hs + tone_embs
elif self.tone_embed_integration_type == "concat":
# concat hidden states with tone embeds and then apply projection
tone_embs = F.normalize(tone_embs).expand(
shape=[-1, hs.shape[1], -1])
hs = self.tone_projection(paddle.concat([hs, tone_embs], axis=-1))
else:
raise NotImplementedError("support only add or concat.")
return hs
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
"""Make masks for self-attention.
Args:
ilens(Tensor):
Batch of lengths (B,).
Returns:
Tensor:
Mask tensor for self-attention. dtype=paddle.bool
Examples:
>>> ilens = [5, 3]
>>> self._source_mask(ilens)
tensor([[[1, 1, 1, 1, 1],
[1, 1, 1, 0, 0]]]) bool
"""
x_masks = make_non_pad_mask(ilens)
return x_masks.unsqueeze(-2)
def _reset_parameters(self, init_enc_alpha: float, init_dec_alpha: float):
# initialize alpha in scaled positional encoding
if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
init_enc_alpha = paddle.to_tensor(init_enc_alpha)
self.encoder.embed[-1].alpha = paddle.create_parameter(
shape=init_enc_alpha.shape,
dtype=str(init_enc_alpha.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(
init_enc_alpha))
if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
init_dec_alpha = paddle.to_tensor(init_dec_alpha)
self.decoder.embed[-1].alpha = paddle.create_parameter(
shape=init_dec_alpha.shape,
dtype=str(init_dec_alpha.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(
init_dec_alpha))
class DiffSingerInference(nn.Layer):
def __init__(self, normalizer, model):
super().__init__()
self.normalizer = normalizer
self.acoustic_model = model
def forward(self, text, note, note_dur, is_slur, spk_id=None, spk_emb=None):
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, note=note, note_dur=note_dur, is_slur=is_slur, spk_id=spk_id, spk_emb=spk_emb)
logmel = self.normalizer.inverse(normalized_mel)
return logmel
class DiffSingerLoss(nn.Layer):
class FastSpeech2MIDILoss(nn.Layer):
"""Loss function module for DiffSinger."""
def __init__(self, use_masking: bool=True,
@ -1152,3 +1109,178 @@ class DiffSingerLoss(nn.Layer):
pitch_masks.broadcast_to(energy_loss.shape)).sum()
return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss
class DiffusionLoss(nn.Layer):
"""Loss function module for DiffSinger."""
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
Args:
use_masking (bool):
Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool):
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
# define criterions
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
def forward(
self,
ref_mels: paddle.Tensor,
out_mels: paddle.Tensor,
mel_masks: paddle.Tensor, ) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
ref_mels(Tensor):
Batch of real mel (B, Lmax, odim).
out_mels(Tensor):
Batch of outputs mel (B, Lmax, odim).
mel_masks(Tensor):
Batch of mask of real mel (B, Lmax, 1).
Returns:
"""
# apply mask to remove padded part
if self.use_masking:
out_mels = out_mels.masked_select(
mel_masks.broadcast_to(out_mels.shape))
ref_mels = ref_mels.masked_select(
mel_masks.broadcast_to(ref_mels.shape))
# calculate loss
l1_loss = self.l1_criterion(out_mels, ref_mels)
# make weighted mask and apply it
if self.use_weighted_masking:
mel_masks = mel_masks.unsqueeze(-1)
out_weights = mel_masks.cast(dtype=paddle.float32) / mel_masks.cast(
dtype=paddle.float32).sum(
axis=1, keepdim=True)
out_weights /= ref_mels.shape[0] * ref_mels.shape[2]
# apply weight
l1_loss = l1_loss.multiply(out_weights)
l1_loss = l1_loss.masked_select(
mel_masks.broadcast_to(l1_loss.shape)).sum()
return l1_loss
class DiffSinger(nn.Layer):
"""DiffSinger module.
This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._
.. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`:
https://arxiv.org/pdf/2105.02446.pdf
Args:
Returns:
"""
def __init__(
self,
fs2_config,
denoiser_config,
diffusion_config, ):
assert check_argument_types()
super().__init__()
self.fs2 = FastSpeech2MIDI(**fs2_config)
denoiser = WaveNetDenoiser(**denoiser_config)
self.diffusion = GaussianDiffusion(denoiser, **diffusion_config)
def forward(
self,
text: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
text_lengths: paddle.Tensor,
speech: paddle.Tensor,
speech_lengths: paddle.Tensor,
durations: paddle.Tensor,
pitch: paddle.Tensor,
energy: paddle.Tensor,
tone_id: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
spk_id: paddle.Tensor=None,
train_fs2: bool=True,
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2(
text=text,
note=note,
note_dur=note_dur,
is_slur=is_slur,
text_lengths=text_lengths,
speech=speech,
speech_lengths=speech_lengths,
durations=durations,
pitch=pitch,
energy=energy,
spk_id=spk_id,
spk_emb=spk_emb)
cond_fs2, mel_masks = self.fs2.encoder_infer_batch(
text=text,
note=note,
note_dur=note_dur,
is_slur=is_slur,
text_lengths=text_lengths,
speech_lengths=speech_lengths,
ds=durations,
ps=pitch,
es=energy)
cond_fs2 = cond_fs2.transpose((0, 2, 1))
mel = self.diffusion(speech.transpose((0, 2, 1)), cond_fs2.detach())
if train_fs2:
return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits
else:
return mel[0], mel_masks
def inference(
self,
text: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
get_mel_fs2: bool=False, ):
mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur)
if get_mel_fs2:
return mel_fs2
mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1))
cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur)
cond_fs2 = cond_fs2.transpose((0, 2, 1))
mel, _ = self.diffusion(mel_fs2, cond_fs2)
mel = mel.transpose((0, 2, 1))
return mel[0]
class DiffSingerInference(nn.Layer):
def __init__(self, normalizer, model):
super().__init__()
self.normalizer = normalizer
self.acoustic_model = model
def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False):
normalized_mel = self.acoustic_model.inference(
text,
note=note,
note_dur=note_dur,
is_slur=is_slur,
get_mel_fs2=get_mel_fs2)
logmel = self.normalizer.inverse(normalized_mel)
return logmel

@ -13,17 +13,19 @@
# limitations under the License.
import logging
from pathlib import Path
from typing import Dict
import paddle
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddlespeech.t2s.models.diffsinger import DiffSingerLoss
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
from paddlespeech.t2s.training.reporter import report
from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
logging.basicConfig(
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
@ -36,27 +38,36 @@ class DiffSingerUpdater(StandardUpdater):
def __init__(
self,
model: Layer,
optimizer: Optimizer,
optimizers: Dict[str, Optimizer],
criterions: Dict[str, Layer],
dataloader: DataLoader,
init_state=None,
use_masking: bool=False,
spk_loss_scale: float=0.02,
use_weighted_masking: bool=False,
output_dir: Path=None,
enable_spk_cls: bool=False, ):
super().__init__(model, optimizer, dataloader, init_state=None)
self.criterion = DiffSingerLoss(
use_masking=use_masking,
use_weighted_masking=use_weighted_masking, )
fs2_train_start_steps: int=0,
ds_train_start_steps: int=160000,
output_dir: Path=None, ):
super().__init__(model, optimizers, dataloader, init_state=None)
self.optimizers = optimizers
self.optimizer_fs2: Optimizer = optimizers['fs2']
self.optimizer_ds: Optimizer = optimizers['ds']
self.criterions = criterions
self.criterion_fs2 = criterions['fs2']
self.criterion_ds = criterions['ds']
self.dataloader = dataloader
self.fs2_train_start_steps = fs2_train_start_steps
self.ds_train_start_steps = ds_train_start_steps
self.state = UpdaterState(iteration=0, epoch=0)
self.train_iterator = iter(self.dataloader)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
logger.addHandler(self.filehandler)
self.logger = logger
self.msg = ""
self.spk_loss_scale = spk_loss_scale
self.enable_spk_cls = enable_spk_cls
def update_core(self, batch):
self.msg = "Rank: {}, ".format(dist.get_rank())
@ -68,24 +79,8 @@ class DiffSingerUpdater(StandardUpdater):
if spk_emb is not None:
spk_id = None
if type(
self.model
) == DataParallel and self.model._layers.spk_num and self.model._layers.enable_speaker_classifier:
with self.model.no_sync():
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
text=batch["text"],
note=batch["note"],
note_dur=batch["note_dur"],
is_slur=batch["is_slur"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
speech_lengths=batch["speech_lengths"],
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id,
spk_emb=spk_emb)
else:
# fastspeech2
if self.state.iteration > self.fs2_train_start_steps and self.state.iteration < self.ds_train_start_steps:
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
text=batch["text"],
note=batch["note"],
@ -98,74 +93,109 @@ class DiffSingerUpdater(StandardUpdater):
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion(
after_outs=after_outs,
before_outs=before_outs,
d_outs=d_outs,
p_outs=p_outs,
e_outs=e_outs,
ys=ys,
ds=batch["durations"],
ps=batch["pitch"],
es=batch["energy"],
ilens=batch["text_lengths"],
olens=olens,
spk_logits=spk_logits,
spk_ids=spk_id, )
scaled_speaker_loss = self.spk_loss_scale * speaker_loss
loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss
optimizer = self.optimizer
optimizer.clear_grad()
loss.backward()
optimizer.step()
report("train/loss", float(loss))
report("train/l1_loss", float(l1_loss))
report("train/duration_loss", float(duration_loss))
report("train/pitch_loss", float(pitch_loss))
report("train/energy_loss", float(energy_loss))
if self.enable_spk_cls:
report("train/speaker_loss", float(speaker_loss))
report("train/scaled_speaker_loss", float(scaled_speaker_loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["duration_loss"] = float(duration_loss)
losses_dict["pitch_loss"] = float(pitch_loss)
losses_dict["energy_loss"] = float(energy_loss)
losses_dict["energy_loss"] = float(energy_loss)
if self.enable_spk_cls:
losses_dict["speaker_loss"] = float(speaker_loss)
losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss)
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
spk_emb=spk_emb,
train_fs2=True, )
l1_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
after_outs=after_outs,
before_outs=before_outs,
d_outs=d_outs,
p_outs=p_outs,
e_outs=e_outs,
ys=ys,
ds=batch["durations"],
ps=batch["pitch"],
es=batch["energy"],
ilens=batch["text_lengths"],
olens=olens,
spk_logits=spk_logits,
spk_ids=spk_id, )
loss_fs2 = l1_loss_fs2 + duration_loss + pitch_loss + energy_loss
self.optimizer_fs2.clear_grad()
loss_fs2.backward()
self.optimizer_fs2.step()
report("train/loss_fs2", float(loss_fs2))
report("train/l1_loss_fs2", float(l1_loss_fs2))
report("train/duration_loss", float(duration_loss))
report("train/pitch_loss", float(pitch_loss))
report("train/energy_loss", float(energy_loss))
losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
losses_dict["duration_loss"] = float(duration_loss)
losses_dict["pitch_loss"] = float(pitch_loss)
losses_dict["energy_loss"] = float(energy_loss)
losses_dict["energy_loss"] = float(energy_loss)
losses_dict["loss_fs2"] = float(loss_fs2)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
if self.state.iteration > self.ds_train_start_steps:
for param in self.model._layers.fs2.parameters():
param.trainable = False
mel, mel_masks = self.model(
text=batch["text"],
note=batch["note"],
note_dur=batch["note_dur"],
is_slur=batch["is_slur"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
speech_lengths=batch["speech_lengths"],
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id,
spk_emb=spk_emb,
train_fs2=False, )
mel = mel.transpose((0, 2, 1))
mel_masks = mel_masks.transpose((0, 2, 1))
l1_loss_ds = self.criterion_ds(
ref_mels=batch["speech"],
out_mels=mel,
mel_masks=mel_masks, )
loss_ds = l1_loss_ds
self.optimizer_ds.clear_grad()
loss_ds.backward()
self.optimizer_ds.step()
report("train/loss_ds", float(loss_ds))
report("train/l1_loss_ds", float(l1_loss_ds))
losses_dict["l1_loss_ds"] = float(l1_loss_ds)
losses_dict["loss_ds"] = float(loss_ds)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
self.logger.info(self.msg)
class DiffSingerEvaluator(StandardEvaluator):
def __init__(self,
model: Layer,
dataloader: DataLoader,
use_masking: bool=False,
use_weighted_masking: bool=False,
spk_loss_scale: float=0.02,
output_dir: Path=None,
enable_spk_cls: bool=False):
def __init__(
self,
model: Layer,
criterions: Dict[str, Layer],
dataloader: DataLoader,
output_dir: Path=None, ):
super().__init__(model, dataloader)
self.model = model
self.criterions = criterions
self.criterion_fs2 = criterions['fs2']
self.criterion_ds = criterions['ds']
self.dataloader = dataloader
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
logger.addHandler(self.filehandler)
self.logger = logger
self.msg = ""
self.spk_loss_scale = spk_loss_scale
self.enable_spk_cls = enable_spk_cls
self.criterion = DiffSingerLoss(
use_masking=use_masking, use_weighted_masking=use_weighted_masking)
def evaluate_core(self, batch):
self.msg = "Evaluate: "
@ -176,73 +206,35 @@ class DiffSingerEvaluator(StandardEvaluator):
if spk_emb is not None:
spk_id = None
if type(
self.model
) == DataParallel and self.model._layers.spk_num and self.model._layers.enable_speaker_classifier:
with self.model.no_sync():
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
text=batch["text"],
note=batch["note"],
note_dur=batch["note_dur"],
is_slur=batch["is_slur"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
speech_lengths=batch["speech_lengths"],
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id,
spk_emb=spk_emb)
else:
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
text=batch["text"],
note=batch["note"],
note_dur=batch["note_dur"],
is_slur=batch["is_slur"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
speech_lengths=batch["speech_lengths"],
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion(
after_outs=after_outs,
before_outs=before_outs,
d_outs=d_outs,
p_outs=p_outs,
e_outs=e_outs,
ys=ys,
ds=batch["durations"],
ps=batch["pitch"],
es=batch["energy"],
ilens=batch["text_lengths"],
olens=olens,
spk_logits=spk_logits,
spk_ids=spk_id, )
scaled_speaker_loss = self.spk_loss_scale * speaker_loss
loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss
report("eval/loss", float(loss))
report("eval/l1_loss", float(l1_loss))
report("eval/duration_loss", float(duration_loss))
report("eval/pitch_loss", float(pitch_loss))
report("eval/energy_loss", float(energy_loss))
if self.enable_spk_cls:
report("train/speaker_loss", float(speaker_loss))
report("train/scaled_speaker_loss", float(scaled_speaker_loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["duration_loss"] = float(duration_loss)
losses_dict["pitch_loss"] = float(pitch_loss)
losses_dict["energy_loss"] = float(energy_loss)
if self.enable_spk_cls:
losses_dict["speaker_loss"] = float(speaker_loss)
losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss)
losses_dict["loss"] = float(loss)
mel, mel_masks = self.model(
text=batch["text"],
note=batch["note"],
note_dur=batch["note_dur"],
is_slur=batch["is_slur"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
speech_lengths=batch["speech_lengths"],
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id,
spk_emb=spk_emb,
train_fs2=False, )
mel = mel.transpose((0, 2, 1))
mel_masks = mel_masks.transpose((0, 2, 1))
l1_loss_ds = self.criterion_ds(
ref_mels=batch["speech"],
out_mels=mel,
mel_masks=mel_masks, )
loss_ds = l1_loss_ds
report("train/loss_ds", float(loss_ds))
report("train/l1_loss_ds", float(l1_loss_ds))
losses_dict["l1_loss_ds"] = float(l1_loss_ds)
losses_dict["loss_ds"] = float(loss_ds)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
self.logger.info(self.msg)

Loading…
Cancel
Save