diffsinger_tmp

pull/3005/head
liangym 3 years ago
parent c91dc02931
commit 84a22ffb93

@ -23,7 +23,7 @@ f0max: 750 # Maximum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 32
batch_size: 48
num_workers: 1
@ -37,33 +37,32 @@ model:
# fastspeech2 module
fastspeech2_params:
adim: 256 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
adim: 256 # attention dimension # lym check
aheads: 2 # number of attention heads # lym check
elayers: 4 # number of encoder layers # lym check
eunits: 1024 # number of encoder ff units # lym check adim * 4
dlayers: 4 # number of decoder layers # lym check
dunits: 1024 # number of decoder ff units # lym check
positionwise_layer_type: conv1d-linear # type of position-wise layer # lym check
positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer # lym check
transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer # lym check
transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding # lym check
transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer # lym check
transformer_activation_type: "gelu"
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
use_scaled_pos_enc: True # whether to use scaled positional encoding
transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer
duration_predictor_layers: 5 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
@ -71,6 +70,11 @@ model:
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
@ -131,19 +135,24 @@ ds_optimizer_params:
ds_scheduler_params:
learning_rate: 0.001
gamma: 0.5
step_size: 25000
step_size: 10000
ds_grad_norm: 1
###########################################################
# INTERVAL SETTING #
###########################################################
ds_train_start_steps: 160000 # Number of steps to start to train diffusion module.
train_max_steps: 320000 # Number of training steps.
save_interval_steps: 1000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
num_snapshots: 5
ds_train_start_steps: 32500 # Number of steps to start to train diffusion module.
train_max_steps: 65000 # Number of training steps.
save_interval_steps: 500 # Interval steps to save checkpoint.
eval_interval_steps: 500 # Interval steps to evaluate the network.
num_snapshots: 20
# ds_train_start_steps: 4 # Number of steps to start to train diffusion module.
# train_max_steps: 8 # Number of training steps.
# save_interval_steps: 1 # Interval steps to save checkpoint.
# eval_interval_steps: 2 # Interval steps to evaluate the network.
# num_snapshots: 5
###########################################################
# OTHER SETTING #

@ -1,6 +1,6 @@
#!/bin/bash
stage=1
stage=0
stop_stage=100
config_path=$1

@ -2,7 +2,9 @@
config_path=$1
train_output_path=$2
ckpt_name=$3
#ckpt_name=$3
iter=$3
ckpt_name=snapshot_iter_${iter}.pdz
stage=0
stop_stage=0
@ -20,81 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--output_dir=${train_output_path}/test_${iter} \
--phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi

@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=1 \
--ngpu=4 \
--phones-dict=dump/phone_id_map.txt

@ -3,9 +3,10 @@
set -e
source path.sh
gpus=0
stage=0
stop_stage=100
gpus=4,5,6,7
#gpus=0
stage=1
stop_stage=1
conf_path=conf/default.yaml
train_output_path=exp/default

@ -105,7 +105,6 @@ class Pitch():
if (f0 == 0).all():
print("All frames seems to be unvoiced.")
return f0
# padding start and end of f0 sequence
start_f0 = f0[f0 != 0][0]
end_f0 = f0[f0 != 0][-1]

@ -101,7 +101,7 @@ def get_sentences_svs(
dataset (str): dataset name
Returns:
Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str)
tunple: speaker name
tuple: speaker name
'''
f = open(file_name, 'r')
sentence = {}
@ -115,7 +115,7 @@ def get_sentences_svs(
ph = line_list[2].split()
midi = note2midi(line_list[3].split())
midi_dur = line_list[4].split()
ph_dur = time2frame([float(t) for t in line_list[5].split()])
ph_dur = time2frame([float(t) for t in line_list[5].split()], sample_rate=sample_rate, n_shift=n_shift)
is_slur = line_list[6].split()
assert len(ph) == len(midi) == len(midi_dur) == len(is_slur)
sentence[utt] = (ph, [int(i) for i in ph_dur],

@ -80,20 +80,27 @@ def main():
# restore scaler
speech_scaler = StandardScaler()
speech_scaler.mean_ = np.load(args.speech_stats)[0]
speech_scaler.scale_ = np.load(args.speech_stats)[1]
# speech_scaler.mean_ = np.load(args.speech_stats)[0]
# speech_scaler.scale_ = np.load(args.speech_stats)[1]
speech_scaler.mean_ = np.zeros(np.load(args.speech_stats)[0].shape, dtype="float32")
speech_scaler.scale_ = np.ones(np.load(args.speech_stats)[1].shape, dtype="float32")
speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
pitch_scaler = StandardScaler()
pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
# pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
# pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
pitch_scaler.mean_ = np.zeros(np.load(args.pitch_stats)[0].shape, dtype="float32")
pitch_scaler.scale_ = np.ones(np.load(args.pitch_stats)[1].shape, dtype="float32")
pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0]
energy_scaler = StandardScaler()
energy_scaler.mean_ = np.load(args.energy_stats)[0]
energy_scaler.scale_ = np.load(args.energy_stats)[1]
# energy_scaler.mean_ = np.load(args.energy_stats)[0]
# energy_scaler.scale_ = np.load(args.energy_stats)[1]
energy_scaler.mean_ = np.zeros(np.load(args.energy_stats)[0].shape, dtype="float32")
energy_scaler.scale_ = np.ones(np.load(args.energy_stats)[1].shape, dtype="float32")
energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0]
vocab_phones = {}
with open(args.phones_dict, 'rt') as f:
phn_id = [line.strip().split() for line in f.readlines()]
@ -111,6 +118,7 @@ def main():
for item in tqdm(dataset):
utt_id = item['utt_id']
print(utt_id)
speech = item['speech']
pitch = item['pitch']
energy = item['energy']

@ -34,7 +34,6 @@ from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_
from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.utils import str2bool
ALL_INITIALS = [
@ -106,6 +105,7 @@ def process_sentence(
pitch_dir = output_dir / "data_pitch"
pitch_dir.mkdir(parents=True, exist_ok=True)
pitch_path = pitch_dir / (utt_id + "_pitch.npy")
# print(pitch, pitch.shape)
np.save(pitch_path, pitch)
energy = energy_extractor.get_energy(wav)
assert energy.shape[0] == num_frames
@ -271,7 +271,6 @@ def main():
sample_rate=config.fs,
n_shift=config.n_shift, )
# merge_silence(sentences)
phone_id_map_path = dumpdir / "phone_id_map.txt"
speaker_id_map_path = dumpdir / "speaker_id_map.txt"
get_input_token(sentences, phone_id_map_path, args.dataset)

@ -43,9 +43,6 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
from paddlespeech.t2s.utils import str2bool
# from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
def train_sp(args, config):

@ -120,8 +120,12 @@ def evaluate(args):
note_dur=note_dur,
is_slur=is_slur,
get_mel_fs2=get_mel_fs2)
# import numpy as np
# mel = np.load("/home/liangyunming/others_code/DiffSinger_lym/diffsinger_mel.npy")
# mel = paddle.to_tensor(mel)
wav = voc_inference(mel)
wav = wav.numpy()
N += wav.size
T += t.elapse
@ -131,8 +135,10 @@ def evaluate(args):
f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write(
# str(output_dir / ("xiaojiuwo_diffsinger" + ".wav")), wav, samplerate=am_config.fs)
str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
print(f"{utt_id} done!")
# break
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")

@ -48,12 +48,12 @@ class DiffSinger(nn.Layer):
note_num: int=300,
is_slur_num: int=2,
fastspeech2_params: Dict[str, Any]={
"adim": 384,
"aheads": 4,
"elayers": 6,
"eunits": 1536,
"dlayers": 6,
"dunits": 1536,
"adim": 256,
"aheads": 2,
"elayers": 4,
"eunits": 1024,
"dlayers": 4,
"dunits": 1024,
"postnet_layers": 5,
"postnet_chans": 512,
"postnet_filts": 5,
@ -74,6 +74,7 @@ class DiffSinger(nn.Layer):
"transformer_dec_dropout_rate": 0.1,
"transformer_dec_positional_dropout_rate": 0.1,
"transformer_dec_attn_dropout_rate": 0.1,
"transformer_activation_type": "gelu",
# duration predictor
"duration_predictor_layers": 2,
"duration_predictor_chans": 384,
@ -149,7 +150,7 @@ class DiffSinger(nn.Layer):
self.fs2 = FastSpeech2MIDI(
idim=idim,
odim=odim,
fastspeech2_config=fastspeech2_params,
fastspeech2_params=fastspeech2_params,
note_num=note_num,
is_slur_num=is_slur_num)
denoiser = WaveNetDenoiser(**denoiser_params)
@ -260,7 +261,7 @@ class DiffSinger(nn.Layer):
Whether to get mel from fastspeech2 module.
Returns:
_type_: _description_
"""
mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur)
if get_mel_fs2:
@ -268,7 +269,9 @@ class DiffSinger(nn.Layer):
mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1))
cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur)
cond_fs2 = cond_fs2.transpose((0, 2, 1))
mel, _ = self.diffusion(mel_fs2, cond_fs2)
# mel, _ = self.diffusion(mel_fs2, cond_fs2)
noise = paddle.randn(mel_fs2.shape)
mel = self.diffusion.inference(noise=noise, cond=cond_fs2, ref_x=mel_fs2, num_inference_steps=100)
mel = mel.transpose((0, 2, 1))
return mel[0]
@ -280,13 +283,32 @@ class DiffSingerInference(nn.Layer):
self.acoustic_model = model
def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False):
"""Calculate forward propagation.
Args:
text(Tensor(int64)):
Batch of padded token (phone) ids (B, Tmax).
note(Tensor(int64)):
Batch of padded note (element in music score) ids (B, Tmax).
note_dur(Tensor(float32)):
Batch of padded note durations in seconds (element in music score) (B, Tmax).
is_slur(Tensor(int64)):
Batch of padded slur (element in music score) ids (B, Tmax).
get_mel_fs2 (bool, optional): . Defaults to False.
Whether to get mel from fastspeech2 module.
Returns:
logmel(Tensor(float32)): denorm logmel, [T, mel_bin]
"""
normalized_mel = self.acoustic_model.inference(
text,
note=note,
note_dur=note_dur,
is_slur=is_slur,
get_mel_fs2=get_mel_fs2)
logmel = self.normalizer.inverse(normalized_mel)
print(normalized_mel)
# logmel = self.normalizer.inverse(normalized_mel)
logmel = normalized_mel
return logmel

@ -41,7 +41,6 @@ class DiffSingerUpdater(StandardUpdater):
optimizers: Dict[str, Optimizer],
criterions: Dict[str, Layer],
dataloader: DataLoader,
fs2_train_start_steps: int=0,
ds_train_start_steps: int=160000,
output_dir: Path=None, ):
super().__init__(model, optimizers, dataloader, init_state=None)
@ -58,7 +57,6 @@ class DiffSingerUpdater(StandardUpdater):
self.dataloader = dataloader
self.fs2_train_start_steps = fs2_train_start_steps
self.ds_train_start_steps = ds_train_start_steps
self.state = UpdaterState(iteration=0, epoch=0)
@ -81,7 +79,8 @@ class DiffSingerUpdater(StandardUpdater):
spk_id = None
# only train fastspeech2 module firstly
if self.state.iteration > self.fs2_train_start_steps and self.state.iteration < self.ds_train_start_steps:
if self.state.iteration <= self.ds_train_start_steps:
# print(batch)
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
text=batch["text"],
note=batch["note"],
@ -97,7 +96,7 @@ class DiffSingerUpdater(StandardUpdater):
spk_emb=spk_emb,
train_fs2=True, )
l1_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
after_outs=after_outs,
before_outs=before_outs,
d_outs=d_outs,
@ -112,7 +111,7 @@ class DiffSingerUpdater(StandardUpdater):
spk_logits=spk_logits,
spk_ids=spk_id, )
loss_fs2 = l1_loss_fs2 + duration_loss + pitch_loss + energy_loss
loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss
self.optimizer_fs2.clear_grad()
loss_fs2.backward()
@ -120,11 +119,13 @@ class DiffSingerUpdater(StandardUpdater):
report("train/loss_fs2", float(loss_fs2))
report("train/l1_loss_fs2", float(l1_loss_fs2))
report("train/ssim_loss_fs2", float(ssim_loss_fs2))
report("train/duration_loss", float(duration_loss))
report("train/pitch_loss", float(pitch_loss))
report("train/energy_loss", float(energy_loss))
losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
losses_dict["duration_loss"] = float(duration_loss)
losses_dict["pitch_loss"] = float(pitch_loss)
losses_dict["energy_loss"] = float(energy_loss)

@ -22,9 +22,11 @@ from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
from paddlespeech.t2s.modules.losses import ssim
from paddlespeech.t2s.modules.masked_fill import masked_fill
class FastSpeech2MIDI(FastSpeech2):
@ -36,14 +38,14 @@ class FastSpeech2MIDI(FastSpeech2):
# fastspeech2 network structure related
idim: int,
odim: int,
fastspeech2_config: Dict[str, Any],
fastspeech2_params: Dict[str, Any],
# note emb
note_num: int=300,
# is_slur emb
is_slur_num: int=2, ):
"""Initialize FastSpeech2 module for svs.
Args:
fastspeech2_config (Dict):
fastspeech2_params (Dict):
The config of FastSpeech2 module on DiffSinger model
note_num (Optional[int]):
Number of note. If not None, assume that the
@ -54,9 +56,9 @@ class FastSpeech2MIDI(FastSpeech2):
"""
assert check_argument_types()
super().__init__(idim=idim, odim=odim, **fastspeech2_config)
super().__init__(idim=idim, odim=odim, **fastspeech2_params)
self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_config[
self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_params[
"adim"]
if note_num is not None:
@ -133,15 +135,15 @@ class FastSpeech2MIDI(FastSpeech2):
spk_id = paddle.cast(spk_id, 'int64')
# forward propagation
before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
olens,
ds,
ps,
es,
xs=xs,
note=note,
note_dur=note_dur,
is_slur=is_slur,
ilens=ilens,
olens=olens,
ds=ds,
ps=ps,
es=es,
is_inference=False,
spk_emb=spk_emb,
spk_id=spk_id, )
@ -170,6 +172,8 @@ class FastSpeech2MIDI(FastSpeech2):
alpha: float=1.0,
spk_emb=None,
spk_id=None, ) -> Sequence[paddle.Tensor]:
before_outs = after_outs = d_outs = p_outs = e_outs = spk_logits = None
# forward encoder
x_masks = self._source_mask(ilens)
note_emb = self.note_embedding_table(note)
@ -206,16 +210,17 @@ class FastSpeech2MIDI(FastSpeech2):
else:
pitch_masks = None
# inference for decoder input for duffusion
# inference for decoder input for diffusion
if is_train_diffusion:
hs = self.length_regulator(hs, ds, is_inference=False)
p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
e_outs = self.energy_predictor(hs.detach(), pitch_masks)
# e_outs = self.energy_predictor(hs.detach(), pitch_masks)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
# e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
# (0, 2, 1))
# hs = hs + p_embs + e_embs
hs = hs + p_embs
elif is_inference:
# (B, Tmax)
@ -235,19 +240,20 @@ class FastSpeech2MIDI(FastSpeech2):
else:
p_outs = self.pitch_predictor(hs, pitch_masks)
if es is not None:
e_outs = es
else:
if self.stop_gradient_from_energy_predictor:
e_outs = self.energy_predictor(hs.detach(), pitch_masks)
else:
e_outs = self.energy_predictor(hs, pitch_masks)
# if es is not None:
# e_outs = es
# else:
# if self.stop_gradient_from_energy_predictor:
# e_outs = self.energy_predictor(hs.detach(), pitch_masks)
# else:
# e_outs = self.energy_predictor(hs, pitch_masks)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
# e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
# (0, 2, 1))
# hs = hs + p_embs + e_embs
hs = hs + p_embs
# training
else:
@ -258,15 +264,16 @@ class FastSpeech2MIDI(FastSpeech2):
p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
else:
p_outs = self.pitch_predictor(hs, pitch_masks)
if self.stop_gradient_from_energy_predictor:
e_outs = self.energy_predictor(hs.detach(), pitch_masks)
else:
e_outs = self.energy_predictor(hs, pitch_masks)
# if self.stop_gradient_from_energy_predictor:
# e_outs = self.energy_predictor(hs.detach(), pitch_masks)
# else:
# e_outs = self.energy_predictor(hs, pitch_masks)
p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
# e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
# (0, 2, 1))
# hs = hs + p_embs + e_embs
hs = hs + p_embs
# forward decoder
if olens is not None and not is_inference:
@ -295,11 +302,12 @@ class FastSpeech2MIDI(FastSpeech2):
(paddle.shape(zs)[0], -1, self.odim))
# postnet -> (B, Lmax//r * r, odim)
if self.postnet is None:
# if self.postnet is None:
# after_outs = before_outs
# else:
# after_outs = before_outs + self.postnet(
# before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
after_outs = before_outs
else:
after_outs = before_outs + self.postnet(
before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits
@ -326,11 +334,11 @@ class FastSpeech2MIDI(FastSpeech2):
# (1, L, odim)
# use *_ to avoid bug in dygraph to static graph
hs, _ = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
xs=xs,
note=note,
note_dur=note_dur,
is_slur=is_slur,
ilens=ilens,
is_inference=True,
return_after_enc=True,
alpha=alpha,
@ -367,15 +375,15 @@ class FastSpeech2MIDI(FastSpeech2):
# (1, L, odim)
# use *_ to avoid bug in dygraph to static graph
hs, h_masks = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
olens,
ds,
ps,
es,
xs=xs,
note=note,
note_dur=note_dur,
is_slur=is_slur,
ilens=ilens,
olens=olens,
ds=ds,
ps=ps,
es=es,
return_after_enc=True,
is_train_diffusion=True,
alpha=alpha,
@ -446,11 +454,11 @@ class FastSpeech2MIDI(FastSpeech2):
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs, _ = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
xs=xs,
note=note,
note_dur=note_dur,
is_slur=is_slur,
ilens=ilens,
ds=ds,
ps=ps,
es=es,
@ -460,20 +468,21 @@ class FastSpeech2MIDI(FastSpeech2):
else:
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs, _ = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
xs=xs,
note=note,
note_dur=note_dur,
is_slur=is_slur,
ilens=ilens,
is_inference=True,
alpha=alpha,
spk_emb=spk_emb,
spk_id=spk_id, )
return outs[0], d_outs[0], p_outs[0], e_outs[0]
# return outs[0], d_outs[0], p_outs[0], e_outs[0]
return outs[0], d_outs[0], p_outs[0], None
class FastSpeech2MIDILoss(nn.Layer):
class FastSpeech2MIDILoss(FastSpeech2Loss):
"""Loss function module for DiffSinger."""
def __init__(self, use_masking: bool=True,
@ -486,18 +495,7 @@ class FastSpeech2MIDILoss(nn.Layer):
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
# define criterions
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
self.mse_criterion = nn.MSELoss(reduction=reduction)
self.duration_criterion = DurationPredictorLoss(reduction=reduction)
self.ce_criterion = nn.CrossEntropyLoss()
super().__init__(use_masking, use_weighted_masking)
def forward(
self,
@ -551,15 +549,23 @@ class FastSpeech2MIDILoss(nn.Layer):
"""
speaker_loss = 0.0
l1_loss = duration_loss = pitch_loss = energy_loss = speaker_loss = ssim_loss = 0.0
out_pad_masks = make_pad_mask(olens).unsqueeze(-1)
before_outs_batch = masked_fill(before_outs, out_pad_masks, 0.0)
# print(before_outs.shape, ys.shape)
ssim_loss = 1.0 - ssim(before_outs_batch.unsqueeze(1), ys.unsqueeze(1))
ssim_loss = ssim_loss * 0.5
# apply mask to remove padded part
if self.use_masking:
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
before_outs = before_outs.masked_select(
out_masks.broadcast_to(before_outs.shape))
if after_outs is not None:
after_outs = after_outs.masked_select(
out_masks.broadcast_to(after_outs.shape))
# if after_outs is not None:
# after_outs = after_outs.masked_select(
# out_masks.broadcast_to(after_outs.shape))
ys = ys.masked_select(out_masks.broadcast_to(ys.shape))
duration_masks = make_non_pad_mask(ilens)
d_outs = d_outs.masked_select(
@ -568,8 +574,8 @@ class FastSpeech2MIDILoss(nn.Layer):
pitch_masks = out_masks
p_outs = p_outs.masked_select(
pitch_masks.broadcast_to(p_outs.shape))
e_outs = e_outs.masked_select(
pitch_masks.broadcast_to(e_outs.shape))
# e_outs = e_outs.masked_select(
# pitch_masks.broadcast_to(e_outs.shape))
ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
es = es.masked_select(pitch_masks.broadcast_to(es.shape))
@ -585,11 +591,17 @@ class FastSpeech2MIDILoss(nn.Layer):
# calculate loss
l1_loss = self.l1_criterion(before_outs, ys)
if after_outs is not None:
l1_loss += self.l1_criterion(after_outs, ys)
# if after_outs is not None:
# l1_loss += self.l1_criterion(after_outs, ys)
# ssim_loss += (1.0 - ssim(after_outs, ys))
l1_loss = l1_loss * 0.5
duration_loss = self.duration_criterion(d_outs, ds)
pitch_loss = self.mse_criterion(p_outs, ps)
energy_loss = self.mse_criterion(e_outs, es)
# print("ppppppppppoooooooooooo: ", p_outs, p_outs.shape)
# print("ppppppppppssssssssssss: ", ps, ps.shape)
# pitch_loss = self.mse_criterion(p_outs, ps)
# energy_loss = self.mse_criterion(e_outs, es)
pitch_loss = self.l1_criterion(p_outs, ps)
if spk_logits is not None and spk_ids is not None:
speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size
@ -618,8 +630,8 @@ class FastSpeech2MIDILoss(nn.Layer):
pitch_loss = pitch_loss.multiply(pitch_weights)
pitch_loss = pitch_loss.masked_select(
pitch_masks.broadcast_to(pitch_loss.shape)).sum()
energy_loss = energy_loss.multiply(pitch_weights)
energy_loss = energy_loss.masked_select(
pitch_masks.broadcast_to(energy_loss.shape)).sum()
# energy_loss = energy_loss.multiply(pitch_weights)
# energy_loss = energy_loss.masked_select(
# pitch_masks.broadcast_to(energy_loss.shape)).sum()
return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss
return l1_loss, ssim_loss, duration_loss, pitch_loss, energy_loss, speaker_loss

@ -93,6 +93,7 @@ class FastSpeech2(nn.Layer):
transformer_dec_dropout_rate: float=0.1,
transformer_dec_positional_dropout_rate: float=0.1,
transformer_dec_attn_dropout_rate: float=0.1,
transformer_activation_type: str="relu",
# for conformer
conformer_pos_enc_layer_type: str="rel_pos",
conformer_self_attn_layer_type: str="rel_selfattn",
@ -200,6 +201,8 @@ class FastSpeech2(nn.Layer):
Dropout rate after decoder positional encoding.
transformer_dec_attn_dropout_rate (float):
Dropout rate in decoder self-attention module.
transformer_activation_type (str):
Activation function type in transformer.
conformer_pos_enc_layer_type (str):
Pos encoding layer type in conformer.
conformer_self_attn_layer_type (str):
@ -250,7 +253,7 @@ class FastSpeech2(nn.Layer):
Kernel size of energy embedding.
energy_embed_dropout_rate (float):
Dropout rate for energy embedding.
stop_gradient_from_energy_predictorbool):
stop_gradient_from_energy_predictor (bool):
Whether to stop gradient from energy predictor to encoder.
spk_num (Optional[int]):
Number of speakers. If not None, assume that the spk_embed_dim is not None,
@ -269,7 +272,7 @@ class FastSpeech2(nn.Layer):
How to integrate tone embedding.
init_type (str):
How to initialize transformer parameters.
init_enc_alpha float):
init_enc_alpha (float):
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha (float):
Initial value of alpha in scaled pos encoding of the decoder.
@ -344,7 +347,8 @@ class FastSpeech2(nn.Layer):
normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
activation_type=transformer_activation_type)
elif encoder_type == "conformer":
self.encoder = ConformerEncoder(
idim=idim,
@ -453,7 +457,8 @@ class FastSpeech2(nn.Layer):
normalize_before=decoder_normalize_before,
concat_after=decoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
activation_type=conformer_activation_type, )
elif decoder_type == "conformer":
self.decoder = ConformerEncoder(
idim=0,

@ -37,7 +37,8 @@ def get_activation(act, **kwargs):
"selu": paddle.nn.SELU,
"leakyrelu": paddle.nn.LeakyReLU,
"swish": paddle.nn.Swish,
"glu": GLU
"glu": GLU,
"gelu": paddle.nn.GELU,
}
return activation_funcs[act](**kwargs)

@ -40,7 +40,7 @@ class WaveNetDenoiser(nn.Layer):
layers (int, optional):
Number of residual blocks inside, by default 20
stacks (int, optional):
The number of groups to split the residual blocks into, by default 4
The number of groups to split the residual blocks into, by default 5
Within each group, the dilation of the residual block grows exponentially.
residual_channels (int, optional):
Residual channel of the residual blocks, by default 256
@ -64,7 +64,7 @@ class WaveNetDenoiser(nn.Layer):
out_channels: int=80,
kernel_size: int=3,
layers: int=20,
stacks: int=4,
stacks: int=5,
residual_channels: int=256,
gate_channels: int=512,
skip_channels: int=256,
@ -72,7 +72,7 @@ class WaveNetDenoiser(nn.Layer):
dropout: float=0.,
bias: bool=True,
use_weight_norm: bool=False,
init_type: str="kaiming_uniform", ):
init_type: str="kaiming_normal", ):
super().__init__()
# initialize parameters
@ -118,18 +118,15 @@ class WaveNetDenoiser(nn.Layer):
bias=bias)
self.conv_layers.append(conv)
final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
nn.initializer.Constant(0.0)(final_conv.weight)
self.last_conv_layers = nn.Sequential(nn.ReLU(),
nn.Conv1D(
skip_channels,
skip_channels,
1,
bias_attr=True),
nn.ReLU(),
nn.Conv1D(
skip_channels,
out_channels,
1,
bias_attr=True))
nn.ReLU(), final_conv)
if use_weight_norm:
self.apply_weight_norm()
@ -200,10 +197,6 @@ class GaussianDiffusion(nn.Layer):
Args:
denoiser (Layer, optional):
The model used for denoising noises.
In fact, the denoiser model performs the operation
of producing a output with more noises from the noisy input.
Then we use the diffusion algorithm to calculate
the input with the output to get the denoised result.
num_train_timesteps (int, optional):
The number of timesteps between the noise and the real during training, by default 1000.
beta_start (float, optional):
@ -233,7 +226,8 @@ class GaussianDiffusion(nn.Layer):
>>> def callback(index, timestep, num_timesteps, sample):
>>> nonlocal pbar
>>> if pbar is None:
>>> pbar = tqdm(total=num_timesteps-index)
>>> pbar = tqdm(total=num_timesteps)
>>> pbar.update(index)
>>> pbar.update()
>>>
>>> return callback
@ -247,7 +241,7 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad():
>>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, x,
>>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback())
@ -262,7 +256,7 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad():
>>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, x_in,
>>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback())
@ -277,11 +271,11 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad():
>>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, None,
>>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback())
100%|| 25/25 [00:01<00:00, 19.75it/s]
100%|| 34/34 [00:01<00:00, 19.75it/s]
>>>
>>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output
>>> ds = 1000
@ -292,11 +286,11 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad():
>>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, x,
>>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback())
100%|| 5/5 [00:00<00:00, 23.80it/s]
100%|| 14/14 [00:00<00:00, 23.80it/s]
"""
@ -366,6 +360,8 @@ class GaussianDiffusion(nn.Layer):
num_inference_steps: Optional[int]=1000,
strength: Optional[float]=None,
scheduler_type: Optional[str]="ddpm",
clip_noise: Optional[bool]=True,
clip_noise_range: Optional[Tuple[float, float]]=(-1, 1),
callback: Optional[Callable[[int, int, int, paddle.Tensor],
None]]=None,
callback_steps: Optional[int]=1):
@ -386,6 +382,10 @@ class GaussianDiffusion(nn.Layer):
scheduler_type (str, optional):
Noise scheduler for generate noises.
Choose a great scheduler can skip many denoising step, by default 'ddpm'.
clip_noise (bool, optional):
Whether to clip each denoised output, by default True.
clip_noise_range (tuple, optional):
denoised output min and max value range after clip, by default (-1, 1).
callback (Callable[[int,int,int,Tensor], None], optional):
Callback function during denoising steps.
@ -426,6 +426,7 @@ class GaussianDiffusion(nn.Layer):
scheduler.set_timesteps(num_inference_steps)
# prepare first noise variables
import pdb;pdb.set_trace()
noisy_input = noise
timesteps = scheduler.timesteps
if ref_x is not None:
@ -444,8 +445,13 @@ class GaussianDiffusion(nn.Layer):
noisy_input = scheduler.add_noise(
ref_x, noise, timesteps[:1].tile([noise.shape[0]]))
# denoising loop
denoised_output = noisy_input
if clip_noise:
n_min, n_max = clip_noise_range
denoised_output = paddle.clip(denoised_output, n_min, n_max)
num_warmup_steps = len(
timesteps) - num_inference_steps * scheduler.order
for i, t in enumerate(timesteps):
@ -457,6 +463,8 @@ class GaussianDiffusion(nn.Layer):
# compute the previous noisy sample x_t -> x_t-1
denoised_output = scheduler.step(noise_pred, t,
denoised_output).prev_sample
if clip_noise:
denoised_output = paddle.clip(denoised_output, n_min, n_max)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and

@ -38,11 +38,9 @@ def masked_fill(xs: paddle.Tensor,
value: Union[float, int]):
# comment following line for converting dygraph to static graph.
# assert is_broadcastable(xs.shape, mask.shape) is True
# bshape = paddle.broadcast_shape(xs.shape, mask.shape)
bshape = broadcast_shape(xs.shape, mask.shape)
mask.stop_gradient = True
mask = mask.broadcast_to(bshape)
trues = paddle.ones_like(xs) * value
mask = mask.cast(dtype=paddle.bool)
xs = paddle.where(mask, trues, xs)

Loading…
Cancel
Save