diffsinger_tmp

pull/3005/head
liangym 3 years ago
parent c91dc02931
commit 84a22ffb93

@ -23,7 +23,7 @@ f0max: 750 # Maximum f0 for pitch extraction.
########################################################### ###########################################################
# DATA SETTING # # DATA SETTING #
########################################################### ###########################################################
batch_size: 32 batch_size: 48
num_workers: 1 num_workers: 1
@ -37,33 +37,32 @@ model:
# fastspeech2 module # fastspeech2 module
fastspeech2_params: fastspeech2_params:
adim: 256 # attention dimension adim: 256 # attention dimension # lym check
aheads: 2 # number of attention heads aheads: 2 # number of attention heads # lym check
elayers: 4 # number of encoder layers elayers: 4 # number of encoder layers # lym check
eunits: 1536 # number of encoder ff units eunits: 1024 # number of encoder ff units # lym check adim * 4
dlayers: 4 # number of decoder layers dlayers: 4 # number of decoder layers # lym check
dunits: 1536 # number of decoder ff units dunits: 1024 # number of decoder ff units # lym check
positionwise_layer_type: conv1d # type of position-wise layer positionwise_layer_type: conv1d-linear # type of position-wise layer # lym check
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer # lym check
duration_predictor_layers: 2 # number of layers of duration predictor transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer # lym check
duration_predictor_chans: 256 # number of channels of duration predictor transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding # lym check
duration_predictor_kernel_size: 3 # filter size of duration predictor transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer # lym check
postnet_layers: 5 # number of layers of postnset transformer_activation_type: "gelu"
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer use_scaled_pos_enc: True # whether to use scaled positional encoding
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding duration_predictor_layers: 5 # number of layers of duration predictor
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
@ -71,6 +70,11 @@ model:
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
@ -131,19 +135,24 @@ ds_optimizer_params:
ds_scheduler_params: ds_scheduler_params:
learning_rate: 0.001 learning_rate: 0.001
gamma: 0.5 gamma: 0.5
step_size: 25000 step_size: 10000
ds_grad_norm: 1 ds_grad_norm: 1
########################################################### ###########################################################
# INTERVAL SETTING # # INTERVAL SETTING #
########################################################### ###########################################################
ds_train_start_steps: 160000 # Number of steps to start to train diffusion module. ds_train_start_steps: 32500 # Number of steps to start to train diffusion module.
train_max_steps: 320000 # Number of training steps. train_max_steps: 65000 # Number of training steps.
save_interval_steps: 1000 # Interval steps to save checkpoint. save_interval_steps: 500 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network. eval_interval_steps: 500 # Interval steps to evaluate the network.
num_snapshots: 5 num_snapshots: 20
# ds_train_start_steps: 4 # Number of steps to start to train diffusion module.
# train_max_steps: 8 # Number of training steps.
# save_interval_steps: 1 # Interval steps to save checkpoint.
# eval_interval_steps: 2 # Interval steps to evaluate the network.
# num_snapshots: 5
########################################################### ###########################################################
# OTHER SETTING # # OTHER SETTING #

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
stage=1 stage=0
stop_stage=100 stop_stage=100
config_path=$1 config_path=$1

@ -2,7 +2,9 @@
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 #ckpt_name=$3
iter=$3
ckpt_name=snapshot_iter_${iter}.pdz
stage=0 stage=0
stop_stage=0 stop_stage=0
@ -20,81 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \ --voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop/feats_stats.npy \ --voc_stat=pwgan_opencpop/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \ --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \ --output_dir=${train_output_path}/test_${iter} \
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi

@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \ --config=${config_path} \
--output-dir=${train_output_path} \ --output-dir=${train_output_path} \
--ngpu=1 \ --ngpu=4 \
--phones-dict=dump/phone_id_map.txt --phones-dict=dump/phone_id_map.txt

@ -3,9 +3,10 @@
set -e set -e
source path.sh source path.sh
gpus=0 gpus=4,5,6,7
stage=0 #gpus=0
stop_stage=100 stage=1
stop_stage=1
conf_path=conf/default.yaml conf_path=conf/default.yaml
train_output_path=exp/default train_output_path=exp/default

@ -105,7 +105,6 @@ class Pitch():
if (f0 == 0).all(): if (f0 == 0).all():
print("All frames seems to be unvoiced.") print("All frames seems to be unvoiced.")
return f0 return f0
# padding start and end of f0 sequence # padding start and end of f0 sequence
start_f0 = f0[f0 != 0][0] start_f0 = f0[f0 != 0][0]
end_f0 = f0[f0 != 0][-1] end_f0 = f0[f0 != 0][-1]

@ -101,7 +101,7 @@ def get_sentences_svs(
dataset (str): dataset name dataset (str): dataset name
Returns: Returns:
Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str) Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str)
tunple: speaker name tuple: speaker name
''' '''
f = open(file_name, 'r') f = open(file_name, 'r')
sentence = {} sentence = {}
@ -115,7 +115,7 @@ def get_sentences_svs(
ph = line_list[2].split() ph = line_list[2].split()
midi = note2midi(line_list[3].split()) midi = note2midi(line_list[3].split())
midi_dur = line_list[4].split() midi_dur = line_list[4].split()
ph_dur = time2frame([float(t) for t in line_list[5].split()]) ph_dur = time2frame([float(t) for t in line_list[5].split()], sample_rate=sample_rate, n_shift=n_shift)
is_slur = line_list[6].split() is_slur = line_list[6].split()
assert len(ph) == len(midi) == len(midi_dur) == len(is_slur) assert len(ph) == len(midi) == len(midi_dur) == len(is_slur)
sentence[utt] = (ph, [int(i) for i in ph_dur], sentence[utt] = (ph, [int(i) for i in ph_dur],

@ -80,20 +80,27 @@ def main():
# restore scaler # restore scaler
speech_scaler = StandardScaler() speech_scaler = StandardScaler()
speech_scaler.mean_ = np.load(args.speech_stats)[0] # speech_scaler.mean_ = np.load(args.speech_stats)[0]
speech_scaler.scale_ = np.load(args.speech_stats)[1] # speech_scaler.scale_ = np.load(args.speech_stats)[1]
speech_scaler.mean_ = np.zeros(np.load(args.speech_stats)[0].shape, dtype="float32")
speech_scaler.scale_ = np.ones(np.load(args.speech_stats)[1].shape, dtype="float32")
speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0] speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
pitch_scaler = StandardScaler() pitch_scaler = StandardScaler()
pitch_scaler.mean_ = np.load(args.pitch_stats)[0] # pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
pitch_scaler.scale_ = np.load(args.pitch_stats)[1] # pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
pitch_scaler.mean_ = np.zeros(np.load(args.pitch_stats)[0].shape, dtype="float32")
pitch_scaler.scale_ = np.ones(np.load(args.pitch_stats)[1].shape, dtype="float32")
pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0] pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0]
energy_scaler = StandardScaler() energy_scaler = StandardScaler()
energy_scaler.mean_ = np.load(args.energy_stats)[0] # energy_scaler.mean_ = np.load(args.energy_stats)[0]
energy_scaler.scale_ = np.load(args.energy_stats)[1] # energy_scaler.scale_ = np.load(args.energy_stats)[1]
energy_scaler.mean_ = np.zeros(np.load(args.energy_stats)[0].shape, dtype="float32")
energy_scaler.scale_ = np.ones(np.load(args.energy_stats)[1].shape, dtype="float32")
energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0] energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0]
vocab_phones = {} vocab_phones = {}
with open(args.phones_dict, 'rt') as f: with open(args.phones_dict, 'rt') as f:
phn_id = [line.strip().split() for line in f.readlines()] phn_id = [line.strip().split() for line in f.readlines()]
@ -111,6 +118,7 @@ def main():
for item in tqdm(dataset): for item in tqdm(dataset):
utt_id = item['utt_id'] utt_id = item['utt_id']
print(utt_id)
speech = item['speech'] speech = item['speech']
pitch = item['pitch'] pitch = item['pitch']
energy = item['energy'] energy = item['energy']

@ -34,7 +34,6 @@ from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_
from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.utils import str2bool from paddlespeech.t2s.utils import str2bool
ALL_INITIALS = [ ALL_INITIALS = [
@ -106,6 +105,7 @@ def process_sentence(
pitch_dir = output_dir / "data_pitch" pitch_dir = output_dir / "data_pitch"
pitch_dir.mkdir(parents=True, exist_ok=True) pitch_dir.mkdir(parents=True, exist_ok=True)
pitch_path = pitch_dir / (utt_id + "_pitch.npy") pitch_path = pitch_dir / (utt_id + "_pitch.npy")
# print(pitch, pitch.shape)
np.save(pitch_path, pitch) np.save(pitch_path, pitch)
energy = energy_extractor.get_energy(wav) energy = energy_extractor.get_energy(wav)
assert energy.shape[0] == num_frames assert energy.shape[0] == num_frames
@ -271,7 +271,6 @@ def main():
sample_rate=config.fs, sample_rate=config.fs,
n_shift=config.n_shift, ) n_shift=config.n_shift, )
# merge_silence(sentences)
phone_id_map_path = dumpdir / "phone_id_map.txt" phone_id_map_path = dumpdir / "phone_id_map.txt"
speaker_id_map_path = dumpdir / "speaker_id_map.txt" speaker_id_map_path = dumpdir / "speaker_id_map.txt"
get_input_token(sentences, phone_id_map_path, args.dataset) get_input_token(sentences, phone_id_map_path, args.dataset)

@ -43,9 +43,6 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer from paddlespeech.t2s.training.trainer import Trainer
from paddlespeech.t2s.utils import str2bool
# from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
def train_sp(args, config): def train_sp(args, config):

@ -120,8 +120,12 @@ def evaluate(args):
note_dur=note_dur, note_dur=note_dur,
is_slur=is_slur, is_slur=is_slur,
get_mel_fs2=get_mel_fs2) get_mel_fs2=get_mel_fs2)
# import numpy as np
# mel = np.load("/home/liangyunming/others_code/DiffSinger_lym/diffsinger_mel.npy")
# mel = paddle.to_tensor(mel)
wav = voc_inference(mel) wav = voc_inference(mel)
wav = wav.numpy() wav = wav.numpy()
N += wav.size N += wav.size
T += t.elapse T += t.elapse
@ -131,8 +135,10 @@ def evaluate(args):
f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
) )
sf.write( sf.write(
# str(output_dir / ("xiaojiuwo_diffsinger" + ".wav")), wav, samplerate=am_config.fs)
str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
# break
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")

@ -48,12 +48,12 @@ class DiffSinger(nn.Layer):
note_num: int=300, note_num: int=300,
is_slur_num: int=2, is_slur_num: int=2,
fastspeech2_params: Dict[str, Any]={ fastspeech2_params: Dict[str, Any]={
"adim": 384, "adim": 256,
"aheads": 4, "aheads": 2,
"elayers": 6, "elayers": 4,
"eunits": 1536, "eunits": 1024,
"dlayers": 6, "dlayers": 4,
"dunits": 1536, "dunits": 1024,
"postnet_layers": 5, "postnet_layers": 5,
"postnet_chans": 512, "postnet_chans": 512,
"postnet_filts": 5, "postnet_filts": 5,
@ -74,6 +74,7 @@ class DiffSinger(nn.Layer):
"transformer_dec_dropout_rate": 0.1, "transformer_dec_dropout_rate": 0.1,
"transformer_dec_positional_dropout_rate": 0.1, "transformer_dec_positional_dropout_rate": 0.1,
"transformer_dec_attn_dropout_rate": 0.1, "transformer_dec_attn_dropout_rate": 0.1,
"transformer_activation_type": "gelu",
# duration predictor # duration predictor
"duration_predictor_layers": 2, "duration_predictor_layers": 2,
"duration_predictor_chans": 384, "duration_predictor_chans": 384,
@ -149,7 +150,7 @@ class DiffSinger(nn.Layer):
self.fs2 = FastSpeech2MIDI( self.fs2 = FastSpeech2MIDI(
idim=idim, idim=idim,
odim=odim, odim=odim,
fastspeech2_config=fastspeech2_params, fastspeech2_params=fastspeech2_params,
note_num=note_num, note_num=note_num,
is_slur_num=is_slur_num) is_slur_num=is_slur_num)
denoiser = WaveNetDenoiser(**denoiser_params) denoiser = WaveNetDenoiser(**denoiser_params)
@ -260,7 +261,7 @@ class DiffSinger(nn.Layer):
Whether to get mel from fastspeech2 module. Whether to get mel from fastspeech2 module.
Returns: Returns:
_type_: _description_
""" """
mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur) mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur)
if get_mel_fs2: if get_mel_fs2:
@ -268,7 +269,9 @@ class DiffSinger(nn.Layer):
mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1)) mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1))
cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur) cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur)
cond_fs2 = cond_fs2.transpose((0, 2, 1)) cond_fs2 = cond_fs2.transpose((0, 2, 1))
mel, _ = self.diffusion(mel_fs2, cond_fs2) # mel, _ = self.diffusion(mel_fs2, cond_fs2)
noise = paddle.randn(mel_fs2.shape)
mel = self.diffusion.inference(noise=noise, cond=cond_fs2, ref_x=mel_fs2, num_inference_steps=100)
mel = mel.transpose((0, 2, 1)) mel = mel.transpose((0, 2, 1))
return mel[0] return mel[0]
@ -280,13 +283,32 @@ class DiffSingerInference(nn.Layer):
self.acoustic_model = model self.acoustic_model = model
def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False): def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False):
"""Calculate forward propagation.
Args:
text(Tensor(int64)):
Batch of padded token (phone) ids (B, Tmax).
note(Tensor(int64)):
Batch of padded note (element in music score) ids (B, Tmax).
note_dur(Tensor(float32)):
Batch of padded note durations in seconds (element in music score) (B, Tmax).
is_slur(Tensor(int64)):
Batch of padded slur (element in music score) ids (B, Tmax).
get_mel_fs2 (bool, optional): . Defaults to False.
Whether to get mel from fastspeech2 module.
Returns:
logmel(Tensor(float32)): denorm logmel, [T, mel_bin]
"""
normalized_mel = self.acoustic_model.inference( normalized_mel = self.acoustic_model.inference(
text, text,
note=note, note=note,
note_dur=note_dur, note_dur=note_dur,
is_slur=is_slur, is_slur=is_slur,
get_mel_fs2=get_mel_fs2) get_mel_fs2=get_mel_fs2)
logmel = self.normalizer.inverse(normalized_mel) print(normalized_mel)
# logmel = self.normalizer.inverse(normalized_mel)
logmel = normalized_mel
return logmel return logmel

@ -41,7 +41,6 @@ class DiffSingerUpdater(StandardUpdater):
optimizers: Dict[str, Optimizer], optimizers: Dict[str, Optimizer],
criterions: Dict[str, Layer], criterions: Dict[str, Layer],
dataloader: DataLoader, dataloader: DataLoader,
fs2_train_start_steps: int=0,
ds_train_start_steps: int=160000, ds_train_start_steps: int=160000,
output_dir: Path=None, ): output_dir: Path=None, ):
super().__init__(model, optimizers, dataloader, init_state=None) super().__init__(model, optimizers, dataloader, init_state=None)
@ -58,7 +57,6 @@ class DiffSingerUpdater(StandardUpdater):
self.dataloader = dataloader self.dataloader = dataloader
self.fs2_train_start_steps = fs2_train_start_steps
self.ds_train_start_steps = ds_train_start_steps self.ds_train_start_steps = ds_train_start_steps
self.state = UpdaterState(iteration=0, epoch=0) self.state = UpdaterState(iteration=0, epoch=0)
@ -81,7 +79,8 @@ class DiffSingerUpdater(StandardUpdater):
spk_id = None spk_id = None
# only train fastspeech2 module firstly # only train fastspeech2 module firstly
if self.state.iteration > self.fs2_train_start_steps and self.state.iteration < self.ds_train_start_steps: if self.state.iteration <= self.ds_train_start_steps:
# print(batch)
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
text=batch["text"], text=batch["text"],
note=batch["note"], note=batch["note"],
@ -97,7 +96,7 @@ class DiffSingerUpdater(StandardUpdater):
spk_emb=spk_emb, spk_emb=spk_emb,
train_fs2=True, ) train_fs2=True, )
l1_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2( l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
after_outs=after_outs, after_outs=after_outs,
before_outs=before_outs, before_outs=before_outs,
d_outs=d_outs, d_outs=d_outs,
@ -112,7 +111,7 @@ class DiffSingerUpdater(StandardUpdater):
spk_logits=spk_logits, spk_logits=spk_logits,
spk_ids=spk_id, ) spk_ids=spk_id, )
loss_fs2 = l1_loss_fs2 + duration_loss + pitch_loss + energy_loss loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss
self.optimizer_fs2.clear_grad() self.optimizer_fs2.clear_grad()
loss_fs2.backward() loss_fs2.backward()
@ -120,11 +119,13 @@ class DiffSingerUpdater(StandardUpdater):
report("train/loss_fs2", float(loss_fs2)) report("train/loss_fs2", float(loss_fs2))
report("train/l1_loss_fs2", float(l1_loss_fs2)) report("train/l1_loss_fs2", float(l1_loss_fs2))
report("train/ssim_loss_fs2", float(ssim_loss_fs2))
report("train/duration_loss", float(duration_loss)) report("train/duration_loss", float(duration_loss))
report("train/pitch_loss", float(pitch_loss)) report("train/pitch_loss", float(pitch_loss))
report("train/energy_loss", float(energy_loss)) report("train/energy_loss", float(energy_loss))
losses_dict["l1_loss_fs2"] = float(l1_loss_fs2) losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
losses_dict["duration_loss"] = float(duration_loss) losses_dict["duration_loss"] = float(duration_loss)
losses_dict["pitch_loss"] = float(pitch_loss) losses_dict["pitch_loss"] = float(pitch_loss)
losses_dict["energy_loss"] = float(energy_loss) losses_dict["energy_loss"] = float(energy_loss)

@ -22,9 +22,11 @@ from paddle import nn
from typeguard import check_argument_types from typeguard import check_argument_types
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask from paddlespeech.t2s.modules.nets_utils import make_pad_mask
from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss from paddlespeech.t2s.modules.losses import ssim
from paddlespeech.t2s.modules.masked_fill import masked_fill
class FastSpeech2MIDI(FastSpeech2): class FastSpeech2MIDI(FastSpeech2):
@ -36,14 +38,14 @@ class FastSpeech2MIDI(FastSpeech2):
# fastspeech2 network structure related # fastspeech2 network structure related
idim: int, idim: int,
odim: int, odim: int,
fastspeech2_config: Dict[str, Any], fastspeech2_params: Dict[str, Any],
# note emb # note emb
note_num: int=300, note_num: int=300,
# is_slur emb # is_slur emb
is_slur_num: int=2, ): is_slur_num: int=2, ):
"""Initialize FastSpeech2 module for svs. """Initialize FastSpeech2 module for svs.
Args: Args:
fastspeech2_config (Dict): fastspeech2_params (Dict):
The config of FastSpeech2 module on DiffSinger model The config of FastSpeech2 module on DiffSinger model
note_num (Optional[int]): note_num (Optional[int]):
Number of note. If not None, assume that the Number of note. If not None, assume that the
@ -54,9 +56,9 @@ class FastSpeech2MIDI(FastSpeech2):
""" """
assert check_argument_types() assert check_argument_types()
super().__init__(idim=idim, odim=odim, **fastspeech2_config) super().__init__(idim=idim, odim=odim, **fastspeech2_params)
self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_config[ self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_params[
"adim"] "adim"]
if note_num is not None: if note_num is not None:
@ -133,15 +135,15 @@ class FastSpeech2MIDI(FastSpeech2):
spk_id = paddle.cast(spk_id, 'int64') spk_id = paddle.cast(spk_id, 'int64')
# forward propagation # forward propagation
before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward( before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward(
xs, xs=xs,
note, note=note,
note_dur, note_dur=note_dur,
is_slur, is_slur=is_slur,
ilens, ilens=ilens,
olens, olens=olens,
ds, ds=ds,
ps, ps=ps,
es, es=es,
is_inference=False, is_inference=False,
spk_emb=spk_emb, spk_emb=spk_emb,
spk_id=spk_id, ) spk_id=spk_id, )
@ -170,6 +172,8 @@ class FastSpeech2MIDI(FastSpeech2):
alpha: float=1.0, alpha: float=1.0,
spk_emb=None, spk_emb=None,
spk_id=None, ) -> Sequence[paddle.Tensor]: spk_id=None, ) -> Sequence[paddle.Tensor]:
before_outs = after_outs = d_outs = p_outs = e_outs = spk_logits = None
# forward encoder # forward encoder
x_masks = self._source_mask(ilens) x_masks = self._source_mask(ilens)
note_emb = self.note_embedding_table(note) note_emb = self.note_embedding_table(note)
@ -206,16 +210,17 @@ class FastSpeech2MIDI(FastSpeech2):
else: else:
pitch_masks = None pitch_masks = None
# inference for decoder input for duffusion # inference for decoder input for diffusion
if is_train_diffusion: if is_train_diffusion:
hs = self.length_regulator(hs, ds, is_inference=False) hs = self.length_regulator(hs, ds, is_inference=False)
p_outs = self.pitch_predictor(hs.detach(), pitch_masks) p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
e_outs = self.energy_predictor(hs.detach(), pitch_masks) # e_outs = self.energy_predictor(hs.detach(), pitch_masks)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1)) (0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( # e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1)) # (0, 2, 1))
hs = hs + e_embs + p_embs # hs = hs + p_embs + e_embs
hs = hs + p_embs
elif is_inference: elif is_inference:
# (B, Tmax) # (B, Tmax)
@ -235,19 +240,20 @@ class FastSpeech2MIDI(FastSpeech2):
else: else:
p_outs = self.pitch_predictor(hs, pitch_masks) p_outs = self.pitch_predictor(hs, pitch_masks)
if es is not None: # if es is not None:
e_outs = es # e_outs = es
else: # else:
if self.stop_gradient_from_energy_predictor: # if self.stop_gradient_from_energy_predictor:
e_outs = self.energy_predictor(hs.detach(), pitch_masks) # e_outs = self.energy_predictor(hs.detach(), pitch_masks)
else: # else:
e_outs = self.energy_predictor(hs, pitch_masks) # e_outs = self.energy_predictor(hs, pitch_masks)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1)) (0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( # e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1)) # (0, 2, 1))
hs = hs + e_embs + p_embs # hs = hs + p_embs + e_embs
hs = hs + p_embs
# training # training
else: else:
@ -258,15 +264,16 @@ class FastSpeech2MIDI(FastSpeech2):
p_outs = self.pitch_predictor(hs.detach(), pitch_masks) p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
else: else:
p_outs = self.pitch_predictor(hs, pitch_masks) p_outs = self.pitch_predictor(hs, pitch_masks)
if self.stop_gradient_from_energy_predictor: # if self.stop_gradient_from_energy_predictor:
e_outs = self.energy_predictor(hs.detach(), pitch_masks) # e_outs = self.energy_predictor(hs.detach(), pitch_masks)
else: # else:
e_outs = self.energy_predictor(hs, pitch_masks) # e_outs = self.energy_predictor(hs, pitch_masks)
p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose( p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
(0, 2, 1)) (0, 2, 1))
e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose( # e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
(0, 2, 1)) # (0, 2, 1))
hs = hs + e_embs + p_embs # hs = hs + p_embs + e_embs
hs = hs + p_embs
# forward decoder # forward decoder
if olens is not None and not is_inference: if olens is not None and not is_inference:
@ -295,11 +302,12 @@ class FastSpeech2MIDI(FastSpeech2):
(paddle.shape(zs)[0], -1, self.odim)) (paddle.shape(zs)[0], -1, self.odim))
# postnet -> (B, Lmax//r * r, odim) # postnet -> (B, Lmax//r * r, odim)
if self.postnet is None: # if self.postnet is None:
after_outs = before_outs # after_outs = before_outs
else: # else:
after_outs = before_outs + self.postnet( # after_outs = before_outs + self.postnet(
before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) # before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
after_outs = before_outs
return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits
@ -326,11 +334,11 @@ class FastSpeech2MIDI(FastSpeech2):
# (1, L, odim) # (1, L, odim)
# use *_ to avoid bug in dygraph to static graph # use *_ to avoid bug in dygraph to static graph
hs, _ = self._forward( hs, _ = self._forward(
xs, xs=xs,
note, note=note,
note_dur, note_dur=note_dur,
is_slur, is_slur=is_slur,
ilens, ilens=ilens,
is_inference=True, is_inference=True,
return_after_enc=True, return_after_enc=True,
alpha=alpha, alpha=alpha,
@ -367,15 +375,15 @@ class FastSpeech2MIDI(FastSpeech2):
# (1, L, odim) # (1, L, odim)
# use *_ to avoid bug in dygraph to static graph # use *_ to avoid bug in dygraph to static graph
hs, h_masks = self._forward( hs, h_masks = self._forward(
xs, xs=xs,
note, note=note,
note_dur, note_dur=note_dur,
is_slur, is_slur=is_slur,
ilens, ilens=ilens,
olens, olens=olens,
ds, ds=ds,
ps, ps=ps,
es, es=es,
return_after_enc=True, return_after_enc=True,
is_train_diffusion=True, is_train_diffusion=True,
alpha=alpha, alpha=alpha,
@ -446,11 +454,11 @@ class FastSpeech2MIDI(FastSpeech2):
# (1, L, odim) # (1, L, odim)
_, outs, d_outs, p_outs, e_outs, _ = self._forward( _, outs, d_outs, p_outs, e_outs, _ = self._forward(
xs, xs=xs,
note, note=note,
note_dur, note_dur=note_dur,
is_slur, is_slur=is_slur,
ilens, ilens=ilens,
ds=ds, ds=ds,
ps=ps, ps=ps,
es=es, es=es,
@ -460,20 +468,21 @@ class FastSpeech2MIDI(FastSpeech2):
else: else:
# (1, L, odim) # (1, L, odim)
_, outs, d_outs, p_outs, e_outs, _ = self._forward( _, outs, d_outs, p_outs, e_outs, _ = self._forward(
xs, xs=xs,
note, note=note,
note_dur, note_dur=note_dur,
is_slur, is_slur=is_slur,
ilens, ilens=ilens,
is_inference=True, is_inference=True,
alpha=alpha, alpha=alpha,
spk_emb=spk_emb, spk_emb=spk_emb,
spk_id=spk_id, ) spk_id=spk_id, )
return outs[0], d_outs[0], p_outs[0], e_outs[0] # return outs[0], d_outs[0], p_outs[0], e_outs[0]
return outs[0], d_outs[0], p_outs[0], None
class FastSpeech2MIDILoss(nn.Layer): class FastSpeech2MIDILoss(FastSpeech2Loss):
"""Loss function module for DiffSinger.""" """Loss function module for DiffSinger."""
def __init__(self, use_masking: bool=True, def __init__(self, use_masking: bool=True,
@ -486,18 +495,7 @@ class FastSpeech2MIDILoss(nn.Layer):
Whether to weighted masking in loss calculation. Whether to weighted masking in loss calculation.
""" """
assert check_argument_types() assert check_argument_types()
super().__init__() super().__init__(use_masking, use_weighted_masking)
assert (use_masking != use_weighted_masking) or not use_masking
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
# define criterions
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
self.mse_criterion = nn.MSELoss(reduction=reduction)
self.duration_criterion = DurationPredictorLoss(reduction=reduction)
self.ce_criterion = nn.CrossEntropyLoss()
def forward( def forward(
self, self,
@ -551,15 +549,23 @@ class FastSpeech2MIDILoss(nn.Layer):
""" """
speaker_loss = 0.0 l1_loss = duration_loss = pitch_loss = energy_loss = speaker_loss = ssim_loss = 0.0
out_pad_masks = make_pad_mask(olens).unsqueeze(-1)
before_outs_batch = masked_fill(before_outs, out_pad_masks, 0.0)
# print(before_outs.shape, ys.shape)
ssim_loss = 1.0 - ssim(before_outs_batch.unsqueeze(1), ys.unsqueeze(1))
ssim_loss = ssim_loss * 0.5
# apply mask to remove padded part # apply mask to remove padded part
if self.use_masking: if self.use_masking:
out_masks = make_non_pad_mask(olens).unsqueeze(-1) out_masks = make_non_pad_mask(olens).unsqueeze(-1)
before_outs = before_outs.masked_select( before_outs = before_outs.masked_select(
out_masks.broadcast_to(before_outs.shape)) out_masks.broadcast_to(before_outs.shape))
if after_outs is not None:
after_outs = after_outs.masked_select( # if after_outs is not None:
out_masks.broadcast_to(after_outs.shape)) # after_outs = after_outs.masked_select(
# out_masks.broadcast_to(after_outs.shape))
ys = ys.masked_select(out_masks.broadcast_to(ys.shape)) ys = ys.masked_select(out_masks.broadcast_to(ys.shape))
duration_masks = make_non_pad_mask(ilens) duration_masks = make_non_pad_mask(ilens)
d_outs = d_outs.masked_select( d_outs = d_outs.masked_select(
@ -568,8 +574,8 @@ class FastSpeech2MIDILoss(nn.Layer):
pitch_masks = out_masks pitch_masks = out_masks
p_outs = p_outs.masked_select( p_outs = p_outs.masked_select(
pitch_masks.broadcast_to(p_outs.shape)) pitch_masks.broadcast_to(p_outs.shape))
e_outs = e_outs.masked_select( # e_outs = e_outs.masked_select(
pitch_masks.broadcast_to(e_outs.shape)) # pitch_masks.broadcast_to(e_outs.shape))
ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape)) ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
es = es.masked_select(pitch_masks.broadcast_to(es.shape)) es = es.masked_select(pitch_masks.broadcast_to(es.shape))
@ -585,11 +591,17 @@ class FastSpeech2MIDILoss(nn.Layer):
# calculate loss # calculate loss
l1_loss = self.l1_criterion(before_outs, ys) l1_loss = self.l1_criterion(before_outs, ys)
if after_outs is not None: # if after_outs is not None:
l1_loss += self.l1_criterion(after_outs, ys) # l1_loss += self.l1_criterion(after_outs, ys)
# ssim_loss += (1.0 - ssim(after_outs, ys))
l1_loss = l1_loss * 0.5
duration_loss = self.duration_criterion(d_outs, ds) duration_loss = self.duration_criterion(d_outs, ds)
pitch_loss = self.mse_criterion(p_outs, ps) # print("ppppppppppoooooooooooo: ", p_outs, p_outs.shape)
energy_loss = self.mse_criterion(e_outs, es) # print("ppppppppppssssssssssss: ", ps, ps.shape)
# pitch_loss = self.mse_criterion(p_outs, ps)
# energy_loss = self.mse_criterion(e_outs, es)
pitch_loss = self.l1_criterion(p_outs, ps)
if spk_logits is not None and spk_ids is not None: if spk_logits is not None and spk_ids is not None:
speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size
@ -618,8 +630,8 @@ class FastSpeech2MIDILoss(nn.Layer):
pitch_loss = pitch_loss.multiply(pitch_weights) pitch_loss = pitch_loss.multiply(pitch_weights)
pitch_loss = pitch_loss.masked_select( pitch_loss = pitch_loss.masked_select(
pitch_masks.broadcast_to(pitch_loss.shape)).sum() pitch_masks.broadcast_to(pitch_loss.shape)).sum()
energy_loss = energy_loss.multiply(pitch_weights) # energy_loss = energy_loss.multiply(pitch_weights)
energy_loss = energy_loss.masked_select( # energy_loss = energy_loss.masked_select(
pitch_masks.broadcast_to(energy_loss.shape)).sum() # pitch_masks.broadcast_to(energy_loss.shape)).sum()
return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss return l1_loss, ssim_loss, duration_loss, pitch_loss, energy_loss, speaker_loss

@ -93,6 +93,7 @@ class FastSpeech2(nn.Layer):
transformer_dec_dropout_rate: float=0.1, transformer_dec_dropout_rate: float=0.1,
transformer_dec_positional_dropout_rate: float=0.1, transformer_dec_positional_dropout_rate: float=0.1,
transformer_dec_attn_dropout_rate: float=0.1, transformer_dec_attn_dropout_rate: float=0.1,
transformer_activation_type: str="relu",
# for conformer # for conformer
conformer_pos_enc_layer_type: str="rel_pos", conformer_pos_enc_layer_type: str="rel_pos",
conformer_self_attn_layer_type: str="rel_selfattn", conformer_self_attn_layer_type: str="rel_selfattn",
@ -200,6 +201,8 @@ class FastSpeech2(nn.Layer):
Dropout rate after decoder positional encoding. Dropout rate after decoder positional encoding.
transformer_dec_attn_dropout_rate (float): transformer_dec_attn_dropout_rate (float):
Dropout rate in decoder self-attention module. Dropout rate in decoder self-attention module.
transformer_activation_type (str):
Activation function type in transformer.
conformer_pos_enc_layer_type (str): conformer_pos_enc_layer_type (str):
Pos encoding layer type in conformer. Pos encoding layer type in conformer.
conformer_self_attn_layer_type (str): conformer_self_attn_layer_type (str):
@ -250,7 +253,7 @@ class FastSpeech2(nn.Layer):
Kernel size of energy embedding. Kernel size of energy embedding.
energy_embed_dropout_rate (float): energy_embed_dropout_rate (float):
Dropout rate for energy embedding. Dropout rate for energy embedding.
stop_gradient_from_energy_predictorbool): stop_gradient_from_energy_predictor (bool):
Whether to stop gradient from energy predictor to encoder. Whether to stop gradient from energy predictor to encoder.
spk_num (Optional[int]): spk_num (Optional[int]):
Number of speakers. If not None, assume that the spk_embed_dim is not None, Number of speakers. If not None, assume that the spk_embed_dim is not None,
@ -269,7 +272,7 @@ class FastSpeech2(nn.Layer):
How to integrate tone embedding. How to integrate tone embedding.
init_type (str): init_type (str):
How to initialize transformer parameters. How to initialize transformer parameters.
init_enc_alpha float): init_enc_alpha (float):
Initial value of alpha in scaled pos encoding of the encoder. Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha (float): init_dec_alpha (float):
Initial value of alpha in scaled pos encoding of the decoder. Initial value of alpha in scaled pos encoding of the decoder.
@ -344,7 +347,8 @@ class FastSpeech2(nn.Layer):
normalize_before=encoder_normalize_before, normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after, concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type, positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) positionwise_conv_kernel_size=positionwise_conv_kernel_size,
activation_type=transformer_activation_type)
elif encoder_type == "conformer": elif encoder_type == "conformer":
self.encoder = ConformerEncoder( self.encoder = ConformerEncoder(
idim=idim, idim=idim,
@ -453,7 +457,8 @@ class FastSpeech2(nn.Layer):
normalize_before=decoder_normalize_before, normalize_before=decoder_normalize_before,
concat_after=decoder_concat_after, concat_after=decoder_concat_after,
positionwise_layer_type=positionwise_layer_type, positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) positionwise_conv_kernel_size=positionwise_conv_kernel_size,
activation_type=conformer_activation_type, )
elif decoder_type == "conformer": elif decoder_type == "conformer":
self.decoder = ConformerEncoder( self.decoder = ConformerEncoder(
idim=0, idim=0,

@ -37,7 +37,8 @@ def get_activation(act, **kwargs):
"selu": paddle.nn.SELU, "selu": paddle.nn.SELU,
"leakyrelu": paddle.nn.LeakyReLU, "leakyrelu": paddle.nn.LeakyReLU,
"swish": paddle.nn.Swish, "swish": paddle.nn.Swish,
"glu": GLU "glu": GLU,
"gelu": paddle.nn.GELU,
} }
return activation_funcs[act](**kwargs) return activation_funcs[act](**kwargs)

@ -40,7 +40,7 @@ class WaveNetDenoiser(nn.Layer):
layers (int, optional): layers (int, optional):
Number of residual blocks inside, by default 20 Number of residual blocks inside, by default 20
stacks (int, optional): stacks (int, optional):
The number of groups to split the residual blocks into, by default 4 The number of groups to split the residual blocks into, by default 5
Within each group, the dilation of the residual block grows exponentially. Within each group, the dilation of the residual block grows exponentially.
residual_channels (int, optional): residual_channels (int, optional):
Residual channel of the residual blocks, by default 256 Residual channel of the residual blocks, by default 256
@ -64,7 +64,7 @@ class WaveNetDenoiser(nn.Layer):
out_channels: int=80, out_channels: int=80,
kernel_size: int=3, kernel_size: int=3,
layers: int=20, layers: int=20,
stacks: int=4, stacks: int=5,
residual_channels: int=256, residual_channels: int=256,
gate_channels: int=512, gate_channels: int=512,
skip_channels: int=256, skip_channels: int=256,
@ -72,7 +72,7 @@ class WaveNetDenoiser(nn.Layer):
dropout: float=0., dropout: float=0.,
bias: bool=True, bias: bool=True,
use_weight_norm: bool=False, use_weight_norm: bool=False,
init_type: str="kaiming_uniform", ): init_type: str="kaiming_normal", ):
super().__init__() super().__init__()
# initialize parameters # initialize parameters
@ -118,18 +118,15 @@ class WaveNetDenoiser(nn.Layer):
bias=bias) bias=bias)
self.conv_layers.append(conv) self.conv_layers.append(conv)
final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
nn.initializer.Constant(0.0)(final_conv.weight)
self.last_conv_layers = nn.Sequential(nn.ReLU(), self.last_conv_layers = nn.Sequential(nn.ReLU(),
nn.Conv1D( nn.Conv1D(
skip_channels, skip_channels,
skip_channels, skip_channels,
1, 1,
bias_attr=True), bias_attr=True),
nn.ReLU(), nn.ReLU(), final_conv)
nn.Conv1D(
skip_channels,
out_channels,
1,
bias_attr=True))
if use_weight_norm: if use_weight_norm:
self.apply_weight_norm() self.apply_weight_norm()
@ -200,10 +197,6 @@ class GaussianDiffusion(nn.Layer):
Args: Args:
denoiser (Layer, optional): denoiser (Layer, optional):
The model used for denoising noises. The model used for denoising noises.
In fact, the denoiser model performs the operation
of producing a output with more noises from the noisy input.
Then we use the diffusion algorithm to calculate
the input with the output to get the denoised result.
num_train_timesteps (int, optional): num_train_timesteps (int, optional):
The number of timesteps between the noise and the real during training, by default 1000. The number of timesteps between the noise and the real during training, by default 1000.
beta_start (float, optional): beta_start (float, optional):
@ -233,7 +226,8 @@ class GaussianDiffusion(nn.Layer):
>>> def callback(index, timestep, num_timesteps, sample): >>> def callback(index, timestep, num_timesteps, sample):
>>> nonlocal pbar >>> nonlocal pbar
>>> if pbar is None: >>> if pbar is None:
>>> pbar = tqdm(total=num_timesteps-index) >>> pbar = tqdm(total=num_timesteps)
>>> pbar.update(index)
>>> pbar.update() >>> pbar.update()
>>> >>>
>>> return callback >>> return callback
@ -247,7 +241,7 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad(): >>> with paddle.no_grad():
>>> sample = diffusion.inference( >>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, x, >>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps, >>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type, >>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback()) >>> callback=create_progress_callback())
@ -262,7 +256,7 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad(): >>> with paddle.no_grad():
>>> sample = diffusion.inference( >>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, x_in, >>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps, >>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type, >>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback()) >>> callback=create_progress_callback())
@ -277,11 +271,11 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad(): >>> with paddle.no_grad():
>>> sample = diffusion.inference( >>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, None, >>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps, >>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type, >>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback()) >>> callback=create_progress_callback())
100%|| 25/25 [00:01<00:00, 19.75it/s] 100%|| 34/34 [00:01<00:00, 19.75it/s]
>>> >>>
>>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output
>>> ds = 1000 >>> ds = 1000
@ -292,11 +286,11 @@ class GaussianDiffusion(nn.Layer):
>>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
>>> with paddle.no_grad(): >>> with paddle.no_grad():
>>> sample = diffusion.inference( >>> sample = diffusion.inference(
>>> paddle.randn(x.shape), c, x, >>> paddle.randn(x.shape), c, ref_x=x_in,
>>> num_inference_steps=infer_steps, >>> num_inference_steps=infer_steps,
>>> scheduler_type=scheduler_type, >>> scheduler_type=scheduler_type,
>>> callback=create_progress_callback()) >>> callback=create_progress_callback())
100%|| 5/5 [00:00<00:00, 23.80it/s] 100%|| 14/14 [00:00<00:00, 23.80it/s]
""" """
@ -366,6 +360,8 @@ class GaussianDiffusion(nn.Layer):
num_inference_steps: Optional[int]=1000, num_inference_steps: Optional[int]=1000,
strength: Optional[float]=None, strength: Optional[float]=None,
scheduler_type: Optional[str]="ddpm", scheduler_type: Optional[str]="ddpm",
clip_noise: Optional[bool]=True,
clip_noise_range: Optional[Tuple[float, float]]=(-1, 1),
callback: Optional[Callable[[int, int, int, paddle.Tensor], callback: Optional[Callable[[int, int, int, paddle.Tensor],
None]]=None, None]]=None,
callback_steps: Optional[int]=1): callback_steps: Optional[int]=1):
@ -386,6 +382,10 @@ class GaussianDiffusion(nn.Layer):
scheduler_type (str, optional): scheduler_type (str, optional):
Noise scheduler for generate noises. Noise scheduler for generate noises.
Choose a great scheduler can skip many denoising step, by default 'ddpm'. Choose a great scheduler can skip many denoising step, by default 'ddpm'.
clip_noise (bool, optional):
Whether to clip each denoised output, by default True.
clip_noise_range (tuple, optional):
denoised output min and max value range after clip, by default (-1, 1).
callback (Callable[[int,int,int,Tensor], None], optional): callback (Callable[[int,int,int,Tensor], None], optional):
Callback function during denoising steps. Callback function during denoising steps.
@ -426,6 +426,7 @@ class GaussianDiffusion(nn.Layer):
scheduler.set_timesteps(num_inference_steps) scheduler.set_timesteps(num_inference_steps)
# prepare first noise variables # prepare first noise variables
import pdb;pdb.set_trace()
noisy_input = noise noisy_input = noise
timesteps = scheduler.timesteps timesteps = scheduler.timesteps
if ref_x is not None: if ref_x is not None:
@ -444,8 +445,13 @@ class GaussianDiffusion(nn.Layer):
noisy_input = scheduler.add_noise( noisy_input = scheduler.add_noise(
ref_x, noise, timesteps[:1].tile([noise.shape[0]])) ref_x, noise, timesteps[:1].tile([noise.shape[0]]))
# denoising loop # denoising loop
denoised_output = noisy_input denoised_output = noisy_input
if clip_noise:
n_min, n_max = clip_noise_range
denoised_output = paddle.clip(denoised_output, n_min, n_max)
num_warmup_steps = len( num_warmup_steps = len(
timesteps) - num_inference_steps * scheduler.order timesteps) - num_inference_steps * scheduler.order
for i, t in enumerate(timesteps): for i, t in enumerate(timesteps):
@ -457,6 +463,8 @@ class GaussianDiffusion(nn.Layer):
# compute the previous noisy sample x_t -> x_t-1 # compute the previous noisy sample x_t -> x_t-1
denoised_output = scheduler.step(noise_pred, t, denoised_output = scheduler.step(noise_pred, t,
denoised_output).prev_sample denoised_output).prev_sample
if clip_noise:
denoised_output = paddle.clip(denoised_output, n_min, n_max)
# call the callback, if provided # call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and

@ -38,11 +38,9 @@ def masked_fill(xs: paddle.Tensor,
value: Union[float, int]): value: Union[float, int]):
# comment following line for converting dygraph to static graph. # comment following line for converting dygraph to static graph.
# assert is_broadcastable(xs.shape, mask.shape) is True # assert is_broadcastable(xs.shape, mask.shape) is True
# bshape = paddle.broadcast_shape(xs.shape, mask.shape)
bshape = broadcast_shape(xs.shape, mask.shape) bshape = broadcast_shape(xs.shape, mask.shape)
mask.stop_gradient = True mask.stop_gradient = True
mask = mask.broadcast_to(bshape) mask = mask.broadcast_to(bshape)
trues = paddle.ones_like(xs) * value trues = paddle.ones_like(xs) * value
mask = mask.cast(dtype=paddle.bool) mask = mask.cast(dtype=paddle.bool)
xs = paddle.where(mask, trues, xs) xs = paddle.where(mask, trues, xs)

Loading…
Cancel
Save