fix diffsinger, test=tts

pull/2834/head
liangym 3 years ago
parent ef7d15dc02
commit c91dc02931

@ -24,79 +24,84 @@ f0max: 750 # Maximum f0 for pitch extraction.
# DATA SETTING #
###########################################################
batch_size: 32
num_workers: 4
num_workers: 1
###########################################################
# MODEL SETTING #
###########################################################
# fastspeech2 module
fs2_model:
adim: 256 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
model:
# music score related
note_num: 300
is_slur_num: 2
denoiser_model:
in_channels: 80
out_channels: 80
kernel_size: 3
layers: 20
stacks: 4
residual_channels: 256
gate_channels: 512
skip_channels: 256
aux_channels: 256
dropout: 0.1
bias: True
use_weight_norm: False
init_type: kaiming_uniform
diffusion:
num_train_timesteps: 100
beta_start: 0.0001
beta_end: 0.06
beta_schedule: "squaredcos_cap_v2"
num_max_timesteps: 60
# fastspeech2 module
fastspeech2_params:
adim: 256 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
# denoiser module
denoiser_params:
in_channels: 80
out_channels: 80
kernel_size: 3
layers: 20
stacks: 5
residual_channels: 256
gate_channels: 512
skip_channels: 256
aux_channels: 256
dropout: 0.1
bias: True
use_weight_norm: False
init_type: "kaiming_normal"
# diffusion module
diffusion_params:
num_train_timesteps: 100
beta_start: 0.0001
beta_end: 0.06
beta_schedule: "squaredcos_cap_v2"
num_max_timesteps: 60
###########################################################
@ -112,7 +117,6 @@ ds_updater:
###########################################################
# OPTIMIZER SETTING #
###########################################################
# gpu_num=2 config
# fastspeech2 optimizer
fs2_optimizer:
optim: adam # optimizer type
@ -134,10 +138,10 @@ ds_grad_norm: 1
###########################################################
# INTERVAL SETTING #
###########################################################
ds_train_start_steps: 80000 # Number of steps to start to train diffusion module.
train_max_steps: 160000 # Number of training steps.
ds_train_start_steps: 160000 # Number of steps to start to train diffusion module.
train_max_steps: 320000 # Number of training steps.
save_interval_steps: 1000 # Interval steps to save checkpoint.
eval_interval_steps: 250 # Interval steps to evaluate the network.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
num_snapshots: 5

@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=2 \
--ngpu=1 \
--phones-dict=dump/phone_id_map.txt

@ -3,13 +3,13 @@
set -e
source path.sh
gpus=4,5
gpus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_153.pdz
ckpt_name=snapshot_iter_320000.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
@ -30,8 +30,3 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import List
import librosa
import numpy as np
@ -42,7 +44,16 @@ def get_phn_dur(file_name):
f.close()
return sentence, speaker_set
def note2midi(notes):
def note2midi(notes: List[str]) -> List[str]:
"""Covert note string to note id, for example: ["C1"] -> [24]
Args:
notes (List[str]): the list of note string
Returns:
List[str]: the list of note id
"""
midis = []
for note in notes:
if note == 'rest':
@ -53,7 +64,21 @@ def note2midi(notes):
return midis
def time2frame(times, sample_rate: int=24000, n_shift: int=128,):
def time2frame(
times: List[float],
sample_rate: int=24000,
n_shift: int=128, ) -> List[int]:
"""Convert the phoneme duration of time(s) into frames
Args:
times (List[float]): phoneme duration of time(s)
sample_rate (int, optional): sample rate. Defaults to 24000.
n_shift (int, optional): frame shift. Defaults to 128.
Returns:
List[int]: phoneme duration of frame
"""
end = 0.0
ends = []
for t in times:
@ -63,14 +88,20 @@ def time2frame(times, sample_rate: int=24000, n_shift: int=128,):
durations = np.diff(frame_pos, prepend=0)
return durations
def get_sentences_svs(file_name, dataset: str='opencpop', sample_rate: int=24000, n_shift: int=128,):
def get_sentences_svs(
file_name,
dataset: str='opencpop',
sample_rate: int=24000,
n_shift: int=128, ):
'''
read label file
Args:
file_name (str or Path): path of gen_duration_from_textgrid.py's result
dataset (str): dataset name
Returns:
Dict: sentence: {'utt': ([char], [int])}
Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str)
tunple: speaker name
'''
f = open(file_name, 'r')
sentence = {}
@ -87,7 +118,10 @@ def get_sentences_svs(file_name, dataset: str='opencpop', sample_rate: int=24000
ph_dur = time2frame([float(t) for t in line_list[5].split()])
is_slur = line_list[6].split()
assert len(ph) == len(midi) == len(midi_dur) == len(is_slur)
sentence[utt] = (ph, [int(i) for i in ph_dur], [int(i) for i in midi], [float(i) for i in midi_dur], [int(i) for i in is_slur], text, "opencpop")
sentence[utt] = (ph, [int(i) for i in ph_dur],
[int(i) for i in midi],
[float(i) for i in midi_dur],
[int(i) for i in is_slur], text, "opencpop")
else:
print("dataset should in {opencpop} now!")

@ -37,21 +37,28 @@ from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.utils import str2bool
ALL_SHENGMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian',
'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou',
'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn']
def process_sentence(config: Dict[str, Any],
fp: Path,
sentences: Dict,
output_dir: Path,
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
cut_sil: bool=True,
spk_emb_dir: Path=None,):
ALL_INITIALS = [
'zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'
]
ALL_FINALS = [
'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia',
'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong',
'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've',
'vn'
]
def process_sentence(
config: Dict[str, Any],
fp: Path,
sentences: Dict,
output_dir: Path,
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
cut_sil: bool=True,
spk_emb_dir: Path=None, ):
utt_id = fp.stem
record = None
if utt_id in sentences:
@ -71,7 +78,7 @@ def process_sentence(config: Dict[str, Any],
note_dur = sentences[utt_id][3]
is_slur = sentences[utt_id][4]
speaker = sentences[utt_id][-1]
# extract mel feats
logmel = mel_extractor.get_log_mel_fbank(wav)
# change duration according to mel_length
@ -82,9 +89,13 @@ def process_sentence(config: Dict[str, Any],
phones = sentences[utt_id][0]
durations = sentences[utt_id][1]
num_frames = logmel.shape[0]
word_boundary = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in phones]
word_boundary = [
1 if x in ALL_FINALS + ['AP', 'SP'] else 0 for x in phones
]
# print(sum(durations), num_frames)
assert sum(durations) == num_frames, "the sum of durations doesn't equal to the num of mel frames. "
assert sum(
durations
) == num_frames, "the sum of durations doesn't equal to the num of mel frames. "
speech_dir = output_dir / "data_speech"
speech_dir.mkdir(parents=True, exist_ok=True)
speech_path = speech_dir / (utt_id + "_speech.npy")
@ -128,17 +139,18 @@ def process_sentence(config: Dict[str, Any],
return record
def process_sentences(config,
fps: List[Path],
sentences: Dict,
output_dir: Path,
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
nprocs: int=1,
cut_sil: bool=True,
spk_emb_dir: Path=None,
write_metadata_method: str='w',):
def process_sentences(
config,
fps: List[Path],
sentences: Dict,
output_dir: Path,
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
nprocs: int=1,
cut_sil: bool=True,
spk_emb_dir: Path=None,
write_metadata_method: str='w', ):
if nprocs == 1:
results = []
for fp in tqdm.tqdm(fps, total=len(fps)):
@ -151,7 +163,7 @@ def process_sentences(config,
pitch_extractor=pitch_extractor,
energy_extractor=energy_extractor,
cut_sil=cut_sil,
spk_emb_dir=spk_emb_dir,)
spk_emb_dir=spk_emb_dir, )
if record:
results.append(record)
else:
@ -159,10 +171,17 @@ def process_sentences(config,
futures = []
with tqdm.tqdm(total=len(fps)) as progress:
for fp in fps:
future = pool.submit(process_sentence, config, fp,
sentences, output_dir, mel_extractor,
pitch_extractor, energy_extractor,
cut_sil, spk_emb_dir,)
future = pool.submit(
process_sentence,
config,
fp,
sentences,
output_dir,
mel_extractor,
pitch_extractor,
energy_extractor,
cut_sil,
spk_emb_dir, )
future.add_done_callback(lambda p: progress.update())
futures.append(future)
@ -202,7 +221,7 @@ def main():
parser.add_argument(
"--label-file", default=None, type=str, help="path to label file.")
parser.add_argument("--config", type=str, help="diffsinger config file.")
parser.add_argument(
@ -235,7 +254,6 @@ def main():
dumpdir.mkdir(parents=True, exist_ok=True)
label_file = Path(args.label_file).expanduser()
if args.spk_emb_dir:
spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
else:
@ -243,11 +261,15 @@ def main():
assert rootdir.is_dir()
assert label_file.is_file()
with open(args.config, 'rt') as f:
config = CfgNode(yaml.safe_load(f))
sentences, speaker_set = get_sentences_svs(label_file, dataset=args.dataset, sample_rate=config.fs, n_shift=config.n_shift,)
sentences, speaker_set = get_sentences_svs(
label_file,
dataset=args.dataset,
sample_rate=config.fs,
n_shift=config.n_shift, )
# merge_silence(sentences)
phone_id_map_path = dumpdir / "phone_id_map.txt"

@ -37,7 +37,7 @@ from paddlespeech.t2s.models.diffsinger import DiffSinger
from paddlespeech.t2s.models.diffsinger import DiffSingerEvaluator
from paddlespeech.t2s.models.diffsinger import DiffSingerUpdater
from paddlespeech.t2s.models.diffsinger import DiffusionLoss
from paddlespeech.t2s.models.diffsinger import FastSpeech2MIDILoss
from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDILoss
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
@ -45,6 +45,9 @@ from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
from paddlespeech.t2s.utils import str2bool
# from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
@ -75,11 +78,6 @@ def train_sp(args, config):
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = diffsinger_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else:
collate_fn = diffsinger_single_spk_batch_fn
print("single speaker diffsinger!")
@ -133,30 +131,28 @@ def train_sp(args, config):
print("vocab_size:", vocab_size)
odim = config.n_mels
config["fs2_model"]["idim"] = vocab_size
config["fs2_model"]["odim"] = odim
config["fs2_model"]["spk_num"] = spk_num
model = DiffSinger(
fs2_config=config["fs2_model"],
denoiser_config=config["denoiser_model"],
diffusion_config=config["diffusion"])
config["model"]["fastspeech2_params"]["spk_num"] = spk_num
model = DiffSinger(idim=vocab_size, odim=odim, **config["model"])
model_fs2 = model.fs2
model_ds = model.diffusion
if world_size > 1:
model = DataParallel(model)
model_fs2 = model._layers.fs2
model_ds = model._layers.diffusion
print("models done!")
# criterion_fs2 = FastSpeech2Loss(**config["fs2_updater"])
criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"])
criterion_ds = DiffusionLoss(**config["ds_updater"])
print("criterions done!")
optimizer_fs2 = build_optimizers(model._layers.fs2,
**config["fs2_optimizer"])
optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"])
lr_schedule_ds = StepDecay(**config["ds_scheduler_params"])
gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
optimizer_ds = AdamW(
learning_rate=lr_schedule_ds,
grad_clip=gradient_clip_ds,
parameters=model._layers.diffusion.parameters(),
parameters=model_ds.parameters(),
**config["ds_optimizer_params"])
# optimizer_ds = build_optimizers(ds, **config["ds_optimizer"])
print("optimizer done!")
@ -189,7 +185,8 @@ def train_sp(args, config):
"ds": criterion_ds,
},
dataloader=dev_dataloader,
output_dir=output_dir)
output_dir=output_dir,)
trainer = Trainer(
updater,
stop_trigger=(config.train_max_steps, "iteration"),
@ -224,12 +221,6 @@ def main():
default=None,
help="speaker id map file for multiple speaker model.")
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
args = parser.parse_args()
with open(args.config) as f:

@ -23,7 +23,6 @@ from typing import Optional
import numpy as np
import onnxruntime as ort
import paddle
import yaml
from paddle import inference
from paddle import jit
from paddle.io import DataLoader
@ -358,13 +357,8 @@ def get_am_inference(am: str='fastspeech2_csmsc',
am = am_class(
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
elif am_name == 'diffsinger':
am_config["fs2_model"]["idim"] = vocab_size
am_config["fs2_model"]["odim"] = am_config.n_mels
am_config["fs2_model"]["spk_num"] = spk_num
am = am_class(
fs2_config=am_config["fs2_model"],
denoiser_config=am_config["denoiser_model"],
diffusion_config=am_config["diffusion"])
am_config["model"]["fastspeech2_params"]["spk_num"] = spk_num
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size,

File diff suppressed because it is too large Load Diff

@ -44,8 +44,9 @@ class DiffSingerUpdater(StandardUpdater):
fs2_train_start_steps: int=0,
ds_train_start_steps: int=160000,
output_dir: Path=None, ):
super().__init__(model, optimizers, dataloader, init_state=None)
self.model = model._layers if isinstance(model,
paddle.DataParallel) else model
self.optimizers = optimizers
self.optimizer_fs2: Optimizer = optimizers['fs2']
@ -79,7 +80,7 @@ class DiffSingerUpdater(StandardUpdater):
if spk_emb is not None:
spk_id = None
# fastspeech2
# only train fastspeech2 module firstly
if self.state.iteration > self.fs2_train_start_steps and self.state.iteration < self.ds_train_start_steps:
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
text=batch["text"],
@ -133,8 +134,9 @@ class DiffSingerUpdater(StandardUpdater):
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
# Then only train diffusion module, freeze fastspeech2 parameters.
if self.state.iteration > self.ds_train_start_steps:
for param in self.model._layers.fs2.parameters():
for param in self.model.fs2.parameters():
param.trainable = False
mel, mel_masks = self.model(
@ -183,12 +185,12 @@ class DiffSingerEvaluator(StandardEvaluator):
dataloader: DataLoader,
output_dir: Path=None, ):
super().__init__(model, dataloader)
self.model = model
self.model = model._layers if isinstance(model,
paddle.DataParallel) else model
self.criterions = criterions
self.criterion_fs2 = criterions['fs2']
self.criterion_ds = criterions['ds']
self.dataloader = dataloader
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
@ -206,6 +208,7 @@ class DiffSingerEvaluator(StandardEvaluator):
if spk_emb is not None:
spk_id = None
# Here show diffsinger eval
mel, mel_masks = self.model(
text=batch["text"],
note=batch["note"],
@ -227,14 +230,13 @@ class DiffSingerEvaluator(StandardEvaluator):
ref_mels=batch["speech"],
out_mels=mel,
mel_masks=mel_masks, )
loss_ds = l1_loss_ds
report("train/loss_ds", float(loss_ds))
report("train/l1_loss_ds", float(l1_loss_ds))
report("eval/loss_ds", float(loss_ds))
report("eval/l1_loss_ds", float(l1_loss_ds))
losses_dict["l1_loss_ds"] = float(l1_loss_ds)
losses_dict["loss_ds"] = float(loss_ds)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
for k, v in losses_dict.items())
self.logger.info(self.msg)

@ -0,0 +1,625 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
from typing import Any
from typing import Dict
from typing import Sequence
from typing import Tuple
import paddle
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
class FastSpeech2MIDI(FastSpeech2):
"""The Fastspeech2 module of DiffSinger.
"""
def __init__(
self,
# fastspeech2 network structure related
idim: int,
odim: int,
fastspeech2_config: Dict[str, Any],
# note emb
note_num: int=300,
# is_slur emb
is_slur_num: int=2, ):
"""Initialize FastSpeech2 module for svs.
Args:
fastspeech2_config (Dict):
The config of FastSpeech2 module on DiffSinger model
note_num (Optional[int]):
Number of note. If not None, assume that the
note_ids will be provided as the input and use note_embedding_table.
is_slur_num (Optional[int]):
Number of note. If not None, assume that the
is_slur_ids will be provided as the input
"""
assert check_argument_types()
super().__init__(idim=idim, odim=odim, **fastspeech2_config)
self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_config[
"adim"]
if note_num is not None:
self.note_embedding_table = nn.Embedding(
num_embeddings=note_num,
embedding_dim=self.note_embed_dim,
padding_idx=self.padding_idx)
self.note_dur_layer = nn.Linear(1, self.note_embed_dim)
if is_slur_num is not None:
self.is_slur_embedding_table = nn.Embedding(
num_embeddings=is_slur_num,
embedding_dim=self.is_slur_embed_dim,
padding_idx=self.padding_idx)
def forward(
self,
text: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
text_lengths: paddle.Tensor,
speech: paddle.Tensor,
speech_lengths: paddle.Tensor,
durations: paddle.Tensor,
pitch: paddle.Tensor,
energy: paddle.Tensor,
spk_emb: paddle.Tensor=None,
spk_id: paddle.Tensor=None,
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
Args:
text(Tensor(int64)):
Batch of padded token (phone) ids (B, Tmax).
note(Tensor(int64)):
Batch of padded note (element in music score) ids (B, Tmax).
note_dur(Tensor(float32)):
Batch of padded note durations in seconds (element in music score) (B, Tmax).
is_slur(Tensor(int64)):
Batch of padded slur (element in music score) ids (B, Tmax).
text_lengths(Tensor(int64)):
Batch of phone lengths of each input (B,).
speech(Tensor[float32]):
Batch of padded target features (e.g. mel) (B, Lmax, odim).
speech_lengths(Tensor(int64)):
Batch of the lengths of each target features (B,).
durations(Tensor(int64)):
Batch of padded token durations in frame (B, Tmax).
pitch(Tensor[float32]):
Batch of padded frame-averaged pitch (B, Lmax, 1).
energy(Tensor[float32]):
Batch of padded frame-averaged energy (B, Lmax, 1).
spk_emb(Tensor[float32], optional):
Batch of speaker embeddings (B, spk_embed_dim).
spk_id(Tnesor[int64], optional(int64)):
Batch of speaker ids (B,)
Returns:
"""
xs = paddle.cast(text, 'int64')
note = paddle.cast(note, 'int64')
note_dur = paddle.cast(note_dur, 'float32')
is_slur = paddle.cast(is_slur, 'int64')
ilens = paddle.cast(text_lengths, 'int64')
olens = paddle.cast(speech_lengths, 'int64')
ds = paddle.cast(durations, 'int64')
ps = pitch
es = energy
ys = speech
olens = speech_lengths
if spk_id is not None:
spk_id = paddle.cast(spk_id, 'int64')
# forward propagation
before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
olens,
ds,
ps,
es,
is_inference=False,
spk_emb=spk_emb,
spk_id=spk_id, )
# modify mod part of groundtruth
if self.reduction_factor > 1:
olens = olens - olens % self.reduction_factor
max_olen = max(olens)
ys = ys[:, :max_olen]
return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits
def _forward(
self,
xs: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
ilens: paddle.Tensor,
olens: paddle.Tensor=None,
ds: paddle.Tensor=None,
ps: paddle.Tensor=None,
es: paddle.Tensor=None,
is_inference: bool=False,
is_train_diffusion: bool=False,
return_after_enc=False,
alpha: float=1.0,
spk_emb=None,
spk_id=None, ) -> Sequence[paddle.Tensor]:
# forward encoder
x_masks = self._source_mask(ilens)
note_emb = self.note_embedding_table(note)
note_dur_emb = self.note_dur_layer(paddle.unsqueeze(note_dur, axis=-1))
is_slur_emb = self.is_slur_embedding_table(is_slur)
# (B, Tmax, adim)
hs, _ = self.encoder(
xs,
x_masks,
note_emb,
note_dur_emb,
is_slur_emb, )
if self.spk_num and self.enable_speaker_classifier and not is_inference:
hs_for_spk_cls = self.grad_reverse(hs)
spk_logits = self.speaker_classifier(hs_for_spk_cls, ilens)
else:
spk_logits = None
# integrate speaker embedding
if self.spk_embed_dim is not None:
# spk_emb has a higher priority than spk_id
if spk_emb is not None:
hs = self._integrate_with_spk_embed(hs, spk_emb)
elif spk_id is not None:
spk_emb = self.spk_embedding_table(spk_id)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# forward duration predictor and variance predictors
d_masks = make_pad_mask(ilens)
if olens is not None:
pitch_masks = make_pad_mask(olens).unsqueeze(-1)
else:
pitch_masks = None
# inference for decoder input for duffusion
if is_train_diffusion:
hs = self.length_regulator(hs, ds, is_inference=False)
p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
e_outs = self.energy_predictor(hs.detach(), pitch_masks)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
elif is_inference:
# (B, Tmax)
if ds is not None:
d_outs = ds
else:
d_outs = self.duration_predictor.inference(hs, d_masks)
# (B, Lmax, adim)
hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
if ps is not None:
p_outs = ps
else:
if self.stop_gradient_from_pitch_predictor:
p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
else:
p_outs = self.pitch_predictor(hs, pitch_masks)
if es is not None:
e_outs = es
else:
if self.stop_gradient_from_energy_predictor:
e_outs = self.energy_predictor(hs.detach(), pitch_masks)
else:
e_outs = self.energy_predictor(hs, pitch_masks)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
# training
else:
d_outs = self.duration_predictor(hs, d_masks)
# (B, Lmax, adim)
hs = self.length_regulator(hs, ds, is_inference=False)
if self.stop_gradient_from_pitch_predictor:
p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
else:
p_outs = self.pitch_predictor(hs, pitch_masks)
if self.stop_gradient_from_energy_predictor:
e_outs = self.energy_predictor(hs.detach(), pitch_masks)
else:
e_outs = self.energy_predictor(hs, pitch_masks)
p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
# forward decoder
if olens is not None and not is_inference:
if self.reduction_factor > 1:
olens_in = paddle.to_tensor(
[olen // self.reduction_factor for olen in olens.numpy()])
else:
olens_in = olens
# (B, 1, T)
h_masks = self._source_mask(olens_in)
else:
h_masks = None
if return_after_enc:
return hs, h_masks
if self.decoder_type == 'cnndecoder':
# remove output masks for dygraph to static graph
zs = self.decoder(hs, h_masks)
before_outs = zs
else:
# (B, Lmax, adim)
zs, _ = self.decoder(hs, h_masks)
# (B, Lmax, odim)
before_outs = self.feat_out(zs).reshape(
(paddle.shape(zs)[0], -1, self.odim))
# postnet -> (B, Lmax//r * r, odim)
if self.postnet is None:
after_outs = before_outs
else:
after_outs = before_outs + self.postnet(
before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits
def encoder_infer(
self,
text: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
alpha: float=1.0,
spk_emb=None,
spk_id=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
xs = paddle.cast(text, 'int64').unsqueeze(0)
note = paddle.cast(note, 'int64').unsqueeze(0)
note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0)
is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0)
# setup batch axis
ilens = paddle.shape(xs)[1]
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
# (1, L, odim)
# use *_ to avoid bug in dygraph to static graph
hs, _ = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
is_inference=True,
return_after_enc=True,
alpha=alpha,
spk_emb=spk_emb,
spk_id=spk_id, )
return hs
# get encoder output for diffusion training
def encoder_infer_batch(
self,
text: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
text_lengths: paddle.Tensor,
speech_lengths: paddle.Tensor,
ds: paddle.Tensor=None,
ps: paddle.Tensor=None,
es: paddle.Tensor=None,
alpha: float=1.0,
spk_emb=None,
spk_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
xs = paddle.cast(text, 'int64')
note = paddle.cast(note, 'int64')
note_dur = paddle.cast(note_dur, 'float32')
is_slur = paddle.cast(is_slur, 'int64')
ilens = paddle.cast(text_lengths, 'int64')
olens = paddle.cast(speech_lengths, 'int64')
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
# (1, L, odim)
# use *_ to avoid bug in dygraph to static graph
hs, h_masks = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
olens,
ds,
ps,
es,
return_after_enc=True,
is_train_diffusion=True,
alpha=alpha,
spk_emb=spk_emb,
spk_id=spk_id, )
return hs, h_masks
def inference(
self,
text: paddle.Tensor,
note: paddle.Tensor,
note_dur: paddle.Tensor,
is_slur: paddle.Tensor,
durations: paddle.Tensor=None,
pitch: paddle.Tensor=None,
energy: paddle.Tensor=None,
alpha: float=1.0,
use_teacher_forcing: bool=False,
spk_emb=None,
spk_id=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Generate the sequence of features given the sequences of characters.
Args:
text(Tensor(int64)):
Input sequence of characters (T,).
note(Tensor(int64)):
Input note (element in music score) ids (T,).
note_dur(Tensor(float32)):
Input note durations in seconds (element in music score) (T,).
is_slur(Tensor(int64)):
Input slur (element in music score) ids (T,).
durations(Tensor, optional (int64)):
Groundtruth of duration (T,).
pitch(Tensor, optional):
Groundtruth of token-averaged pitch (T, 1).
energy(Tensor, optional):
Groundtruth of token-averaged energy (T, 1).
alpha(float, optional):
Alpha to control the speed.
use_teacher_forcing(bool, optional):
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
spk_emb(Tensor, optional, optional):
peaker embedding vector (spk_embed_dim,). (Default value = None)
spk_id(Tensor, optional(int64), optional):
spk ids (1,). (Default value = None)
Returns:
"""
xs = paddle.cast(text, 'int64').unsqueeze(0)
note = paddle.cast(note, 'int64').unsqueeze(0)
note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0)
is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0)
d, p, e = durations, pitch, energy
# setup batch axis
ilens = paddle.shape(xs)[1]
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
if use_teacher_forcing:
# use groundtruth of duration, pitch, and energy
ds = d.unsqueeze(0) if d is not None else None
ps = p.unsqueeze(0) if p is not None else None
es = e.unsqueeze(0) if e is not None else None
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs, _ = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
ds=ds,
ps=ps,
es=es,
spk_emb=spk_emb,
spk_id=spk_id,
is_inference=True)
else:
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs, _ = self._forward(
xs,
note,
note_dur,
is_slur,
ilens,
is_inference=True,
alpha=alpha,
spk_emb=spk_emb,
spk_id=spk_id, )
return outs[0], d_outs[0], p_outs[0], e_outs[0]
class FastSpeech2MIDILoss(nn.Layer):
"""Loss function module for DiffSinger."""
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
Args:
use_masking (bool):
Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool):
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
# define criterions
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
self.mse_criterion = nn.MSELoss(reduction=reduction)
self.duration_criterion = DurationPredictorLoss(reduction=reduction)
self.ce_criterion = nn.CrossEntropyLoss()
def forward(
self,
after_outs: paddle.Tensor,
before_outs: paddle.Tensor,
d_outs: paddle.Tensor,
p_outs: paddle.Tensor,
e_outs: paddle.Tensor,
ys: paddle.Tensor,
ds: paddle.Tensor,
ps: paddle.Tensor,
es: paddle.Tensor,
ilens: paddle.Tensor,
olens: paddle.Tensor,
spk_logits: paddle.Tensor=None,
spk_ids: paddle.Tensor=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
paddle.Tensor, ]:
"""Calculate forward propagation.
Args:
after_outs(Tensor):
Batch of outputs after postnets (B, Lmax, odim).
before_outs(Tensor):
Batch of outputs before postnets (B, Lmax, odim).
d_outs(Tensor):
Batch of outputs of duration predictor (B, Tmax).
p_outs(Tensor):
Batch of outputs of pitch predictor (B, Lmax, 1).
e_outs(Tensor):
Batch of outputs of energy predictor (B, Lmax, 1).
ys(Tensor):
Batch of target features (B, Lmax, odim).
ds(Tensor):
Batch of durations (B, Tmax).
ps(Tensor):
Batch of target frame-averaged pitch (B, Lmax, 1).
es(Tensor):
Batch of target frame-averaged energy (B, Lmax, 1).
ilens(Tensor):
Batch of the lengths of each input (B,).
olens(Tensor):
Batch of the lengths of each target (B,).
spk_logits(Option[Tensor]):
Batch of outputs after speaker classifier (B, Lmax, num_spk)
spk_ids(Option[Tensor]):
Batch of target spk_id (B,)
Returns:
"""
speaker_loss = 0.0
# apply mask to remove padded part
if self.use_masking:
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
before_outs = before_outs.masked_select(
out_masks.broadcast_to(before_outs.shape))
if after_outs is not None:
after_outs = after_outs.masked_select(
out_masks.broadcast_to(after_outs.shape))
ys = ys.masked_select(out_masks.broadcast_to(ys.shape))
duration_masks = make_non_pad_mask(ilens)
d_outs = d_outs.masked_select(
duration_masks.broadcast_to(d_outs.shape))
ds = ds.masked_select(duration_masks.broadcast_to(ds.shape))
pitch_masks = out_masks
p_outs = p_outs.masked_select(
pitch_masks.broadcast_to(p_outs.shape))
e_outs = e_outs.masked_select(
pitch_masks.broadcast_to(e_outs.shape))
ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
es = es.masked_select(pitch_masks.broadcast_to(es.shape))
if spk_logits is not None and spk_ids is not None:
batch_size = spk_ids.shape[0]
spk_ids = paddle.repeat_interleave(spk_ids, spk_logits.shape[1],
None)
spk_logits = paddle.reshape(spk_logits,
[-1, spk_logits.shape[-1]])
mask_index = spk_logits.abs().sum(axis=1) != 0
spk_ids = spk_ids[mask_index]
spk_logits = spk_logits[mask_index]
# calculate loss
l1_loss = self.l1_criterion(before_outs, ys)
if after_outs is not None:
l1_loss += self.l1_criterion(after_outs, ys)
duration_loss = self.duration_criterion(d_outs, ds)
pitch_loss = self.mse_criterion(p_outs, ps)
energy_loss = self.mse_criterion(e_outs, es)
if spk_logits is not None and spk_ids is not None:
speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size
# make weighted mask and apply it
if self.use_weighted_masking:
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
dtype=paddle.float32).sum(
axis=1, keepdim=True)
out_weights /= ys.shape[0] * ys.shape[2]
duration_masks = make_non_pad_mask(ilens)
duration_weights = (duration_masks.cast(dtype=paddle.float32) /
duration_masks.cast(dtype=paddle.float32).sum(
axis=1, keepdim=True))
duration_weights /= ds.shape[0]
# apply weight
l1_loss = l1_loss.multiply(out_weights)
l1_loss = l1_loss.masked_select(
out_masks.broadcast_to(l1_loss.shape)).sum()
duration_loss = (duration_loss.multiply(duration_weights)
.masked_select(duration_masks).sum())
pitch_masks = out_masks
pitch_weights = out_weights
pitch_loss = pitch_loss.multiply(pitch_weights)
pitch_loss = pitch_loss.masked_select(
pitch_masks.broadcast_to(pitch_loss.shape)).sum()
energy_loss = energy_loss.multiply(pitch_weights)
energy_loss = energy_loss.masked_select(
pitch_masks.broadcast_to(energy_loss.shape)).sum()
return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss

@ -15,6 +15,7 @@
from typing import List
from typing import Union
import paddle
from paddle import nn
from paddlespeech.t2s.modules.activation import get_activation
@ -390,20 +391,26 @@ class TransformerEncoder(BaseEncoder):
padding_idx=padding_idx,
encoder_type="transformer")
def forward(self, xs, masks, note_emb=None, note_dur_emb=None, is_slur_emb=None, scale=16):
def forward(self,
xs: paddle.Tensor,
masks: paddle.Tensor,
note_emb: paddle.Tensor=None,
note_dur_emb: paddle.Tensor=None,
is_slur_emb: paddle.Tensor=None,
scale: int=16):
"""Encoder input sequence.
Args:
xs(Tensor):
Input tensor (#batch, time, idim).
masks(Tensor):
Mask tensor (#batch, 1, time).
note_emb(Tensor):
Input tensor (#batch, time, attention_dim).
note_dur_emb(Tensor):
Input tensor (#batch, time, attention_dim).
is_slur_emb(Tensor):
Input tensor (#batch, time, attention_dim).
masks(Tensor):
Mask tensor (#batch, 1, time).
Returns:
Tensor:

Loading…
Cancel
Save