|
|
|
@ -735,6 +735,118 @@ class FastSpeech2(nn.Layer):
|
|
|
|
|
tone_id=tone_id)
|
|
|
|
|
return hs
|
|
|
|
|
|
|
|
|
|
def _inference(self,
|
|
|
|
|
xs: paddle.Tensor,
|
|
|
|
|
ilens: paddle.Tensor,
|
|
|
|
|
olens: paddle.Tensor=None,
|
|
|
|
|
ds: paddle.Tensor=None,
|
|
|
|
|
ps: paddle.Tensor=None,
|
|
|
|
|
es: paddle.Tensor=None,
|
|
|
|
|
is_inference: bool=False,
|
|
|
|
|
return_after_enc=False,
|
|
|
|
|
alpha: float=1.0,
|
|
|
|
|
spk_emb=None,
|
|
|
|
|
spk_id=None,
|
|
|
|
|
tone_id=None) -> Sequence[paddle.Tensor]:
|
|
|
|
|
# forward encoder
|
|
|
|
|
x_masks = self._source_mask(ilens)
|
|
|
|
|
# (B, Tmax, adim)
|
|
|
|
|
hs, _ = self.encoder(xs, x_masks)
|
|
|
|
|
|
|
|
|
|
# integrate speaker embedding
|
|
|
|
|
if self.spk_embed_dim is not None:
|
|
|
|
|
# spk_emb has a higher priority than spk_id
|
|
|
|
|
if spk_emb is not None:
|
|
|
|
|
hs = self._integrate_with_spk_embed(hs, spk_emb)
|
|
|
|
|
elif spk_id is not None:
|
|
|
|
|
spk_emb = self.spk_embedding_table(spk_id)
|
|
|
|
|
hs = self._integrate_with_spk_embed(hs, spk_emb)
|
|
|
|
|
|
|
|
|
|
# integrate tone embedding
|
|
|
|
|
if self.tone_embed_dim is not None:
|
|
|
|
|
if tone_id is not None:
|
|
|
|
|
tone_embs = self.tone_embedding_table(tone_id)
|
|
|
|
|
hs = self._integrate_with_tone_embed(hs, tone_embs)
|
|
|
|
|
# forward duration predictor and variance predictors
|
|
|
|
|
d_masks = make_pad_mask(ilens)
|
|
|
|
|
|
|
|
|
|
if self.stop_gradient_from_pitch_predictor:
|
|
|
|
|
p_outs = self.pitch_predictor(hs.detach(), d_masks.unsqueeze(-1))
|
|
|
|
|
else:
|
|
|
|
|
p_outs = self.pitch_predictor(hs, d_masks.unsqueeze(-1))
|
|
|
|
|
if self.stop_gradient_from_energy_predictor:
|
|
|
|
|
e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
|
|
|
|
|
else:
|
|
|
|
|
e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
|
|
|
|
|
|
|
|
|
|
if is_inference:
|
|
|
|
|
# (B, Tmax)
|
|
|
|
|
if ds is not None:
|
|
|
|
|
d_outs = ds
|
|
|
|
|
else:
|
|
|
|
|
d_outs = self.duration_predictor.inference(hs, d_masks)
|
|
|
|
|
if ps is not None:
|
|
|
|
|
p_outs = ps
|
|
|
|
|
if es is not None:
|
|
|
|
|
e_outs = es
|
|
|
|
|
|
|
|
|
|
# use prediction in inference
|
|
|
|
|
# (B, Tmax, 1)
|
|
|
|
|
|
|
|
|
|
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
|
|
|
|
|
(0, 2, 1))
|
|
|
|
|
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
|
|
|
|
|
(0, 2, 1))
|
|
|
|
|
hs = hs + e_embs + p_embs
|
|
|
|
|
|
|
|
|
|
# (B, Lmax, adim)
|
|
|
|
|
hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
|
|
|
|
|
else:
|
|
|
|
|
d_outs = self.duration_predictor(hs, d_masks)
|
|
|
|
|
# use groundtruth in training
|
|
|
|
|
p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
|
|
|
|
|
(0, 2, 1))
|
|
|
|
|
e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
|
|
|
|
|
(0, 2, 1))
|
|
|
|
|
hs = hs + e_embs + p_embs
|
|
|
|
|
|
|
|
|
|
# (B, Lmax, adim)
|
|
|
|
|
hs = self.length_regulator(hs, ds, is_inference=False)
|
|
|
|
|
|
|
|
|
|
# forward decoder
|
|
|
|
|
if olens is not None and not is_inference:
|
|
|
|
|
if self.reduction_factor > 1:
|
|
|
|
|
olens_in = paddle.to_tensor(
|
|
|
|
|
[olen // self.reduction_factor for olen in olens.numpy()])
|
|
|
|
|
else:
|
|
|
|
|
olens_in = olens
|
|
|
|
|
# (B, 1, T)
|
|
|
|
|
h_masks = self._source_mask(olens_in)
|
|
|
|
|
else:
|
|
|
|
|
h_masks = None
|
|
|
|
|
if return_after_enc:
|
|
|
|
|
return hs, h_masks
|
|
|
|
|
|
|
|
|
|
if self.decoder_type == 'cnndecoder':
|
|
|
|
|
# remove output masks for dygraph to static graph
|
|
|
|
|
zs = self.decoder(hs, h_masks)
|
|
|
|
|
before_outs = zs
|
|
|
|
|
else:
|
|
|
|
|
# (B, Lmax, adim)
|
|
|
|
|
zs, _ = self.decoder(hs, h_masks)
|
|
|
|
|
# (B, Lmax, odim)
|
|
|
|
|
before_outs = self.feat_out(zs).reshape(
|
|
|
|
|
(paddle.shape(zs)[0], -1, self.odim))
|
|
|
|
|
|
|
|
|
|
# postnet -> (B, Lmax//r * r, odim)
|
|
|
|
|
if self.postnet is None:
|
|
|
|
|
after_outs = before_outs
|
|
|
|
|
else:
|
|
|
|
|
after_outs = before_outs + self.postnet(
|
|
|
|
|
before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
|
|
|
|
|
|
|
|
|
|
return before_outs, after_outs, d_outs, p_outs, e_outs
|
|
|
|
|
|
|
|
|
|
def inference(
|
|
|
|
|
self,
|
|
|
|
|
text: paddle.Tensor,
|
|
|
|
@ -794,7 +906,7 @@ class FastSpeech2(nn.Layer):
|
|
|
|
|
es = e.unsqueeze(0) if e is not None else None
|
|
|
|
|
|
|
|
|
|
# (1, L, odim)
|
|
|
|
|
_, outs, d_outs, p_outs, e_outs = self._forward(
|
|
|
|
|
_, outs, d_outs, p_outs, e_outs = self._inference(
|
|
|
|
|
xs,
|
|
|
|
|
ilens,
|
|
|
|
|
ds=ds,
|
|
|
|
@ -806,7 +918,7 @@ class FastSpeech2(nn.Layer):
|
|
|
|
|
is_inference=True)
|
|
|
|
|
else:
|
|
|
|
|
# (1, L, odim)
|
|
|
|
|
_, outs, d_outs, p_outs, e_outs = self._forward(
|
|
|
|
|
_, outs, d_outs, p_outs, e_outs = self._inference(
|
|
|
|
|
xs,
|
|
|
|
|
ilens,
|
|
|
|
|
is_inference=True,
|
|
|
|
@ -814,6 +926,8 @@ class FastSpeech2(nn.Layer):
|
|
|
|
|
spk_emb=spk_emb,
|
|
|
|
|
spk_id=spk_id,
|
|
|
|
|
tone_id=tone_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return outs[0], d_outs[0], p_outs[0], e_outs[0]
|
|
|
|
|
|
|
|
|
|
def _integrate_with_spk_embed(self, hs, spk_emb):
|
|
|
|
@ -1135,9 +1249,7 @@ class FastSpeech2Loss(nn.Layer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
speaker_loss = 0.0
|
|
|
|
|
|
|
|
|
|
import pdb;pdb.set_trace()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# apply mask to remove padded part
|
|
|
|
|
if self.use_masking:
|
|
|
|
|
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
|
|
|
|
@ -1165,8 +1277,7 @@ class FastSpeech2Loss(nn.Layer):
|
|
|
|
|
mask_index = spk_logits.abs().sum(axis=1)!=0
|
|
|
|
|
spk_ids = spk_ids[mask_index]
|
|
|
|
|
spk_logits = spk_logits[mask_index]
|
|
|
|
|
speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0]
|
|
|
|
|
print("sssssssssssssssssss")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# calculate loss
|
|
|
|
|
l1_loss = self.l1_criterion(before_outs, ys)
|
|
|
|
@ -1175,6 +1286,9 @@ class FastSpeech2Loss(nn.Layer):
|
|
|
|
|
duration_loss = self.duration_criterion(d_outs, ds)
|
|
|
|
|
pitch_loss = self.mse_criterion(p_outs, ps)
|
|
|
|
|
energy_loss = self.mse_criterion(e_outs, es)
|
|
|
|
|
|
|
|
|
|
if spk_logits is not None and spk_ids is not None:
|
|
|
|
|
speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0]
|
|
|
|
|
|
|
|
|
|
# if spk_logits is None or spk_ids is None:
|
|
|
|
|
# speaker_loss = 0.0
|
|
|
|
|