pull/2588/head
liangym 3 years ago
parent 593f247460
commit 6d557dd388

@ -735,6 +735,118 @@ class FastSpeech2(nn.Layer):
tone_id=tone_id)
return hs
def _inference(self,
xs: paddle.Tensor,
ilens: paddle.Tensor,
olens: paddle.Tensor=None,
ds: paddle.Tensor=None,
ps: paddle.Tensor=None,
es: paddle.Tensor=None,
is_inference: bool=False,
return_after_enc=False,
alpha: float=1.0,
spk_emb=None,
spk_id=None,
tone_id=None) -> Sequence[paddle.Tensor]:
# forward encoder
x_masks = self._source_mask(ilens)
# (B, Tmax, adim)
hs, _ = self.encoder(xs, x_masks)
# integrate speaker embedding
if self.spk_embed_dim is not None:
# spk_emb has a higher priority than spk_id
if spk_emb is not None:
hs = self._integrate_with_spk_embed(hs, spk_emb)
elif spk_id is not None:
spk_emb = self.spk_embedding_table(spk_id)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# integrate tone embedding
if self.tone_embed_dim is not None:
if tone_id is not None:
tone_embs = self.tone_embedding_table(tone_id)
hs = self._integrate_with_tone_embed(hs, tone_embs)
# forward duration predictor and variance predictors
d_masks = make_pad_mask(ilens)
if self.stop_gradient_from_pitch_predictor:
p_outs = self.pitch_predictor(hs.detach(), d_masks.unsqueeze(-1))
else:
p_outs = self.pitch_predictor(hs, d_masks.unsqueeze(-1))
if self.stop_gradient_from_energy_predictor:
e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
else:
e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
if is_inference:
# (B, Tmax)
if ds is not None:
d_outs = ds
else:
d_outs = self.duration_predictor.inference(hs, d_masks)
if ps is not None:
p_outs = ps
if es is not None:
e_outs = es
# use prediction in inference
# (B, Tmax, 1)
p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
# (B, Lmax, adim)
hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
else:
d_outs = self.duration_predictor(hs, d_masks)
# use groundtruth in training
p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
(0, 2, 1))
e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
(0, 2, 1))
hs = hs + e_embs + p_embs
# (B, Lmax, adim)
hs = self.length_regulator(hs, ds, is_inference=False)
# forward decoder
if olens is not None and not is_inference:
if self.reduction_factor > 1:
olens_in = paddle.to_tensor(
[olen // self.reduction_factor for olen in olens.numpy()])
else:
olens_in = olens
# (B, 1, T)
h_masks = self._source_mask(olens_in)
else:
h_masks = None
if return_after_enc:
return hs, h_masks
if self.decoder_type == 'cnndecoder':
# remove output masks for dygraph to static graph
zs = self.decoder(hs, h_masks)
before_outs = zs
else:
# (B, Lmax, adim)
zs, _ = self.decoder(hs, h_masks)
# (B, Lmax, odim)
before_outs = self.feat_out(zs).reshape(
(paddle.shape(zs)[0], -1, self.odim))
# postnet -> (B, Lmax//r * r, odim)
if self.postnet is None:
after_outs = before_outs
else:
after_outs = before_outs + self.postnet(
before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
return before_outs, after_outs, d_outs, p_outs, e_outs
def inference(
self,
text: paddle.Tensor,
@ -794,7 +906,7 @@ class FastSpeech2(nn.Layer):
es = e.unsqueeze(0) if e is not None else None
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs = self._forward(
_, outs, d_outs, p_outs, e_outs = self._inference(
xs,
ilens,
ds=ds,
@ -806,7 +918,7 @@ class FastSpeech2(nn.Layer):
is_inference=True)
else:
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs = self._forward(
_, outs, d_outs, p_outs, e_outs = self._inference(
xs,
ilens,
is_inference=True,
@ -814,6 +926,8 @@ class FastSpeech2(nn.Layer):
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id)
return outs[0], d_outs[0], p_outs[0], e_outs[0]
def _integrate_with_spk_embed(self, hs, spk_emb):
@ -1135,9 +1249,7 @@ class FastSpeech2Loss(nn.Layer):
"""
speaker_loss = 0.0
import pdb;pdb.set_trace()
# apply mask to remove padded part
if self.use_masking:
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
@ -1165,8 +1277,7 @@ class FastSpeech2Loss(nn.Layer):
mask_index = spk_logits.abs().sum(axis=1)!=0
spk_ids = spk_ids[mask_index]
spk_logits = spk_logits[mask_index]
speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0]
print("sssssssssssssssssss")
# calculate loss
l1_loss = self.l1_criterion(before_outs, ys)
@ -1175,6 +1286,9 @@ class FastSpeech2Loss(nn.Layer):
duration_loss = self.duration_criterion(d_outs, ds)
pitch_loss = self.mse_criterion(p_outs, ps)
energy_loss = self.mse_criterion(e_outs, es)
if spk_logits is not None and spk_ids is not None:
speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0]
# if spk_logits is None or spk_ids is None:
# speaker_loss = 0.0

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading…
Cancel
Save