fix yamls, change labels to stop_labels, test=tts

pull/1314/head
TianYuan 3 years ago
parent 1bf1a876ae
commit 96323816e9

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################

@ -21,10 +21,6 @@ fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################

@ -27,9 +27,7 @@ import tqdm
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.data.get_feats import Energy
from paddlespeech.t2s.data.get_feats import LogMelFBank
from paddlespeech.t2s.data.get_feats import Pitch
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
@ -42,8 +40,6 @@ def process_sentence(config: Dict[str, Any],
sentences: Dict,
output_dir: Path,
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
cut_sil: bool=True,
spk_emb_dir: Path=None):
utt_id = fp.stem
@ -117,8 +113,6 @@ def process_sentences(config,
sentences: Dict,
output_dir: Path,
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
nprocs: int=1,
cut_sil: bool=True,
spk_emb_dir: Path=None):
@ -126,8 +120,7 @@ def process_sentences(config,
results = []
for fp in fps:
record = process_sentence(config, fp, sentences, output_dir,
mel_extractor, pitch_extractor,
energy_extractor, cut_sil, spk_emb_dir)
mel_extractor, cut_sil, spk_emb_dir)
if record:
results.append(record)
else:
@ -137,7 +130,6 @@ def process_sentences(config,
for fp in fps:
future = pool.submit(process_sentence, config, fp,
sentences, output_dir, mel_extractor,
pitch_extractor, energy_extractor,
cut_sil, spk_emb_dir)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
@ -299,17 +291,6 @@ def main():
n_mels=config.n_mels,
fmin=config.fmin,
fmax=config.fmax)
pitch_extractor = Pitch(
sr=config.fs,
hop_length=config.n_shift,
f0min=config.f0min,
f0max=config.f0max)
energy_extractor = Energy(
sr=config.fs,
n_fft=config.n_fft,
hop_length=config.n_shift,
win_length=config.win_length,
window=config.window)
# process for the 3 sections
if train_wav_files:
@ -319,8 +300,6 @@ def main():
sentences,
train_dump_dir,
mel_extractor,
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
@ -331,8 +310,6 @@ def main():
sentences,
dev_dump_dir,
mel_extractor,
pitch_extractor,
energy_extractor,
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if test_wav_files:
@ -342,8 +319,6 @@ def main():
sentences,
test_dump_dir,
mel_extractor,
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)

@ -300,10 +300,10 @@ class Tacotron2(nn.Layer):
olens = speech_lengths
# make labels for stop prediction
labels = make_pad_mask(olens - 1)
stop_labels = make_pad_mask(olens - 1)
# bool 类型无法切片
labels = paddle.cast(labels, dtype='float32')
labels = F.pad(labels, [0, 0, 0, 1], "constant", 1.0)
stop_labels = paddle.cast(stop_labels, dtype='float32')
stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
# calculate tacotron2 outputs
after_outs, before_outs, logits, att_ws = self._forward(
@ -322,12 +322,13 @@ class Tacotron2(nn.Layer):
olens = olens - olens % self.reduction_factor
max_out = max(olens)
ys = ys[:, :max_out]
labels = labels[:, :max_out]
labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0)
stop_labels = stop_labels[:, :max_out]
stop_labels = paddle.scatter(stop_labels, 1,
(olens - 1).unsqueeze(1), 1.0)
olens_in = olens // self.reduction_factor
else:
olens_in = olens
return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in
return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in
def _forward(
self,

@ -74,7 +74,7 @@ class Tacotron2Updater(StandardUpdater):
if spk_emb is not None:
spk_id = None
after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@ -83,8 +83,13 @@ class Tacotron2Updater(StandardUpdater):
spk_emb=spk_emb)
# calculate taco2 loss
l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs,
logits, ys, labels, olens)
l1_loss, mse_loss, bce_loss = self.taco2_loss(
after_outs=after_outs,
before_outs=before_outs,
logits=logits,
ys=ys,
stop_labels=stop_labels,
olens=olens)
if self.loss_type == "L1+L2":
loss = l1_loss + mse_loss + bce_loss
@ -164,7 +169,7 @@ class Tacotron2Evaluator(StandardEvaluator):
if spk_emb is not None:
spk_id = None
after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@ -173,8 +178,13 @@ class Tacotron2Evaluator(StandardEvaluator):
spk_emb=spk_emb)
# calculate taco2 loss
l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs,
logits, ys, labels, olens)
l1_loss, mse_loss, bce_loss = self.taco2_loss(
after_outs=after_outs,
before_outs=before_outs,
logits=logits,
ys=ys,
stop_labels=stop_labels,
olens=olens)
if self.loss_type == "L1+L2":
loss = l1_loss + mse_loss + bce_loss

@ -433,12 +433,10 @@ class TransformerTTS(nn.Layer):
olens = paddle.cast(speech_lengths, 'int64')
# make labels for stop prediction
labels = make_pad_mask(olens - 1)
labels = numpy.pad(
labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0)
labels = paddle.to_tensor(labels)
labels = paddle.cast(labels, dtype="float32")
# labels = F.pad(labels, [0, 1], "constant", 1.0)
stop_labels = make_pad_mask(olens - 1)
# bool 类型无法切片
stop_labels = paddle.cast(stop_labels, dtype='float32')
stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
# calculate transformer outputs
after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
@ -450,8 +448,8 @@ class TransformerTTS(nn.Layer):
olens = olens - olens % self.reduction_factor
max_olen = max(olens)
ys = ys[:, :max_olen]
labels = labels[:, :max_olen]
labels[:, -1] = 1.0 # make sure at least one frame has 1
stop_labels = stop_labels[:, :max_olen]
stop_labels[:, -1] = 1.0 # make sure at least one frame has 1
olens_in = olens // self.reduction_factor
else:
olens_in = olens
@ -465,7 +463,7 @@ class TransformerTTS(nn.Layer):
'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn
need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc
return after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict
return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict
def _forward(
self,

@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater):
self.msg = "Rank: {}, ".format(dist.get_rank())
losses_dict = {}
after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model(
after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater):
before_outs=before_outs,
logits=logits,
ys=ys,
labels=labels,
stop_labels=stop_labels,
olens=olens)
report("train/bce_loss", float(bce_loss))
@ -226,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
def evaluate_core(self, batch):
self.msg = "Evaluate: "
losses_dict = {}
after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model(
after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@ -237,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
before_outs=before_outs,
logits=logits,
ys=ys,
labels=labels,
stop_labels=stop_labels,
olens=olens)
report("eval/bce_loss", float(bce_loss))

@ -263,7 +263,7 @@ class Tacotron2Loss(nn.Layer):
self.bce_criterion = nn.BCEWithLogitsLoss(
reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
def forward(self, after_outs, before_outs, logits, ys, labels, olens):
def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
"""Calculate forward propagation.
Parameters
----------
@ -275,7 +275,7 @@ class Tacotron2Loss(nn.Layer):
Batch of stop logits (B, Lmax).
ys : Tensor
Batch of padded target features (B, Lmax, odim).
labels : Tensor(int64)
stop_labels : Tensor(int64)
Batch of the sequences of stop token labels (B, Lmax).
olens : Tensor(int64)
Batch of the lengths of each target (B,).
@ -296,8 +296,8 @@ class Tacotron2Loss(nn.Layer):
masks.broadcast_to(after_outs.shape))
before_outs = before_outs.masked_select(
masks.broadcast_to(before_outs.shape))
labels = labels.masked_select(
masks[:, :, 0].broadcast_to(labels.shape))
stop_labels = stop_labels.masked_select(
masks[:, :, 0].broadcast_to(stop_labels.shape))
logits = logits.masked_select(
masks[:, :, 0].broadcast_to(logits.shape))
@ -306,7 +306,7 @@ class Tacotron2Loss(nn.Layer):
before_outs, ys)
mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
before_outs, ys)
bce_loss = self.bce_criterion(logits, labels)
bce_loss = self.bce_criterion(logits, stop_labels)
# make weighted mask and apply it
if self.use_weighted_masking:

@ -207,7 +207,7 @@ class AttLoc(nn.Layer):
w = F.softmax(scaling * e, axis=1)
# weighted sum over flames
# weighted sum over frames
# utt x hdim
c = paddle.sum(
self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)

Loading…
Cancel
Save