From 96323816e9da0aae7fb26c7ab4882ec008870ec1 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 18 Jan 2022 10:22:42 +0000 Subject: [PATCH] fix yamls, change labels to stop_labels, test=tts --- examples/aishell3/tts3/conf/default.yaml | 4 +-- examples/aishell3/vc1/conf/default.yaml | 4 +-- examples/csmsc/tts0/conf/default.yaml | 4 --- examples/csmsc/tts3/conf/conformer.yaml | 4 +-- examples/csmsc/tts3/conf/default.yaml | 4 +-- examples/ljspeech/tts3/conf/default.yaml | 4 +-- examples/vctk/tts3/conf/default.yaml | 4 +-- .../t2s/exps/new_tacotron2/preprocess.py | 27 +------------------ .../t2s/models/new_tacotron2/tacotron2.py | 13 ++++----- .../models/new_tacotron2/tacotron2_updater.py | 22 ++++++++++----- .../models/transformer_tts/transformer_tts.py | 16 +++++------ .../transformer_tts_updater.py | 8 +++--- paddlespeech/t2s/modules/losses.py | 10 +++---- .../t2s/modules/tacotron2/attentions.py | 2 +- 14 files changed, 53 insertions(+), 73 deletions(-) diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 69307049..ac495674 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml index 69307049..ac495674 100644 --- a/examples/aishell3/vc1/conf/default.yaml +++ b/examples/aishell3/vc1/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml index 171aee88..42635c50 100644 --- a/examples/csmsc/tts0/conf/default.yaml +++ b/examples/csmsc/tts0/conf/default.yaml @@ -21,10 +21,6 @@ fmin: 80 # Minimum frequency of Mel basis. fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. -# Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. - ########################################################### # DATA SETTING # ########################################################### diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml index 03e4f2e3..fcad8615 100644 --- a/examples/csmsc/tts3/conf/conformer.yaml +++ b/examples/csmsc/tts3/conf/conformer.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml index ce2b24d9..2c2a1ea1 100644 --- a/examples/csmsc/tts3/conf/default.yaml +++ b/examples/csmsc/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml index 15cfda2c..5305c912 100644 --- a/examples/ljspeech/tts3/conf/default.yaml +++ b/examples/ljspeech/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml index 86d4a0d5..1bca9107 100644 --- a/examples/vctk/tts3/conf/default.yaml +++ b/examples/vctk/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py index 0b61912c..5fc6b590 100644 --- a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py @@ -27,9 +27,7 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import Energy from paddlespeech.t2s.data.get_feats import LogMelFBank -from paddlespeech.t2s.data.get_feats import Pitch from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur @@ -42,8 +40,6 @@ def process_sentence(config: Dict[str, Any], sentences: Dict, output_dir: Path, mel_extractor=None, - pitch_extractor=None, - energy_extractor=None, cut_sil: bool=True, spk_emb_dir: Path=None): utt_id = fp.stem @@ -117,8 +113,6 @@ def process_sentences(config, sentences: Dict, output_dir: Path, mel_extractor=None, - pitch_extractor=None, - energy_extractor=None, nprocs: int=1, cut_sil: bool=True, spk_emb_dir: Path=None): @@ -126,8 +120,7 @@ def process_sentences(config, results = [] for fp in fps: record = process_sentence(config, fp, sentences, output_dir, - mel_extractor, pitch_extractor, - energy_extractor, cut_sil, spk_emb_dir) + mel_extractor, cut_sil, spk_emb_dir) if record: results.append(record) else: @@ -137,7 +130,6 @@ def process_sentences(config, for fp in fps: future = pool.submit(process_sentence, config, fp, sentences, output_dir, mel_extractor, - pitch_extractor, energy_extractor, cut_sil, spk_emb_dir) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -299,17 +291,6 @@ def main(): n_mels=config.n_mels, fmin=config.fmin, fmax=config.fmax) - pitch_extractor = Pitch( - sr=config.fs, - hop_length=config.n_shift, - f0min=config.f0min, - f0max=config.f0max) - energy_extractor = Energy( - sr=config.fs, - n_fft=config.n_fft, - hop_length=config.n_shift, - win_length=config.win_length, - window=config.window) # process for the 3 sections if train_wav_files: @@ -319,8 +300,6 @@ def main(): sentences, train_dump_dir, mel_extractor, - pitch_extractor, - energy_extractor, nprocs=args.num_cpu, cut_sil=args.cut_sil, spk_emb_dir=spk_emb_dir) @@ -331,8 +310,6 @@ def main(): sentences, dev_dump_dir, mel_extractor, - pitch_extractor, - energy_extractor, cut_sil=args.cut_sil, spk_emb_dir=spk_emb_dir) if test_wav_files: @@ -342,8 +319,6 @@ def main(): sentences, test_dump_dir, mel_extractor, - pitch_extractor, - energy_extractor, nprocs=args.num_cpu, cut_sil=args.cut_sil, spk_emb_dir=spk_emb_dir) diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py index 4804ffb4..6a6d1073 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -300,10 +300,10 @@ class Tacotron2(nn.Layer): olens = speech_lengths # make labels for stop prediction - labels = make_pad_mask(olens - 1) + stop_labels = make_pad_mask(olens - 1) # bool 类型无法切片 - labels = paddle.cast(labels, dtype='float32') - labels = F.pad(labels, [0, 0, 0, 1], "constant", 1.0) + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) # calculate tacotron2 outputs after_outs, before_outs, logits, att_ws = self._forward( @@ -322,12 +322,13 @@ class Tacotron2(nn.Layer): olens = olens - olens % self.reduction_factor max_out = max(olens) ys = ys[:, :max_out] - labels = labels[:, :max_out] - labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0) + stop_labels = stop_labels[:, :max_out] + stop_labels = paddle.scatter(stop_labels, 1, + (olens - 1).unsqueeze(1), 1.0) olens_in = olens // self.reduction_factor else: olens_in = olens - return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in + return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in def _forward( self, diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py index 7572171b..09e6827d 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -74,7 +74,7 @@ class Tacotron2Updater(StandardUpdater): if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -83,8 +83,13 @@ class Tacotron2Updater(StandardUpdater): spk_emb=spk_emb) # calculate taco2 loss - l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, - logits, ys, labels, olens) + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) if self.loss_type == "L1+L2": loss = l1_loss + mse_loss + bce_loss @@ -164,7 +169,7 @@ class Tacotron2Evaluator(StandardEvaluator): if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -173,8 +178,13 @@ class Tacotron2Evaluator(StandardEvaluator): spk_emb=spk_emb) # calculate taco2 loss - l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, - logits, ys, labels, olens) + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) if self.loss_type == "L1+L2": loss = l1_loss + mse_loss + bce_loss diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index ba1f33ea..4babe283 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -433,12 +433,10 @@ class TransformerTTS(nn.Layer): olens = paddle.cast(speech_lengths, 'int64') # make labels for stop prediction - labels = make_pad_mask(olens - 1) - labels = numpy.pad( - labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0) - labels = paddle.to_tensor(labels) - labels = paddle.cast(labels, dtype="float32") - # labels = F.pad(labels, [0, 1], "constant", 1.0) + stop_labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, @@ -450,8 +448,8 @@ class TransformerTTS(nn.Layer): olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] - labels = labels[:, :max_olen] - labels[:, -1] = 1.0 # make sure at least one frame has 1 + stop_labels = stop_labels[:, :max_olen] + stop_labels[:, -1] = 1.0 # make sure at least one frame has 1 olens_in = olens // self.reduction_factor else: olens_in = olens @@ -465,7 +463,7 @@ class TransformerTTS(nn.Layer): 'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc - return after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict + return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict def _forward( self, diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index 1f25b019..dff908e0 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("train/bce_loss", float(bce_loss)) @@ -226,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator): def evaluate_core(self, batch): self.msg = "Evaluate: " losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -237,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("eval/bce_loss", float(bce_loss)) diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 044a52e5..3cc7a93c 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -263,7 +263,7 @@ class Tacotron2Loss(nn.Layer): self.bce_criterion = nn.BCEWithLogitsLoss( reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) - def forward(self, after_outs, before_outs, logits, ys, labels, olens): + def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens): """Calculate forward propagation. Parameters ---------- @@ -275,7 +275,7 @@ class Tacotron2Loss(nn.Layer): Batch of stop logits (B, Lmax). ys : Tensor Batch of padded target features (B, Lmax, odim). - labels : Tensor(int64) + stop_labels : Tensor(int64) Batch of the sequences of stop token labels (B, Lmax). olens : Tensor(int64) Batch of the lengths of each target (B,). @@ -296,8 +296,8 @@ class Tacotron2Loss(nn.Layer): masks.broadcast_to(after_outs.shape)) before_outs = before_outs.masked_select( masks.broadcast_to(before_outs.shape)) - labels = labels.masked_select( - masks[:, :, 0].broadcast_to(labels.shape)) + stop_labels = stop_labels.masked_select( + masks[:, :, 0].broadcast_to(stop_labels.shape)) logits = logits.masked_select( masks[:, :, 0].broadcast_to(logits.shape)) @@ -306,7 +306,7 @@ class Tacotron2Loss(nn.Layer): before_outs, ys) mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( before_outs, ys) - bce_loss = self.bce_criterion(logits, labels) + bce_loss = self.bce_criterion(logits, stop_labels) # make weighted mask and apply it if self.use_weighted_masking: diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index 2b912db3..710e326d 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -207,7 +207,7 @@ class AttLoc(nn.Layer): w = F.softmax(scaling * e, axis=1) - # weighted sum over flames + # weighted sum over frames # utt x hdim c = paddle.sum( self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)