diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml index fff355afa..7e48190c7 100644 --- a/examples/opencpop/svs1/conf/default.yaml +++ b/examples/opencpop/svs1/conf/default.yaml @@ -24,79 +24,84 @@ f0max: 750 # Maximum f0 for pitch extraction. # DATA SETTING # ########################################################### batch_size: 32 -num_workers: 4 +num_workers: 1 ########################################################### # MODEL SETTING # ########################################################### -# fastspeech2 module -fs2_model: - adim: 256 # attention dimension - aheads: 2 # number of attention heads - elayers: 4 # number of encoder layers - eunits: 1536 # number of encoder ff units - dlayers: 4 # number of decoder layers - dunits: 1536 # number of decoder ff units - positionwise_layer_type: conv1d # type of position-wise layer - positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer - duration_predictor_layers: 2 # number of layers of duration predictor - duration_predictor_chans: 256 # number of channels of duration predictor - duration_predictor_kernel_size: 3 # filter size of duration predictor - postnet_layers: 5 # number of layers of postnset - postnet_filts: 5 # filter size of conv layers in postnet - postnet_chans: 256 # number of channels of conv layers in postnet - use_scaled_pos_enc: True # whether to use scaled positional encoding - encoder_normalize_before: True # whether to perform layer normalization before the input - decoder_normalize_before: True # whether to perform layer normalization before the input - reduction_factor: 1 # reduction factor - init_type: xavier_uniform # initialization type - init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding - init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding - transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer - transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding - transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer - transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer - transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding - transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer - pitch_predictor_layers: 5 # number of conv layers in pitch predictor - pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor - pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor - pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor - pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch - pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder - energy_predictor_layers: 2 # number of conv layers in energy predictor - energy_predictor_chans: 256 # number of channels of conv layers in energy predictor - energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor - energy_predictor_dropout: 0.5 # dropout rate in energy predictor - energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy - energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder +model: + # music score related note_num: 300 is_slur_num: 2 -denoiser_model: - in_channels: 80 - out_channels: 80 - kernel_size: 3 - layers: 20 - stacks: 4 - residual_channels: 256 - gate_channels: 512 - skip_channels: 256 - aux_channels: 256 - dropout: 0.1 - bias: True - use_weight_norm: False - init_type: kaiming_uniform - -diffusion: - num_train_timesteps: 100 - beta_start: 0.0001 - beta_end: 0.06 - beta_schedule: "squaredcos_cap_v2" - num_max_timesteps: 60 + # fastspeech2 module + fastspeech2_params: + adim: 256 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + use_scaled_pos_enc: True # whether to use scaled positional encoding + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + init_type: xavier_uniform # initialization type + init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding + init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder + + # denoiser module + denoiser_params: + in_channels: 80 + out_channels: 80 + kernel_size: 3 + layers: 20 + stacks: 5 + residual_channels: 256 + gate_channels: 512 + skip_channels: 256 + aux_channels: 256 + dropout: 0.1 + bias: True + use_weight_norm: False + init_type: "kaiming_normal" + + # diffusion module + diffusion_params: + num_train_timesteps: 100 + beta_start: 0.0001 + beta_end: 0.06 + beta_schedule: "squaredcos_cap_v2" + num_max_timesteps: 60 ########################################################### @@ -112,7 +117,6 @@ ds_updater: ########################################################### # OPTIMIZER SETTING # ########################################################### -# gpu_num=2 config # fastspeech2 optimizer fs2_optimizer: optim: adam # optimizer type @@ -134,10 +138,10 @@ ds_grad_norm: 1 ########################################################### # INTERVAL SETTING # ########################################################### -ds_train_start_steps: 80000 # Number of steps to start to train diffusion module. -train_max_steps: 160000 # Number of training steps. +ds_train_start_steps: 160000 # Number of steps to start to train diffusion module. +train_max_steps: 320000 # Number of training steps. save_interval_steps: 1000 # Interval steps to save checkpoint. -eval_interval_steps: 250 # Interval steps to evaluate the network. +eval_interval_steps: 1000 # Interval steps to evaluate the network. num_snapshots: 5 diff --git a/examples/opencpop/svs1/local/train.sh b/examples/opencpop/svs1/local/train.sh index 5e255fb8d..d1302f99f 100755 --- a/examples/opencpop/svs1/local/train.sh +++ b/examples/opencpop/svs1/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --ngpu=2 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh index 44d8efc66..7bde38518 100755 --- a/examples/opencpop/svs1/run.sh +++ b/examples/opencpop/svs1/run.sh @@ -3,13 +3,13 @@ set -e source path.sh -gpus=4,5 +gpus=0 stage=0 stop_stage=100 conf_path=conf/default.yaml train_output_path=exp/default -ckpt_name=snapshot_iter_153.pdz +ckpt_name=snapshot_iter_320000.pdz # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -30,8 +30,3 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # synthesize, vocoder is pwgan by default CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 -fi diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py index 075a80e13..e8e4083a4 100644 --- a/paddlespeech/t2s/datasets/preprocess_utils.py +++ b/paddlespeech/t2s/datasets/preprocess_utils.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import re +from typing import List + import librosa import numpy as np @@ -42,7 +44,16 @@ def get_phn_dur(file_name): f.close() return sentence, speaker_set -def note2midi(notes): + +def note2midi(notes: List[str]) -> List[str]: + """Covert note string to note id, for example: ["C1"] -> [24] + + Args: + notes (List[str]): the list of note string + + Returns: + List[str]: the list of note id + """ midis = [] for note in notes: if note == 'rest': @@ -53,7 +64,21 @@ def note2midi(notes): return midis -def time2frame(times, sample_rate: int=24000, n_shift: int=128,): + +def time2frame( + times: List[float], + sample_rate: int=24000, + n_shift: int=128, ) -> List[int]: + """Convert the phoneme duration of time(s) into frames + + Args: + times (List[float]): phoneme duration of time(s) + sample_rate (int, optional): sample rate. Defaults to 24000. + n_shift (int, optional): frame shift. Defaults to 128. + + Returns: + List[int]: phoneme duration of frame + """ end = 0.0 ends = [] for t in times: @@ -63,14 +88,20 @@ def time2frame(times, sample_rate: int=24000, n_shift: int=128,): durations = np.diff(frame_pos, prepend=0) return durations -def get_sentences_svs(file_name, dataset: str='opencpop', sample_rate: int=24000, n_shift: int=128,): + +def get_sentences_svs( + file_name, + dataset: str='opencpop', + sample_rate: int=24000, + n_shift: int=128, ): ''' read label file Args: file_name (str or Path): path of gen_duration_from_textgrid.py's result dataset (str): dataset name Returns: - Dict: sentence: {'utt': ([char], [int])} + Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str) + tunple: speaker name ''' f = open(file_name, 'r') sentence = {} @@ -87,7 +118,10 @@ def get_sentences_svs(file_name, dataset: str='opencpop', sample_rate: int=24000 ph_dur = time2frame([float(t) for t in line_list[5].split()]) is_slur = line_list[6].split() assert len(ph) == len(midi) == len(midi_dur) == len(is_slur) - sentence[utt] = (ph, [int(i) for i in ph_dur], [int(i) for i in midi], [float(i) for i in midi_dur], [int(i) for i in is_slur], text, "opencpop") + sentence[utt] = (ph, [int(i) for i in ph_dur], + [int(i) for i in midi], + [float(i) for i in midi_dur], + [int(i) for i in is_slur], text, "opencpop") else: print("dataset should in {opencpop} now!") diff --git a/paddlespeech/t2s/exps/diffsinger/preprocess.py b/paddlespeech/t2s/exps/diffsinger/preprocess.py index e89a2f31e..f9322dc98 100644 --- a/paddlespeech/t2s/exps/diffsinger/preprocess.py +++ b/paddlespeech/t2s/exps/diffsinger/preprocess.py @@ -37,21 +37,28 @@ from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.utils import str2bool -ALL_SHENGMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', - 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'] -ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', - 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou', - 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn'] - -def process_sentence(config: Dict[str, Any], - fp: Path, - sentences: Dict, - output_dir: Path, - mel_extractor=None, - pitch_extractor=None, - energy_extractor=None, - cut_sil: bool=True, - spk_emb_dir: Path=None,): +ALL_INITIALS = [ + 'zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', + 'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w' +] +ALL_FINALS = [ + 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', + 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', + 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', + 'vn' +] + + +def process_sentence( + config: Dict[str, Any], + fp: Path, + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + cut_sil: bool=True, + spk_emb_dir: Path=None, ): utt_id = fp.stem record = None if utt_id in sentences: @@ -71,7 +78,7 @@ def process_sentence(config: Dict[str, Any], note_dur = sentences[utt_id][3] is_slur = sentences[utt_id][4] speaker = sentences[utt_id][-1] - + # extract mel feats logmel = mel_extractor.get_log_mel_fbank(wav) # change duration according to mel_length @@ -82,9 +89,13 @@ def process_sentence(config: Dict[str, Any], phones = sentences[utt_id][0] durations = sentences[utt_id][1] num_frames = logmel.shape[0] - word_boundary = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in phones] + word_boundary = [ + 1 if x in ALL_FINALS + ['AP', 'SP'] else 0 for x in phones + ] # print(sum(durations), num_frames) - assert sum(durations) == num_frames, "the sum of durations doesn't equal to the num of mel frames. " + assert sum( + durations + ) == num_frames, "the sum of durations doesn't equal to the num of mel frames. " speech_dir = output_dir / "data_speech" speech_dir.mkdir(parents=True, exist_ok=True) speech_path = speech_dir / (utt_id + "_speech.npy") @@ -128,17 +139,18 @@ def process_sentence(config: Dict[str, Any], return record -def process_sentences(config, - fps: List[Path], - sentences: Dict, - output_dir: Path, - mel_extractor=None, - pitch_extractor=None, - energy_extractor=None, - nprocs: int=1, - cut_sil: bool=True, - spk_emb_dir: Path=None, - write_metadata_method: str='w',): +def process_sentences( + config, + fps: List[Path], + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + nprocs: int=1, + cut_sil: bool=True, + spk_emb_dir: Path=None, + write_metadata_method: str='w', ): if nprocs == 1: results = [] for fp in tqdm.tqdm(fps, total=len(fps)): @@ -151,7 +163,7 @@ def process_sentences(config, pitch_extractor=pitch_extractor, energy_extractor=energy_extractor, cut_sil=cut_sil, - spk_emb_dir=spk_emb_dir,) + spk_emb_dir=spk_emb_dir, ) if record: results.append(record) else: @@ -159,10 +171,17 @@ def process_sentences(config, futures = [] with tqdm.tqdm(total=len(fps)) as progress: for fp in fps: - future = pool.submit(process_sentence, config, fp, - sentences, output_dir, mel_extractor, - pitch_extractor, energy_extractor, - cut_sil, spk_emb_dir,) + future = pool.submit( + process_sentence, + config, + fp, + sentences, + output_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + cut_sil, + spk_emb_dir, ) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -202,7 +221,7 @@ def main(): parser.add_argument( "--label-file", default=None, type=str, help="path to label file.") - + parser.add_argument("--config", type=str, help="diffsinger config file.") parser.add_argument( @@ -235,7 +254,6 @@ def main(): dumpdir.mkdir(parents=True, exist_ok=True) label_file = Path(args.label_file).expanduser() - if args.spk_emb_dir: spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() else: @@ -243,11 +261,15 @@ def main(): assert rootdir.is_dir() assert label_file.is_file() - + with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - sentences, speaker_set = get_sentences_svs(label_file, dataset=args.dataset, sample_rate=config.fs, n_shift=config.n_shift,) + sentences, speaker_set = get_sentences_svs( + label_file, + dataset=args.dataset, + sample_rate=config.fs, + n_shift=config.n_shift, ) # merge_silence(sentences) phone_id_map_path = dumpdir / "phone_id_map.txt" diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py index f0f2c0af7..c7940ad40 100644 --- a/paddlespeech/t2s/exps/diffsinger/train.py +++ b/paddlespeech/t2s/exps/diffsinger/train.py @@ -37,7 +37,7 @@ from paddlespeech.t2s.models.diffsinger import DiffSinger from paddlespeech.t2s.models.diffsinger import DiffSingerEvaluator from paddlespeech.t2s.models.diffsinger import DiffSingerUpdater from paddlespeech.t2s.models.diffsinger import DiffusionLoss -from paddlespeech.t2s.models.diffsinger import FastSpeech2MIDILoss +from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDILoss from paddlespeech.t2s.training.extensions.snapshot import Snapshot from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.optimizer import build_optimizers @@ -45,6 +45,9 @@ from paddlespeech.t2s.training.seeding import seed_everything from paddlespeech.t2s.training.trainer import Trainer from paddlespeech.t2s.utils import str2bool +# from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss + + def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly @@ -75,11 +78,6 @@ def train_sp(args, config): spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] - elif args.voice_cloning: - print("Training voice cloning!") - collate_fn = diffsinger_multi_spk_batch_fn - fields += ["spk_emb"] - converters["spk_emb"] = np.load else: collate_fn = diffsinger_single_spk_batch_fn print("single speaker diffsinger!") @@ -133,30 +131,28 @@ def train_sp(args, config): print("vocab_size:", vocab_size) odim = config.n_mels - config["fs2_model"]["idim"] = vocab_size - config["fs2_model"]["odim"] = odim - config["fs2_model"]["spk_num"] = spk_num - - model = DiffSinger( - fs2_config=config["fs2_model"], - denoiser_config=config["denoiser_model"], - diffusion_config=config["diffusion"]) + config["model"]["fastspeech2_params"]["spk_num"] = spk_num + model = DiffSinger(idim=vocab_size, odim=odim, **config["model"]) + model_fs2 = model.fs2 + model_ds = model.diffusion if world_size > 1: model = DataParallel(model) + model_fs2 = model._layers.fs2 + model_ds = model._layers.diffusion print("models done!") + # criterion_fs2 = FastSpeech2Loss(**config["fs2_updater"]) criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"]) criterion_ds = DiffusionLoss(**config["ds_updater"]) print("criterions done!") - optimizer_fs2 = build_optimizers(model._layers.fs2, - **config["fs2_optimizer"]) + optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"]) lr_schedule_ds = StepDecay(**config["ds_scheduler_params"]) gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"]) optimizer_ds = AdamW( learning_rate=lr_schedule_ds, grad_clip=gradient_clip_ds, - parameters=model._layers.diffusion.parameters(), + parameters=model_ds.parameters(), **config["ds_optimizer_params"]) # optimizer_ds = build_optimizers(ds, **config["ds_optimizer"]) print("optimizer done!") @@ -189,7 +185,8 @@ def train_sp(args, config): "ds": criterion_ds, }, dataloader=dev_dataloader, - output_dir=output_dir) + output_dir=output_dir,) + trainer = Trainer( updater, stop_trigger=(config.train_max_steps, "iteration"), @@ -224,12 +221,6 @@ def main(): default=None, help="speaker id map file for multiple speaker model.") - parser.add_argument( - "--voice-cloning", - type=str2bool, - default=False, - help="whether training voice cloning model.") - args = parser.parse_args() with open(args.config) as f: diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 410775138..9bf51710f 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -23,7 +23,6 @@ from typing import Optional import numpy as np import onnxruntime as ort import paddle -import yaml from paddle import inference from paddle import jit from paddle.io import DataLoader @@ -358,13 +357,8 @@ def get_am_inference(am: str='fastspeech2_csmsc', am = am_class( idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) elif am_name == 'diffsinger': - am_config["fs2_model"]["idim"] = vocab_size - am_config["fs2_model"]["odim"] = am_config.n_mels - am_config["fs2_model"]["spk_num"] = spk_num - am = am_class( - fs2_config=am_config["fs2_model"], - denoiser_config=am_config["denoiser_model"], - diffusion_config=am_config["diffusion"]) + am_config["model"]["fastspeech2_params"]["spk_num"] = spk_num + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py index 2f973bf0a..de9b39602 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py @@ -13,519 +13,147 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """DiffSinger related modules for paddle""" +from typing import Any from typing import Dict -from typing import List -from typing import Sequence from typing import Tuple -from typing import Union -import numpy as np import paddle -import paddle.nn.functional as F from paddle import nn from typeguard import check_argument_types -from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 -from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer -from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier +from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI from paddlespeech.t2s.modules.diffusion import GaussianDiffusion from paddlespeech.t2s.modules.diffusion import WaveNetDenoiser -from paddlespeech.t2s.modules.nets_utils import initialize -from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask -from paddlespeech.t2s.modules.nets_utils import make_pad_mask -from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor -from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss -from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator -from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor -from paddlespeech.t2s.modules.tacotron2.decoder import Postnet -from paddlespeech.t2s.modules.transformer.encoder import CNNDecoder -from paddlespeech.t2s.modules.transformer.encoder import CNNPostnet -from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder -from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder -class FastSpeech2MIDI(FastSpeech2): - """The Fastspeech2 module of DiffSinger. +class DiffSinger(nn.Layer): + """DiffSinger module. + + This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._ + .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`: + https://arxiv.org/pdf/2105.02446.pdf + + Args: + + Returns: + """ def __init__( self, - # fastspeech2 network structure related + # fastspeech2midi config idim: int, odim: int, - adim: int=384, - aheads: int=4, - elayers: int=6, - eunits: int=1536, - dlayers: int=6, - dunits: int=1536, - postnet_layers: int=5, - postnet_chans: int=512, - postnet_filts: int=5, - postnet_dropout_rate: float=0.5, - positionwise_layer_type: str="conv1d", - positionwise_conv_kernel_size: int=1, - use_scaled_pos_enc: bool=True, - use_batch_norm: bool=True, - encoder_normalize_before: bool=True, - decoder_normalize_before: bool=True, - encoder_concat_after: bool=False, - decoder_concat_after: bool=False, - reduction_factor: int=1, - encoder_type: str="transformer", - decoder_type: str="transformer", - # for transformer - transformer_enc_dropout_rate: float=0.1, - transformer_enc_positional_dropout_rate: float=0.1, - transformer_enc_attn_dropout_rate: float=0.1, - transformer_dec_dropout_rate: float=0.1, - transformer_dec_positional_dropout_rate: float=0.1, - transformer_dec_attn_dropout_rate: float=0.1, - # for conformer - conformer_pos_enc_layer_type: str="rel_pos", - conformer_self_attn_layer_type: str="rel_selfattn", - conformer_activation_type: str="swish", - use_macaron_style_in_conformer: bool=True, - use_cnn_in_conformer: bool=True, - zero_triu: bool=False, - conformer_enc_kernel_size: int=7, - conformer_dec_kernel_size: int=31, - # for CNN Decoder - cnn_dec_dropout_rate: float=0.2, - cnn_postnet_dropout_rate: float=0.2, - cnn_postnet_resblock_kernel_sizes: List[int]=[256, 256], - cnn_postnet_kernel_size: int=5, - cnn_decoder_embedding_dim: int=256, - # duration predictor - duration_predictor_layers: int=2, - duration_predictor_chans: int=384, - duration_predictor_kernel_size: int=3, - duration_predictor_dropout_rate: float=0.1, - # energy predictor - energy_predictor_layers: int=2, - energy_predictor_chans: int=384, - energy_predictor_kernel_size: int=3, - energy_predictor_dropout: float=0.5, - energy_embed_kernel_size: int=9, - energy_embed_dropout: float=0.5, - stop_gradient_from_energy_predictor: bool=False, - # pitch predictor - pitch_predictor_layers: int=2, - pitch_predictor_chans: int=384, - pitch_predictor_kernel_size: int=3, - pitch_predictor_dropout: float=0.5, - pitch_embed_kernel_size: int=9, - pitch_embed_dropout: float=0.5, - stop_gradient_from_pitch_predictor: bool=False, - # spk emb - spk_num: int=None, - spk_embed_dim: int=None, - spk_embed_integration_type: str="add", - # tone emb - tone_num: int=None, - tone_embed_dim: int=None, - tone_embed_integration_type: str="add", - # note emb + # music score related note_num: int=300, - # is_slur emb is_slur_num: int=2, - # training related - init_type: str="xavier_uniform", - init_enc_alpha: float=1.0, - init_dec_alpha: float=1.0, - # speaker classifier - enable_speaker_classifier: bool=False, - hidden_sc_dim: int=256, ): - """Initialize FastSpeech2 module for svs. + fastspeech2_params: Dict[str, Any]={ + "adim": 384, + "aheads": 4, + "elayers": 6, + "eunits": 1536, + "dlayers": 6, + "dunits": 1536, + "postnet_layers": 5, + "postnet_chans": 512, + "postnet_filts": 5, + "postnet_dropout_rate": 0.5, + "positionwise_layer_type": "conv1d", + "positionwise_conv_kernel_size": 1, + "use_scaled_pos_enc": True, + "use_batch_norm": True, + "encoder_normalize_before": True, + "decoder_normalize_before": True, + "encoder_concat_after": False, + "decoder_concat_after": False, + "reduction_factor": 1, + # for transformer + "transformer_enc_dropout_rate": 0.1, + "transformer_enc_positional_dropout_rate": 0.1, + "transformer_enc_attn_dropout_rate": 0.1, + "transformer_dec_dropout_rate": 0.1, + "transformer_dec_positional_dropout_rate": 0.1, + "transformer_dec_attn_dropout_rate": 0.1, + # duration predictor + "duration_predictor_layers": 2, + "duration_predictor_chans": 384, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_rate": 0.1, + # energy predictor + "energy_predictor_layers": 2, + "energy_predictor_chans": 384, + "energy_predictor_kernel_size": 3, + "energy_predictor_dropout": 0.5, + "energy_embed_kernel_size": 9, + "energy_embed_dropout": 0.5, + "stop_gradient_from_energy_predictor": False, + # pitch predictor + "pitch_predictor_layers": 2, + "pitch_predictor_chans": 384, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout": 0.5, + "pitch_embed_kernel_size": 9, + "pitch_embed_dropout": 0.5, + "stop_gradient_from_pitch_predictor": False, + # spk emb + "spk_num": None, + "spk_embed_dim": None, + "spk_embed_integration_type": "add", + # training related + "init_type": "xavier_uniform", + "init_enc_alpha": 1.0, + "init_dec_alpha": 1.0, + # speaker classifier + "enable_speaker_classifier": False, + "hidden_sc_dim": 256, + }, + # denoiser config + denoiser_params: Dict[str, Any]={ + "in_channels": 80, + "out_channels": 80, + "kernel_size": 3, + "layers": 20, + "stacks": 5, + "residual_channels": 256, + "gate_channels": 512, + "skip_channels": 256, + "aux_channels": 256, + "dropout": 0., + "bias": True, + "use_weight_norm": False, + "init_type": "kaiming_normal", + }, + # diffusion config + diffusion_params: Dict[str, Any]={ + "num_train_timesteps": 100, + "beta_start": 0.0001, + "beta_end": 0.06, + "beta_schedule": "squaredcos_cap_v2", + "num_max_timesteps": 60 + }, ): + """Initialize DiffSinger module. + Args: idim (int): - Dimension of the inputs. + Dimension of the inputs (Input vocabrary size.). odim (int): - Dimension of the outputs. - adim (int): - Attention dimension. - aheads (int): - Number of attention heads. - elayers (int): - Number of encoder layers. - eunits (int): - Number of encoder hidden units. - dlayers (int): - Number of decoder layers. - dunits (int): - Number of decoder hidden units. - postnet_layers (int): - Number of postnet layers. - postnet_chans (int): - Number of postnet channels. - postnet_filts (int): - Kernel size of postnet. - postnet_dropout_rate (float): - Dropout rate in postnet. - use_scaled_pos_enc (bool): - Whether to use trainable scaled pos encoding. - use_batch_norm (bool): - Whether to use batch normalization in encoder prenet. - encoder_normalize_before (bool): - Whether to apply layernorm layer before encoder block. - decoder_normalize_before (bool): - Whether to apply layernorm layer before decoder block. - encoder_concat_after (bool): - Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after (bool): - Whether to concatenate attention layer's input and output in decoder. - reduction_factor (int): - Reduction factor. - encoder_type (str): - Encoder type ("transformer" or "conformer"). - decoder_type (str): - Decoder type ("transformer" or "conformer"). - transformer_enc_dropout_rate (float): - Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float): - Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate (float): - Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate (float): - Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate (float): - Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate (float): - Dropout rate in decoder self-attention module. - conformer_pos_enc_layer_type (str): - Pos encoding layer type in conformer. - conformer_self_attn_layer_type (str): - Self-attention layer type in conformer - conformer_activation_type (str): - Activation function type in conformer. - use_macaron_style_in_conformer (bool): - Whether to use macaron style FFN. - use_cnn_in_conformer (bool): - Whether to use CNN in conformer. - zero_triu (bool): - Whether to use zero triu in relative self-attention module. - conformer_enc_kernel_size (int): - Kernel size of encoder conformer. - conformer_dec_kernel_size (int): - Kernel size of decoder conformer. - duration_predictor_layers (int): - Number of duration predictor layers. - duration_predictor_chans (int): - Number of duration predictor channels. - duration_predictor_kernel_size (int): - Kernel size of duration predictor. - duration_predictor_dropout_rate (float): - Dropout rate in duration predictor. - pitch_predictor_layers (int): - Number of pitch predictor layers. - pitch_predictor_chans (int): - Number of pitch predictor channels. - pitch_predictor_kernel_size (int): - Kernel size of pitch predictor. - pitch_predictor_dropout_rate (float): - Dropout rate in pitch predictor. - pitch_embed_kernel_size (float): - Kernel size of pitch embedding. - pitch_embed_dropout_rate (float): - Dropout rate for pitch embedding. - stop_gradient_from_pitch_predictor (bool): - Whether to stop gradient from pitch predictor to encoder. - energy_predictor_layers (int): - Number of energy predictor layers. - energy_predictor_chans (int): - Number of energy predictor channels. - energy_predictor_kernel_size (int): - Kernel size of energy predictor. - energy_predictor_dropout_rate (float): - Dropout rate in energy predictor. - energy_embed_kernel_size (float): - Kernel size of energy embedding. - energy_embed_dropout_rate (float): - Dropout rate for energy embedding. - stop_gradient_from_energy_predictor (bool): - Whether to stop gradient from energy predictor to encoder. - spk_num (Optional[int]): - Number of speakers. If not None, assume that the spk_embed_dim is not None, - spk_ids will be provided as the input and use spk_embedding_table. - spk_embed_dim (Optional[int]): - Speaker embedding dimension. If not None, - assume that spk_emb will be provided as the input or spk_num is not None. - spk_embed_integration_type (str): - How to integrate speaker embedding. - tone_num (Optional[int]): - Number of tones. If not None, assume that the - tone_ids will be provided as the input and use tone_embedding_table. - tone_embed_dim (Optional[int]): - Tone embedding dimension. If not None, assume that tone_num is not None. - tone_embed_integration_type (str): - How to integrate tone embedding. - init_type (str): - How to initialize transformer parameters. - init_enc_alpha (float): - Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha (float): - Initial value of alpha in scaled pos encoding of the decoder. - enable_speaker_classifier (bool): - Whether to use speaker classifier module - hidden_sc_dim (int): - The hidden layer dim of speaker classifier - note_num (Optional[int]): - Number of note. If not None, assume that the - note_ids will be provided as the input and use note_embedding_table. - is_slur_num (Optional[int]): - Number of note. If not None, assume that the - is_slur_ids will be provided as the input - + Dimension of the outputs (Acoustic feature dimension.). + note_num (int, optional): The number of note. Defaults to 300. + is_slur_num (int, optional): The number of slur. Defaults to 2. + fastspeech2_params (Dict[str, Any]): Parameter dict for fastspeech2 module. + denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module. + diffusion_params (Dict[str, Any]): Parameter dict for diffusion module. """ assert check_argument_types() - super().__init__(idim, odim) - - # store hyperparameters - self.odim = odim - self.reduction_factor = reduction_factor - self.encoder_type = encoder_type - self.decoder_type = decoder_type - self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor - self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor - self.use_scaled_pos_enc = use_scaled_pos_enc - self.hidden_sc_dim = hidden_sc_dim - self.spk_num = spk_num - self.enable_speaker_classifier = enable_speaker_classifier - - self.spk_embed_dim = spk_embed_dim - if self.spk_embed_dim is not None: - self.spk_embed_integration_type = spk_embed_integration_type - - self.tone_embed_dim = tone_embed_dim - if self.tone_embed_dim is not None: - self.tone_embed_integration_type = tone_embed_integration_type - - self.note_embed_dim = adim - if self.note_embed_dim is not None: - self.note_dur_layer = nn.Linear(1, self.note_embed_dim) - - self.is_slur_embed_dim = adim - - # use idx 0 as padding idx - self.padding_idx = 0 - - # initialize parameters - initialize(self, init_type) - - if spk_num and self.spk_embed_dim: - self.spk_embedding_table = nn.Embedding( - num_embeddings=spk_num, - embedding_dim=self.spk_embed_dim, - padding_idx=self.padding_idx) - - if self.tone_embed_dim is not None: - self.tone_embedding_table = nn.Embedding( - num_embeddings=tone_num, - embedding_dim=self.tone_embed_dim, - padding_idx=self.padding_idx) - - if note_num and self.note_embed_dim: - self.note_embedding_table = nn.Embedding( - num_embeddings=note_num, - embedding_dim=self.note_embed_dim, - padding_idx=self.padding_idx) - - if is_slur_num and self.is_slur_embed_dim: - self.is_slur_embedding_table = nn.Embedding( - num_embeddings=is_slur_num, - embedding_dim=self.is_slur_embed_dim, - padding_idx=self.padding_idx) - - # get positional encoding layer type - transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos" - - # define encoder - encoder_input_layer = nn.Embedding( - num_embeddings=idim, - embedding_dim=adim, - padding_idx=self.padding_idx) - - if encoder_type == "transformer": - self.encoder = TransformerEncoder( - idim=idim, - attention_dim=adim, - attention_heads=aheads, - linear_units=eunits, - num_blocks=elayers, - input_layer=encoder_input_layer, - dropout_rate=transformer_enc_dropout_rate, - positional_dropout_rate=transformer_enc_positional_dropout_rate, - attention_dropout_rate=transformer_enc_attn_dropout_rate, - pos_enc_layer_type=transformer_pos_enc_layer_type, - normalize_before=encoder_normalize_before, - concat_after=encoder_concat_after, - positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) - elif encoder_type == "conformer": - self.encoder = ConformerEncoder( - idim=idim, - attention_dim=adim, - attention_heads=aheads, - linear_units=eunits, - num_blocks=elayers, - input_layer=encoder_input_layer, - dropout_rate=transformer_enc_dropout_rate, - positional_dropout_rate=transformer_enc_positional_dropout_rate, - attention_dropout_rate=transformer_enc_attn_dropout_rate, - normalize_before=encoder_normalize_before, - concat_after=encoder_concat_after, - positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, - macaron_style=use_macaron_style_in_conformer, - pos_enc_layer_type=conformer_pos_enc_layer_type, - selfattention_layer_type=conformer_self_attn_layer_type, - activation_type=conformer_activation_type, - use_cnn_module=use_cnn_in_conformer, - cnn_module_kernel=conformer_enc_kernel_size, - zero_triu=zero_triu, ) - else: - raise ValueError(f"{encoder_type} is not supported.") - - # define additional projection for speaker embedding - if self.spk_embed_dim is not None: - if self.spk_embed_integration_type == "add": - self.spk_projection = nn.Linear(self.spk_embed_dim, adim) - else: - self.spk_projection = nn.Linear(adim + self.spk_embed_dim, adim) - - # define additional projection for tone embedding - if self.tone_embed_dim is not None: - if self.tone_embed_integration_type == "add": - self.tone_projection = nn.Linear(self.tone_embed_dim, adim) - else: - self.tone_projection = nn.Linear(adim + self.tone_embed_dim, - adim) - - if self.spk_num and self.enable_speaker_classifier: - # set lambda = 1 - self.grad_reverse = GradientReversalLayer(1) - self.speaker_classifier = SpeakerClassifier( - idim=adim, hidden_sc_dim=self.hidden_sc_dim, spk_num=spk_num) - - # define duration predictor - self.duration_predictor = DurationPredictor( - idim=adim, - n_layers=duration_predictor_layers, - n_chans=duration_predictor_chans, - kernel_size=duration_predictor_kernel_size, - dropout_rate=duration_predictor_dropout_rate, ) - - # define pitch predictor - self.pitch_predictor = VariancePredictor( - idim=adim, - n_layers=pitch_predictor_layers, - n_chans=pitch_predictor_chans, - kernel_size=pitch_predictor_kernel_size, - dropout_rate=pitch_predictor_dropout, ) - # We use continuous pitch + FastPitch style avg - self.pitch_embed = nn.Sequential( - nn.Conv1D( - in_channels=1, - out_channels=adim, - kernel_size=pitch_embed_kernel_size, - padding=(pitch_embed_kernel_size - 1) // 2, ), - nn.Dropout(pitch_embed_dropout), ) - - # define energy predictor - self.energy_predictor = VariancePredictor( - idim=adim, - n_layers=energy_predictor_layers, - n_chans=energy_predictor_chans, - kernel_size=energy_predictor_kernel_size, - dropout_rate=energy_predictor_dropout, ) - # We use continuous enegy + FastPitch style avg - self.energy_embed = nn.Sequential( - nn.Conv1D( - in_channels=1, - out_channels=adim, - kernel_size=energy_embed_kernel_size, - padding=(energy_embed_kernel_size - 1) // 2, ), - nn.Dropout(energy_embed_dropout), ) - - # define length regulator - self.length_regulator = LengthRegulator() - - # define decoder - # NOTE: we use encoder as decoder - # because fastspeech's decoder is the same as encoder - if decoder_type == "transformer": - self.decoder = TransformerEncoder( - idim=0, - attention_dim=adim, - attention_heads=aheads, - linear_units=dunits, - num_blocks=dlayers, - # in decoder, don't need layer before pos_enc_class (we use embedding here in encoder) - input_layer=None, - dropout_rate=transformer_dec_dropout_rate, - positional_dropout_rate=transformer_dec_positional_dropout_rate, - attention_dropout_rate=transformer_dec_attn_dropout_rate, - pos_enc_layer_type=transformer_pos_enc_layer_type, - normalize_before=decoder_normalize_before, - concat_after=decoder_concat_after, - positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) - elif decoder_type == "conformer": - self.decoder = ConformerEncoder( - idim=0, - attention_dim=adim, - attention_heads=aheads, - linear_units=dunits, - num_blocks=dlayers, - input_layer=None, - dropout_rate=transformer_dec_dropout_rate, - positional_dropout_rate=transformer_dec_positional_dropout_rate, - attention_dropout_rate=transformer_dec_attn_dropout_rate, - normalize_before=decoder_normalize_before, - concat_after=decoder_concat_after, - positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, - macaron_style=use_macaron_style_in_conformer, - pos_enc_layer_type=conformer_pos_enc_layer_type, - selfattention_layer_type=conformer_self_attn_layer_type, - activation_type=conformer_activation_type, - use_cnn_module=use_cnn_in_conformer, - cnn_module_kernel=conformer_dec_kernel_size, ) - elif decoder_type == 'cnndecoder': - self.decoder = CNNDecoder( - emb_dim=adim, - odim=odim, - kernel_size=cnn_postnet_kernel_size, - dropout_rate=cnn_dec_dropout_rate, - resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes) - else: - raise ValueError(f"{decoder_type} is not supported.") - - # define final projection - self.feat_out = nn.Linear(adim, odim * reduction_factor) - - # define postnet - if decoder_type == 'cnndecoder': - self.postnet = CNNPostnet( - odim=odim, - kernel_size=cnn_postnet_kernel_size, - dropout_rate=cnn_postnet_dropout_rate, - resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes) - else: - self.postnet = (None if postnet_layers == 0 else Postnet( - idim=idim, - odim=odim, - n_layers=postnet_layers, - n_chans=postnet_chans, - n_filts=postnet_filts, - use_batch_norm=use_batch_norm, - dropout_rate=postnet_dropout_rate, )) - - nn.initializer.set_global_initializer(None) - - self._reset_parameters( - init_enc_alpha=init_enc_alpha, - init_dec_alpha=init_dec_alpha, ) + super().__init__() + self.fs2 = FastSpeech2MIDI( + idim=idim, + odim=odim, + fastspeech2_config=fastspeech2_params, + note_num=note_num, + is_slur_num=is_slur_num) + denoiser = WaveNetDenoiser(**denoiser_params) + self.diffusion = GaussianDiffusion(denoiser, **diffusion_params) def forward( self, @@ -539,326 +167,76 @@ class FastSpeech2MIDI(FastSpeech2): durations: paddle.Tensor, pitch: paddle.Tensor, energy: paddle.Tensor, - tone_id: paddle.Tensor=None, spk_emb: paddle.Tensor=None, spk_id: paddle.Tensor=None, + train_fs2: bool=True, ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. Args: text(Tensor(int64)): - Batch of padded token ids (B, Tmax). + Batch of padded token (phone) ids (B, Tmax). + note(Tensor(int64)): + Batch of padded note (element in music score) ids (B, Tmax). + note_dur(Tensor(float32)): + Batch of padded note durations in seconds (element in music score) (B, Tmax). + is_slur(Tensor(int64)): + Batch of padded slur (element in music score) ids (B, Tmax). text_lengths(Tensor(int64)): - Batch of lengths of each input (B,). - speech(Tensor): - Batch of padded target features (B, Lmax, odim). + Batch of phone lengths of each input (B,). + speech(Tensor[float32]): + Batch of padded target features (e.g. mel) (B, Lmax, odim). speech_lengths(Tensor(int64)): - Batch of the lengths of each target (B,). + Batch of the lengths of each target features (B,). durations(Tensor(int64)): - Batch of padded durations (B, Tmax). - pitch(Tensor): - Batch of padded token-averaged pitch (B, Tmax, 1). - energy(Tensor): - Batch of padded token-averaged energy (B, Tmax, 1). - tone_id(Tensor, optional(int64)): - Batch of padded tone ids (B, Tmax). - spk_emb(Tensor, optional): + Batch of padded token durations in frame (B, Tmax). + pitch(Tensor[float32]): + Batch of padded frame-averaged pitch (B, Lmax, 1). + energy(Tensor[float32]): + Batch of padded frame-averaged energy (B, Lmax, 1). + spk_emb(Tensor[float32], optional): Batch of speaker embeddings (B, spk_embed_dim). - spk_id(Tnesor, optional(int64)): + spk_id(Tnesor[int64], optional(int64)): Batch of speaker ids (B,) + train_fs2(bool): + Whether to train only the fastspeech2 module Returns: """ - - # input of embedding must be int64 - xs = paddle.cast(text, 'int64') - note = paddle.cast(note, 'int64') - note_dur = paddle.cast(note_dur, 'float32') - is_slur = paddle.cast(is_slur, 'int64') - ilens = paddle.cast(text_lengths, 'int64') - ds = paddle.cast(durations, 'int64') - olens = paddle.cast(speech_lengths, 'int64') - ys = speech - ps = pitch - es = energy - if spk_id is not None: - spk_id = paddle.cast(spk_id, 'int64') - if tone_id is not None: - tone_id = paddle.cast(tone_id, 'int64') - # forward propagation - before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward( - xs, - note, - note_dur, - is_slur, - ilens, - olens, - ds, - ps, - es, - is_inference=False, - spk_emb=spk_emb, - spk_id=spk_id, - tone_id=tone_id) - # modify mod part of groundtruth - if self.reduction_factor > 1: - olens = olens - olens % self.reduction_factor - max_olen = max(olens) - ys = ys[:, :max_olen] - - return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits - - def _forward(self, - xs: paddle.Tensor, - note: paddle.Tensor, - note_dur: paddle.Tensor, - is_slur: paddle.Tensor, - ilens: paddle.Tensor, - olens: paddle.Tensor=None, - ds: paddle.Tensor=None, - ps: paddle.Tensor=None, - es: paddle.Tensor=None, - is_inference: bool=False, - is_train_diffusion: bool=False, - return_after_enc=False, - alpha: float=1.0, - spk_emb=None, - spk_id=None, - tone_id=None) -> Sequence[paddle.Tensor]: - # forward encoder - x_masks = self._source_mask(ilens) - note_emb = self.note_embedding_table(note) - note_dur_emb = self.note_dur_layer(paddle.unsqueeze(note_dur, axis=-1)) - is_slur_emb = self.is_slur_embedding_table(is_slur) - - # (B, Tmax, adim) - hs, _ = self.encoder( - xs, - x_masks, - note_emb, - note_dur_emb, - is_slur_emb, ) - - if self.spk_num and self.enable_speaker_classifier and not is_inference: - hs_for_spk_cls = self.grad_reverse(hs) - spk_logits = self.speaker_classifier(hs_for_spk_cls, ilens) - else: - spk_logits = None - - # integrate speaker embedding - if self.spk_embed_dim is not None: - # spk_emb has a higher priority than spk_id - if spk_emb is not None: - hs = self._integrate_with_spk_embed(hs, spk_emb) - elif spk_id is not None: - spk_emb = self.spk_embedding_table(spk_id) - hs = self._integrate_with_spk_embed(hs, spk_emb) - - # integrate tone embedding - if self.tone_embed_dim is not None: - if tone_id is not None: - tone_embs = self.tone_embedding_table(tone_id) - hs = self._integrate_with_tone_embed(hs, tone_embs) - # forward duration predictor and variance predictors - d_masks = make_pad_mask(ilens) - if olens is not None: - pitch_masks = make_pad_mask(olens).unsqueeze(-1) - else: - pitch_masks = None - - # inference for decoder input for duffusion - if is_train_diffusion: - hs = self.length_regulator(hs, ds, is_inference=False) - p_outs = self.pitch_predictor(hs.detach(), pitch_masks) - e_outs = self.energy_predictor(hs.detach(), pitch_masks) - p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( - (0, 2, 1)) - e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( - (0, 2, 1)) - hs = hs + e_embs + p_embs - - elif is_inference: - # (B, Tmax) - if ds is not None: - d_outs = ds - else: - d_outs = self.duration_predictor.inference(hs, d_masks) - - # (B, Lmax, adim) - hs = self.length_regulator(hs, d_outs, alpha, is_inference=True) - - if ps is not None: - p_outs = ps - else: - if self.stop_gradient_from_pitch_predictor: - p_outs = self.pitch_predictor(hs.detach(), pitch_masks) - else: - p_outs = self.pitch_predictor(hs, pitch_masks) - - if es is not None: - e_outs = es - else: - if self.stop_gradient_from_energy_predictor: - e_outs = self.energy_predictor(hs.detach(), pitch_masks) - else: - e_outs = self.energy_predictor(hs, pitch_masks) - - p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( - (0, 2, 1)) - e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( - (0, 2, 1)) - hs = hs + e_embs + p_embs - - # training - else: - d_outs = self.duration_predictor(hs, d_masks) - # (B, Lmax, adim) - hs = self.length_regulator(hs, ds, is_inference=False) - if self.stop_gradient_from_pitch_predictor: - p_outs = self.pitch_predictor(hs.detach(), pitch_masks) - else: - p_outs = self.pitch_predictor(hs, pitch_masks) - if self.stop_gradient_from_energy_predictor: - e_outs = self.energy_predictor(hs.detach(), pitch_masks) - else: - e_outs = self.energy_predictor(hs, pitch_masks) - p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose( - (0, 2, 1)) - e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose( - (0, 2, 1)) - hs = hs + e_embs + p_embs - - # forward decoder - if olens is not None and not is_inference: - if self.reduction_factor > 1: - olens_in = paddle.to_tensor( - [olen // self.reduction_factor for olen in olens.numpy()]) - else: - olens_in = olens - # (B, 1, T) - h_masks = self._source_mask(olens_in) - else: - h_masks = None - - if return_after_enc: - return hs, h_masks - - if self.decoder_type == 'cnndecoder': - # remove output masks for dygraph to static graph - zs = self.decoder(hs, h_masks) - before_outs = zs - else: - # (B, Lmax, adim) - zs, _ = self.decoder(hs, h_masks) - # (B, Lmax, odim) - before_outs = self.feat_out(zs).reshape( - (paddle.shape(zs)[0], -1, self.odim)) - - # postnet -> (B, Lmax//r * r, odim) - if self.postnet is None: - after_outs = before_outs - else: - after_outs = before_outs + self.postnet( - before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - - return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits - - def encoder_infer( - self, - text: paddle.Tensor, - note: paddle.Tensor, - note_dur: paddle.Tensor, - is_slur: paddle.Tensor, - alpha: float=1.0, - spk_emb=None, - spk_id=None, - tone_id=None, - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: - - # input of embedding must be int64 - x = paddle.cast(text, 'int64') - note = paddle.cast(note, 'int64') - note_dur = paddle.cast(note_dur, 'float32') - is_slur = paddle.cast(is_slur, 'int64') - # setup batch axis - ilens = paddle.shape(x)[0] - - xs = x.unsqueeze(0) - note = note.unsqueeze(0) - note_dur = note_dur.unsqueeze(0) - is_slur = is_slur.unsqueeze(0) - - if spk_emb is not None: - spk_emb = spk_emb.unsqueeze(0) - - if tone_id is not None: - tone_id = tone_id.unsqueeze(0) - - # (1, L, odim) - # use *_ to avoid bug in dygraph to static graph - hs, _ = self._forward( - xs, - note, - note_dur, - is_slur, - ilens, - is_inference=True, - return_after_enc=True, - alpha=alpha, - spk_emb=spk_emb, + # only train fastspeech2 module firstly + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2( + text=text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + text_lengths=text_lengths, + speech=speech, + speech_lengths=speech_lengths, + durations=durations, + pitch=pitch, + energy=energy, spk_id=spk_id, - tone_id=tone_id) - return hs - - # for diffusion - def encoder_infer_batch( - self, - text: paddle.Tensor, - note: paddle.Tensor, - note_dur: paddle.Tensor, - is_slur: paddle.Tensor, - text_lengths: paddle.Tensor, - speech_lengths: paddle.Tensor, - ds: paddle.Tensor=None, - ps: paddle.Tensor=None, - es: paddle.Tensor=None, - alpha: float=1.0, - spk_emb=None, - spk_id=None, - tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]: - - # input of embedding must be int64 - xs = paddle.cast(text, 'int64') - note = paddle.cast(note, 'int64') - note_dur = paddle.cast(note_dur, 'float32') - is_slur = paddle.cast(is_slur, 'int64') - - if spk_emb is not None: - spk_emb = spk_emb.unsqueeze(0) + spk_emb=spk_emb) + if train_fs2: + return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits - if tone_id is not None: - tone_id = tone_id.unsqueeze(0) + # get the encoder output from fastspeech2 as the condition of denoiser module + cond_fs2, mel_masks = self.fs2.encoder_infer_batch( + text=text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + text_lengths=text_lengths, + speech_lengths=speech_lengths, + ds=durations, + ps=pitch, + es=energy) + cond_fs2 = cond_fs2.transpose((0, 2, 1)) - # (1, L, odim) - # use *_ to avoid bug in dygraph to static graph - hs, h_masks = self._forward( - xs, - note, - note_dur, - is_slur, - ilens=text_lengths, - olens=speech_lengths, - ds=ds, - ps=ps, - es=es, - return_after_enc=True, - is_train_diffusion=True, - alpha=alpha, - spk_emb=spk_emb, - spk_id=spk_id, - tone_id=tone_id) - return hs, h_masks + # get the output(final mel) from diffusion module + mel = self.diffusion(speech.transpose((0, 2, 1)), cond_fs2.detach()) + return mel[0], mel_masks def inference( self, @@ -866,253 +244,54 @@ class FastSpeech2MIDI(FastSpeech2): note: paddle.Tensor, note_dur: paddle.Tensor, is_slur: paddle.Tensor, - durations: paddle.Tensor=None, - pitch: paddle.Tensor=None, - energy: paddle.Tensor=None, - alpha: float=1.0, - use_teacher_forcing: bool=False, - spk_emb=None, - spk_id=None, - tone_id=None, - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: - """Generate the sequence of features given the sequences of characters. + get_mel_fs2: bool=False, ): + """Run inference Args: text(Tensor(int64)): - Input sequence of characters (T,). - durations(Tensor, optional (int64)): - Groundtruth of duration (T,). - pitch(Tensor, optional): - Groundtruth of token-averaged pitch (T, 1). - energy(Tensor, optional): - Groundtruth of token-averaged energy (T, 1). - alpha(float, optional): - Alpha to control the speed. - use_teacher_forcing(bool, optional): - Whether to use teacher forcing. - If true, groundtruth of duration, pitch and energy will be used. - spk_emb(Tensor, optional, optional): - peaker embedding vector (spk_embed_dim,). (Default value = None) - spk_id(Tensor, optional(int64), optional): - spk ids (1,). (Default value = None) - tone_id(Tensor, optional(int64), optional): - tone ids (T,). (Default value = None) + Batch of padded token (phone) ids (B, Tmax). + note(Tensor(int64)): + Batch of padded note (element in music score) ids (B, Tmax). + note_dur(Tensor(float32)): + Batch of padded note durations in seconds (element in music score) (B, Tmax). + is_slur(Tensor(int64)): + Batch of padded slur (element in music score) ids (B, Tmax). + get_mel_fs2 (bool, optional): . Defaults to False. + Whether to get mel from fastspeech2 module. Returns: - + _type_: _description_ """ - # input of embedding must be int64 - x = paddle.cast(text, 'int64') - note = paddle.cast(note, 'int64') - note_dur = paddle.cast(note_dur, 'float32') - is_slur = paddle.cast(is_slur, 'int64') - d, p, e = durations, pitch, energy - # setup batch axis - ilens = paddle.shape(x)[0] - - xs = x.unsqueeze(0) - note = note.unsqueeze(0) - note_dur = note_dur.unsqueeze(0) - is_slur = is_slur.unsqueeze(0) - - if spk_emb is not None: - spk_emb = spk_emb.unsqueeze(0) - - if tone_id is not None: - tone_id = tone_id.unsqueeze(0) - - if use_teacher_forcing: - # use groundtruth of duration, pitch, and energy - ds = d.unsqueeze(0) if d is not None else None - ps = p.unsqueeze(0) if p is not None else None - es = e.unsqueeze(0) if e is not None else None - - # (1, L, odim) - _, outs, d_outs, p_outs, e_outs, _ = self._forward( - xs, - note, - note_dur, - is_slur, - ilens, - ds=ds, - ps=ps, - es=es, - spk_emb=spk_emb, - spk_id=spk_id, - tone_id=tone_id, - is_inference=True) - else: - # (1, L, odim) - _, outs, d_outs, p_outs, e_outs, _ = self._forward( - xs, - note, - note_dur, - is_slur, - ilens, - is_inference=True, - alpha=alpha, - spk_emb=spk_emb, - spk_id=spk_id, - tone_id=tone_id) - - return outs[0], d_outs[0], p_outs[0], e_outs[0] - + mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur) + if get_mel_fs2: + return mel_fs2 + mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1)) + cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur) + cond_fs2 = cond_fs2.transpose((0, 2, 1)) + mel, _ = self.diffusion(mel_fs2, cond_fs2) + mel = mel.transpose((0, 2, 1)) + return mel[0] -class FastSpeech2MIDILoss(nn.Layer): - """Loss function module for DiffSinger.""" - def __init__(self, use_masking: bool=True, - use_weighted_masking: bool=False): - """Initialize feed-forward Transformer loss module. - Args: - use_masking (bool): - Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool): - Whether to weighted masking in loss calculation. - """ - assert check_argument_types() +class DiffSingerInference(nn.Layer): + def __init__(self, normalizer, model): super().__init__() + self.normalizer = normalizer + self.acoustic_model = model - assert (use_masking != use_weighted_masking) or not use_masking - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - - # define criterions - reduction = "none" if self.use_weighted_masking else "mean" - self.l1_criterion = nn.L1Loss(reduction=reduction) - self.mse_criterion = nn.MSELoss(reduction=reduction) - self.duration_criterion = DurationPredictorLoss(reduction=reduction) - self.ce_criterion = nn.CrossEntropyLoss() - - def forward( - self, - after_outs: paddle.Tensor, - before_outs: paddle.Tensor, - d_outs: paddle.Tensor, - p_outs: paddle.Tensor, - e_outs: paddle.Tensor, - ys: paddle.Tensor, - ds: paddle.Tensor, - ps: paddle.Tensor, - es: paddle.Tensor, - ilens: paddle.Tensor, - olens: paddle.Tensor, - spk_logits: paddle.Tensor=None, - spk_ids: paddle.Tensor=None, - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, - paddle.Tensor, ]: - """Calculate forward propagation. - - Args: - after_outs(Tensor): - Batch of outputs after postnets (B, Lmax, odim). - before_outs(Tensor): - Batch of outputs before postnets (B, Lmax, odim). - d_outs(Tensor): - Batch of outputs of duration predictor (B, Tmax). - p_outs(Tensor): - Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs(Tensor): - Batch of outputs of energy predictor (B, Tmax, 1). - ys(Tensor): - Batch of target features (B, Lmax, odim). - ds(Tensor): - Batch of durations (B, Tmax). - ps(Tensor): - Batch of target token-averaged pitch (B, Tmax, 1). - es(Tensor): - Batch of target token-averaged energy (B, Tmax, 1). - ilens(Tensor): - Batch of the lengths of each input (B,). - olens(Tensor): - Batch of the lengths of each target (B,). - spk_logits(Option[Tensor]): - Batch of outputs after speaker classifier (B, Lmax, num_spk) - spk_ids(Option[Tensor]): - Batch of target spk_id (B,) - - - Returns: - - - """ - speaker_loss = 0.0 - - # apply mask to remove padded part - if self.use_masking: - out_masks = make_non_pad_mask(olens).unsqueeze(-1) - before_outs = before_outs.masked_select( - out_masks.broadcast_to(before_outs.shape)) - if after_outs is not None: - after_outs = after_outs.masked_select( - out_masks.broadcast_to(after_outs.shape)) - ys = ys.masked_select(out_masks.broadcast_to(ys.shape)) - duration_masks = make_non_pad_mask(ilens) - d_outs = d_outs.masked_select( - duration_masks.broadcast_to(d_outs.shape)) - ds = ds.masked_select(duration_masks.broadcast_to(ds.shape)) - pitch_masks = out_masks - p_outs = p_outs.masked_select( - pitch_masks.broadcast_to(p_outs.shape)) - e_outs = e_outs.masked_select( - pitch_masks.broadcast_to(e_outs.shape)) - ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape)) - es = es.masked_select(pitch_masks.broadcast_to(es.shape)) - - if spk_logits is not None and spk_ids is not None: - batch_size = spk_ids.shape[0] - spk_ids = paddle.repeat_interleave(spk_ids, spk_logits.shape[1], - None) - spk_logits = paddle.reshape(spk_logits, - [-1, spk_logits.shape[-1]]) - mask_index = spk_logits.abs().sum(axis=1) != 0 - spk_ids = spk_ids[mask_index] - spk_logits = spk_logits[mask_index] - - # calculate loss - l1_loss = self.l1_criterion(before_outs, ys) - if after_outs is not None: - l1_loss += self.l1_criterion(after_outs, ys) - duration_loss = self.duration_criterion(d_outs, ds) - pitch_loss = self.mse_criterion(p_outs, ps) - energy_loss = self.mse_criterion(e_outs, es) - - if spk_logits is not None and spk_ids is not None: - speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size - - # make weighted mask and apply it - if self.use_weighted_masking: - out_masks = make_non_pad_mask(olens).unsqueeze(-1) - out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast( - dtype=paddle.float32).sum( - axis=1, keepdim=True) - out_weights /= ys.shape[0] * ys.shape[2] - duration_masks = make_non_pad_mask(ilens) - duration_weights = (duration_masks.cast(dtype=paddle.float32) / - duration_masks.cast(dtype=paddle.float32).sum( - axis=1, keepdim=True)) - duration_weights /= ds.shape[0] - - # apply weight - l1_loss = l1_loss.multiply(out_weights) - l1_loss = l1_loss.masked_select( - out_masks.broadcast_to(l1_loss.shape)).sum() - duration_loss = (duration_loss.multiply(duration_weights) - .masked_select(duration_masks).sum()) - pitch_masks = out_masks - pitch_weights = out_weights - pitch_loss = pitch_loss.multiply(pitch_weights) - pitch_loss = pitch_loss.masked_select( - pitch_masks.broadcast_to(pitch_loss.shape)).sum() - energy_loss = energy_loss.multiply(pitch_weights) - energy_loss = energy_loss.masked_select( - pitch_masks.broadcast_to(energy_loss.shape)).sum() - - return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss + def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False): + normalized_mel = self.acoustic_model.inference( + text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + get_mel_fs2=get_mel_fs2) + logmel = self.normalizer.inverse(normalized_mel) + return logmel class DiffusionLoss(nn.Layer): - """Loss function module for DiffSinger.""" + """Loss function module for Diffusion module on DiffSinger.""" def __init__(self, use_masking: bool=True, use_weighted_masking: bool=False): @@ -1175,112 +354,3 @@ class DiffusionLoss(nn.Layer): mel_masks.broadcast_to(l1_loss.shape)).sum() return l1_loss - - -class DiffSinger(nn.Layer): - """DiffSinger module. - - This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._ - .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`: - https://arxiv.org/pdf/2105.02446.pdf - - Args: - - Returns: - - """ - - def __init__( - self, - fs2_config, - denoiser_config, - diffusion_config, ): - - assert check_argument_types() - super().__init__() - self.fs2 = FastSpeech2MIDI(**fs2_config) - denoiser = WaveNetDenoiser(**denoiser_config) - self.diffusion = GaussianDiffusion(denoiser, **diffusion_config) - - def forward( - self, - text: paddle.Tensor, - note: paddle.Tensor, - note_dur: paddle.Tensor, - is_slur: paddle.Tensor, - text_lengths: paddle.Tensor, - speech: paddle.Tensor, - speech_lengths: paddle.Tensor, - durations: paddle.Tensor, - pitch: paddle.Tensor, - energy: paddle.Tensor, - tone_id: paddle.Tensor=None, - spk_emb: paddle.Tensor=None, - spk_id: paddle.Tensor=None, - train_fs2: bool=True, - ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: - - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2( - text=text, - note=note, - note_dur=note_dur, - is_slur=is_slur, - text_lengths=text_lengths, - speech=speech, - speech_lengths=speech_lengths, - durations=durations, - pitch=pitch, - energy=energy, - spk_id=spk_id, - spk_emb=spk_emb) - cond_fs2, mel_masks = self.fs2.encoder_infer_batch( - text=text, - note=note, - note_dur=note_dur, - is_slur=is_slur, - text_lengths=text_lengths, - speech_lengths=speech_lengths, - ds=durations, - ps=pitch, - es=energy) - cond_fs2 = cond_fs2.transpose((0, 2, 1)) - mel = self.diffusion(speech.transpose((0, 2, 1)), cond_fs2.detach()) - - if train_fs2: - return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits - else: - return mel[0], mel_masks - - def inference( - self, - text: paddle.Tensor, - note: paddle.Tensor, - note_dur: paddle.Tensor, - is_slur: paddle.Tensor, - get_mel_fs2: bool=False, ): - mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur) - if get_mel_fs2: - return mel_fs2 - mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1)) - cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur) - cond_fs2 = cond_fs2.transpose((0, 2, 1)) - mel, _ = self.diffusion(mel_fs2, cond_fs2) - mel = mel.transpose((0, 2, 1)) - return mel[0] - - -class DiffSingerInference(nn.Layer): - def __init__(self, normalizer, model): - super().__init__() - self.normalizer = normalizer - self.acoustic_model = model - - def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False): - normalized_mel = self.acoustic_model.inference( - text, - note=note, - note_dur=note_dur, - is_slur=is_slur, - get_mel_fs2=get_mel_fs2) - logmel = self.normalizer.inverse(normalized_mel) - return logmel diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py index 3a52c592d..7ffe3198f 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py @@ -44,8 +44,9 @@ class DiffSingerUpdater(StandardUpdater): fs2_train_start_steps: int=0, ds_train_start_steps: int=160000, output_dir: Path=None, ): - super().__init__(model, optimizers, dataloader, init_state=None) + self.model = model._layers if isinstance(model, + paddle.DataParallel) else model self.optimizers = optimizers self.optimizer_fs2: Optimizer = optimizers['fs2'] @@ -79,7 +80,7 @@ class DiffSingerUpdater(StandardUpdater): if spk_emb is not None: spk_id = None - # fastspeech2 + # only train fastspeech2 module firstly if self.state.iteration > self.fs2_train_start_steps and self.state.iteration < self.ds_train_start_steps: before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( text=batch["text"], @@ -133,8 +134,9 @@ class DiffSingerUpdater(StandardUpdater): self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) + # Then only train diffusion module, freeze fastspeech2 parameters. if self.state.iteration > self.ds_train_start_steps: - for param in self.model._layers.fs2.parameters(): + for param in self.model.fs2.parameters(): param.trainable = False mel, mel_masks = self.model( @@ -183,12 +185,12 @@ class DiffSingerEvaluator(StandardEvaluator): dataloader: DataLoader, output_dir: Path=None, ): super().__init__(model, dataloader) - self.model = model + self.model = model._layers if isinstance(model, + paddle.DataParallel) else model self.criterions = criterions self.criterion_fs2 = criterions['fs2'] self.criterion_ds = criterions['ds'] - self.dataloader = dataloader log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) @@ -206,6 +208,7 @@ class DiffSingerEvaluator(StandardEvaluator): if spk_emb is not None: spk_id = None + # Here show diffsinger eval mel, mel_masks = self.model( text=batch["text"], note=batch["note"], @@ -227,14 +230,13 @@ class DiffSingerEvaluator(StandardEvaluator): ref_mels=batch["speech"], out_mels=mel, mel_masks=mel_masks, ) - loss_ds = l1_loss_ds - report("train/loss_ds", float(loss_ds)) - report("train/l1_loss_ds", float(l1_loss_ds)) + report("eval/loss_ds", float(loss_ds)) + report("eval/l1_loss_ds", float(l1_loss_ds)) losses_dict["l1_loss_ds"] = float(l1_loss_ds) losses_dict["loss_ds"] = float(loss_ds) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_dict.items()) + for k, v in losses_dict.items()) self.logger.info(self.msg) diff --git a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py new file mode 100644 index 000000000..85f9a9550 --- /dev/null +++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py @@ -0,0 +1,625 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +from typing import Any +from typing import Dict +from typing import Sequence +from typing import Tuple + +import paddle +from paddle import nn +from typeguard import check_argument_types + +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask +from paddlespeech.t2s.modules.nets_utils import make_pad_mask +from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss + + +class FastSpeech2MIDI(FastSpeech2): + """The Fastspeech2 module of DiffSinger. + """ + + def __init__( + self, + # fastspeech2 network structure related + idim: int, + odim: int, + fastspeech2_config: Dict[str, Any], + # note emb + note_num: int=300, + # is_slur emb + is_slur_num: int=2, ): + """Initialize FastSpeech2 module for svs. + Args: + fastspeech2_config (Dict): + The config of FastSpeech2 module on DiffSinger model + note_num (Optional[int]): + Number of note. If not None, assume that the + note_ids will be provided as the input and use note_embedding_table. + is_slur_num (Optional[int]): + Number of note. If not None, assume that the + is_slur_ids will be provided as the input + + """ + assert check_argument_types() + super().__init__(idim=idim, odim=odim, **fastspeech2_config) + + self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_config[ + "adim"] + + if note_num is not None: + self.note_embedding_table = nn.Embedding( + num_embeddings=note_num, + embedding_dim=self.note_embed_dim, + padding_idx=self.padding_idx) + self.note_dur_layer = nn.Linear(1, self.note_embed_dim) + + if is_slur_num is not None: + self.is_slur_embedding_table = nn.Embedding( + num_embeddings=is_slur_num, + embedding_dim=self.is_slur_embed_dim, + padding_idx=self.padding_idx) + + def forward( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + durations: paddle.Tensor, + pitch: paddle.Tensor, + energy: paddle.Tensor, + spk_emb: paddle.Tensor=None, + spk_id: paddle.Tensor=None, + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + """Calculate forward propagation. + + Args: + text(Tensor(int64)): + Batch of padded token (phone) ids (B, Tmax). + note(Tensor(int64)): + Batch of padded note (element in music score) ids (B, Tmax). + note_dur(Tensor(float32)): + Batch of padded note durations in seconds (element in music score) (B, Tmax). + is_slur(Tensor(int64)): + Batch of padded slur (element in music score) ids (B, Tmax). + text_lengths(Tensor(int64)): + Batch of phone lengths of each input (B,). + speech(Tensor[float32]): + Batch of padded target features (e.g. mel) (B, Lmax, odim). + speech_lengths(Tensor(int64)): + Batch of the lengths of each target features (B,). + durations(Tensor(int64)): + Batch of padded token durations in frame (B, Tmax). + pitch(Tensor[float32]): + Batch of padded frame-averaged pitch (B, Lmax, 1). + energy(Tensor[float32]): + Batch of padded frame-averaged energy (B, Lmax, 1). + spk_emb(Tensor[float32], optional): + Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor[int64], optional(int64)): + Batch of speaker ids (B,) + + Returns: + + """ + xs = paddle.cast(text, 'int64') + note = paddle.cast(note, 'int64') + note_dur = paddle.cast(note_dur, 'float32') + is_slur = paddle.cast(is_slur, 'int64') + ilens = paddle.cast(text_lengths, 'int64') + olens = paddle.cast(speech_lengths, 'int64') + ds = paddle.cast(durations, 'int64') + ps = pitch + es = energy + ys = speech + olens = speech_lengths + if spk_id is not None: + spk_id = paddle.cast(spk_id, 'int64') + # forward propagation + before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward( + xs, + note, + note_dur, + is_slur, + ilens, + olens, + ds, + ps, + es, + is_inference=False, + spk_emb=spk_emb, + spk_id=spk_id, ) + # modify mod part of groundtruth + if self.reduction_factor > 1: + olens = olens - olens % self.reduction_factor + max_olen = max(olens) + ys = ys[:, :max_olen] + + return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits + + def _forward( + self, + xs: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + ilens: paddle.Tensor, + olens: paddle.Tensor=None, + ds: paddle.Tensor=None, + ps: paddle.Tensor=None, + es: paddle.Tensor=None, + is_inference: bool=False, + is_train_diffusion: bool=False, + return_after_enc=False, + alpha: float=1.0, + spk_emb=None, + spk_id=None, ) -> Sequence[paddle.Tensor]: + # forward encoder + x_masks = self._source_mask(ilens) + note_emb = self.note_embedding_table(note) + note_dur_emb = self.note_dur_layer(paddle.unsqueeze(note_dur, axis=-1)) + is_slur_emb = self.is_slur_embedding_table(is_slur) + + # (B, Tmax, adim) + hs, _ = self.encoder( + xs, + x_masks, + note_emb, + note_dur_emb, + is_slur_emb, ) + + if self.spk_num and self.enable_speaker_classifier and not is_inference: + hs_for_spk_cls = self.grad_reverse(hs) + spk_logits = self.speaker_classifier(hs_for_spk_cls, ilens) + else: + spk_logits = None + + # integrate speaker embedding + if self.spk_embed_dim is not None: + # spk_emb has a higher priority than spk_id + if spk_emb is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) + elif spk_id is not None: + spk_emb = self.spk_embedding_table(spk_id) + hs = self._integrate_with_spk_embed(hs, spk_emb) + + # forward duration predictor and variance predictors + d_masks = make_pad_mask(ilens) + if olens is not None: + pitch_masks = make_pad_mask(olens).unsqueeze(-1) + else: + pitch_masks = None + + # inference for decoder input for duffusion + if is_train_diffusion: + hs = self.length_regulator(hs, ds, is_inference=False) + p_outs = self.pitch_predictor(hs.detach(), pitch_masks) + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + e_embs + p_embs + + elif is_inference: + # (B, Tmax) + if ds is not None: + d_outs = ds + else: + d_outs = self.duration_predictor.inference(hs, d_masks) + + # (B, Lmax, adim) + hs = self.length_regulator(hs, d_outs, alpha, is_inference=True) + + if ps is not None: + p_outs = ps + else: + if self.stop_gradient_from_pitch_predictor: + p_outs = self.pitch_predictor(hs.detach(), pitch_masks) + else: + p_outs = self.pitch_predictor(hs, pitch_masks) + + if es is not None: + e_outs = es + else: + if self.stop_gradient_from_energy_predictor: + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + else: + e_outs = self.energy_predictor(hs, pitch_masks) + + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + e_embs + p_embs + + # training + else: + d_outs = self.duration_predictor(hs, d_masks) + # (B, Lmax, adim) + hs = self.length_regulator(hs, ds, is_inference=False) + if self.stop_gradient_from_pitch_predictor: + p_outs = self.pitch_predictor(hs.detach(), pitch_masks) + else: + p_outs = self.pitch_predictor(hs, pitch_masks) + if self.stop_gradient_from_energy_predictor: + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + else: + e_outs = self.energy_predictor(hs, pitch_masks) + p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + e_embs + p_embs + + # forward decoder + if olens is not None and not is_inference: + if self.reduction_factor > 1: + olens_in = paddle.to_tensor( + [olen // self.reduction_factor for olen in olens.numpy()]) + else: + olens_in = olens + # (B, 1, T) + h_masks = self._source_mask(olens_in) + else: + h_masks = None + + if return_after_enc: + return hs, h_masks + + if self.decoder_type == 'cnndecoder': + # remove output masks for dygraph to static graph + zs = self.decoder(hs, h_masks) + before_outs = zs + else: + # (B, Lmax, adim) + zs, _ = self.decoder(hs, h_masks) + # (B, Lmax, odim) + before_outs = self.feat_out(zs).reshape( + (paddle.shape(zs)[0], -1, self.odim)) + + # postnet -> (B, Lmax//r * r, odim) + if self.postnet is None: + after_outs = before_outs + else: + after_outs = before_outs + self.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + + return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits + + def encoder_infer( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + alpha: float=1.0, + spk_emb=None, + spk_id=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + xs = paddle.cast(text, 'int64').unsqueeze(0) + note = paddle.cast(note, 'int64').unsqueeze(0) + note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0) + is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0) + # setup batch axis + ilens = paddle.shape(xs)[1] + + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + # (1, L, odim) + # use *_ to avoid bug in dygraph to static graph + hs, _ = self._forward( + xs, + note, + note_dur, + is_slur, + ilens, + is_inference=True, + return_after_enc=True, + alpha=alpha, + spk_emb=spk_emb, + spk_id=spk_id, ) + return hs + + # get encoder output for diffusion training + def encoder_infer_batch( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + text_lengths: paddle.Tensor, + speech_lengths: paddle.Tensor, + ds: paddle.Tensor=None, + ps: paddle.Tensor=None, + es: paddle.Tensor=None, + alpha: float=1.0, + spk_emb=None, + spk_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + + xs = paddle.cast(text, 'int64') + note = paddle.cast(note, 'int64') + note_dur = paddle.cast(note_dur, 'float32') + is_slur = paddle.cast(is_slur, 'int64') + ilens = paddle.cast(text_lengths, 'int64') + olens = paddle.cast(speech_lengths, 'int64') + + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + # (1, L, odim) + # use *_ to avoid bug in dygraph to static graph + hs, h_masks = self._forward( + xs, + note, + note_dur, + is_slur, + ilens, + olens, + ds, + ps, + es, + return_after_enc=True, + is_train_diffusion=True, + alpha=alpha, + spk_emb=spk_emb, + spk_id=spk_id, ) + return hs, h_masks + + def inference( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + durations: paddle.Tensor=None, + pitch: paddle.Tensor=None, + energy: paddle.Tensor=None, + alpha: float=1.0, + use_teacher_forcing: bool=False, + spk_emb=None, + spk_id=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """Generate the sequence of features given the sequences of characters. + + Args: + text(Tensor(int64)): + Input sequence of characters (T,). + note(Tensor(int64)): + Input note (element in music score) ids (T,). + note_dur(Tensor(float32)): + Input note durations in seconds (element in music score) (T,). + is_slur(Tensor(int64)): + Input slur (element in music score) ids (T,). + durations(Tensor, optional (int64)): + Groundtruth of duration (T,). + pitch(Tensor, optional): + Groundtruth of token-averaged pitch (T, 1). + energy(Tensor, optional): + Groundtruth of token-averaged energy (T, 1). + alpha(float, optional): + Alpha to control the speed. + use_teacher_forcing(bool, optional): + Whether to use teacher forcing. + If true, groundtruth of duration, pitch and energy will be used. + spk_emb(Tensor, optional, optional): + peaker embedding vector (spk_embed_dim,). (Default value = None) + spk_id(Tensor, optional(int64), optional): + spk ids (1,). (Default value = None) + + Returns: + + """ + xs = paddle.cast(text, 'int64').unsqueeze(0) + note = paddle.cast(note, 'int64').unsqueeze(0) + note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0) + is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0) + d, p, e = durations, pitch, energy + # setup batch axis + ilens = paddle.shape(xs)[1] + + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + if use_teacher_forcing: + # use groundtruth of duration, pitch, and energy + ds = d.unsqueeze(0) if d is not None else None + ps = p.unsqueeze(0) if p is not None else None + es = e.unsqueeze(0) if e is not None else None + + # (1, L, odim) + _, outs, d_outs, p_outs, e_outs, _ = self._forward( + xs, + note, + note_dur, + is_slur, + ilens, + ds=ds, + ps=ps, + es=es, + spk_emb=spk_emb, + spk_id=spk_id, + is_inference=True) + else: + # (1, L, odim) + _, outs, d_outs, p_outs, e_outs, _ = self._forward( + xs, + note, + note_dur, + is_slur, + ilens, + is_inference=True, + alpha=alpha, + spk_emb=spk_emb, + spk_id=spk_id, ) + + return outs[0], d_outs[0], p_outs[0], e_outs[0] + + +class FastSpeech2MIDILoss(nn.Layer): + """Loss function module for DiffSinger.""" + + def __init__(self, use_masking: bool=True, + use_weighted_masking: bool=False): + """Initialize feed-forward Transformer loss module. + Args: + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to weighted masking in loss calculation. + """ + assert check_argument_types() + super().__init__() + + assert (use_masking != use_weighted_masking) or not use_masking + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + + # define criterions + reduction = "none" if self.use_weighted_masking else "mean" + self.l1_criterion = nn.L1Loss(reduction=reduction) + self.mse_criterion = nn.MSELoss(reduction=reduction) + self.duration_criterion = DurationPredictorLoss(reduction=reduction) + self.ce_criterion = nn.CrossEntropyLoss() + + def forward( + self, + after_outs: paddle.Tensor, + before_outs: paddle.Tensor, + d_outs: paddle.Tensor, + p_outs: paddle.Tensor, + e_outs: paddle.Tensor, + ys: paddle.Tensor, + ds: paddle.Tensor, + ps: paddle.Tensor, + es: paddle.Tensor, + ilens: paddle.Tensor, + olens: paddle.Tensor, + spk_logits: paddle.Tensor=None, + spk_ids: paddle.Tensor=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, + paddle.Tensor, ]: + """Calculate forward propagation. + + Args: + after_outs(Tensor): + Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): + Batch of outputs before postnets (B, Lmax, odim). + d_outs(Tensor): + Batch of outputs of duration predictor (B, Tmax). + p_outs(Tensor): + Batch of outputs of pitch predictor (B, Lmax, 1). + e_outs(Tensor): + Batch of outputs of energy predictor (B, Lmax, 1). + ys(Tensor): + Batch of target features (B, Lmax, odim). + ds(Tensor): + Batch of durations (B, Tmax). + ps(Tensor): + Batch of target frame-averaged pitch (B, Lmax, 1). + es(Tensor): + Batch of target frame-averaged energy (B, Lmax, 1). + ilens(Tensor): + Batch of the lengths of each input (B,). + olens(Tensor): + Batch of the lengths of each target (B,). + spk_logits(Option[Tensor]): + Batch of outputs after speaker classifier (B, Lmax, num_spk) + spk_ids(Option[Tensor]): + Batch of target spk_id (B,) + + + Returns: + + + """ + speaker_loss = 0.0 + # apply mask to remove padded part + if self.use_masking: + out_masks = make_non_pad_mask(olens).unsqueeze(-1) + before_outs = before_outs.masked_select( + out_masks.broadcast_to(before_outs.shape)) + if after_outs is not None: + after_outs = after_outs.masked_select( + out_masks.broadcast_to(after_outs.shape)) + ys = ys.masked_select(out_masks.broadcast_to(ys.shape)) + duration_masks = make_non_pad_mask(ilens) + d_outs = d_outs.masked_select( + duration_masks.broadcast_to(d_outs.shape)) + ds = ds.masked_select(duration_masks.broadcast_to(ds.shape)) + pitch_masks = out_masks + p_outs = p_outs.masked_select( + pitch_masks.broadcast_to(p_outs.shape)) + e_outs = e_outs.masked_select( + pitch_masks.broadcast_to(e_outs.shape)) + ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape)) + es = es.masked_select(pitch_masks.broadcast_to(es.shape)) + + if spk_logits is not None and spk_ids is not None: + batch_size = spk_ids.shape[0] + spk_ids = paddle.repeat_interleave(spk_ids, spk_logits.shape[1], + None) + spk_logits = paddle.reshape(spk_logits, + [-1, spk_logits.shape[-1]]) + mask_index = spk_logits.abs().sum(axis=1) != 0 + spk_ids = spk_ids[mask_index] + spk_logits = spk_logits[mask_index] + + # calculate loss + l1_loss = self.l1_criterion(before_outs, ys) + if after_outs is not None: + l1_loss += self.l1_criterion(after_outs, ys) + duration_loss = self.duration_criterion(d_outs, ds) + pitch_loss = self.mse_criterion(p_outs, ps) + energy_loss = self.mse_criterion(e_outs, es) + + if spk_logits is not None and spk_ids is not None: + speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size + + # make weighted mask and apply it + if self.use_weighted_masking: + out_masks = make_non_pad_mask(olens).unsqueeze(-1) + out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast( + dtype=paddle.float32).sum( + axis=1, keepdim=True) + out_weights /= ys.shape[0] * ys.shape[2] + duration_masks = make_non_pad_mask(ilens) + duration_weights = (duration_masks.cast(dtype=paddle.float32) / + duration_masks.cast(dtype=paddle.float32).sum( + axis=1, keepdim=True)) + duration_weights /= ds.shape[0] + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select( + out_masks.broadcast_to(l1_loss.shape)).sum() + duration_loss = (duration_loss.multiply(duration_weights) + .masked_select(duration_masks).sum()) + pitch_masks = out_masks + pitch_weights = out_weights + pitch_loss = pitch_loss.multiply(pitch_weights) + pitch_loss = pitch_loss.masked_select( + pitch_masks.broadcast_to(pitch_loss.shape)).sum() + energy_loss = energy_loss.multiply(pitch_weights) + energy_loss = energy_loss.masked_select( + pitch_masks.broadcast_to(energy_loss.shape)).sum() + + return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py index 91848de76..0fd94689d 100644 --- a/paddlespeech/t2s/modules/transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -15,6 +15,7 @@ from typing import List from typing import Union +import paddle from paddle import nn from paddlespeech.t2s.modules.activation import get_activation @@ -390,20 +391,26 @@ class TransformerEncoder(BaseEncoder): padding_idx=padding_idx, encoder_type="transformer") - def forward(self, xs, masks, note_emb=None, note_dur_emb=None, is_slur_emb=None, scale=16): + def forward(self, + xs: paddle.Tensor, + masks: paddle.Tensor, + note_emb: paddle.Tensor=None, + note_dur_emb: paddle.Tensor=None, + is_slur_emb: paddle.Tensor=None, + scale: int=16): """Encoder input sequence. Args: xs(Tensor): Input tensor (#batch, time, idim). + masks(Tensor): + Mask tensor (#batch, 1, time). note_emb(Tensor): Input tensor (#batch, time, attention_dim). note_dur_emb(Tensor): Input tensor (#batch, time, attention_dim). is_slur_emb(Tensor): Input tensor (#batch, time, attention_dim). - masks(Tensor): - Mask tensor (#batch, 1, time). Returns: Tensor: