add multi-band melgan finetune scripts

pull/995/head
TianYuan 3 years ago
parent dc7daa2a61
commit a6ac497f8e

@ -13,7 +13,6 @@
# limitations under the License. # limitations under the License.
import argparse import argparse
from pathlib import Path from pathlib import Path
from typing import Union
import numpy as np import numpy as np
import paddle import paddle
@ -23,129 +22,12 @@ from yacs.config import CfgNode
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.t2s.modules.normalizer import ZScore
class StyleFastSpeech2Inference(FastSpeech2Inference):
def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
super().__init__(normalizer, model)
pitch_mean, pitch_std = np.load(pitch_stats_path)
self.pitch_mean = paddle.to_tensor(pitch_mean)
self.pitch_std = paddle.to_tensor(pitch_std)
energy_mean, energy_std = np.load(energy_stats_path)
self.energy_mean = paddle.to_tensor(energy_mean)
self.energy_std = paddle.to_tensor(energy_std)
def denorm(self, data, mean, std):
return data * std + mean
def norm(self, data, mean, std):
return (data - mean) / std
def forward(self,
text: paddle.Tensor,
durations: Union[paddle.Tensor, np.ndarray]=None,
durations_scale: Union[int, float]=None,
durations_bias: Union[int, float]=None,
pitch: Union[paddle.Tensor, np.ndarray]=None,
pitch_scale: Union[int, float]=None,
pitch_bias: Union[int, float]=None,
energy: Union[paddle.Tensor, np.ndarray]=None,
energy_scale: Union[int, float]=None,
energy_bias: Union[int, float]=None,
robot: bool=False):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, durations=None, pitch=None, energy=None)
# priority: groundtruth > scale/bias > previous output
# set durations
if isinstance(durations, np.ndarray):
durations = paddle.to_tensor(durations)
elif isinstance(durations, paddle.Tensor):
durations = durations
elif durations_scale or durations_bias:
durations_scale = durations_scale if durations_scale is not None else 1
durations_bias = durations_bias if durations_bias is not None else 0
durations = durations_scale * d_outs + durations_bias
else:
durations = d_outs
if robot:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch = paddle.zeros(p_outs.shape)
# set pitch, can overwrite robot set
if isinstance(pitch, np.ndarray):
pitch = paddle.to_tensor(pitch)
elif isinstance(pitch, paddle.Tensor):
pitch = pitch
elif pitch_scale or pitch_bias:
pitch_scale = pitch_scale if pitch_scale is not None else 1
pitch_bias = pitch_bias if pitch_bias is not None else 0
p_Hz = paddle.exp(
self.denorm(p_outs, self.pitch_mean, self.pitch_std))
p_HZ = pitch_scale * p_Hz + pitch_bias
pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
else:
pitch = p_outs
# set energy
if isinstance(energy, np.ndarray):
energy = paddle.to_tensor(energy)
elif isinstance(energy, paddle.Tensor):
energy = energy
elif energy_scale or energy_bias:
energy_scale = energy_scale if energy_scale is not None else 1
energy_bias = energy_bias if energy_bias is not None else 0
e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
e_dnorm = energy_scale * e_dnorm + energy_bias
energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
else:
energy = e_outs
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text,
durations=durations,
pitch=pitch,
energy=energy,
use_teacher_forcing=True)
logmel = self.normalizer.inverse(normalized_mel)
return logmel
def evaluate(args, fastspeech2_config, pwg_config): def evaluate(args, fastspeech2_config, pwg_config):
# construct dataset for evaluation # construct dataset for evaluation

@ -0,0 +1,139 @@
# This is the hyperparameter configuration file for MelGAN.
# Please make sure this is adjusted for the CSMSC dataset. If you want to
# apply to the other dataset, you might need to carefully change some parameters.
# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
# This configuration is based on full-band MelGAN but the hop size and sampling
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
# is not shown in the paper so currently we train 1M iterations (not sure enough
# to converge). The optimizer setting is based on @dathudeptrai advice.
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # Sampling rate.
n_fft: 2048 # FFT size. (in samples)
n_shift: 300 # Hop size. (in samples)
win_length: 1200 # Window length. (in samples)
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
n_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params:
in_channels: 80 # Number of input channels.
out_channels: 4 # Number of output channels.
kernel_size: 7 # Kernel size of initial and final conv layers.
channels: 384 # Initial number of channels for conv layers.
upsample_scales: [5, 5, 3] # List of Upsampling scales.
stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
stacks: 4 # Number of stacks in a single residual stack module.
use_weight_norm: True # Whether to use weight normalization.
use_causal_conv: False # Whether to use causal convolution.
use_final_nonlinear_activation: True
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
scales: 3 # Number of multi-scales.
downsample_pooling: "AvgPool1D" # Pooling type for the input downsampling.
downsample_pooling_params: # Parameters of the above pooling function.
kernel_size: 4
stride: 2
padding: 1
exclusive: True
kernel_sizes: [5, 3] # List of kernel size.
channels: 16 # Number of channels of the initial conv layer.
max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
downsample_scales: [4, 4, 4] # List of downsampling scales.
nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
nonlinear_activation_params: # Parameters of nonlinear activation function.
negative_slope: 0.2
use_weight_norm: True # Whether to use weight norm.
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss: true
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
use_subband_stft_loss: true
subband_stft_loss_params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
use_feat_match_loss: false # Whether to use feature matching loss.
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 64 # Batch size.
batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
num_workers: 2 # Number of workers in DataLoader.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params:
epsilon: 1.0e-7 # Generator's epsilon.
weight_decay: 0.0 # Generator's weight decay coefficient.
generator_grad_norm: -1 # Generator's gradient norm.
generator_scheduler_params:
learning_rate: 1.0e-3 # Generator's learning rate.
gamma: 0.5 # Generator's scheduler gamma.
milestones: # At each milestone, lr will be multiplied by gamma.
- 100000
- 200000
- 300000
- 400000
- 500000
- 600000
discriminator_optimizer_params:
epsilon: 1.0e-7 # Discriminator's epsilon.
weight_decay: 0.0 # Discriminator's weight decay coefficient.
discriminator_grad_norm: -1 # Discriminator's gradient norm.
discriminator_scheduler_params:
learning_rate: 1.0e-3 # Discriminator's learning rate.
gamma: 0.5 # Discriminator's scheduler gamma.
milestones: # At each milestone, lr will be multiplied by gamma.
- 100000
- 200000
- 300000
- 400000
- 500000
- 600000
###########################################################
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
train_max_steps: 1200000 # Number of training steps.
save_interval_steps: 1000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
###########################################################
# OTHER SETTING #
###########################################################
num_snapshots: 10 # max number of snapshots to keep while training
seed: 42 # random seed for paddle, random, and np.random

@ -0,0 +1,65 @@
#!/bin/bash
source path.sh
gpus=0
stage=0
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gt_duration_mel.py \
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--dur-file=durations.txt \
--output-dir=dump_finetune \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 local/link_wav.py \
--old-dump-dir=dump \
--dump-dir=dump_finetune
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump_finetune/train/raw/metadata.jsonl \
--field-name="feats"
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/train/raw/metadata.jsonl \
--dumpdir=dump_finetune/train/norm \
--stats=dump_finetune/train/feats_stats.npy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/dev/raw/metadata.jsonl \
--dumpdir=dump_finetune/dev/norm \
--stats=dump_finetune/train/feats_stats.npy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/test/raw/metadata.jsonl \
--dumpdir=dump_finetune/test/norm \
--stats=dump_finetune/train/feats_stats.npy
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} \
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ${BIN_DIR}/train.py \
--train-metadata=dump_finetune/train/norm/metadata.jsonl \
--dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
--config=conf/finetune.yaml \
--output-dir=exp/finetune \
--ngpu=1
fi

@ -0,0 +1,85 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from operator import itemgetter
from pathlib import Path
import jsonlines
import numpy as np
def main():
# parse config and args
parser = argparse.ArgumentParser(
description="Preprocess audio and then extract features .")
parser.add_argument(
"--old-dump-dir",
default=None,
type=str,
help="directory to dump feature files.")
parser.add_argument(
"--dump-dir",
type=str,
required=True,
help="directory to finetune dump feature files.")
args = parser.parse_args()
old_dump_dir = Path(args.old_dump_dir).expanduser()
old_dump_dir = old_dump_dir.resolve()
dump_dir = Path(args.dump_dir).expanduser()
# use absolute path
dump_dir = dump_dir.resolve()
dump_dir.mkdir(parents=True, exist_ok=True)
assert old_dump_dir.is_dir()
assert dump_dir.is_dir()
for sub in ["train", "dev", "test"]:
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
output_dir = dump_dir / sub
output_dir.mkdir(parents=True, exist_ok=True)
results = []
for name in os.listdir(output_dir / "raw"):
# 003918_feats.npy
utt_id = name.split("_")[0]
mel_path = output_dir / ("raw/" + name)
gen_mel = np.load(mel_path)
wave_name = utt_id + "_wave.npy"
wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
output_dir / ("raw/" + wave_name))
num_sample = wav.shape[0]
num_frames = gen_mel.shape[0]
wav_path = output_dir / ("raw/" + wave_name)
record = {
"utt_id": utt_id,
"num_samples": num_sample,
"num_frames": num_frames,
"feats": str(mel_path),
"wave": str(wav_path),
}
results.append(record)
results.sort(key=itemgetter("utt_id"))
with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
for item in results:
writer.write(item)
if __name__ == "__main__":
main()

@ -110,10 +110,10 @@ class Clip(object):
if len(x) < c.shape[0] * self.hop_size: if len(x) < c.shape[0] * self.hop_size:
x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge") x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
elif len(x) > c.shape[0] * self.hop_size: elif len(x) > c.shape[0] * self.hop_size:
print( # print(
f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })" # f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
) # )
x = x[:c.shape[1] * self.hop_size] x = x[:c.shape[0] * self.hop_size]
# check the legnth is valid # check the legnth is valid
assert len(x) == c.shape[ assert len(x) == c.shape[

@ -0,0 +1,167 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# generate mels using durations.txt
# for mb melgan finetune
# 长度和原本的 mel 不一致怎么办?
import argparse
from pathlib import Path
import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
from paddlespeech.t2s.modules.normalizer import ZScore
def evaluate(args, fastspeech2_config):
# construct dataset for evaluation
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
phone_dict = {}
for phn, id in phn_id:
phone_dict[phn] = int(id)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
model.eval()
stat = np.load(args.fastspeech2_stat)
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
fastspeech2_normalizer = ZScore(mu, std)
fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
model)
fastspeech2_inference.eval()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
sentences, speaker_set = get_phn_dur(args.dur_file)
merge_silence(sentences)
for i, utt_id in enumerate(sentences):
phones = sentences[utt_id][0]
durations = sentences[utt_id][1]
speaker = sentences[utt_id][2]
# 裁剪掉开头和结尾的 sil
if args.cut_sil:
if phones[0] == "sil" and len(durations) > 1:
durations = durations[1:]
phones = phones[1:]
if phones[-1] == 'sil' and len(durations) > 1:
durations = durations[:-1]
phones = phones[:-1]
# sentences[utt_id][0] = phones
# sentences[utt_id][1] = durations
phone_ids = [phone_dict[phn] for phn in phones]
phone_ids = paddle.to_tensor(np.array(phone_ids))
durations = paddle.to_tensor(np.array(durations))
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
# split data into 3 sections
if args.dataset == "baker":
num_train = 9800
num_dev = 100
if i in range(0, num_train):
sub_output_dir = output_dir / ("train/raw")
elif i in range(num_train, num_train + num_dev):
sub_output_dir = output_dir / ("dev/raw")
else:
sub_output_dir = output_dir / ("test/raw")
sub_output_dir.mkdir(parents=True, exist_ok=True)
with paddle.no_grad():
mel = fastspeech2_inference(phone_ids, durations=durations)
np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(
description="Synthesize with fastspeech2 & parallel wavegan.")
parser.add_argument(
"--dataset",
default="baker",
type=str,
help="name of dataset, should in {baker, ljspeech, vctk} now")
parser.add_argument(
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
parser.add_argument(
"--fastspeech2-checkpoint",
type=str,
help="fastspeech2 checkpoint to load.")
parser.add_argument(
"--fastspeech2-stat",
type=str,
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
)
parser.add_argument(
"--phones-dict",
type=str,
default="phone_id_map.txt",
help="phone vocabulary file.")
parser.add_argument(
"--dur-file", default=None, type=str, help="path to durations.txt.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--cut-sil",
type=str2bool,
default=True,
help="whether cut sil in the edge of audio")
args = parser.parse_args()
if args.ngpu == 0:
paddle.set_device("cpu")
elif args.ngpu > 0:
paddle.set_device("gpu")
else:
print("ngpu should >= 0 !")
with open(args.fastspeech2_config) as f:
fastspeech2_config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(fastspeech2_config)
evaluate(args, fastspeech2_config)
if __name__ == "__main__":
main()

@ -16,7 +16,9 @@
from typing import Dict from typing import Dict
from typing import Sequence from typing import Sequence
from typing import Tuple from typing import Tuple
from typing import Union
import numpy as np
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import nn from paddle import nn
@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
return logmel return logmel
class StyleFastSpeech2Inference(FastSpeech2Inference):
def __init__(self,
normalizer,
model,
pitch_stats_path=None,
energy_stats_path=None):
super().__init__(normalizer, model)
if pitch_stats_path:
pitch_mean, pitch_std = np.load(pitch_stats_path)
self.pitch_mean = paddle.to_tensor(pitch_mean)
self.pitch_std = paddle.to_tensor(pitch_std)
if energy_stats_path:
energy_mean, energy_std = np.load(energy_stats_path)
self.energy_mean = paddle.to_tensor(energy_mean)
self.energy_std = paddle.to_tensor(energy_std)
def denorm(self, data, mean, std):
return data * std + mean
def norm(self, data, mean, std):
return (data - mean) / std
def forward(self,
text: paddle.Tensor,
durations: Union[paddle.Tensor, np.ndarray]=None,
durations_scale: Union[int, float]=None,
durations_bias: Union[int, float]=None,
pitch: Union[paddle.Tensor, np.ndarray]=None,
pitch_scale: Union[int, float]=None,
pitch_bias: Union[int, float]=None,
energy: Union[paddle.Tensor, np.ndarray]=None,
energy_scale: Union[int, float]=None,
energy_bias: Union[int, float]=None,
robot: bool=False):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, durations=None, pitch=None, energy=None)
# priority: groundtruth > scale/bias > previous output
# set durations
if isinstance(durations, np.ndarray):
durations = paddle.to_tensor(durations)
elif isinstance(durations, paddle.Tensor):
durations = durations
elif durations_scale or durations_bias:
durations_scale = durations_scale if durations_scale is not None else 1
durations_bias = durations_bias if durations_bias is not None else 0
durations = durations_scale * d_outs + durations_bias
else:
durations = d_outs
if robot:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch = paddle.zeros(p_outs.shape)
# set pitch, can overwrite robot set
if isinstance(pitch, np.ndarray):
pitch = paddle.to_tensor(pitch)
elif isinstance(pitch, paddle.Tensor):
pitch = pitch
elif pitch_scale or pitch_bias:
pitch_scale = pitch_scale if pitch_scale is not None else 1
pitch_bias = pitch_bias if pitch_bias is not None else 0
p_Hz = paddle.exp(
self.denorm(p_outs, self.pitch_mean, self.pitch_std))
p_HZ = pitch_scale * p_Hz + pitch_bias
pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
else:
pitch = p_outs
# set energy
if isinstance(energy, np.ndarray):
energy = paddle.to_tensor(energy)
elif isinstance(energy, paddle.Tensor):
energy = energy
elif energy_scale or energy_bias:
energy_scale = energy_scale if energy_scale is not None else 1
energy_bias = energy_bias if energy_bias is not None else 0
e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
e_dnorm = energy_scale * e_dnorm + energy_bias
energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
else:
energy = e_outs
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text,
durations=durations,
pitch=pitch,
energy=energy,
use_teacher_forcing=True)
logmel = self.normalizer.inverse(normalized_mel)
return logmel
class FastSpeech2Loss(nn.Layer): class FastSpeech2Loss(nn.Layer):
"""Loss function module for FastSpeech2.""" """Loss function module for FastSpeech2."""

Loading…
Cancel
Save