You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/examples/opencpop/svs1/conf/default.yaml

160 lines
9.5 KiB

###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 512 # FFT size (samples).
n_shift: 128 # Hop size (samples). 12.5ms
win_length: 512 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 30 # Minimum frequency of Mel basis.
fmax: 12000 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 750 # Maximum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 48 # batch size
num_workers: 1 # number of gpu
###########################################################
# MODEL SETTING #
###########################################################
model:
# music score related
note_num: 300 # number of note
is_slur_num: 2 # number of slur
# fastspeech2 module options
use_energy_pred: False # whether use energy predictor
use_postnet: False # whether use postnet
# fastspeech2 module
fastspeech2_params:
adim: 256 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1024 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1024 # number of decoder ff units
positionwise_layer_type: conv1d-linear # type of position-wise layer
positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer
transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer
transformer_activation_type: "gelu" # Activation function type in transformer.
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
use_scaled_pos_enc: True # whether to use scaled positional encoding
transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer
duration_predictor_layers: 5 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
postnet_dropout_rate: 0.5 # dropout rate for postnet
# denoiser module
denoiser_params:
in_channels: 80 # Number of channels of the input mel-spectrogram
out_channels: 80 # Number of channels of the output mel-spectrogram
kernel_size: 3 # Kernel size of the residual blocks inside
layers: 20 # Number of residual blocks inside
stacks: 5 # The number of groups to split the residual blocks into
residual_channels: 256 # Residual channel of the residual blocks
gate_channels: 512 # Gate channel of the residual blocks
skip_channels: 256 # Skip channel of the residual blocks
aux_channels: 256 # Auxiliary channel of the residual blocks
dropout: 0.1 # Dropout of the residual blocks
bias: True # Whether to use bias in residual blocks
use_weight_norm: False # Whether to use weight norm in all convolutions
init_type: "kaiming_normal" # Type of initialize weights of a neural network module
diffusion_params:
num_train_timesteps: 100 # The number of timesteps between the noise and the real during training
beta_start: 0.0001 # beta start parameter for the scheduler
beta_end: 0.06 # beta end parameter for the scheduler
beta_schedule: "linear" # beta schedule parameter for the scheduler
num_max_timesteps: 100 # The max timestep transition from real to noise
stretch: True # whether to stretch before diffusion
###########################################################
# UPDATER SETTING #
###########################################################
fs2_updater:
use_masking: True # whether to apply masking for padded part in loss calculation
ds_updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
# fastspeech2 optimizer
fs2_optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate
# diffusion optimizer
ds_optimizer_params:
beta1: 0.9
beta2: 0.98
weight_decay: 0.0
ds_scheduler_params:
learning_rate: 0.001
gamma: 0.5
step_size: 50000
ds_grad_norm: 1
###########################################################
# INTERVAL SETTING #
###########################################################
only_train_diffusion: True # Whether to freeze fastspeech2 parameters when training diffusion
ds_train_start_steps: 160000 # Number of steps to start to train diffusion module.
train_max_steps: 320000 # Number of training steps.
save_interval_steps: 2000 # Interval steps to save checkpoint.
eval_interval_steps: 2000 # Interval steps to evaluate the network.
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086