|
|
|
###########################################################
|
|
|
|
# FEATURE EXTRACTION SETTING #
|
|
|
|
###########################################################
|
|
|
|
|
|
|
|
fs: 24000 # sr
|
|
|
|
n_fft: 2048 # FFT size (samples).
|
|
|
|
n_shift: 300 # Hop size (samples). 12.5ms
|
|
|
|
win_length: 1200 # Window length (samples). 50ms
|
|
|
|
# If set to null, it will be the same as fft_size.
|
|
|
|
window: "hann" # Window function.
|
|
|
|
|
|
|
|
# Only used for feats_type != raw
|
|
|
|
|
|
|
|
fmin: 80 # Minimum frequency of Mel basis.
|
|
|
|
fmax: 7600 # Maximum frequency of Mel basis.
|
|
|
|
n_mels: 80 # The number of mel basis.
|
|
|
|
|
|
|
|
# Only used for the model using pitch features (e.g. FastSpeech2)
|
|
|
|
f0min: 80 # Minimum f0 for pitch extraction.
|
|
|
|
f0max: 400 # Maximum f0 for pitch extraction.
|
|
|
|
|
|
|
|
|
|
|
|
###########################################################
|
|
|
|
# DATA SETTING #
|
|
|
|
###########################################################
|
|
|
|
batch_size: 64
|
|
|
|
num_workers: 2
|
|
|
|
|
|
|
|
|
|
|
|
###########################################################
|
|
|
|
# MODEL SETTING #
|
|
|
|
###########################################################
|
|
|
|
model:
|
|
|
|
adim: 384 # attention dimension
|
|
|
|
aheads: 2 # number of attention heads
|
|
|
|
elayers: 4 # number of encoder layers
|
|
|
|
eunits: 1536 # number of encoder ff units
|
|
|
|
dlayers: 4 # number of decoder layers
|
|
|
|
dunits: 1536 # number of decoder ff units
|
|
|
|
positionwise_layer_type: conv1d # type of position-wise layer
|
|
|
|
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
|
|
|
duration_predictor_layers: 2 # number of layers of duration predictor
|
|
|
|
duration_predictor_chans: 256 # number of channels of duration predictor
|
|
|
|
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
|
|
|
postnet_layers: 5 # number of layers of postnset
|
|
|
|
postnet_filts: 5 # filter size of conv layers in postnet
|
|
|
|
postnet_chans: 256 # number of channels of conv layers in postnet
|
|
|
|
use_scaled_pos_enc: True # whether to use scaled positional encoding
|
|
|
|
encoder_normalize_before: True # whether to perform layer normalization before the input
|
|
|
|
decoder_normalize_before: True # whether to perform layer normalization before the input
|
|
|
|
reduction_factor: 1 # reduction factor
|
|
|
|
init_type: xavier_uniform # initialization type
|
|
|
|
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
|
|
|
|
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
|
|
|
|
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
|
|
|
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
|
|
|
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
|
|
|
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
|
|
|
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
|
|
|
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
|
|
|
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
|
|
|
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
|
|
|
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
|
|
|
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
|
|
|
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
|
|
|
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
|
|
|
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
|
|
|
|
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
|
|
|
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
|
|
|
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
|
|
|
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
|
|
|
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
|
|
|
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
|
|
|
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
|
|
|
|
spk_embed_dim: 256 # speaker embedding dimension
|
|
|
|
spk_embed_integration_type: concat # speaker embedding integration type
|
|
|
|
enable_speaker_classifier: False # Whether to use speaker classifier module
|
|
|
|
hidden_sc_dim: 256 # The hidden layer dim of speaker classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
###########################################################
|
|
|
|
# UPDATER SETTING #
|
|
|
|
###########################################################
|
|
|
|
updater:
|
|
|
|
use_masking: True # whether to apply masking for padded part in loss calculation
|
|
|
|
spk_loss_scale: 0.02 # The scales of speaker classifier loss
|
|
|
|
|
|
|
|
|
|
|
|
###########################################################
|
|
|
|
# OPTIMIZER SETTING #
|
|
|
|
###########################################################
|
|
|
|
optimizer:
|
|
|
|
optim: adam # optimizer type
|
|
|
|
learning_rate: 0.001 # learning rate
|
|
|
|
|
|
|
|
###########################################################
|
|
|
|
# TRAINING SETTING #
|
|
|
|
###########################################################
|
|
|
|
max_epoch: 200
|
|
|
|
num_snapshots: 5
|
|
|
|
|
|
|
|
|
|
|
|
###########################################################
|
|
|
|
# OTHER SETTING #
|
|
|
|
###########################################################
|
|
|
|
seed: 10086
|