You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
108 lines
5.9 KiB
108 lines
5.9 KiB
2 years ago
|
###########################################################
|
||
|
# FEATURE EXTRACTION SETTING #
|
||
|
###########################################################
|
||
|
|
||
|
fs: 24000 # sr
|
||
|
n_fft: 2048 # FFT size (samples).
|
||
|
n_shift: 300 # Hop size (samples). 12.5ms
|
||
|
win_length: 1200 # Window length (samples). 50ms
|
||
|
# If set to null, it will be the same as fft_size.
|
||
|
window: "hann" # Window function.
|
||
|
|
||
|
# Only used for feats_type != raw
|
||
|
|
||
|
fmin: 80 # Minimum frequency of Mel basis.
|
||
|
fmax: 7600 # Maximum frequency of Mel basis.
|
||
|
n_mels: 80 # The number of mel basis.
|
||
|
|
||
|
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||
|
# The canton datasets we use are different from others like Databaker or LJSpeech,
|
||
|
# we set it to 110 to avoid too many zero-pitch problem.
|
||
|
# Reference: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/issues/38
|
||
|
f0min: 110 # Minimum f0 for pitch extraction.
|
||
|
f0max: 400 # Maximum f0 for pitch extraction.
|
||
|
|
||
|
|
||
|
###########################################################
|
||
|
# DATA SETTING #
|
||
|
###########################################################
|
||
|
batch_size: 32
|
||
|
num_workers: 2
|
||
|
|
||
|
|
||
|
###########################################################
|
||
|
# MODEL SETTING #
|
||
|
###########################################################
|
||
|
model:
|
||
|
adim: 384 # attention dimension
|
||
|
aheads: 2 # number of attention heads
|
||
|
elayers: 4 # number of encoder layers
|
||
|
eunits: 1536 # number of encoder ff units
|
||
|
dlayers: 4 # number of decoder layers
|
||
|
dunits: 1536 # number of decoder ff units
|
||
|
positionwise_layer_type: conv1d # type of position-wise layer
|
||
|
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||
|
duration_predictor_layers: 2 # number of layers of duration predictor
|
||
|
duration_predictor_chans: 256 # number of channels of duration predictor
|
||
|
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||
|
postnet_layers: 5 # number of layers of postnset
|
||
|
postnet_filts: 5 # filter size of conv layers in postnet
|
||
|
postnet_chans: 256 # number of channels of conv layers in postnet
|
||
|
use_scaled_pos_enc: True # whether to use scaled positional encoding
|
||
|
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||
|
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||
|
reduction_factor: 1 # reduction factor
|
||
|
init_type: xavier_uniform # initialization type
|
||
|
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
|
||
|
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
|
||
|
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||
|
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||
|
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||
|
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||
|
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||
|
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||
|
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||
|
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||
|
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||
|
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||
|
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||
|
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||
|
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
|
||
|
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||
|
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||
|
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||
|
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||
|
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||
|
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||
|
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
|
||
|
spk_embed_dim: 256 # speaker embedding dimension
|
||
|
spk_embed_integration_type: concat # speaker embedding integration type
|
||
|
|
||
|
|
||
|
|
||
|
###########################################################
|
||
|
# UPDATER SETTING #
|
||
|
###########################################################
|
||
|
updater:
|
||
|
use_masking: True # whether to apply masking for padded part in loss calculation
|
||
|
|
||
|
|
||
|
###########################################################
|
||
|
# OPTIMIZER SETTING #
|
||
|
###########################################################
|
||
|
optimizer:
|
||
|
optim: adam # optimizer type
|
||
|
learning_rate: 0.001 # learning rate
|
||
|
|
||
|
###########################################################
|
||
|
# TRAINING SETTING #
|
||
|
###########################################################
|
||
|
max_epoch: 1000
|
||
|
num_snapshots: 5
|
||
|
|
||
|
|
||
|
###########################################################
|
||
|
# OTHER SETTING #
|
||
|
###########################################################
|
||
|
seed: 10086
|