########################################################### # FEATURE EXTRACTION SETTING # ########################################################### fs: 24000 # sr n_fft: 512 # FFT size (samples). n_shift: 128 # Hop size (samples). 12.5ms win_length: 512 # Window length (samples). 50ms # If set to null, it will be the same as fft_size. window: "hann" # Window function. # Only used for feats_type != raw fmin: 30 # Minimum frequency of Mel basis. fmax: 12000 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) f0min: 80 # Minimum f0 for pitch extraction. f0max: 750 # Maximum f0 for pitch extraction. ########################################################### # DATA SETTING # ########################################################### batch_size: 48 # batch size num_workers: 1 # number of gpu ########################################################### # MODEL SETTING # ########################################################### model: # music score related note_num: 300 # number of note is_slur_num: 2 # number of slur # fastspeech2 module options use_energy_pred: False # whether use energy predictor use_postnet: False # whether use postnet # fastspeech2 module fastspeech2_params: adim: 256 # attention dimension aheads: 2 # number of attention heads elayers: 4 # number of encoder layers eunits: 1024 # number of encoder ff units dlayers: 4 # number of decoder layers dunits: 1024 # number of decoder ff units positionwise_layer_type: conv1d-linear # type of position-wise layer positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer transformer_activation_type: "gelu" # Activation function type in transformer. encoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input reduction_factor: 1 # reduction factor init_type: xavier_uniform # initialization type init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding use_scaled_pos_enc: True # whether to use scaled positional encoding transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer duration_predictor_layers: 5 # number of layers of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder postnet_layers: 5 # number of layers of postnet postnet_filts: 5 # filter size of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet postnet_dropout_rate: 0.5 # dropout rate for postnet # denoiser module denoiser_params: in_channels: 80 # Number of channels of the input mel-spectrogram out_channels: 80 # Number of channels of the output mel-spectrogram kernel_size: 3 # Kernel size of the residual blocks inside layers: 20 # Number of residual blocks inside stacks: 5 # The number of groups to split the residual blocks into residual_channels: 256 # Residual channel of the residual blocks gate_channels: 512 # Gate channel of the residual blocks skip_channels: 256 # Skip channel of the residual blocks aux_channels: 256 # Auxiliary channel of the residual blocks dropout: 0.1 # Dropout of the residual blocks bias: True # Whether to use bias in residual blocks use_weight_norm: False # Whether to use weight norm in all convolutions init_type: "kaiming_normal" # Type of initialize weights of a neural network module diffusion_params: num_train_timesteps: 100 # The number of timesteps between the noise and the real during training beta_start: 0.0001 # beta start parameter for the scheduler beta_end: 0.06 # beta end parameter for the scheduler beta_schedule: "linear" # beta schedule parameter for the scheduler num_max_timesteps: 100 # The max timestep transition from real to noise stretch: True # whether to stretch before diffusion ########################################################### # UPDATER SETTING # ########################################################### fs2_updater: use_masking: True # whether to apply masking for padded part in loss calculation ds_updater: use_masking: True # whether to apply masking for padded part in loss calculation ########################################################### # OPTIMIZER SETTING # ########################################################### # fastspeech2 optimizer fs2_optimizer: optim: adam # optimizer type learning_rate: 0.001 # learning rate # diffusion optimizer ds_optimizer_params: beta1: 0.9 beta2: 0.98 weight_decay: 0.0 ds_scheduler_params: learning_rate: 0.001 gamma: 0.5 step_size: 50000 ds_grad_norm: 1 ########################################################### # INTERVAL SETTING # ########################################################### only_train_diffusion: True # Whether to freeze fastspeech2 parameters when training diffusion ds_train_start_steps: 160000 # Number of steps to start to train diffusion module. train_max_steps: 320000 # Number of training steps. save_interval_steps: 2000 # Interval steps to save checkpoint. eval_interval_steps: 2000 # Interval steps to evaluate the network. num_snapshots: 5 ########################################################### # OTHER SETTING # ########################################################### seed: 10086