########################################################### # FEATURE EXTRACTION SETTING # ########################################################### fs: 24000 # sr n_fft: 2048 # FFT size (samples). n_shift: 300 # Hop size (samples). 12.5ms win_length: 1200 # Window length (samples). 50ms # If set to null, it will be the same as fft_size. window: "hann" # Window function. # Only used for feats_type != raw fmin: 80 # Minimum frequency of Mel basis. fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) # The canton datasets we use are different from others like Databaker or LJSpeech, # we set it to 110 to avoid too many zero-pitch problem. # Reference: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/issues/38 f0min: 110 # Minimum f0 for pitch extraction. f0max: 400 # Maximum f0 for pitch extraction. ########################################################### # DATA SETTING # ########################################################### batch_size: 32 num_workers: 2 ########################################################### # MODEL SETTING # ########################################################### model: adim: 384 # attention dimension aheads: 2 # number of attention heads elayers: 4 # number of encoder layers eunits: 1536 # number of encoder ff units dlayers: 4 # number of decoder layers dunits: 1536 # number of decoder ff units positionwise_layer_type: conv1d # type of position-wise layer positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor postnet_layers: 5 # number of layers of postnset postnet_filts: 5 # filter size of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet use_scaled_pos_enc: True # whether to use scaled positional encoding encoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input reduction_factor: 1 # reduction factor init_type: xavier_uniform # initialization type init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type ########################################################### # UPDATER SETTING # ########################################################### updater: use_masking: True # whether to apply masking for padded part in loss calculation ########################################################### # OPTIMIZER SETTING # ########################################################### optimizer: optim: adam # optimizer type learning_rate: 0.001 # learning rate ########################################################### # TRAINING SETTING # ########################################################### max_epoch: 1000 num_snapshots: 5 ########################################################### # OTHER SETTING # ########################################################### seed: 10086