You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
4.2 KiB
86 lines
4.2 KiB
###########################################################
|
|
# FEATURE EXTRACTION SETTING #
|
|
###########################################################
|
|
|
|
fs: 24000 # sr
|
|
n_fft: 2048 # FFT size (samples).
|
|
n_shift: 300 # Hop size (samples). 12.5ms
|
|
win_length: 1200 # Window length (samples). 50ms
|
|
# If set to null, it will be the same as fft_size.
|
|
window: "hann" # Window function.
|
|
|
|
# Only used for feats_type != raw
|
|
|
|
fmin: 80 # Minimum frequency of Mel basis.
|
|
fmax: 7600 # Maximum frequency of Mel basis.
|
|
n_mels: 80 # The number of mel basis.
|
|
|
|
###########################################################
|
|
# DATA SETTING #
|
|
###########################################################
|
|
batch_size: 64
|
|
num_workers: 2
|
|
|
|
###########################################################
|
|
# MODEL SETTING #
|
|
###########################################################
|
|
model: # keyword arguments for the selected model
|
|
embed_dim: 512 # char or phn embedding dimension
|
|
elayers: 1 # number of blstm layers in encoder
|
|
eunits: 512 # number of blstm units
|
|
econv_layers: 3 # number of convolutional layers in encoder
|
|
econv_chans: 512 # number of channels in convolutional layer
|
|
econv_filts: 5 # filter size of convolutional layer
|
|
atype: location # attention function type
|
|
adim: 512 # attention dimension
|
|
aconv_chans: 32 # number of channels in convolutional layer of attention
|
|
aconv_filts: 15 # filter size of convolutional layer of attention
|
|
cumulate_att_w: True # whether to cumulate attention weight
|
|
dlayers: 2 # number of lstm layers in decoder
|
|
dunits: 1024 # number of lstm units in decoder
|
|
prenet_layers: 2 # number of layers in prenet
|
|
prenet_units: 256 # number of units in prenet
|
|
postnet_layers: 5 # number of layers in postnet
|
|
postnet_chans: 512 # number of channels in postnet
|
|
postnet_filts: 5 # filter size of postnet layer
|
|
output_activation: null # activation function for the final output
|
|
use_batch_norm: True # whether to use batch normalization in encoder
|
|
use_concate: True # whether to concatenate encoder embedding with decoder outputs
|
|
use_residual: False # whether to use residual connection in encoder
|
|
dropout_rate: 0.5 # dropout rate
|
|
zoneout_rate: 0.1 # zoneout rate
|
|
reduction_factor: 1 # reduction factor
|
|
spk_embed_dim: 256 # speaker embedding dimension
|
|
spk_embed_integration_type: concat # how to integrate speaker embedding
|
|
|
|
|
|
###########################################################
|
|
# UPDATER SETTING #
|
|
###########################################################
|
|
updater:
|
|
use_masking: True # whether to apply masking for padded part in loss calculation
|
|
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
|
|
use_guided_attn_loss: True # whether to use guided attention loss
|
|
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
|
|
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
|
|
|
|
|
|
##########################################################
|
|
# OPTIMIZER SETTING #
|
|
##########################################################
|
|
optimizer:
|
|
optim: adam # optimizer type
|
|
learning_rate: 1.0e-03 # learning rate
|
|
epsilon: 1.0e-06 # epsilon
|
|
weight_decay: 0.0 # weight decay coefficient
|
|
|
|
###########################################################
|
|
# TRAINING SETTING #
|
|
###########################################################
|
|
max_epoch: 100
|
|
num_snapshots: 5
|
|
|
|
###########################################################
|
|
# OTHER SETTING #
|
|
###########################################################
|
|
seed: 42 |