You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
167 lines
3.5 KiB
167 lines
3.5 KiB
# This configuration tested on 8 GPUs (A100) with 80GB GPU memory.
|
|
# It takes around 2 days to finish the training,You can adjust
|
|
# batch_size、num_workers here and ngpu in local/train.sh for your machine
|
|
###########################################################
|
|
# FEATURE EXTRACTION SETTING #
|
|
###########################################################
|
|
|
|
fs: 24000 # sr
|
|
n_fft: 2048 # FFT size (samples).
|
|
n_shift: 300 # Hop size (samples). 12.5ms
|
|
win_length: 1200 # Window length (samples). 50ms
|
|
# If set to null, it will be the same as fft_size.
|
|
window: "hann" # Window function.
|
|
|
|
# Only used for feats_type != raw
|
|
|
|
fmin: 80 # Minimum frequency of Mel basis.
|
|
fmax: 7600 # Maximum frequency of Mel basis.
|
|
n_mels: 80 # The number of mel basis.
|
|
|
|
mean_phn_span: 8
|
|
mlm_prob: 0.8
|
|
|
|
###########################################################
|
|
# DATA SETTING #
|
|
###########################################################
|
|
batch_size: 40
|
|
num_workers: 8
|
|
|
|
###########################################################
|
|
# MODEL SETTING #
|
|
###########################################################
|
|
model:
|
|
text_masking: false
|
|
postnet_layers: 5
|
|
postnet_filts: 5
|
|
postnet_chans: 256
|
|
encoder_type: conformer
|
|
decoder_type: conformer
|
|
enc_input_layer: sega_mlm
|
|
enc_pre_speech_layer: 0
|
|
enc_cnn_module_kernel: 7
|
|
enc_attention_dim: 384
|
|
enc_attention_heads: 2
|
|
enc_linear_units: 1536
|
|
enc_num_blocks: 4
|
|
enc_dropout_rate: 0.2
|
|
enc_positional_dropout_rate: 0.2
|
|
enc_attention_dropout_rate: 0.2
|
|
enc_normalize_before: true
|
|
enc_macaron_style: true
|
|
enc_use_cnn_module: true
|
|
enc_selfattention_layer_type: legacy_rel_selfattn
|
|
enc_activation_type: swish
|
|
enc_pos_enc_layer_type: legacy_rel_pos
|
|
enc_positionwise_layer_type: conv1d
|
|
enc_positionwise_conv_kernel_size: 3
|
|
dec_cnn_module_kernel: 31
|
|
dec_attention_dim: 384
|
|
dec_attention_heads: 2
|
|
dec_linear_units: 1536
|
|
dec_num_blocks: 4
|
|
dec_dropout_rate: 0.2
|
|
dec_positional_dropout_rate: 0.2
|
|
dec_attention_dropout_rate: 0.2
|
|
dec_macaron_style: true
|
|
dec_use_cnn_module: true
|
|
dec_selfattention_layer_type: legacy_rel_selfattn
|
|
dec_activation_type: swish
|
|
dec_pos_enc_layer_type: legacy_rel_pos
|
|
dec_positionwise_layer_type: conv1d
|
|
dec_positionwise_conv_kernel_size: 3
|
|
|
|
###########################################################
|
|
# OPTIMIZER SETTING #
|
|
###########################################################
|
|
scheduler_params:
|
|
d_model: 384
|
|
warmup_steps: 4000
|
|
grad_clip: 1.0
|
|
|
|
###########################################################
|
|
# TRAINING SETTING #
|
|
###########################################################
|
|
max_epoch: 1500
|
|
num_snapshots: 50
|
|
|
|
###########################################################
|
|
# OTHER SETTING #
|
|
###########################################################
|
|
seed: 0
|
|
|
|
token_list:
|
|
- <blank>
|
|
- <unk>
|
|
- AH0
|
|
- T
|
|
- N
|
|
- sp
|
|
- D
|
|
- S
|
|
- R
|
|
- L
|
|
- IH1
|
|
- DH
|
|
- AE1
|
|
- M
|
|
- EH1
|
|
- K
|
|
- Z
|
|
- W
|
|
- HH
|
|
- ER0
|
|
- AH1
|
|
- IY1
|
|
- P
|
|
- V
|
|
- F
|
|
- B
|
|
- AY1
|
|
- IY0
|
|
- EY1
|
|
- AA1
|
|
- AO1
|
|
- UW1
|
|
- IH0
|
|
- OW1
|
|
- NG
|
|
- G
|
|
- SH
|
|
- ER1
|
|
- Y
|
|
- TH
|
|
- AW1
|
|
- CH
|
|
- UH1
|
|
- IH2
|
|
- JH
|
|
- OW0
|
|
- EH2
|
|
- OY1
|
|
- AY2
|
|
- EH0
|
|
- EY2
|
|
- UW0
|
|
- AE2
|
|
- AA2
|
|
- OW2
|
|
- AH2
|
|
- ZH
|
|
- AO2
|
|
- IY2
|
|
- AE0
|
|
- UW2
|
|
- AY0
|
|
- AA0
|
|
- AO0
|
|
- AW2
|
|
- EY0
|
|
- UH2
|
|
- ER2
|
|
- OY2
|
|
- UH0
|
|
- AW0
|
|
- OY0
|
|
- <sos/eos>
|