PaddleSpeech/examples/csmsc/voc6/conf/default.yaml


###########################################################
#                FEATURE EXTRACTION SETTING               #
###########################################################
fs: 24000                # Sampling rate.
n_fft: 2048              # FFT size (samples).
n_shift: 300             # Hop size (samples). 12.5ms
win_length: 1200         # Window length (samples). 50ms
                         # If set to null, it will be the same as fft_size.
window: "hann"           # Window function.
n_mels: 80               # Number of mel basis.
fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
mu_law: True             # Recommended to suppress noise if using raw bitsexit()


###########################################################
#                       MODEL SETTING                     #
###########################################################
model:
    rnn_dims: 512                     # Hidden dims of RNN Layers.
    fc_dims: 512
    bits: 9                           # Bit depth of signal
    aux_context_window: 2             # Context window size for auxiliary feature.
                                      # If set to 2, previous 2 and future 2 frames will be considered.
    aux_channels: 80                  # Number of channels for auxiliary feature conv.
                                      # Must be the same as num_mels.
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
    compute_dims: 128                 # Dims of Conv1D in MelResNet.
    res_out_dims: 128                 # Dims of output in MelResNet.
    res_blocks: 10                    # Number of residual blocks.
    mode: RAW                         # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
inference:
    gen_batched: True                 # whether to genenate sample in batch mode
    target: 12000                     # target number of samples to be generated in each batch entry
    overlap: 600                      # number of samples for crossfading between batches


###########################################################
#                  DATA LOADER SETTING                    #
###########################################################
batch_size: 64              # Batch size.
batch_max_steps: 4500       # Length of each audio in batch. Make sure dividable by hop_size.
num_workers: 2              # Number of workers in DataLoader.

###########################################################
#                     OPTIMIZER SETTING                   #
###########################################################
grad_clip: 4.0
learning_rate: 1.0e-4                


###########################################################
#                    INTERVAL SETTING                     #
###########################################################

train_max_steps: 400000               # Number of training steps.
save_interval_steps: 5000             # Interval steps to save checkpoint.
eval_interval_steps: 1000             # Interval steps to evaluate the network.
gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples
generate_num: 5                       # number of samples to generate at each checkpoint

###########################################################
#                     OTHER SETTING                       #
###########################################################
num_snapshots: 10                 # max number of snapshots to keep while training
seed: 42                          # random seed for paddle, random, and np.random
add wavernn, test=tts 3 years ago
			`###########################################################`
			`# FEATURE EXTRACTION SETTING #`
			`###########################################################`
			`fs: 24000 # Sampling rate.`
			`n_fft: 2048 # FFT size (samples).`
			`n_shift: 300 # Hop size (samples). 12.5ms`
			`win_length: 1200 # Window length (samples). 50ms`
			`# If set to null, it will be the same as fft_size.`
			`window: "hann" # Window function.`
			`n_mels: 80 # Number of mel basis.`
			`fmin: 80 # Minimum freq in mel basis calculation. (Hz)`
			`fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)`
			`mu_law: True # Recommended to suppress noise if using raw bitsexit()`


			`###########################################################`
			`# MODEL SETTING #`
			`###########################################################`
			`model:`
			`rnn_dims: 512 # Hidden dims of RNN Layers.`
			`fc_dims: 512`
			`bits: 9 # Bit depth of signal`
update wavernn, test=tts 3 years ago			`aux_context_window: 2 # Context window size for auxiliary feature.`
			`# If set to 2, previous 2 and future 2 frames will be considered.`
add wavernn, test=tts 3 years ago			`aux_channels: 80 # Number of channels for auxiliary feature conv.`
			`# Must be the same as num_mels.`
			`upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here`
update wavernn, test=tts 3 years ago			`compute_dims: 128 # Dims of Conv1D in MelResNet.`
			`res_out_dims: 128 # Dims of output in MelResNet.`
			`res_blocks: 10 # Number of residual blocks.`
add wavernn, test=tts 3 years ago			`mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)`
			`inference:`
			`gen_batched: True # whether to genenate sample in batch mode`
			`target: 12000 # target number of samples to be generated in each batch entry`
			`overlap: 600 # number of samples for crossfading between batches`


			`###########################################################`
			`# DATA LOADER SETTING #`
			`###########################################################`
			`batch_size: 64 # Batch size.`
			`batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size.`
			`num_workers: 2 # Number of workers in DataLoader.`

			`###########################################################`
			`# OPTIMIZER SETTING #`
			`###########################################################`
			`grad_clip: 4.0`
			`learning_rate: 1.0e-4`


			`###########################################################`
			`# INTERVAL SETTING #`
			`###########################################################`

			`train_max_steps: 400000 # Number of training steps.`
			`save_interval_steps: 5000 # Interval steps to save checkpoint.`
			`eval_interval_steps: 1000 # Interval steps to evaluate the network.`
			`gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples`
			`generate_num: 5 # number of samples to generate at each checkpoint`

			`###########################################################`
			`# OTHER SETTING #`
			`###########################################################`
			`num_snapshots: 10 # max number of snapshots to keep while training`
			`seed: 42 # random seed for paddle, random, and np.random`