PaddleSpeech/demos/streaming_tts_server/conf/tts_online_application.yaml

# This is the parameter configuration file for streaming tts server.

#################################################################################
#                             SERVER SETTING                                    #
#################################################################################
host: 0.0.0.0
port: 8092

# The task format in the engin_list is: <speech task>_<engine type>
# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.
# protocol choices = ['websocket', 'http'] 
protocol: 'http'
engine_list: ['tts_online-onnx']


#################################################################################
#                                ENGINE CONFIG                                  #
#################################################################################

################################### TTS #########################################
################### speech task: tts; engine_type: online #######################
tts_online: 
    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']   
    # fastspeech2_cnndecoder_csmsc support streaming am infer.     
    am: 'fastspeech2_csmsc'   
    am_config: 
    am_ckpt: 
    am_stat: 
    phones_dict: 
    tones_dict: 
    speaker_dict: 
    spk_id: 0

    # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
    voc: 'mb_melgan_csmsc'
    voc_config: 
    voc_ckpt: 
    voc_stat: 

    # others
    lang: 'zh'
    device: 'cpu' # set 'gpu:id' or 'cpu'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
    voc_block: 36
    voc_pad: 14
    

#################################################################################
#                                ENGINE CONFIG                                  #
#################################################################################

################################### TTS #########################################
################### speech task: tts; engine_type: online-onnx #######################
tts_online-onnx: 
    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']
    # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.        
    am: 'fastspeech2_cnndecoder_csmsc_onnx' 
    # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
    # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
    am_ckpt:   # list
    am_stat: 
    phones_dict: 
    tones_dict: 
    speaker_dict: 
    spk_id: 0
    am_sample_rate: 24000
    am_sess_conf:
        device: "cpu" # set 'gpu:id' or 'cpu'
        use_trt: False
        cpu_threads: 4

    # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
    # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference
    voc: 'hifigan_csmsc_onnx'
    voc_ckpt: 
    voc_sample_rate: 24000
    voc_sess_conf:
        device: "cpu" # set 'gpu:id' or 'cpu'
        use_trt: False
        cpu_threads: 4

    # others
    lang: 'zh'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc_onnx, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
    voc_block: 36
    voc_pad: 14
    # voc_upsample should be same as n_shift on voc config.
    voc_upsample: 300
add info, test=doc 3 years ago			`# This is the parameter configuration file for streaming tts server.`
add streaming tts demos, test=doc 3 years ago
			`#################################################################################`
			`# SERVER SETTING #`
			`#################################################################################`
fix cors, test=doc 3 years ago			`host: 0.0.0.0`
add streaming tts demos, test=doc 3 years ago			`port: 8092`

			`# The task format in the engin_list is: <speech task>_<engine type>`
add info, test=doc 3 years ago			`# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.`
			`# protocol choices = ['websocket', 'http']`
add streaming tts demos, test=doc 3 years ago			`protocol: 'http'`
			`engine_list: ['tts_online-onnx']`


			`#################################################################################`
			`# ENGINE CONFIG #`
			`#################################################################################`

			`################################### TTS #########################################`
			`################### speech task: tts; engine_type: online #######################`
			`tts_online:`
add info, test=doc 3 years ago			`# am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']`
			`# fastspeech2_cnndecoder_csmsc support streaming am infer.`
add streaming tts demos, test=doc 3 years ago			`am: 'fastspeech2_csmsc'`
			`am_config:`
			`am_ckpt:`
			`am_stat:`
			`phones_dict:`
			`tones_dict:`
			`speaker_dict:`
			`spk_id: 0`

			`# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']`
add info, test=doc 3 years ago			`# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference`
add streaming tts demos, test=doc 3 years ago			`voc: 'mb_melgan_csmsc'`
			`voc_config:`
			`voc_ckpt:`
			`voc_stat:`

			`# others`
			`lang: 'zh'`
			`device: 'cpu' # set 'gpu:id' or 'cpu'`
add info, test=doc 3 years ago			`# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,`
			`# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio`
improve server code, test=doc 3 years ago			`am_block: 72`
add streaming tts demos, test=doc 3 years ago			`am_pad: 12`
add info, test=doc 3 years ago			`# voc_pad and voc_block voc model to streaming voc infer,`
			`# when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal`
fix hifigan pad value 2 years ago			`# when voc model is hifigan_csmsc, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal`
improve server code, test=doc 3 years ago			`voc_block: 36`
add streaming tts demos, test=doc 3 years ago			`voc_pad: 14`



			`#################################################################################`
			`# ENGINE CONFIG #`
			`#################################################################################`

			`################################### TTS #########################################`
			`################### speech task: tts; engine_type: online-onnx #######################`
			`tts_online-onnx:`
add info, test=doc 3 years ago			`# am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']`
			`# fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.`
add streaming tts demos, test=doc 3 years ago			`am: 'fastspeech2_cnndecoder_csmsc_onnx'`
			`# am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];`
			`# if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];`
			`am_ckpt: # list`
			`am_stat:`
			`phones_dict:`
			`tones_dict:`
			`speaker_dict:`
			`spk_id: 0`
			`am_sample_rate: 24000`
			`am_sess_conf:`
			`device: "cpu" # set 'gpu:id' or 'cpu'`
			`use_trt: False`
			`cpu_threads: 4`

			`# voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']`
add info, test=doc 3 years ago			`# Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference`
add streaming tts demos, test=doc 3 years ago			`voc: 'hifigan_csmsc_onnx'`
			`voc_ckpt:`
			`voc_sample_rate: 24000`
			`voc_sess_conf:`
			`device: "cpu" # set 'gpu:id' or 'cpu'`
			`use_trt: False`
			`cpu_threads: 4`

			`# others`
			`lang: 'zh'`
add info, test=doc 3 years ago			`# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,`
			`# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio`
improve server code, test=doc 3 years ago			`am_block: 72`
add streaming tts demos, test=doc 3 years ago			`am_pad: 12`
add info, test=doc 3 years ago			`# voc_pad and voc_block voc model to streaming voc infer,`
			`# when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal`
fix hifigan pad value 2 years ago			`# when voc model is hifigan_csmsc_onnx, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal`
improve server code, test=doc 3 years ago			`voc_block: 36`
add streaming tts demos, test=doc 3 years ago			`voc_pad: 14`
add info, test=doc 3 years ago			`# voc_upsample should be same as n_shift on voc config.`
add streaming tts demos, test=doc 3 years ago			`voc_upsample: 300`