# This is the parameter configuration file for streaming tts server. ################################################################################# # SERVER SETTING # ################################################################################# host: 0.0.0.0 port: 8192 # The task format in the engin_list is: _ # engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online. # protocol choices = ['websocket', 'http'] protocol: 'websocket' engine_list: ['tts_online-onnx'] ################################################################################# # ENGINE CONFIG # ################################################################################# ################################### TTS ######################################### ################### speech task: tts; engine_type: online ####################### tts_online: # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] # fastspeech2_cnndecoder_csmsc support streaming am infer. am: 'fastspeech2_csmsc' am_config: am_ckpt: am_stat: phones_dict: tones_dict: speaker_dict: # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference voc: 'mb_melgan_csmsc' voc_config: voc_ckpt: voc_stat: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 36 voc_pad: 14 ################################################################################# # ENGINE CONFIG # ################################################################################# ################################### TTS ######################################### ################### speech task: tts; engine_type: online-onnx ####################### tts_online-onnx: # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer. am: 'fastspeech2_cnndecoder_csmsc_onnx' # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model]; # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model]; am_ckpt: # list am_stat: phones_dict: tones_dict: speaker_dict: am_sample_rate: 24000 am_sess_conf: device: "cpu" # set 'gpu:id' or 'cpu' use_trt: False cpu_threads: 4 # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx'] # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference voc: 'mb_melgan_csmsc_onnx' voc_ckpt: voc_sample_rate: 24000 voc_sess_conf: device: "cpu" # set 'gpu:id' or 'cpu' use_trt: False cpu_threads: 4 # others lang: 'zh' # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 72 am_pad: 12 # voc_pad and voc_block voc model to streaming voc infer, # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc_onnx, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 36 voc_pad: 14 # voc_upsample should be same as n_shift on voc config. voc_upsample: 300