Merge branch 'tts-server3' of https://github.com/lym0302/PaddleSpeech into tts-server3

pull/1425/head
lym0302 3 years ago
commit e354848cb3

@ -14,7 +14,7 @@ port: 8692
am: 'fastspeech2_csmsc' am: 'fastspeech2_csmsc'
am_model: # the pdmodel file of am static model am_model: # the pdmodel file of am static model
am_params: # the pdiparams file of am static model am_params: # the pdiparams file of am static model
sample_rate: 24000 am_sample_rate: 24000
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
@ -33,6 +33,7 @@ am_predictor_conf:
voc: 'pwgan_csmsc' voc: 'pwgan_csmsc'
voc_model: # the pdmodel file of vocoder static model voc_model: # the pdmodel file of vocoder static model
voc_params: # the pdiparams file of vocoder static model voc_params: # the pdiparams file of vocoder static model
voc_sample_rate: 24000
voc_predictor_conf: voc_predictor_conf:
use_gpu: True use_gpu: True

@ -83,6 +83,8 @@ pretrained_models = {
'pwgan_csmsc.pdmodel', 'pwgan_csmsc.pdmodel',
'params': 'params':
'pwgan_csmsc.pdiparams', 'pwgan_csmsc.pdiparams',
'sample_rate':
24000,
}, },
# mb_melgan # mb_melgan
"mb_melgan_csmsc-zh": { "mb_melgan_csmsc-zh": {
@ -94,6 +96,8 @@ pretrained_models = {
'mb_melgan_csmsc.pdmodel', 'mb_melgan_csmsc.pdmodel',
'params': 'params':
'mb_melgan_csmsc.pdiparams', 'mb_melgan_csmsc.pdiparams',
'sample_rate':
24000,
}, },
# hifigan # hifigan
"hifigan_csmsc-zh": { "hifigan_csmsc-zh": {
@ -105,6 +109,8 @@ pretrained_models = {
'hifigan_csmsc.pdmodel', 'hifigan_csmsc.pdmodel',
'params': 'params':
'hifigan_csmsc.pdiparams', 'hifigan_csmsc.pdiparams',
'sample_rate':
24000,
}, },
} }
@ -141,13 +147,14 @@ class TTSServerExecutor(TTSExecutor):
am: str='fastspeech2_csmsc', am: str='fastspeech2_csmsc',
am_model: Optional[os.PathLike]=None, am_model: Optional[os.PathLike]=None,
am_params: Optional[os.PathLike]=None, am_params: Optional[os.PathLike]=None,
sample_rate: int=24000, am_sample_rate: int=24000,
phones_dict: Optional[os.PathLike]=None, phones_dict: Optional[os.PathLike]=None,
tones_dict: Optional[os.PathLike]=None, tones_dict: Optional[os.PathLike]=None,
speaker_dict: Optional[os.PathLike]=None, speaker_dict: Optional[os.PathLike]=None,
voc: str='pwgan_csmsc', voc: str='pwgan_csmsc',
voc_model: Optional[os.PathLike]=None, voc_model: Optional[os.PathLike]=None,
voc_params: Optional[os.PathLike]=None, voc_params: Optional[os.PathLike]=None,
voc_sample_rate: int=24000,
lang: str='zh', lang: str='zh',
am_predictor_conf: dict=None, am_predictor_conf: dict=None,
voc_predictor_conf: dict=None, ): voc_predictor_conf: dict=None, ):
@ -169,7 +176,7 @@ class TTSServerExecutor(TTSExecutor):
# must have phones_dict in acoustic # must have phones_dict in acoustic
self.phones_dict = os.path.join( self.phones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['phones_dict']) am_res_path, pretrained_models[am_tag]['phones_dict'])
self.sample_rate = pretrained_models[am_tag]['sample_rate'] self.am_sample_rate = pretrained_models[am_tag]['sample_rate']
logger.info(am_res_path) logger.info(am_res_path)
logger.info(self.am_model) logger.info(self.am_model)
@ -178,7 +185,7 @@ class TTSServerExecutor(TTSExecutor):
self.am_model = os.path.abspath(am_model) self.am_model = os.path.abspath(am_model)
self.am_params = os.path.abspath(am_params) self.am_params = os.path.abspath(am_params)
self.phones_dict = os.path.abspath(phones_dict) self.phones_dict = os.path.abspath(phones_dict)
self.sample_rate = sample_rate self.am_sample_rate = am_sample_rate
self.am_res_path = os.path.dirname(os.path.abspath(self.am_model)) self.am_res_path = os.path.dirname(os.path.abspath(self.am_model))
print("self.phones_dict:", self.phones_dict) print("self.phones_dict:", self.phones_dict)
@ -207,14 +214,17 @@ class TTSServerExecutor(TTSExecutor):
pretrained_models[voc_tag]['model']) pretrained_models[voc_tag]['model'])
self.voc_params = os.path.join(voc_res_path, self.voc_params = os.path.join(voc_res_path,
pretrained_models[voc_tag]['params']) pretrained_models[voc_tag]['params'])
self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate']
logger.info(voc_res_path) logger.info(voc_res_path)
logger.info(self.voc_model) logger.info(self.voc_model)
logger.info(self.voc_params) logger.info(self.voc_params)
else: else:
self.voc_model = os.path.abspath(voc_model) self.voc_model = os.path.abspath(voc_model)
self.voc_params = os.path.abspath(voc_params) self.voc_params = os.path.abspath(voc_params)
self.voc_sample_rate = voc_sample_rate
self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model)) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model))
assert (self.voc_sample_rate == self.am_sample_rate)
# Init body. # Init body.
with open(self.phones_dict, "r") as f: with open(self.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()] phn_id = [line.strip().split() for line in f.readlines()]
@ -343,13 +353,14 @@ class TTSEngine(BaseEngine):
am=self.conf_dict["am"], am=self.conf_dict["am"],
am_model=self.conf_dict["am_model"], am_model=self.conf_dict["am_model"],
am_params=self.conf_dict["am_params"], am_params=self.conf_dict["am_params"],
sample_rate=self.conf_dict["sample_rate"], am_sample_rate=self.conf_dict["am_sample_rate"],
phones_dict=self.conf_dict["phones_dict"], phones_dict=self.conf_dict["phones_dict"],
tones_dict=self.conf_dict["tones_dict"], tones_dict=self.conf_dict["tones_dict"],
speaker_dict=self.conf_dict["speaker_dict"], speaker_dict=self.conf_dict["speaker_dict"],
voc=self.conf_dict["voc"], voc=self.conf_dict["voc"],
voc_model=self.conf_dict["voc_model"], voc_model=self.conf_dict["voc_model"],
voc_params=self.conf_dict["voc_params"], voc_params=self.conf_dict["voc_params"],
voc_sample_rate=self.conf_dict["voc_sample_rate"],
lang=self.conf_dict["lang"], lang=self.conf_dict["lang"],
am_predictor_conf=self.conf_dict["am_predictor_conf"], am_predictor_conf=self.conf_dict["am_predictor_conf"],
voc_predictor_conf=self.conf_dict["voc_predictor_conf"], ) voc_predictor_conf=self.conf_dict["voc_predictor_conf"], )
@ -451,7 +462,7 @@ class TTSEngine(BaseEngine):
try: try:
target_sample_rate, wav_base64 = self.postprocess( target_sample_rate, wav_base64 = self.postprocess(
wav=self.executor._outputs['wav'].numpy(), wav=self.executor._outputs['wav'].numpy(),
original_fs=self.executor.sample_rate, original_fs=self.executor.am_sample_rate,
target_fs=sample_rate, target_fs=sample_rate,
volume=volume, volume=volume,
speed=speed, speed=speed,

Loading…
Cancel
Save