diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py index 25a8bc76..c9135b88 100644 --- a/paddlespeech/server/engine/tts/online/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/tts_engine.py @@ -12,24 +12,329 @@ # See the License for the specific language governing permissions and # limitations under the License. import base64 +import math +import os import time +from typing import Optional import numpy as np import paddle +import yaml +from yacs.config import CfgNode from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor +from paddlespeech.cli.utils import download_and_decompress +from paddlespeech.cli.utils import MODEL_HOME +from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm +from paddlespeech.server.utils.util import denorm from paddlespeech.server.utils.util import get_chunks +from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.modules.normalizer import ZScore + +__all__ = ['TTSEngine'] + +# support online model +pretrained_models = { + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', + 'md5': + '637d28a5e53aa60275612ba4393d5f22', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_76000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "fastspeech2_cnndecoder_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip', + 'md5': + '6eb28e22ace73e0ebe7845f86478f89f', + 'config': + 'cnndecoder.yaml', + 'ckpt': + 'snapshot_iter_153000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'ee5f0604e20091f0d495b6ec4618b90d', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, +} + +model_alias = { + # acoustic model + "fastspeech2": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2", + "fastspeech2_inference": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + + # voc + "mb_melgan": + "paddlespeech.t2s.models.melgan:MelGANGenerator", + "mb_melgan_inference": + "paddlespeech.t2s.models.melgan:MelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", +} __all__ = ['TTSEngine'] class TTSServerExecutor(TTSExecutor): - def __init__(self): + def __init__(self, am_block, am_pad, voc_block, voc_pad): super().__init__() - pass + self.am_block = am_block + self.am_pad = am_pad + self.voc_block = voc_block + self.voc_pad = voc_pad + + def get_model_info(self, + field: str, + model_name: str, + ckpt: Optional[os.PathLike], + stat: Optional[os.PathLike]): + """get model information + + Args: + field (str): am or voc + model_name (str): model type, support fastspeech2, higigan, mb_melgan + ckpt (Optional[os.PathLike]): ckpt file + stat (Optional[os.PathLike]): stat file, including mean and standard deviation + + Returns: + [module]: model module + [Tensor]: mean + [Tensor]: standard deviation + """ + + model_class = dynamic_import(model_name, model_alias) + + if field == "am": + odim = self.am_config.n_mels + model = model_class( + idim=self.vocab_size, odim=odim, **self.am_config["model"]) + model.set_state_dict(paddle.load(ckpt)["main_params"]) + + elif field == "voc": + model = model_class(**self.voc_config["generator_params"]) + model.set_state_dict(paddle.load(ckpt)["generator_params"]) + model.remove_weight_norm() + + else: + logger.error("Please set correct field, am or voc") + + model.eval() + model_mu, model_std = np.load(stat) + model_mu = paddle.to_tensor(model_mu) + model_std = paddle.to_tensor(model_std) + + return model, model_mu, model_std + + def _get_pretrained_path(self, tag: str) -> os.PathLike: + """ + Download and returns pretrained resources path of current task. + """ + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) + + res_path = os.path.join(MODEL_HOME, tag) + decompressed_path = download_and_decompress(pretrained_models[tag], + res_path) + decompressed_path = os.path.abspath(decompressed_path) + logger.info( + 'Use pretrained model stored in: {}'.format(decompressed_path)) + return decompressed_path + + def _init_from_path( + self, + am: str='fastspeech2_csmsc', + am_config: Optional[os.PathLike]=None, + am_ckpt: Optional[os.PathLike]=None, + am_stat: Optional[os.PathLike]=None, + phones_dict: Optional[os.PathLike]=None, + tones_dict: Optional[os.PathLike]=None, + speaker_dict: Optional[os.PathLike]=None, + voc: str='mb_melgan_csmsc', + voc_config: Optional[os.PathLike]=None, + voc_ckpt: Optional[os.PathLike]=None, + voc_stat: Optional[os.PathLike]=None, + lang: str='zh', ): + """ + Init model and other resources from a specific path. + """ + if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): + logger.info('Models had been initialized.') + return + # am model info + am_tag = am + '-' + lang + if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + am_res_path = self._get_pretrained_path(am_tag) + self.am_res_path = am_res_path + self.am_config = os.path.join(am_res_path, + pretrained_models[am_tag]['config']) + self.am_ckpt = os.path.join(am_res_path, + pretrained_models[am_tag]['ckpt']) + self.am_stat = os.path.join( + am_res_path, pretrained_models[am_tag]['speech_stats']) + # must have phones_dict in acoustic + self.phones_dict = os.path.join( + am_res_path, pretrained_models[am_tag]['phones_dict']) + print("self.phones_dict:", self.phones_dict) + logger.info(am_res_path) + logger.info(self.am_config) + logger.info(self.am_ckpt) + else: + self.am_config = os.path.abspath(am_config) + self.am_ckpt = os.path.abspath(am_ckpt) + self.am_stat = os.path.abspath(am_stat) + self.phones_dict = os.path.abspath(phones_dict) + self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) + print("self.phones_dict:", self.phones_dict) + + self.tones_dict = None + self.speaker_dict = None + + # voc model info + voc_tag = voc + '-' + lang + if voc_ckpt is None or voc_config is None or voc_stat is None: + voc_res_path = self._get_pretrained_path(voc_tag) + self.voc_res_path = voc_res_path + self.voc_config = os.path.join(voc_res_path, + pretrained_models[voc_tag]['config']) + self.voc_ckpt = os.path.join(voc_res_path, + pretrained_models[voc_tag]['ckpt']) + self.voc_stat = os.path.join( + voc_res_path, pretrained_models[voc_tag]['speech_stats']) + logger.info(voc_res_path) + logger.info(self.voc_config) + logger.info(self.voc_ckpt) + else: + self.voc_config = os.path.abspath(voc_config) + self.voc_ckpt = os.path.abspath(voc_ckpt) + self.voc_stat = os.path.abspath(voc_stat) + self.voc_res_path = os.path.dirname( + os.path.abspath(self.voc_config)) + + # Init body. + with open(self.am_config) as f: + self.am_config = CfgNode(yaml.safe_load(f)) + with open(self.voc_config) as f: + self.voc_config = CfgNode(yaml.safe_load(f)) + + with open(self.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + self.vocab_size = len(phn_id) + print("vocab_size:", self.vocab_size) + + # frontend + if lang == 'zh': + self.frontend = Frontend( + phone_vocab_path=self.phones_dict, + tone_vocab_path=self.tones_dict) + + elif lang == 'en': + self.frontend = English(phone_vocab_path=self.phones_dict) + print("frontend done!") + + # am infer info + self.am_name = am[:am.rindex('_')] + if self.am_name == "fastspeech2_cnndecoder": + self.am_inference, self.am_mu, self.am_std = self.get_model_info( + "am", "fastspeech2", self.am_ckpt, self.am_stat) + else: + am, am_mu, am_std = self.get_model_info("am", self.am_name, + self.am_ckpt, self.am_stat) + am_normalizer = ZScore(am_mu, am_std) + am_inference_class = dynamic_import(self.am_name + '_inference', + model_alias) + self.am_inference = am_inference_class(am_normalizer, am) + self.am_inference.eval() + print("acoustic model done!") + + # voc infer info + self.voc_name = voc[:voc.rindex('_')] + voc, voc_mu, voc_std = self.get_model_info("voc", self.voc_name, + self.voc_ckpt, self.voc_stat) + voc_normalizer = ZScore(voc_mu, voc_std) + voc_inference_class = dynamic_import(self.voc_name + '_inference', + model_alias) + self.voc_inference = voc_inference_class(voc_normalizer, voc) + self.voc_inference.eval() + print("voc done!") + + def get_phone(self, sentence, lang, merge_sentences, get_tone_ids): + tone_ids = None + if lang == 'zh': + input_ids = self.frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + elif lang == 'en': + input_ids = self.frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + + def depadding(self, data, chunk_num, chunk_id, block, pad, upsample): + """ + Streaming inference removes the result of pad inference + """ + front_pad = min(chunk_id * block, pad) + # first chunk + if chunk_id == 0: + data = data[:block * upsample] + # last chunk + elif chunk_id == chunk_num - 1: + data = data[front_pad * upsample:] + # middle chunk + else: + data = data[front_pad * upsample:(front_pad + block) * upsample] + + return data @paddle.no_grad() def infer( @@ -37,16 +342,20 @@ class TTSServerExecutor(TTSExecutor): text: str, lang: str='zh', am: str='fastspeech2_csmsc', - spk_id: int=0, - am_block: int=42, - am_pad: int=12, - voc_block: int=14, - voc_pad: int=14, ): + spk_id: int=0, ): """ Model inference and result stored in self.output. """ - am_name = am[:am.rindex('_')] - am_dataset = am[am.rindex('_') + 1:] + + am_block = self.am_block + am_pad = self.am_pad + am_upsample = 1 + voc_block = self.voc_block + voc_pad = self.voc_pad + voc_upsample = self.voc_config.n_shift + # first_flag 用于标记首包 + first_flag = 1 + get_tone_ids = False merge_sentences = False frontend_st = time.time() @@ -64,43 +373,100 @@ class TTSServerExecutor(TTSExecutor): phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") - self.frontend_time = time.time() - frontend_st + frontend_et = time.time() + self.frontend_time = frontend_et - frontend_st for i in range(len(phone_ids)): - am_st = time.time() part_phone_ids = phone_ids[i] - # am - if am_name == 'speedyspeech': - part_tone_ids = tone_ids[i] - mel = self.am_inference(part_phone_ids, part_tone_ids) - # fastspeech2 + voc_chunk_id = 0 + + # fastspeech2_csmsc + if am == "fastspeech2_csmsc": + # am + mel = self.am_inference(part_phone_ids) + if first_flag == 1: + first_am_et = time.time() + self.first_am_infer = first_am_et - frontend_et + + # voc streaming + mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc") + voc_chunk_num = len(mel_chunks) + voc_st = time.time() + for i, mel_chunk in enumerate(mel_chunks): + sub_wav = self.voc_inference(mel_chunk) + sub_wav = self.depadding(sub_wav, voc_chunk_num, i, + voc_block, voc_pad, voc_upsample) + if first_flag == 1: + first_voc_et = time.time() + self.first_voc_infer = first_voc_et - first_am_et + self.first_response_time = first_voc_et - frontend_st + first_flag = 0 + + yield sub_wav + + # fastspeech2_cnndecoder_csmsc + elif am == "fastspeech2_cnndecoder_csmsc": + # am + orig_hs, h_masks = self.am_inference.encoder_infer( + part_phone_ids) + + # streaming voc chunk info + mel_len = orig_hs.shape[1] + voc_chunk_num = math.ceil(mel_len / self.voc_block) + start = 0 + end = min(self.voc_block + self.voc_pad, mel_len) + + # streaming am + hss = get_chunks(orig_hs, self.am_block, self.am_pad, "am") + am_chunk_num = len(hss) + for i, hs in enumerate(hss): + before_outs, _ = self.am_inference.decoder(hs) + after_outs = before_outs + self.am_inference.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + sub_mel = denorm(normalized_mel, self.am_mu, self.am_std) + sub_mel = self.depadding(sub_mel, am_chunk_num, i, am_block, + am_pad, am_upsample) + + if i == 0: + mel_streaming = sub_mel + else: + mel_streaming = np.concatenate( + (mel_streaming, sub_mel), axis=0) + + # streaming voc + # 当流式AM推理的mel帧数大于流式voc推理的chunk size,开始进行流式voc 推理 + while (mel_streaming.shape[0] >= end and + voc_chunk_id < voc_chunk_num): + if first_flag == 1: + first_am_et = time.time() + self.first_am_infer = first_am_et - frontend_et + voc_chunk = mel_streaming[start:end, :] + voc_chunk = paddle.to_tensor(voc_chunk) + sub_wav = self.voc_inference(voc_chunk) + + sub_wav = self.depadding(sub_wav, voc_chunk_num, + voc_chunk_id, voc_block, + voc_pad, voc_upsample) + if first_flag == 1: + first_voc_et = time.time() + self.first_voc_infer = first_voc_et - first_am_et + self.first_response_time = first_voc_et - frontend_st + first_flag = 0 + + yield sub_wav + + voc_chunk_id += 1 + start = max(0, voc_chunk_id * voc_block - voc_pad) + end = min((voc_chunk_id + 1) * voc_block + voc_pad, + mel_len) + else: - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - mel = self.am_inference( - part_phone_ids, spk_id=paddle.to_tensor(spk_id)) - else: - mel = self.am_inference(part_phone_ids) - am_et = time.time() - - # voc streaming - voc_upsample = self.voc_config.n_shift - mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc") - chunk_num = len(mel_chunks) - voc_st = time.time() - for i, mel_chunk in enumerate(mel_chunks): - sub_wav = self.voc_inference(mel_chunk) - front_pad = min(i * voc_block, voc_pad) - - if i == 0: - sub_wav = sub_wav[:voc_block * voc_upsample] - elif i == chunk_num - 1: - sub_wav = sub_wav[front_pad * voc_upsample:] - else: - sub_wav = sub_wav[front_pad * voc_upsample:( - front_pad + voc_block) * voc_upsample] - - yield sub_wav + logger.error( + "Only support fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc on streaming tts." + ) + + self.final_response_time = time.time() - frontend_st class TTSEngine(BaseEngine): @@ -113,14 +479,21 @@ class TTSEngine(BaseEngine): def __init__(self, name=None): """Initialize TTS server engine """ - super(TTSEngine, self).__init__() + super().__init__() def init(self, config: dict) -> bool: - self.executor = TTSServerExecutor() self.config = config - assert "fastspeech2_csmsc" in config.am and ( - config.voc == "hifigan_csmsc-zh" or config.voc == "mb_melgan_csmsc" + assert ( + config.am == "fastspeech2_csmsc" or + config.am == "fastspeech2_cnndecoder_csmsc" + ) and ( + config.voc == "hifigan_csmsc" or config.voc == "mb_melgan_csmsc" ), 'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.' + + assert ( + config.voc_block > 0 and config.voc_pad > 0 + ), "Please set correct voc_block and voc_pad, they should be more than 0." + try: if self.config.device: self.device = self.config.device @@ -135,6 +508,9 @@ class TTSEngine(BaseEngine): (self.device)) return False + self.executor = TTSServerExecutor(config.am_block, config.am_pad, + config.voc_block, config.voc_pad) + try: self.executor._init_from_path( am=self.config.am, @@ -155,15 +531,42 @@ class TTSEngine(BaseEngine): (self.device)) return False - self.am_block = self.config.am_block - self.am_pad = self.config.am_pad - self.voc_block = self.config.voc_block - self.voc_pad = self.config.voc_pad - logger.info("Initialize TTS server engine successfully on device: %s." % (self.device)) + + # warm up + try: + self.warm_up() + except Exception as e: + logger.error("Failed to warm up on tts engine.") + return False + return True + def warm_up(self): + """warm up + """ + if self.config.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if self.config.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + logger.info( + "*******************************warm up ********************************" + ) + for i in range(3): + for wav in self.executor.infer( + text=sentence, + lang=self.config.lang, + am=self.config.am, + spk_id=0, ): + logger.info( + f"The first response time of the {i} warm up: {self.executor.first_response_time} s" + ) + break + logger.info( + "**********************************************************************" + ) + def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): # Convert byte to text if text_bese64: @@ -195,18 +598,14 @@ class TTSEngine(BaseEngine): wav_base64: The base64 format of the synthesized audio. """ - lang = self.config.lang wav_list = [] for wav in self.executor.infer( text=sentence, - lang=lang, + lang=self.config.lang, am=self.config.am, - spk_id=spk_id, - am_block=self.am_block, - am_pad=self.am_pad, - voc_block=self.voc_block, - voc_pad=self.voc_pad): + spk_id=spk_id, ): + # wav type: float32, convert to pcm (base64) wav = float2pcm(wav) # float32 to int16 wav_bytes = wav.tobytes() # to bytes @@ -216,5 +615,14 @@ class TTSEngine(BaseEngine): yield wav_base64 wav_all = np.concatenate(wav_list, axis=0) - logger.info("The durations of audio is: {} s".format( - len(wav_all) / self.executor.am_config.fs)) + duration = len(wav_all) / self.executor.am_config.fs + logger.info(f"sentence: {sentence}") + logger.info(f"The durations of audio is: {duration} s") + logger.info( + f"first response time: {self.executor.first_response_time} s") + logger.info( + f"final response time: {self.executor.final_response_time} s") + logger.info(f"RTF: {self.executor.final_response_time / duration}") + logger.info( + f"Other info: front time: {self.executor.frontend_time} s, first am infer time: {self.executor.first_am_infer} s, first voc infer time: {self.executor.first_voc_infer} s," + ) diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 0fe70849..72ee0060 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -52,6 +52,10 @@ def get_chunks(data, block_size, pad_size, step): Returns: list: chunks list """ + + if block_size == -1: + return [data] + if step == "am": data_len = data.shape[1] elif step == "voc":