|
|
|
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
import base64
|
|
|
|
|
import math
|
|
|
|
|
import os
|
|
|
|
|
import time
|
|
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import paddle
|
|
|
|
|
|
|
|
|
|
from paddlespeech.cli.log import logger
|
|
|
|
|
from paddlespeech.cli.tts.infer import TTSExecutor
|
|
|
|
|
from paddlespeech.cli.utils import download_and_decompress
|
|
|
|
|
from paddlespeech.cli.utils import MODEL_HOME
|
|
|
|
|
from paddlespeech.server.engine.base_engine import BaseEngine
|
|
|
|
|
from paddlespeech.server.utils.audio_process import float2pcm
|
|
|
|
|
from paddlespeech.server.utils.onnx_infer import get_sess
|
|
|
|
|
from paddlespeech.server.utils.util import denorm
|
|
|
|
|
from paddlespeech.server.utils.util import get_chunks
|
|
|
|
|
from paddlespeech.t2s.frontend import English
|
|
|
|
|
from paddlespeech.t2s.frontend.zh_frontend import Frontend
|
|
|
|
|
|
|
|
|
|
__all__ = ['TTSEngine']
|
|
|
|
|
|
|
|
|
|
# support online model
|
|
|
|
|
pretrained_models = {
|
|
|
|
|
# fastspeech2
|
|
|
|
|
"fastspeech2_csmsc_onnx-zh": {
|
|
|
|
|
'url':
|
|
|
|
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
|
|
|
|
|
'md5':
|
|
|
|
|
'fd3ad38d83273ad51f0ea4f4abf3ab4e',
|
|
|
|
|
'ckpt': ['fastspeech2_csmsc.onnx'],
|
|
|
|
|
'phones_dict':
|
|
|
|
|
'phone_id_map.txt',
|
|
|
|
|
'sample_rate':
|
|
|
|
|
24000,
|
|
|
|
|
},
|
|
|
|
|
"fastspeech2_cnndecoder_csmsc_onnx-zh": {
|
|
|
|
|
'url':
|
|
|
|
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
|
|
|
|
|
'md5':
|
|
|
|
|
'5f70e1a6bcd29d72d54e7931aa86f266',
|
|
|
|
|
'ckpt': [
|
|
|
|
|
'fastspeech2_csmsc_am_encoder_infer.onnx',
|
|
|
|
|
'fastspeech2_csmsc_am_decoder.onnx',
|
|
|
|
|
'fastspeech2_csmsc_am_postnet.onnx',
|
|
|
|
|
],
|
|
|
|
|
'speech_stats':
|
|
|
|
|
'speech_stats.npy',
|
|
|
|
|
'phones_dict':
|
|
|
|
|
'phone_id_map.txt',
|
|
|
|
|
'sample_rate':
|
|
|
|
|
24000,
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
# mb_melgan
|
|
|
|
|
"mb_melgan_csmsc_onnx-zh": {
|
|
|
|
|
'url':
|
|
|
|
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
|
|
|
|
|
'md5':
|
|
|
|
|
'5b83ec746e8414bc29032d954ffd07ec',
|
|
|
|
|
'ckpt':
|
|
|
|
|
'mb_melgan_csmsc.onnx',
|
|
|
|
|
'sample_rate':
|
|
|
|
|
24000,
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
# hifigan
|
|
|
|
|
"hifigan_csmsc_onnx-zh": {
|
|
|
|
|
'url':
|
|
|
|
|
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
|
|
|
|
|
'md5':
|
|
|
|
|
'1a7dc0385875889e46952e50c0994a6b',
|
|
|
|
|
'ckpt':
|
|
|
|
|
'hifigan_csmsc.onnx',
|
|
|
|
|
'sample_rate':
|
|
|
|
|
24000,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
model_alias = {
|
|
|
|
|
# acoustic model
|
|
|
|
|
"fastspeech2":
|
|
|
|
|
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
|
|
|
|
|
"fastspeech2_inference":
|
|
|
|
|
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
|
|
|
|
|
|
|
|
|
|
# voc
|
|
|
|
|
"mb_melgan":
|
|
|
|
|
"paddlespeech.t2s.models.melgan:MelGANGenerator",
|
|
|
|
|
"mb_melgan_inference":
|
|
|
|
|
"paddlespeech.t2s.models.melgan:MelGANInference",
|
|
|
|
|
"hifigan":
|
|
|
|
|
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
|
|
|
|
|
"hifigan_inference":
|
|
|
|
|
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__all__ = ['TTSEngine']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TTSServerExecutor(TTSExecutor):
|
|
|
|
|
def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.am_block = am_block
|
|
|
|
|
self.am_pad = am_pad
|
|
|
|
|
self.voc_block = voc_block
|
|
|
|
|
self.voc_pad = voc_pad
|
|
|
|
|
self.voc_upsample = voc_upsample
|
|
|
|
|
|
|
|
|
|
self.pretrained_models = pretrained_models
|
|
|
|
|
self.model_alias = model_alias
|
|
|
|
|
|
|
|
|
|
def _get_pretrained_path(self, tag: str) -> os.PathLike:
|
|
|
|
|
"""
|
|
|
|
|
#Download and returns pretrained resources path of current task.
|
|
|
|
|
"""
|
|
|
|
|
support_models = list(pretrained_models.keys())
|
|
|
|
|
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
|
|
|
|
|
tag, '\n\t\t'.join(support_models))
|
|
|
|
|
|
|
|
|
|
res_path = os.path.join(MODEL_HOME, tag)
|
|
|
|
|
decompressed_path = download_and_decompress(pretrained_models[tag],
|
|
|
|
|
res_path)
|
|
|
|
|
decompressed_path = os.path.abspath(decompressed_path)
|
|
|
|
|
logger.info(
|
|
|
|
|
'Use pretrained model stored in: {}'.format(decompressed_path))
|
|
|
|
|
return decompressed_path
|
|
|
|
|
|
|
|
|
|
def _init_from_path(
|
|
|
|
|
self,
|
|
|
|
|
am: str='fastspeech2_csmsc_onnx',
|
|
|
|
|
am_ckpt: Optional[list]=None,
|
|
|
|
|
am_stat: Optional[os.PathLike]=None,
|
|
|
|
|
phones_dict: Optional[os.PathLike]=None,
|
|
|
|
|
tones_dict: Optional[os.PathLike]=None,
|
|
|
|
|
speaker_dict: Optional[os.PathLike]=None,
|
|
|
|
|
am_sample_rate: int=24000,
|
|
|
|
|
am_sess_conf: dict=None,
|
|
|
|
|
voc: str='mb_melgan_csmsc_onnx',
|
|
|
|
|
voc_ckpt: Optional[os.PathLike]=None,
|
|
|
|
|
voc_sample_rate: int=24000,
|
|
|
|
|
voc_sess_conf: dict=None,
|
|
|
|
|
lang: str='zh', ):
|
|
|
|
|
"""
|
|
|
|
|
Init model and other resources from a specific path.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if (hasattr(self, 'am_sess') or
|
|
|
|
|
(hasattr(self, 'am_encoder_infer_sess') and
|
|
|
|
|
hasattr(self, 'am_decoder_sess') and hasattr(
|
|
|
|
|
self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'):
|
|
|
|
|
logger.info('Models had been initialized.')
|
|
|
|
|
return
|
|
|
|
|
# am
|
|
|
|
|
am_tag = am + '-' + lang
|
|
|
|
|
if am == "fastspeech2_csmsc_onnx":
|
|
|
|
|
# get model info
|
|
|
|
|
if am_ckpt is None or phones_dict is None:
|
|
|
|
|
am_res_path = self._get_pretrained_path(am_tag)
|
|
|
|
|
self.am_res_path = am_res_path
|
|
|
|
|
self.am_ckpt = os.path.join(
|
|
|
|
|
am_res_path, pretrained_models[am_tag]['ckpt'][0])
|
|
|
|
|
# must have phones_dict in acoustic
|
|
|
|
|
self.phones_dict = os.path.join(
|
|
|
|
|
am_res_path, pretrained_models[am_tag]['phones_dict'])
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
self.am_ckpt = os.path.abspath(am_ckpt[0])
|
|
|
|
|
self.phones_dict = os.path.abspath(phones_dict)
|
|
|
|
|
self.am_res_path = os.path.dirname(
|
|
|
|
|
os.path.abspath(self.am_ckpt))
|
|
|
|
|
|
|
|
|
|
# create am sess
|
|
|
|
|
self.am_sess = get_sess(self.am_ckpt, am_sess_conf)
|
|
|
|
|
|
|
|
|
|
elif am == "fastspeech2_cnndecoder_csmsc_onnx":
|
|
|
|
|
if am_ckpt is None or am_stat is None or phones_dict is None:
|
|
|
|
|
am_res_path = self._get_pretrained_path(am_tag)
|
|
|
|
|
self.am_res_path = am_res_path
|
|
|
|
|
self.am_encoder_infer = os.path.join(
|
|
|
|
|
am_res_path, pretrained_models[am_tag]['ckpt'][0])
|
|
|
|
|
self.am_decoder = os.path.join(
|
|
|
|
|
am_res_path, pretrained_models[am_tag]['ckpt'][1])
|
|
|
|
|
self.am_postnet = os.path.join(
|
|
|
|
|
am_res_path, pretrained_models[am_tag]['ckpt'][2])
|
|
|
|
|
# must have phones_dict in acoustic
|
|
|
|
|
self.phones_dict = os.path.join(
|
|
|
|
|
am_res_path, pretrained_models[am_tag]['phones_dict'])
|
|
|
|
|
self.am_stat = os.path.join(
|
|
|
|
|
am_res_path, pretrained_models[am_tag]['speech_stats'])
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
self.am_encoder_infer = os.path.abspath(am_ckpt[0])
|
|
|
|
|
self.am_decoder = os.path.abspath(am_ckpt[1])
|
|
|
|
|
self.am_postnet = os.path.abspath(am_ckpt[2])
|
|
|
|
|
self.phones_dict = os.path.abspath(phones_dict)
|
|
|
|
|
self.am_stat = os.path.abspath(am_stat)
|
|
|
|
|
self.am_res_path = os.path.dirname(
|
|
|
|
|
os.path.abspath(self.am_ckpt))
|
|
|
|
|
|
|
|
|
|
# create am sess
|
|
|
|
|
self.am_encoder_infer_sess = get_sess(self.am_encoder_infer,
|
|
|
|
|
am_sess_conf)
|
|
|
|
|
self.am_decoder_sess = get_sess(self.am_decoder, am_sess_conf)
|
|
|
|
|
self.am_postnet_sess = get_sess(self.am_postnet, am_sess_conf)
|
|
|
|
|
|
|
|
|
|
self.am_mu, self.am_std = np.load(self.am_stat)
|
|
|
|
|
|
|
|
|
|
logger.info(f"self.phones_dict: {self.phones_dict}")
|
|
|
|
|
logger.info(f"am model dir: {self.am_res_path}")
|
|
|
|
|
logger.info("Create am sess successfully.")
|
|
|
|
|
|
|
|
|
|
# voc model info
|
|
|
|
|
voc_tag = voc + '-' + lang
|
|
|
|
|
if voc_ckpt is None:
|
|
|
|
|
voc_res_path = self._get_pretrained_path(voc_tag)
|
|
|
|
|
self.voc_res_path = voc_res_path
|
|
|
|
|
self.voc_ckpt = os.path.join(voc_res_path,
|
|
|
|
|
pretrained_models[voc_tag]['ckpt'])
|
|
|
|
|
else:
|
|
|
|
|
self.voc_ckpt = os.path.abspath(voc_ckpt)
|
|
|
|
|
self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
|
|
|
|
|
logger.info(self.voc_res_path)
|
|
|
|
|
|
|
|
|
|
# create voc sess
|
|
|
|
|
self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf)
|
|
|
|
|
logger.info("Create voc sess successfully.")
|
|
|
|
|
|
|
|
|
|
with open(self.phones_dict, "r") as f:
|
|
|
|
|
phn_id = [line.strip().split() for line in f.readlines()]
|
|
|
|
|
self.vocab_size = len(phn_id)
|
|
|
|
|
logger.info(f"vocab_size: {self.vocab_size}")
|
|
|
|
|
|
|
|
|
|
# frontend
|
|
|
|
|
self.tones_dict = None
|
|
|
|
|
if lang == 'zh':
|
|
|
|
|
self.frontend = Frontend(
|
|
|
|
|
phone_vocab_path=self.phones_dict,
|
|
|
|
|
tone_vocab_path=self.tones_dict)
|
|
|
|
|
|
|
|
|
|
elif lang == 'en':
|
|
|
|
|
self.frontend = English(phone_vocab_path=self.phones_dict)
|
|
|
|
|
logger.info("frontend done!")
|
|
|
|
|
|
|
|
|
|
def depadding(self, data, chunk_num, chunk_id, block, pad, upsample):
|
|
|
|
|
"""
|
|
|
|
|
Streaming inference removes the result of pad inference
|
|
|
|
|
"""
|
|
|
|
|
front_pad = min(chunk_id * block, pad)
|
|
|
|
|
# first chunk
|
|
|
|
|
if chunk_id == 0:
|
|
|
|
|
data = data[:block * upsample]
|
|
|
|
|
# last chunk
|
|
|
|
|
elif chunk_id == chunk_num - 1:
|
|
|
|
|
data = data[front_pad * upsample:]
|
|
|
|
|
# middle chunk
|
|
|
|
|
else:
|
|
|
|
|
data = data[front_pad * upsample:(front_pad + block) * upsample]
|
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
@paddle.no_grad()
|
|
|
|
|
def infer(
|
|
|
|
|
self,
|
|
|
|
|
text: str,
|
|
|
|
|
lang: str='zh',
|
|
|
|
|
am: str='fastspeech2_csmsc_onnx',
|
|
|
|
|
spk_id: int=0, ):
|
|
|
|
|
"""
|
|
|
|
|
Model inference and result stored in self.output.
|
|
|
|
|
"""
|
|
|
|
|
#import pdb;pdb.set_trace()
|
|
|
|
|
|
|
|
|
|
am_block = self.am_block
|
|
|
|
|
am_pad = self.am_pad
|
|
|
|
|
am_upsample = 1
|
|
|
|
|
voc_block = self.voc_block
|
|
|
|
|
voc_pad = self.voc_pad
|
|
|
|
|
voc_upsample = self.voc_upsample
|
|
|
|
|
# first_flag 用于标记首包
|
|
|
|
|
first_flag = 1
|
|
|
|
|
get_tone_ids = False
|
|
|
|
|
merge_sentences = False
|
|
|
|
|
|
|
|
|
|
# front
|
|
|
|
|
frontend_st = time.time()
|
|
|
|
|
if lang == 'zh':
|
|
|
|
|
input_ids = self.frontend.get_input_ids(
|
|
|
|
|
text,
|
|
|
|
|
merge_sentences=merge_sentences,
|
|
|
|
|
get_tone_ids=get_tone_ids)
|
|
|
|
|
phone_ids = input_ids["phone_ids"]
|
|
|
|
|
if get_tone_ids:
|
|
|
|
|
tone_ids = input_ids["tone_ids"]
|
|
|
|
|
elif lang == 'en':
|
|
|
|
|
input_ids = self.frontend.get_input_ids(
|
|
|
|
|
text, merge_sentences=merge_sentences)
|
|
|
|
|
phone_ids = input_ids["phone_ids"]
|
|
|
|
|
else:
|
|
|
|
|
logger.error("lang should in {'zh', 'en'}!")
|
|
|
|
|
frontend_et = time.time()
|
|
|
|
|
self.frontend_time = frontend_et - frontend_st
|
|
|
|
|
|
|
|
|
|
for i in range(len(phone_ids)):
|
|
|
|
|
part_phone_ids = phone_ids[i].numpy()
|
|
|
|
|
voc_chunk_id = 0
|
|
|
|
|
|
|
|
|
|
# fastspeech2_csmsc
|
|
|
|
|
if am == "fastspeech2_csmsc_onnx":
|
|
|
|
|
# am
|
|
|
|
|
mel = self.am_sess.run(
|
|
|
|
|
output_names=None, input_feed={'text': part_phone_ids})
|
|
|
|
|
mel = mel[0]
|
|
|
|
|
if first_flag == 1:
|
|
|
|
|
first_am_et = time.time()
|
|
|
|
|
self.first_am_infer = first_am_et - frontend_et
|
|
|
|
|
|
|
|
|
|
# voc streaming
|
|
|
|
|
mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc")
|
|
|
|
|
voc_chunk_num = len(mel_chunks)
|
|
|
|
|
voc_st = time.time()
|
|
|
|
|
for i, mel_chunk in enumerate(mel_chunks):
|
|
|
|
|
sub_wav = self.voc_sess.run(
|
|
|
|
|
output_names=None, input_feed={'logmel': mel_chunk})
|
|
|
|
|
sub_wav = self.depadding(sub_wav[0], voc_chunk_num, i,
|
|
|
|
|
voc_block, voc_pad, voc_upsample)
|
|
|
|
|
if first_flag == 1:
|
|
|
|
|
first_voc_et = time.time()
|
|
|
|
|
self.first_voc_infer = first_voc_et - first_am_et
|
|
|
|
|
self.first_response_time = first_voc_et - frontend_st
|
|
|
|
|
first_flag = 0
|
|
|
|
|
|
|
|
|
|
yield sub_wav
|
|
|
|
|
|
|
|
|
|
# fastspeech2_cnndecoder_csmsc
|
|
|
|
|
elif am == "fastspeech2_cnndecoder_csmsc_onnx":
|
|
|
|
|
# am
|
|
|
|
|
orig_hs = self.am_encoder_infer_sess.run(
|
|
|
|
|
None, input_feed={'text': part_phone_ids})
|
|
|
|
|
orig_hs = orig_hs[0]
|
|
|
|
|
|
|
|
|
|
# streaming voc chunk info
|
|
|
|
|
mel_len = orig_hs.shape[1]
|
|
|
|
|
voc_chunk_num = math.ceil(mel_len / self.voc_block)
|
|
|
|
|
start = 0
|
|
|
|
|
end = min(self.voc_block + self.voc_pad, mel_len)
|
|
|
|
|
|
|
|
|
|
# streaming am
|
|
|
|
|
hss = get_chunks(orig_hs, self.am_block, self.am_pad, "am")
|
|
|
|
|
am_chunk_num = len(hss)
|
|
|
|
|
for i, hs in enumerate(hss):
|
|
|
|
|
am_decoder_output = self.am_decoder_sess.run(
|
|
|
|
|
None, input_feed={'xs': hs})
|
|
|
|
|
am_postnet_output = self.am_postnet_sess.run(
|
|
|
|
|
None,
|
|
|
|
|
input_feed={
|
|
|
|
|
'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
|
|
|
|
|
})
|
|
|
|
|
am_output_data = am_decoder_output + np.transpose(
|
|
|
|
|
am_postnet_output[0], (0, 2, 1))
|
|
|
|
|
normalized_mel = am_output_data[0][0]
|
|
|
|
|
|
|
|
|
|
sub_mel = denorm(normalized_mel, self.am_mu, self.am_std)
|
|
|
|
|
sub_mel = self.depadding(sub_mel, am_chunk_num, i, am_block,
|
|
|
|
|
am_pad, am_upsample)
|
|
|
|
|
|
|
|
|
|
if i == 0:
|
|
|
|
|
mel_streaming = sub_mel
|
|
|
|
|
else:
|
|
|
|
|
mel_streaming = np.concatenate(
|
|
|
|
|
(mel_streaming, sub_mel), axis=0)
|
|
|
|
|
|
|
|
|
|
# streaming voc
|
|
|
|
|
# 当流式AM推理的mel帧数大于流式voc推理的chunk size,开始进行流式voc 推理
|
|
|
|
|
while (mel_streaming.shape[0] >= end and
|
|
|
|
|
voc_chunk_id < voc_chunk_num):
|
|
|
|
|
if first_flag == 1:
|
|
|
|
|
first_am_et = time.time()
|
|
|
|
|
self.first_am_infer = first_am_et - frontend_et
|
|
|
|
|
voc_chunk = mel_streaming[start:end, :]
|
|
|
|
|
|
|
|
|
|
sub_wav = self.voc_sess.run(
|
|
|
|
|
output_names=None, input_feed={'logmel': voc_chunk})
|
|
|
|
|
sub_wav = self.depadding(sub_wav[0], voc_chunk_num,
|
|
|
|
|
voc_chunk_id, voc_block,
|
|
|
|
|
voc_pad, voc_upsample)
|
|
|
|
|
if first_flag == 1:
|
|
|
|
|
first_voc_et = time.time()
|
|
|
|
|
self.first_voc_infer = first_voc_et - first_am_et
|
|
|
|
|
self.first_response_time = first_voc_et - frontend_st
|
|
|
|
|
first_flag = 0
|
|
|
|
|
|
|
|
|
|
yield sub_wav
|
|
|
|
|
|
|
|
|
|
voc_chunk_id += 1
|
|
|
|
|
start = max(0, voc_chunk_id * voc_block - voc_pad)
|
|
|
|
|
end = min((voc_chunk_id + 1) * voc_block + voc_pad,
|
|
|
|
|
mel_len)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
logger.error(
|
|
|
|
|
"Only support fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc on streaming tts."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self.final_response_time = time.time() - frontend_st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TTSEngine(BaseEngine):
|
|
|
|
|
"""TTS server engine
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
metaclass: Defaults to Singleton.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, name=None):
|
|
|
|
|
"""Initialize TTS server engine
|
|
|
|
|
"""
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
def init(self, config: dict) -> bool:
|
|
|
|
|
self.config = config
|
|
|
|
|
assert (
|
|
|
|
|
self.config.am == "fastspeech2_csmsc_onnx" or
|
|
|
|
|
self.config.am == "fastspeech2_cnndecoder_csmsc_onnx"
|
|
|
|
|
) and (
|
|
|
|
|
self.config.voc == "hifigan_csmsc_onnx" or
|
|
|
|
|
self.config.voc == "mb_melgan_csmsc_onnx"
|
|
|
|
|
), 'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.'
|
|
|
|
|
|
|
|
|
|
assert (
|
|
|
|
|
self.config.voc_block > 0 and self.config.voc_pad > 0
|
|
|
|
|
), "Please set correct voc_block and voc_pad, they should be more than 0."
|
|
|
|
|
|
|
|
|
|
assert (
|
|
|
|
|
self.config.voc_sample_rate == self.config.am_sample_rate
|
|
|
|
|
), "The sample rate of AM and Vocoder model are different, please check model."
|
|
|
|
|
|
|
|
|
|
self.executor = TTSServerExecutor(
|
|
|
|
|
self.config.am_block, self.config.am_pad, self.config.voc_block,
|
|
|
|
|
self.config.voc_pad, self.config.voc_upsample)
|
|
|
|
|
|
|
|
|
|
if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device:
|
|
|
|
|
paddle.set_device("cpu")
|
|
|
|
|
else:
|
|
|
|
|
paddle.set_device(self.config.am_sess_conf.device)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
self.executor._init_from_path(
|
|
|
|
|
am=self.config.am,
|
|
|
|
|
am_ckpt=self.config.am_ckpt,
|
|
|
|
|
am_stat=self.config.am_stat,
|
|
|
|
|
phones_dict=self.config.phones_dict,
|
|
|
|
|
tones_dict=self.config.tones_dict,
|
|
|
|
|
speaker_dict=self.config.speaker_dict,
|
|
|
|
|
am_sample_rate=self.config.am_sample_rate,
|
|
|
|
|
am_sess_conf=self.config.am_sess_conf,
|
|
|
|
|
voc=self.config.voc,
|
|
|
|
|
voc_ckpt=self.config.voc_ckpt,
|
|
|
|
|
voc_sample_rate=self.config.voc_sample_rate,
|
|
|
|
|
voc_sess_conf=self.config.voc_sess_conf,
|
|
|
|
|
lang=self.config.lang)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("Failed to get model related files.")
|
|
|
|
|
logger.error("Initialize TTS server engine Failed on device: %s." %
|
|
|
|
|
(self.config.voc_sess_conf.device))
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
logger.info("Initialize TTS server engine successfully on device: %s." %
|
|
|
|
|
(self.config.voc_sess_conf.device))
|
|
|
|
|
|
|
|
|
|
# warm up
|
|
|
|
|
try:
|
|
|
|
|
self.warm_up()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("Failed to warm up on tts engine.")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def warm_up(self):
|
|
|
|
|
"""warm up
|
|
|
|
|
"""
|
|
|
|
|
if self.config.lang == 'zh':
|
|
|
|
|
sentence = "您好,欢迎使用语音合成服务。"
|
|
|
|
|
if self.config.lang == 'en':
|
|
|
|
|
sentence = "Hello and welcome to the speech synthesis service."
|
|
|
|
|
logger.info(
|
|
|
|
|
"*******************************warm up ********************************"
|
|
|
|
|
)
|
|
|
|
|
for i in range(3):
|
|
|
|
|
for wav in self.executor.infer(
|
|
|
|
|
text=sentence,
|
|
|
|
|
lang=self.config.lang,
|
|
|
|
|
am=self.config.am,
|
|
|
|
|
spk_id=0, ):
|
|
|
|
|
logger.info(
|
|
|
|
|
f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
|
|
|
|
|
)
|
|
|
|
|
break
|
|
|
|
|
logger.info(
|
|
|
|
|
"**********************************************************************"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
|
|
|
|
|
# Convert byte to text
|
|
|
|
|
if text_bese64:
|
|
|
|
|
text_bytes = base64.b64decode(text_bese64) # base64 to bytes
|
|
|
|
|
text = text_bytes.decode('utf-8') # bytes to text
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
def run(self,
|
|
|
|
|
sentence: str,
|
|
|
|
|
spk_id: int=0,
|
|
|
|
|
speed: float=1.0,
|
|
|
|
|
volume: float=1.0,
|
|
|
|
|
sample_rate: int=0,
|
|
|
|
|
save_path: str=None):
|
|
|
|
|
""" run include inference and postprocess.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
sentence (str): text to be synthesized
|
|
|
|
|
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
|
|
|
|
|
speed (float, optional): speed. Defaults to 1.0.
|
|
|
|
|
volume (float, optional): volume. Defaults to 1.0.
|
|
|
|
|
sample_rate (int, optional): target sample rate for synthesized audio,
|
|
|
|
|
0 means the same as the model sampling rate. Defaults to 0.
|
|
|
|
|
save_path (str, optional): The save path of the synthesized audio.
|
|
|
|
|
None means do not save audio. Defaults to None.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
wav_base64: The base64 format of the synthesized audio.
|
|
|
|
|
"""
|
|
|
|
|
wav_list = []
|
|
|
|
|
|
|
|
|
|
for wav in self.executor.infer(
|
|
|
|
|
text=sentence,
|
|
|
|
|
lang=self.config.lang,
|
|
|
|
|
am=self.config.am,
|
|
|
|
|
spk_id=spk_id, ):
|
|
|
|
|
|
|
|
|
|
# wav type: <class 'numpy.ndarray'> float32, convert to pcm (base64)
|
|
|
|
|
wav = float2pcm(wav) # float32 to int16
|
|
|
|
|
wav_bytes = wav.tobytes() # to bytes
|
|
|
|
|
wav_base64 = base64.b64encode(wav_bytes).decode('utf8') # to base64
|
|
|
|
|
wav_list.append(wav)
|
|
|
|
|
|
|
|
|
|
yield wav_base64
|
|
|
|
|
|
|
|
|
|
wav_all = np.concatenate(wav_list, axis=0)
|
|
|
|
|
duration = len(wav_all) / self.config.voc_sample_rate
|
|
|
|
|
logger.info(f"sentence: {sentence}")
|
|
|
|
|
logger.info(f"The durations of audio is: {duration} s")
|
|
|
|
|
logger.info(
|
|
|
|
|
f"first response time: {self.executor.first_response_time} s")
|
|
|
|
|
logger.info(
|
|
|
|
|
f"final response time: {self.executor.final_response_time} s")
|
|
|
|
|
logger.info(f"RTF: {self.executor.final_response_time / duration}")
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Other info: front time: {self.executor.frontend_time} s, first am infer time: {self.executor.first_am_infer} s, first voc infer time: {self.executor.first_voc_infer} s,"
|
|
|
|
|
)
|