Merge pull request #1446 from lym0302/tts-server3

[server] add params type
pull/1454/head
Hui Zhang 3 years ago committed by GitHub
commit 79c064fe21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,11 +1,5 @@
# This is the parameter configuration file for TTS server. # This is the parameter configuration file for TTS server.
##################################################################
# TTS SERVER SETTING #
##################################################################
host: '0.0.0.0'
port: 8692
################################################################## ##################################################################
# ACOUSTIC MODEL SETTING # # ACOUSTIC MODEL SETTING #
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',

@ -1,12 +1,6 @@
# This is the parameter configuration file for TTS server. # This is the parameter configuration file for TTS server.
# These are the static models that support paddle inference. # These are the static models that support paddle inference.
##################################################################
# TTS SERVER SETTING #
##################################################################
host: '0.0.0.0'
port: 8692
################################################################## ##################################################################
# ACOUSTIC MODEL SETTING # # ACOUSTIC MODEL SETTING #
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import base64 import base64
import io import io
import os import os
@ -21,7 +20,6 @@ import librosa
import numpy as np import numpy as np
import paddle import paddle
import soundfile as sf import soundfile as sf
import yaml
from engine.base_engine import BaseEngine from engine.base_engine import BaseEngine
from scipy.io import wavfile from scipy.io import wavfile
@ -32,6 +30,7 @@ from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
from utils.audio_process import change_speed from utils.audio_process import change_speed
from utils.config import get_config
from utils.errors import ErrorCode from utils.errors import ErrorCode
from utils.exception import ServerBaseException from utils.exception import ServerBaseException
from utils.paddle_predictor import init_predictor from utils.paddle_predictor import init_predictor
@ -118,14 +117,7 @@ pretrained_models = {
class TTSServerExecutor(TTSExecutor): class TTSServerExecutor(TTSExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
pass
self.parser = argparse.ArgumentParser(
prog='paddlespeech.tts', add_help=True)
self.parser.add_argument(
'--conf',
type=str,
default='./conf/tts/tts_pd.yaml',
help='Configuration parameters.')
def _get_pretrained_path(self, tag: str) -> os.PathLike: def _get_pretrained_path(self, tag: str) -> os.PathLike:
""" """
@ -224,7 +216,10 @@ class TTSServerExecutor(TTSExecutor):
self.voc_sample_rate = voc_sample_rate self.voc_sample_rate = voc_sample_rate
self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model)) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model))
assert (self.voc_sample_rate == self.am_sample_rate) assert (
self.voc_sample_rate == self.am_sample_rate
), "The sample rate of AM and Vocoder model are different, please check model."
# Init body. # Init body.
with open(self.phones_dict, "r") as f: with open(self.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()] phn_id = [line.strip().split() for line in f.readlines()]
@ -339,31 +334,31 @@ class TTSEngine(BaseEngine):
metaclass: Defaults to Singleton. metaclass: Defaults to Singleton.
""" """
def __init__(self, name=None): def __init__(self):
"""Initialize TTS server engine """Initialize TTS server engine
""" """
super(TTSEngine, self).__init__() super(TTSEngine, self).__init__()
self.executor = TTSServerExecutor()
config_path = self.executor.parser.parse_args().conf def init(self, config_file: str):
with open(config_path, 'rt') as f: self.executor = TTSServerExecutor()
self.conf_dict = yaml.safe_load(f) self.config_file = config_file
self.config = get_config(config_file)
self.executor._init_from_path( self.executor._init_from_path(
am=self.conf_dict["am"], am=self.config.am,
am_model=self.conf_dict["am_model"], am_model=self.config.am_model,
am_params=self.conf_dict["am_params"], am_params=self.config.am_params,
am_sample_rate=self.conf_dict["am_sample_rate"], am_sample_rate=self.config.am_sample_rate,
phones_dict=self.conf_dict["phones_dict"], phones_dict=self.config.phones_dict,
tones_dict=self.conf_dict["tones_dict"], tones_dict=self.config.tones_dict,
speaker_dict=self.conf_dict["speaker_dict"], speaker_dict=self.config.speaker_dict,
voc=self.conf_dict["voc"], voc=self.config.voc,
voc_model=self.conf_dict["voc_model"], voc_model=self.config.voc_model,
voc_params=self.conf_dict["voc_params"], voc_params=self.config.voc_params,
voc_sample_rate=self.conf_dict["voc_sample_rate"], voc_sample_rate=self.config.voc_sample_rate,
lang=self.conf_dict["lang"], lang=self.config.lang,
am_predictor_conf=self.conf_dict["am_predictor_conf"], am_predictor_conf=self.config.am_predictor_conf,
voc_predictor_conf=self.conf_dict["voc_predictor_conf"], ) voc_predictor_conf=self.config.voc_predictor_conf, )
logger.info("Initialize TTS server engine successfully.") logger.info("Initialize TTS server engine successfully.")
@ -382,6 +377,13 @@ class TTSEngine(BaseEngine):
target_fs (int): target audio sample rate target_fs (int): target audio sample rate
volume (float): target volume volume (float): target volume
speed (float): target speed speed (float): target speed
Raises:
ServerBaseException: Throws an exception if the change speed unsuccessfully.
Returns:
target_fs: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
""" """
# transform sample_rate # transform sample_rate
@ -440,21 +442,20 @@ class TTSEngine(BaseEngine):
save_path (str, optional): The save path of the synthesized audio. Defaults to None. save_path (str, optional): The save path of the synthesized audio. Defaults to None.
Raises: Raises:
ServerBaseException: Exception ServerBaseException: Throws an exception if tts inference unsuccessfully.
ServerBaseException: Exception ServerBaseException: Throws an exception if postprocess unsuccessfully.
Returns: Returns:
lang, target_sample_rate, wav_base64 lang: model language
target_sample_rate: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
""" """
lang = self.conf_dict["lang"] lang = self.config.lang
try: try:
self.executor.infer( self.executor.infer(
text=sentence, text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
lang=lang,
am=self.conf_dict["am"],
spk_id=spk_id)
except: except:
raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
"tts infer failed.") "tts infer failed.")

@ -11,20 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import base64 import base64
import io import io
import librosa import librosa
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import yaml
from engine.base_engine import BaseEngine from engine.base_engine import BaseEngine
from scipy.io import wavfile from scipy.io import wavfile
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.cli.tts.infer import TTSExecutor
from utils.audio_process import change_speed from utils.audio_process import change_speed
from utils.config import get_config
from utils.errors import ErrorCode from utils.errors import ErrorCode
from utils.exception import ServerBaseException from utils.exception import ServerBaseException
@ -34,14 +33,7 @@ __all__ = ['TTSEngine']
class TTSServerExecutor(TTSExecutor): class TTSServerExecutor(TTSExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
pass
self.parser = argparse.ArgumentParser(
prog='paddlespeech.tts', add_help=True)
self.parser.add_argument(
'--conf',
type=str,
default='./conf/tts/tts.yaml',
help='Configuration parameters.')
class TTSEngine(BaseEngine): class TTSEngine(BaseEngine):
@ -55,25 +47,25 @@ class TTSEngine(BaseEngine):
"""Initialize TTS server engine """Initialize TTS server engine
""" """
super(TTSEngine, self).__init__() super(TTSEngine, self).__init__()
self.executor = TTSServerExecutor()
config_path = self.executor.parser.parse_args().conf def init(self, config_file: str):
with open(config_path, 'rt') as f: self.executor = TTSServerExecutor()
self.conf_dict = yaml.safe_load(f) self.config_file = config_file
self.config = get_config(config_file)
self.executor._init_from_path( self.executor._init_from_path(
am=self.conf_dict["am"], am=self.config.am,
am_config=self.conf_dict["am_config"], am_config=self.config.am_config,
am_ckpt=self.conf_dict["am_ckpt"], am_ckpt=self.config.am_ckpt,
am_stat=self.conf_dict["am_stat"], am_stat=self.config.am_stat,
phones_dict=self.conf_dict["phones_dict"], phones_dict=self.config.phones_dict,
tones_dict=self.conf_dict["tones_dict"], tones_dict=self.config.tones_dict,
speaker_dict=self.conf_dict["speaker_dict"], speaker_dict=self.config.speaker_dict,
voc=self.conf_dict["voc"], voc=self.config.voc,
voc_config=self.conf_dict["voc_config"], voc_config=self.config.voc_config,
voc_ckpt=self.conf_dict["voc_ckpt"], voc_ckpt=self.config.voc_ckpt,
voc_stat=self.conf_dict["voc_stat"], voc_stat=self.config.voc_stat,
lang=self.conf_dict["lang"]) lang=self.config.lang)
logger.info("Initialize TTS server engine successfully.") logger.info("Initialize TTS server engine successfully.")
@ -92,6 +84,13 @@ class TTSEngine(BaseEngine):
target_fs (int): target audio sample rate target_fs (int): target audio sample rate
volume (float): target volume volume (float): target volume
speed (float): target speed speed (float): target speed
Raises:
ServerBaseException: Throws an exception if the change speed unsuccessfully.
Returns:
target_fs: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
""" """
# transform sample_rate # transform sample_rate
@ -137,15 +136,33 @@ class TTSEngine(BaseEngine):
volume: float=1.0, volume: float=1.0,
sample_rate: int=0, sample_rate: int=0,
save_path: str=None): save_path: str=None):
""" run include inference and postprocess.
Args:
sentence (str): text to be synthesized
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
speed (float, optional): speed. Defaults to 1.0.
volume (float, optional): volume. Defaults to 1.0.
sample_rate (int, optional): target sample rate for synthesized audio,
0 means the same as the model sampling rate. Defaults to 0.
save_path (str, optional): The save path of the synthesized audio.
None means do not save audio. Defaults to None.
Raises:
ServerBaseException: Throws an exception if tts inference unsuccessfully.
ServerBaseException: Throws an exception if postprocess unsuccessfully.
Returns:
lang: model language
target_sample_rate: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
"""
lang = self.conf_dict["lang"] lang = self.config.lang
try: try:
self.executor.infer( self.executor.infer(
text=sentence, text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
lang=lang,
am=self.conf_dict["am"],
spk_id=spk_id)
except: except:
raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
"tts infer failed.") "tts infer failed.")

@ -15,8 +15,17 @@ import wave
import numpy as np import numpy as np
from paddlespeech.cli.log import logger
def wav2pcm(wavfile, pcmfile, data_type=np.int16): def wav2pcm(wavfile, pcmfile, data_type=np.int16):
""" Save the wav file as a pcm file
Args:
wavfile (str): wav file path
pcmfile (str): pcm file save path
data_type (type, optional): pcm sample type. Defaults to np.int16.
"""
with open(wavfile, "rb") as f: with open(wavfile, "rb") as f:
f.seek(0) f.seek(0)
f.read(44) f.read(44)
@ -25,12 +34,21 @@ def wav2pcm(wavfile, pcmfile, data_type=np.int16):
def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000): def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000):
"""Save the pcm file as a wav file
Args:
pcm_file (str): pcm file path
wav_file (str): wav file save path
channels (int, optional): audio channel. Defaults to 1.
bits (int, optional): Bit depth. Defaults to 16.
sample_rate (int, optional): sample rate. Defaults to 16000.
"""
pcmf = open(pcm_file, 'rb') pcmf = open(pcm_file, 'rb')
pcmdata = pcmf.read() pcmdata = pcmf.read()
pcmf.close() pcmf.close()
if bits % 8 != 0: if bits % 8 != 0:
raise ValueError("bits % 8 must == 0. now bits:" + str(bits)) logger.error("bits % 8 must == 0. now bits:" + str(bits))
wavfile = wave.open(wav_file, 'wb') wavfile = wave.open(wav_file, 'wb')
wavfile.setnchannels(channels) wavfile.setnchannels(channels)

@ -0,0 +1,110 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import base64
import io
import json
import os
import random
import time
import numpy as np
import requests
import soundfile
def wav2pcm(wavfile: str, pcmfile: str, data_type=np.int16):
with open(wavfile, "rb") as f:
f.seek(0)
f.read(44)
data = np.fromfile(f, dtype=data_type)
data.tofile(pcmfile)
# Request and response
def tts_client(args):
""" Request and response
Args:
text: A sentence to be synthesized
outfile: Synthetic audio file
"""
url = 'http://127.0.0.1:8090/paddlespeech/tts'
request = {
"text": args.text,
"spk_id": args.spk_id,
"speed": args.speed,
"volume": args.volume,
"sample_rate": args.sample_rate,
"save_path": args.output
}
response = requests.post(url, json.dumps(request))
response_dict = response.json()
wav_base64 = response_dict["result"]["audio"]
audio_data_byte = base64.b64decode(wav_base64)
# from byte
samples, sample_rate = soundfile.read(
io.BytesIO(audio_data_byte), dtype='float32')
# transform audio
outfile = args.output
if outfile.endswith(".wav"):
soundfile.write(outfile, samples, sample_rate)
elif outfile.endswith(".pcm"):
temp_wav = str(random.getrandbits(128)) + ".wav"
soundfile.write(temp_wav, samples, sample_rate)
wav2pcm(temp_wav, outfile, data_type=np.int16)
os.system("rm %s" % (temp_wav))
else:
print("The format for saving audio only supports wav or pcm")
return len(samples), sample_rate
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--text',
type=str,
default="你好,欢迎使用语音合成服务",
help='A sentence to be synthesized')
parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
parser.add_argument(
'--volume', type=float, default=1.0, help='Audio volume')
parser.add_argument(
'--sample_rate',
type=int,
default=0,
help='Sampling rate, the default is the same as the model')
parser.add_argument(
'--output',
type=str,
default="./out.wav",
help='Synthesized audio file')
args = parser.parse_args()
st = time.time()
try:
samples_length, sample_rate = tts_client(args)
time_consume = time.time() - st
duration = samples_length / sample_rate
rtf = time_consume / duration
print("Synthesized audio successfully.")
print("Inference time: %f" % (time_consume))
print("The duration of synthesized audio: %f" % (duration))
print("The RTF is: %f" % (rtf))
except:
print("Failed to synthesized audio.")
Loading…
Cancel
Save