add tts server, test=tts

pull/1386/head
lym0302 3 years ago
parent 2a530d49ff
commit 777a026277

@ -0,0 +1,38 @@
# This is the parameter configuration file for TTS server.
##################################################################
# TTS SERVER SETTING #
##################################################################
host: '0.0.0.0'
port: 8692
##################################################################
# ACOUSTIC MODEL SETTING #
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
# 'fastspeech2_vctk']
##################################################################
am: 'fastspeech2_csmsc'
am_config:
am_ckpt:
am_stat:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0
##################################################################
# VOCODER SETTING #
# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
# 'pwgan_vctk', 'mb_melgan_csmsc']
##################################################################
voc: 'pwgan_csmsc'
voc_config:
voc_ckpt:
voc_stat:
##################################################################
# OTHERS #
##################################################################
lang: 'zh'
device: paddle.get_device()

@ -0,0 +1,143 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import base64
import librosa
import numpy as np
import soundfile as sf
import yaml
from engine.base_engine import BaseEngine
from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor
__all__ = ['TTSEngine']
class TTSServerExecutor(TTSExecutor):
def __init__(self):
super().__init__()
self.parser = argparse.ArgumentParser(
prog='paddlespeech.tts', add_help=True)
self.parser.add_argument(
'--conf',
type=str,
default='./conf/tts/tts.yaml',
help='Configuration parameters.')
class TTSEngine(BaseEngine):
"""TTS server engine
Args:
metaclass: Defaults to Singleton.
"""
def __init__(self, name=None):
"""Initialize TTS server engine
"""
super(TTSEngine, self).__init__()
self.executor = TTSServerExecutor()
config_path = self.executor.parser.parse_args().conf
with open(config_path, 'rt') as f:
self.conf_dict = yaml.safe_load(f)
self.executor._init_from_path(
am=self.conf_dict["am"],
am_config=self.conf_dict["am_config"],
am_ckpt=self.conf_dict["am_ckpt"],
am_stat=self.conf_dict["am_stat"],
phones_dict=self.conf_dict["phones_dict"],
tones_dict=self.conf_dict["tones_dict"],
speaker_dict=self.conf_dict["speaker_dict"],
voc=self.conf_dict["voc"],
voc_config=self.conf_dict["voc_config"],
voc_ckpt=self.conf_dict["voc_ckpt"],
voc_stat=self.conf_dict["voc_stat"],
lang=self.conf_dict["lang"])
logger.info("Initialize TTS server engine successfully.")
def postprocess(self,
wav,
original_fs: int,
target_fs: int=16000,
volume: float=1.0,
speed: float=1.0,
audio_path: str=None,
audio_format: str="wav"):
"""Post-processing operations, including speech, volume, sample rate, save audio file
Args:
wav (numpy(float)): Synthesized audio sample points
original_fs (int): original audio sample rate
target_fs (int): target audio sample rate
volume (float): target volume
speed (float): target speed
"""
# transform sample_rate
if target_fs == 0 or target_fs > original_fs:
target_fs = original_fs
wav_tar_fs = wav
else:
wav_tar_fs = librosa.resample(
np.squeeze(wav), original_fs, target_fs)
# transform volume
wav_vol = wav_tar_fs * volume
# transform speed
# TODO
target_wav = wav_vol.reshape(-1, 1)
# save audio
if audio_path is not None:
sf.write(audio_path, target_wav, target_fs)
logger.info('Wave file has been generated: {}'.format(audio_path))
# wav to base64
base64_bytes = base64.b64encode(target_wav)
base64_string = base64_bytes.decode('utf-8')
wav_base64 = base64_string
return target_fs, wav_base64
def run(self,
sentence: str,
spk_id: int=0,
speed: float=1.0,
volume: float=1.0,
sample_rate: int=0,
save_path: str=None,
audio_format: str="wav"):
lang = self.conf_dict["lang"]
self.executor.infer(
text=sentence, lang=lang, am=self.conf_dict["am"], spk_id=spk_id)
target_sample_rate, wav_base64 = self.postprocess(
wav=self.executor._outputs['wav'].numpy(),
original_fs=self.executor.am_config.fs,
target_fs=sample_rate,
volume=volume,
speed=speed,
audio_path=save_path,
audio_format=audio_format)
return lang, target_sample_rate, wav_base64

@ -13,31 +13,55 @@
# limitations under the License.
import argparse
import asr_api as api_run
import tts_api as api_run
import uvicorn
import yaml
from engine.tts.python.tts_engine import TTSEngine
from fastapi import FastAPI
from restful.api import router as api_router
from paddlespeech.cli.log import logger
app = FastAPI(
title="PaddleSpeech Serving API", description="Api", version="0.0.1")
def init(args):
""" 系统初始化
"""
app.include_router(api_router)
# engine single
TTS_ENGINE = TTSEngine()
# todo others
return True
def main(args):
"""主程序入口"""
if init(args):
api_run.run()
app.run(host='0.0.0.0', port=conf.port)
#TODO configuration
from yacs.config import CfgNode
with open(args.config_file, 'rt') as f:
config = CfgNode(yaml.safe_load(f))
if init(args):
uvicorn.run(app, host=config.host, port=config.port, debug=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--config_file", action="store",
help="yaml file of the app", default="./conf/application.yaml")
parser.add_argument("--log_file", action="store",
help="log file", default="./log/paddlespeech.log")
parser.add_argument(
"--config_file",
action="store",
help="yaml file of the app",
default="./server.yaml")
parser.add_argument(
"--log_file",
action="store",
help="log file",
default="./log/paddlespeech.log")
args = parser.parse_args()
main(args)

@ -13,19 +13,9 @@
# limitations under the License.
from fastapi import APIRouter
router = APIRouter()
router.include_router(auth_router)
router.include_router(user_router)
router.include_router(profile_router)
router.include_router(comment_router)
router.include_router(article_router)
router.include_router(tag_router)
from .tts_api import router as tts_router
#from .asr_api import router as asr_router
def init_app(app):
app.include_router(router)
router = APIRouter()
#router.include_router(asr_router)
router.include_router(tts_router)

@ -11,13 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
from typing import List
from typing import Optional
from pydantic import BaseModel
__all__ = ['ASRRequest', 'TTSRequest']
__all__ = ['ASRRequest, TTSRequest']
#****************************************************************************************/
#************************************ ASR request ***************************************/
@ -44,13 +44,25 @@ class ASRRequest(BaseModel):
#************************************ TTS request ***************************************/
#****************************************************************************************/
class TTSRequest(BaseModel):
"""
"""TTS request
request body example
{
"audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
"audio_format": "wav",
"sample_rate": 16000,
"lang ": "zh_cn",
"ptt ":false
"text": "你好,欢迎使用百度飞桨语音合成服务。",
"spk_id": 0,
"speed": 1.0,
"volume": 1.0,
"sample_rate": 0,
"tts_audio_path": "./tts.wav",
"audio_format": "wav"
}
"""
"""
text: str
spk_id: int = 0
speed: float = 1.0
volume: float = 1.0
sample_rate: int = 0
save_path: str = None
audio_format: str = "wav"

@ -11,23 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
from typing import List
from typing import Optional
from pydantic import BaseModel
__all__ = ['ASRResponse']
__all__ = ['ASRResponse', 'TTSResponse']
class Message(BaseModel):
description: str
#****************************************************************************************/
#************************************ ASR response **************************************/
#****************************************************************************************/
class AsrResult(BaseModel):
transcription: str
class ASRResponse(BaseModel):
"""
response example
@ -36,7 +38,7 @@ class ASRResponse(BaseModel):
"code": 0,
"message": {
"description": "success"
}
},
"result": {
"transcription": "你好,飞桨"
}
@ -47,6 +49,40 @@ class ASRResponse(BaseModel):
message: Message
result: AsrResult
#****************************************************************************************/
#************************************ TTS response **************************************/
#****************************************************************************************/
class TTSResult(BaseModel):
lang: str = "zh"
sample_rate: int
spk_id: int = 0
speed: float = 1.0
volume: float = 1.0
save_path: str = None
audio: str
class TTSResponse(BaseModel):
"""
response example
{
"success": true,
"code": 0,
"message": {
"description": "success"
},
"result": {
"lang": "zh",
"sample_rate": 24000,
"speed": 1.0,
"volume": 1.0,
"audio": "LTI1OTIuNjI1OTUwMzQsOTk2OS41NDk4...",
"save_path": "./tts.wav"
}
}
"""
success: bool
code: int
message: Message
result: TTSResult

Loading…
Cancel
Save