From 777a0262775b2a6d7c1914746838b294c43ab38d Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Tue, 25 Jan 2022 20:32:36 +0800
Subject: [PATCH] add tts server, test=tts

---
 speechserving/speechserving/conf/tts/tts.yaml |  38 +++++
 .../engine/tts/python/tts_engine.py           | 143 ++++++++++++++++++
 speechserving/speechserving/main.py           |  42 +++--
 speechserving/speechserving/restful/api.py    |  20 +--
 .../speechserving/restful/request.py          |  30 ++--
 .../speechserving/restful/response.py         |  42 ++++-
 6 files changed, 279 insertions(+), 36 deletions(-)
 create mode 100644 speechserving/speechserving/conf/tts/tts.yaml
 create mode 100644 speechserving/speechserving/engine/tts/python/tts_engine.py

diff --git a/speechserving/speechserving/conf/tts/tts.yaml b/speechserving/speechserving/conf/tts/tts.yaml
new file mode 100644
index 00000000..8e08d51c
--- /dev/null
+++ b/speechserving/speechserving/conf/tts/tts.yaml
@@ -0,0 +1,38 @@
+# This is the parameter configuration file for TTS server.
+
+##################################################################
+#                     TTS SERVER SETTING                         #
+##################################################################
+host: '0.0.0.0'
+port: 8692
+
+##################################################################
+#                  ACOUSTIC MODEL SETTING                        #
+# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
+#             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+#             'fastspeech2_vctk']
+##################################################################
+am: 'fastspeech2_csmsc'   
+am_config: 
+am_ckpt: 
+am_stat: 
+phones_dict: 
+tones_dict: 
+speaker_dict: 
+spk_id: 0
+
+##################################################################
+#                     VOCODER SETTING                            #
+# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+#              'pwgan_vctk', 'mb_melgan_csmsc']
+##################################################################
+voc: 'pwgan_csmsc'
+voc_config: 
+voc_ckpt: 
+voc_stat: 
+
+##################################################################
+#                            OTHERS                              #
+##################################################################
+lang: 'zh'
+device: paddle.get_device()
\ No newline at end of file
diff --git a/speechserving/speechserving/engine/tts/python/tts_engine.py b/speechserving/speechserving/engine/tts/python/tts_engine.py
new file mode 100644
index 00000000..d790aa31
--- /dev/null
+++ b/speechserving/speechserving/engine/tts/python/tts_engine.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import base64
+
+import librosa
+import numpy as np
+import soundfile as sf
+import yaml
+from engine.base_engine import BaseEngine
+
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.tts.infer import TTSExecutor
+
+__all__ = ['TTSEngine']
+
+
+class TTSServerExecutor(TTSExecutor):
+    def __init__(self):
+        super().__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.tts', add_help=True)
+        self.parser.add_argument(
+            '--conf',
+            type=str,
+            default='./conf/tts/tts.yaml',
+            help='Configuration parameters.')
+
+
+class TTSEngine(BaseEngine):
+    """TTS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self, name=None):
+        """Initialize TTS server engine
+        """
+        super(TTSEngine, self).__init__()
+        self.executor = TTSServerExecutor()
+
+        config_path = self.executor.parser.parse_args().conf
+        with open(config_path, 'rt') as f:
+            self.conf_dict = yaml.safe_load(f)
+
+        self.executor._init_from_path(
+            am=self.conf_dict["am"],
+            am_config=self.conf_dict["am_config"],
+            am_ckpt=self.conf_dict["am_ckpt"],
+            am_stat=self.conf_dict["am_stat"],
+            phones_dict=self.conf_dict["phones_dict"],
+            tones_dict=self.conf_dict["tones_dict"],
+            speaker_dict=self.conf_dict["speaker_dict"],
+            voc=self.conf_dict["voc"],
+            voc_config=self.conf_dict["voc_config"],
+            voc_ckpt=self.conf_dict["voc_ckpt"],
+            voc_stat=self.conf_dict["voc_stat"],
+            lang=self.conf_dict["lang"])
+
+        logger.info("Initialize TTS server engine successfully.")
+
+    def postprocess(self,
+                    wav,
+                    original_fs: int,
+                    target_fs: int=16000,
+                    volume: float=1.0,
+                    speed: float=1.0,
+                    audio_path: str=None,
+                    audio_format: str="wav"):
+        """Post-processing operations, including speech, volume, sample rate, save audio file
+
+        Args:
+            wav (numpy(float)): Synthesized audio sample points
+            original_fs (int): original audio sample rate
+            target_fs (int): target audio sample rate
+            volume (float): target volume
+            speed (float): target speed
+        """
+
+        # transform sample_rate
+        if target_fs == 0 or target_fs > original_fs:
+            target_fs = original_fs
+            wav_tar_fs = wav
+        else:
+            wav_tar_fs = librosa.resample(
+                np.squeeze(wav), original_fs, target_fs)
+
+        # transform volume
+        wav_vol = wav_tar_fs * volume
+
+        # transform speed
+        # TODO
+        target_wav = wav_vol.reshape(-1, 1)
+
+        # save audio
+        if audio_path is not None:
+            sf.write(audio_path, target_wav, target_fs)
+            logger.info('Wave file has been generated: {}'.format(audio_path))
+
+        # wav to base64
+        base64_bytes = base64.b64encode(target_wav)
+        base64_string = base64_bytes.decode('utf-8')
+        wav_base64 = base64_string
+
+        return target_fs, wav_base64
+
+    def run(self,
+            sentence: str,
+            spk_id: int=0,
+            speed: float=1.0,
+            volume: float=1.0,
+            sample_rate: int=0,
+            save_path: str=None,
+            audio_format: str="wav"):
+
+        lang = self.conf_dict["lang"]
+
+        self.executor.infer(
+            text=sentence, lang=lang, am=self.conf_dict["am"], spk_id=spk_id)
+
+        target_sample_rate, wav_base64 = self.postprocess(
+            wav=self.executor._outputs['wav'].numpy(),
+            original_fs=self.executor.am_config.fs,
+            target_fs=sample_rate,
+            volume=volume,
+            speed=speed,
+            audio_path=save_path,
+            audio_format=audio_format)
+
+        return lang, target_sample_rate, wav_base64
diff --git a/speechserving/speechserving/main.py b/speechserving/speechserving/main.py
index 91046984..864c543d 100644
--- a/speechserving/speechserving/main.py
+++ b/speechserving/speechserving/main.py
@@ -13,31 +13,55 @@
 # limitations under the License.
 import argparse
 
-import asr_api  as api_run
-import tts_api  as api_run
+import uvicorn
+import yaml
+from engine.tts.python.tts_engine import TTSEngine
+from fastapi import FastAPI
+from restful.api import router as api_router
 
+from paddlespeech.cli.log import logger
+
+app = FastAPI(
+    title="PaddleSpeech Serving API", description="Api", version="0.0.1")
 
 
 def init(args):
     """ 系统初始化
     """
+    app.include_router(api_router)
+
+    # engine single 
+    TTS_ENGINE = TTSEngine()
+
+    # todo others 
+
+    return True
 
 
 def main(args):
     """主程序入口"""
 
-    if init(args):
-        api_run.run()
-        app.run(host='0.0.0.0', port=conf.port)
+    #TODO configuration 
+    from yacs.config import CfgNode
+    with open(args.config_file, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
 
+    if init(args):
+        uvicorn.run(app, host=config.host, port=config.port, debug=True)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--config_file", action="store",
-                        help="yaml file of the app", default="./conf/application.yaml")
-    parser.add_argument("--log_file", action="store",
-                        help="log file", default="./log/paddlespeech.log")
+    parser.add_argument(
+        "--config_file",
+        action="store",
+        help="yaml file of the app",
+        default="./server.yaml")
+    parser.add_argument(
+        "--log_file",
+        action="store",
+        help="log file",
+        default="./log/paddlespeech.log")
     args = parser.parse_args()
 
     main(args)
diff --git a/speechserving/speechserving/restful/api.py b/speechserving/speechserving/restful/api.py
index 6324fac2..c6250ce7 100644
--- a/speechserving/speechserving/restful/api.py
+++ b/speechserving/speechserving/restful/api.py
@@ -13,19 +13,9 @@
 # limitations under the License.
 from fastapi import APIRouter
 
-router = APIRouter()
-
-
-router.include_router(auth_router)
-router.include_router(user_router)
-router.include_router(profile_router)
-router.include_router(comment_router)
-router.include_router(article_router)
-router.include_router(tag_router)
-
+from .tts_api import router as tts_router
+#from .asr_api import router as asr_router
 
-
-
-def init_app(app):
-    
-    app.include_router(router)
+router = APIRouter()
+#router.include_router(asr_router)
+router.include_router(tts_router)
diff --git a/speechserving/speechserving/restful/request.py b/speechserving/speechserving/restful/request.py
index 4721decd..f1fa4bcb 100644
--- a/speechserving/speechserving/restful/request.py
+++ b/speechserving/speechserving/restful/request.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
 from typing import List
+from typing import Optional
 
 from pydantic import BaseModel
 
+__all__ = ['ASRRequest', 'TTSRequest']
 
-__all__ = ['ASRRequest, TTSRequest']
 
 #****************************************************************************************/
 #************************************ ASR request ***************************************/
@@ -44,13 +44,25 @@ class ASRRequest(BaseModel):
 #************************************ TTS request ***************************************/
 #****************************************************************************************/
 class TTSRequest(BaseModel):
-    """
+    """TTS request
+
     request body example
     {
-        "audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
-        "audio_format": "wav",
-        "sample_rate": 16000,
-        "lang ": "zh_cn",
-        "ptt ":false
+        "text": "你好，欢迎使用百度飞桨语音合成服务。",
+        "spk_id": 0,
+        "speed": 1.0,
+        "volume": 1.0,
+        "sample_rate": 0,
+        "tts_audio_path": "./tts.wav",
+        "audio_format": "wav"
     }
-    """
\ No newline at end of file
+    
+    """
+
+    text: str
+    spk_id: int = 0
+    speed: float = 1.0
+    volume: float = 1.0
+    sample_rate: int = 0
+    save_path: str = None
+    audio_format: str = "wav"
diff --git a/speechserving/speechserving/restful/response.py b/speechserving/speechserving/restful/response.py
index fdb07a84..684a37f9 100644
--- a/speechserving/speechserving/restful/response.py
+++ b/speechserving/speechserving/restful/response.py
@@ -11,23 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
 from typing import List
+from typing import Optional
 
 from pydantic import BaseModel
 
-__all__ = ['ASRResponse']
+__all__ = ['ASRResponse', 'TTSResponse']
 
 
 class Message(BaseModel):
     description: str
 
+
 #****************************************************************************************/
 #************************************ ASR response **************************************/
 #****************************************************************************************/
 class AsrResult(BaseModel):
     transcription: str
 
+
 class ASRResponse(BaseModel):
     """
     response example
@@ -36,7 +38,7 @@ class ASRResponse(BaseModel):
         "code": 0,
         "message": {
             "description": "success" 
-        }
+        },
         "result": {
             "transcription": "你好，飞桨"
         }
@@ -47,6 +49,40 @@ class ASRResponse(BaseModel):
     message: Message
     result: AsrResult
 
+
 #****************************************************************************************/
 #************************************ TTS response **************************************/
 #****************************************************************************************/
+class TTSResult(BaseModel):
+    lang: str = "zh"
+    sample_rate: int
+    spk_id: int = 0
+    speed: float = 1.0
+    volume: float = 1.0
+    save_path: str = None
+    audio: str
+
+
+class TTSResponse(BaseModel):
+    """
+    response example
+    {
+        "success": true,
+        "code": 0,
+        "message": {
+            "description": "success" 
+        },
+        "result": {
+            "lang": "zh",
+            "sample_rate": 24000,
+            "speed": 1.0,
+            "volume": 1.0,
+            "audio": "LTI1OTIuNjI1OTUwMzQsOTk2OS41NDk4...",
+            "save_path": "./tts.wav"
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
+    result: TTSResult