Merge pull request #1446 from lym0302/tts-server3

[server] add params type
4 years ago · 79c064fe21
parent 12195378c2 37d9dc5a46
commit 79c064fe21
6 changed files with 216 additions and 82 deletions
--- a/speechserving/speechserving/conf/tts/tts.yaml
+++ b/speechserving/speechserving/conf/tts/tts.yaml
@ -1,11 +1,5 @@
 # This is the parameter configuration file for TTS server.
 ##################################################################
 #                     TTS SERVER SETTING                         #
 ##################################################################
 host: '0.0.0.0'
 port: 8692
 ##################################################################
 #                  ACOUSTIC MODEL SETTING                        #
 # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
--- a/speechserving/speechserving/conf/tts/tts_pd.yaml
+++ b/speechserving/speechserving/conf/tts/tts_pd.yaml
@ -1,12 +1,6 @@
 # This is the parameter configuration file for TTS server.
 # These are the static models that support paddle inference.
 ##################################################################
 #                     TTS SERVER SETTING                         #
 ##################################################################
 host: '0.0.0.0'
 port: 8692
 ##################################################################
 #                  ACOUSTIC MODEL SETTING                        #
 # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
--- a/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
+++ b/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import base64
 import io
 import os
@ -21,7 +20,6 @@ import librosa
 import numpy as np
 import paddle
 import soundfile as sf
 import yaml
 from engine.base_engine import BaseEngine
 from scipy.io import wavfile
@ -32,6 +30,7 @@ from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from utils.audio_process import change_speed
 from utils.config import get_config
 from utils.errors import ErrorCode
 from utils.exception import ServerBaseException
 from utils.paddle_predictor import init_predictor
@ -118,14 +117,7 @@ pretrained_models = {
 class TTSServerExecutor(TTSExecutor):
    def __init__(self):
        super().__init__()
-
+        pass
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.tts', add_help=True)
        self.parser.add_argument(
            '--conf',
            type=str,
            default='./conf/tts/tts_pd.yaml',
            help='Configuration parameters.')
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
@ -224,7 +216,10 @@ class TTSServerExecutor(TTSExecutor):
            self.voc_sample_rate = voc_sample_rate
            self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model))
-        assert (self.voc_sample_rate == self.am_sample_rate)
+        assert (
            self.voc_sample_rate == self.am_sample_rate
        ), "The sample rate of AM and Vocoder model are different, please check model."
        # Init body.
        with open(self.phones_dict, "r") as f:
            phn_id = [line.strip().split() for line in f.readlines()]
@ -339,31 +334,31 @@ class TTSEngine(BaseEngine):
        metaclass: Defaults to Singleton.
    """
-    def __init__(self, name=None):
+    def __init__(self):
        """Initialize TTS server engine
        """
        super(TTSEngine, self).__init__()
        self.executor = TTSServerExecutor()
-        config_path = self.executor.parser.parse_args().conf
+    def init(self, config_file: str):
-        with open(config_path, 'rt') as f:
+        self.executor = TTSServerExecutor()
-            self.conf_dict = yaml.safe_load(f)
+        self.config_file = config_file
        self.config = get_config(config_file)
        self.executor._init_from_path(
-            am=self.conf_dict["am"],
+            am=self.config.am,
-            am_model=self.conf_dict["am_model"],
+            am_model=self.config.am_model,
-            am_params=self.conf_dict["am_params"],
+            am_params=self.config.am_params,
-            am_sample_rate=self.conf_dict["am_sample_rate"],
+            am_sample_rate=self.config.am_sample_rate,
-            phones_dict=self.conf_dict["phones_dict"],
+            phones_dict=self.config.phones_dict,
-            tones_dict=self.conf_dict["tones_dict"],
+            tones_dict=self.config.tones_dict,
-            speaker_dict=self.conf_dict["speaker_dict"],
+            speaker_dict=self.config.speaker_dict,
-            voc=self.conf_dict["voc"],
+            voc=self.config.voc,
-            voc_model=self.conf_dict["voc_model"],
+            voc_model=self.config.voc_model,
-            voc_params=self.conf_dict["voc_params"],
+            voc_params=self.config.voc_params,
-            voc_sample_rate=self.conf_dict["voc_sample_rate"],
+            voc_sample_rate=self.config.voc_sample_rate,
-            lang=self.conf_dict["lang"],
+            lang=self.config.lang,
-            am_predictor_conf=self.conf_dict["am_predictor_conf"],
+            am_predictor_conf=self.config.am_predictor_conf,
-            voc_predictor_conf=self.conf_dict["voc_predictor_conf"], )
+            voc_predictor_conf=self.config.voc_predictor_conf, )
        logger.info("Initialize TTS server engine successfully.")
@ -382,6 +377,13 @@ class TTSEngine(BaseEngine):
            target_fs (int): target audio sample rate
            volume (float): target volume
            speed (float): target speed
        Raises:
            ServerBaseException: Throws an exception if the change speed unsuccessfully.
        Returns:
            target_fs: target sample rate for synthesized audio.
            wav_base64: The base64 format of the synthesized audio.
        """
        # transform sample_rate
@ -440,21 +442,20 @@ class TTSEngine(BaseEngine):
            save_path (str, optional): The save path of the synthesized audio. Defaults to None.
        Raises:
-            ServerBaseException: Exception
+            ServerBaseException: Throws an exception if tts inference unsuccessfully.
-            ServerBaseException: Exception
+            ServerBaseException: Throws an exception if postprocess unsuccessfully.
        Returns:
-            lang, target_sample_rate, wav_base64
+            lang: model language 
            target_sample_rate: target sample rate for synthesized audio.
            wav_base64: The base64 format of the synthesized audio.
        """
-        lang = self.conf_dict["lang"]
+        lang = self.config.lang
        try:
            self.executor.infer(
-                text=sentence,
+                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
                lang=lang,
                am=self.conf_dict["am"],
                spk_id=spk_id)
        except:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts infer failed.")
--- a/speechserving/speechserving/engine/tts/python/tts_engine.py
+++ b/speechserving/speechserving/engine/tts/python/tts_engine.py
@ -11,20 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import base64
 import io
 import librosa
 import numpy as np
 import soundfile as sf
 import yaml
 from engine.base_engine import BaseEngine
 from scipy.io import wavfile
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
 from utils.audio_process import change_speed
 from utils.config import get_config
 from utils.errors import ErrorCode
 from utils.exception import ServerBaseException
@ -34,14 +33,7 @@ __all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
    def __init__(self):
        super().__init__()
-
+        pass
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.tts', add_help=True)
        self.parser.add_argument(
            '--conf',
            type=str,
            default='./conf/tts/tts.yaml',
            help='Configuration parameters.')
 class TTSEngine(BaseEngine):
@ -55,25 +47,25 @@ class TTSEngine(BaseEngine):
        """Initialize TTS server engine
        """
        super(TTSEngine, self).__init__()
        self.executor = TTSServerExecutor()
-        config_path = self.executor.parser.parse_args().conf
+    def init(self, config_file: str):
-        with open(config_path, 'rt') as f:
+        self.executor = TTSServerExecutor()
-            self.conf_dict = yaml.safe_load(f)
+        self.config_file = config_file
        self.config = get_config(config_file)
        self.executor._init_from_path(
-            am=self.conf_dict["am"],
+            am=self.config.am,
-            am_config=self.conf_dict["am_config"],
+            am_config=self.config.am_config,
-            am_ckpt=self.conf_dict["am_ckpt"],
+            am_ckpt=self.config.am_ckpt,
-            am_stat=self.conf_dict["am_stat"],
+            am_stat=self.config.am_stat,
-            phones_dict=self.conf_dict["phones_dict"],
+            phones_dict=self.config.phones_dict,
-            tones_dict=self.conf_dict["tones_dict"],
+            tones_dict=self.config.tones_dict,
-            speaker_dict=self.conf_dict["speaker_dict"],
+            speaker_dict=self.config.speaker_dict,
-            voc=self.conf_dict["voc"],
+            voc=self.config.voc,
-            voc_config=self.conf_dict["voc_config"],
+            voc_config=self.config.voc_config,
-            voc_ckpt=self.conf_dict["voc_ckpt"],
+            voc_ckpt=self.config.voc_ckpt,
-            voc_stat=self.conf_dict["voc_stat"],
+            voc_stat=self.config.voc_stat,
-            lang=self.conf_dict["lang"])
+            lang=self.config.lang)
        logger.info("Initialize TTS server engine successfully.")
@ -92,6 +84,13 @@ class TTSEngine(BaseEngine):
            target_fs (int): target audio sample rate
            volume (float): target volume
            speed (float): target speed
        Raises:
            ServerBaseException: Throws an exception if the change speed unsuccessfully.
        Returns:
            target_fs: target sample rate for synthesized audio.
            wav_base64: The base64 format of the synthesized audio.
        """
        # transform sample_rate
@ -137,15 +136,33 @@ class TTSEngine(BaseEngine):
            volume: float=1.0,
            sample_rate: int=0,
            save_path: str=None):
        """ run include inference and postprocess.
        Args:
            sentence (str): text to be synthesized
            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
            speed (float, optional): speed. Defaults to 1.0.
            volume (float, optional): volume. Defaults to 1.0.
            sample_rate (int, optional): target sample rate for synthesized audio, 
            0 means the same as the model sampling rate. Defaults to 0.
            save_path (str, optional): The save path of the synthesized audio. 
            None means do not save audio. Defaults to None.
        Raises:
            ServerBaseException: Throws an exception if tts inference unsuccessfully.
            ServerBaseException: Throws an exception if postprocess unsuccessfully.
        Returns:
            lang: model language 
            target_sample_rate: target sample rate for synthesized audio.
            wav_base64: The base64 format of the synthesized audio.
        """
-        lang = self.conf_dict["lang"]
+        lang = self.config.lang
        try:
            self.executor.infer(
-                text=sentence,
+                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
                lang=lang,
                am=self.conf_dict["am"],
                spk_id=spk_id)
        except:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts infer failed.")
--- a/speechserving/speechserving/utils/audio_process.py
+++ b/speechserving/speechserving/utils/audio_process.py
@ -15,8 +15,17 @@ import wave
 import numpy as np
 from paddlespeech.cli.log import logger
 def wav2pcm(wavfile, pcmfile, data_type=np.int16):
    """ Save the wav file as a pcm file
    Args:
        wavfile (str): wav file path
        pcmfile (str): pcm file save path
        data_type (type, optional): pcm sample type. Defaults to np.int16.
    """
    with open(wavfile, "rb") as f:
        f.seek(0)
        f.read(44)
@ -25,12 +34,21 @@ def wav2pcm(wavfile, pcmfile, data_type=np.int16):
 def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000):
    """Save the pcm file as a wav file
    Args:
        pcm_file (str): pcm file path
        wav_file (str): wav file save path
        channels (int, optional): audio channel. Defaults to 1.
        bits (int, optional): Bit depth. Defaults to 16.
        sample_rate (int, optional): sample rate. Defaults to 16000.
    """
    pcmf = open(pcm_file, 'rb')
    pcmdata = pcmf.read()
    pcmf.close()
    if bits % 8 != 0:
-        raise ValueError("bits % 8 must == 0. now bits:" + str(bits))
+        logger.error("bits % 8 must == 0. now bits:" + str(bits))
    wavfile = wave.open(wav_file, 'wb')
    wavfile.setnchannels(channels)
--- a/speechserving/tests/tts/test_client.py
+++ b/speechserving/tests/tts/test_client.py
@ -0,0 +1,110 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import base64
 import io
 import json
 import os
 import random
 import time
 import numpy as np
 import requests
 import soundfile
 def wav2pcm(wavfile: str, pcmfile: str, data_type=np.int16):
    with open(wavfile, "rb") as f:
        f.seek(0)
        f.read(44)
        data = np.fromfile(f, dtype=data_type)
        data.tofile(pcmfile)
 # Request and response
 def tts_client(args):
    """ Request and response
    Args:
        text: A sentence to be synthesized
        outfile: Synthetic audio file
    """
    url = 'http://127.0.0.1:8090/paddlespeech/tts'
    request = {
        "text": args.text,
        "spk_id": args.spk_id,
        "speed": args.speed,
        "volume": args.volume,
        "sample_rate": args.sample_rate,
        "save_path": args.output
    }
    response = requests.post(url, json.dumps(request))
    response_dict = response.json()
    wav_base64 = response_dict["result"]["audio"]
    audio_data_byte = base64.b64decode(wav_base64)
    # from byte
    samples, sample_rate = soundfile.read(
        io.BytesIO(audio_data_byte), dtype='float32')
    # transform audio
    outfile = args.output
    if outfile.endswith(".wav"):
        soundfile.write(outfile, samples, sample_rate)
    elif outfile.endswith(".pcm"):
        temp_wav = str(random.getrandbits(128)) + ".wav"
        soundfile.write(temp_wav, samples, sample_rate)
        wav2pcm(temp_wav, outfile, data_type=np.int16)
        os.system("rm %s" % (temp_wav))
    else:
        print("The format for saving audio only supports wav or pcm")
    return len(samples), sample_rate
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--text',
        type=str,
        default="你好，欢迎使用语音合成服务",
        help='A sentence to be synthesized')
    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
    parser.add_argument(
        '--volume', type=float, default=1.0, help='Audio volume')
    parser.add_argument(
        '--sample_rate',
        type=int,
        default=0,
        help='Sampling rate, the default is the same as the model')
    parser.add_argument(
        '--output',
        type=str,
        default="./out.wav",
        help='Synthesized audio file')
    args = parser.parse_args()
    st = time.time()
    try:
        samples_length, sample_rate = tts_client(args)
        time_consume = time.time() - st
        duration = samples_length / sample_rate
        rtf = time_consume / duration
        print("Synthesized audio successfully.")
        print("Inference time: %f" % (time_consume))
        print("The duration of synthesized audio: %f" % (duration))
        print("The RTF is: %f" % (rtf))
    except:
        print("Failed to synthesized audio.")