change sr, test=doc

pull/2121/head
lym0302 3 years ago
parent a0d1888c0d
commit 5b06b76ebc

@ -192,23 +192,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
self.parser.add_argument( self.parser.add_argument(
'--spk_id', type=int, default=0, help='Speaker id') '--spk_id', type=int, default=0, help='Speaker id')
self.parser.add_argument( self.parser.add_argument(
'--speed', '--output', type=str, default=None, help='Client saves synthesized audio')
type=float,
default=1.0,
help='Audio speed, the value should be set between 0 and 3')
self.parser.add_argument(
'--volume',
type=float,
default=1.0,
help='Audio volume, the value should be set between 0 and 3')
self.parser.add_argument(
'--sample_rate',
type=int,
default=0,
choices=[0, 8000, 16000],
help='Sampling rate, the default is the same as the model')
self.parser.add_argument(
'--output', type=str, default=None, help='Synthesized audio file')
self.parser.add_argument( self.parser.add_argument(
"--play", type=bool, help="whether to play audio", default=False) "--play", type=bool, help="whether to play audio", default=False)
@ -219,9 +203,6 @@ class TTSOnlineClientExecutor(BaseExecutor):
port = args.port port = args.port
protocol = args.protocol protocol = args.protocol
spk_id = args.spk_id spk_id = args.spk_id
speed = args.speed
volume = args.volume
sample_rate = args.sample_rate
output = args.output output = args.output
play = args.play play = args.play
@ -232,9 +213,6 @@ class TTSOnlineClientExecutor(BaseExecutor):
port=port, port=port,
protocol=protocol, protocol=protocol,
spk_id=spk_id, spk_id=spk_id,
speed=speed,
volume=volume,
sample_rate=sample_rate,
output=output, output=output,
play=play) play=play)
return True return True
@ -250,9 +228,6 @@ class TTSOnlineClientExecutor(BaseExecutor):
port: int=8092, port: int=8092,
protocol: str="http", protocol: str="http",
spk_id: int=0, spk_id: int=0,
speed: float=1.0,
volume: float=1.0,
sample_rate: int=0,
output: str=None, output: str=None,
play: bool=False): play: bool=False):
""" """
@ -264,7 +239,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
from paddlespeech.server.utils.audio_handler import TTSHttpHandler from paddlespeech.server.utils.audio_handler import TTSHttpHandler
handler = TTSHttpHandler(server_ip, port, play) handler = TTSHttpHandler(server_ip, port, play)
first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
input, spk_id, speed, volume, sample_rate, output) input, spk_id, output)
delay_time_list = compute_delay(receive_time_list, delay_time_list = compute_delay(receive_time_list,
chunk_duration_list) chunk_duration_list)
@ -274,7 +249,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
handler = TTSWsHandler(server_ip, port, play) handler = TTSWsHandler(server_ip, port, play)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
handler.run(input, output)) handler.run(input, spk_id, output))
delay_time_list = compute_delay(receive_time_list, delay_time_list = compute_delay(receive_time_list,
chunk_duration_list) chunk_duration_list)

@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from typing import Text from typing import Text
from ..utils.log import logger from paddlespeech.cli.log import logger
__all__ = ['EngineFactory'] __all__ = ['EngineFactory']

@ -19,6 +19,8 @@ from typing import Optional
import numpy as np import numpy as np
import paddle import paddle
import librosa
from scipy import signal
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.cli.tts.infer import TTSExecutor
@ -30,6 +32,8 @@ from paddlespeech.server.utils.util import denorm
from paddlespeech.server.utils.util import get_chunks from paddlespeech.server.utils.util import get_chunks
from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.server.utils.audio_process import change_speed
from paddlespeech.server.utils.exception import ServerBaseException
__all__ = ['TTSEngine', 'PaddleTTSConnectionHandler'] __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
@ -64,6 +68,8 @@ class TTSServerExecutor(TTSExecutor):
self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'): self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'):
logger.info('Models had been initialized.') logger.info('Models had been initialized.')
return return
# am # am
am_tag = am + '-' + lang am_tag = am + '-' + lang
if am == "fastspeech2_csmsc_onnx": if am == "fastspeech2_csmsc_onnx":
@ -213,6 +219,8 @@ class TTSEngine(BaseEngine):
self.config.voc_sample_rate == self.config.am_sample_rate self.config.voc_sample_rate == self.config.am_sample_rate
), "The sample rate of AM and Vocoder model are different, please check model." ), "The sample rate of AM and Vocoder model are different, please check model."
self.sample_rate = self.config.voc_sample_rate
try: try:
if self.config.am_sess_conf.device is not None: if self.config.am_sess_conf.device is not None:
self.device = self.config.am_sess_conf.device self.device = self.config.am_sess_conf.device
@ -441,32 +449,15 @@ class PaddleTTSConnectionHandler:
self.final_response_time = time.time() - frontend_st self.final_response_time = time.time() - frontend_st
def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
# Convert byte to text
if text_bese64:
text_bytes = base64.b64decode(text_bese64) # base64 to bytes
text = text_bytes.decode('utf-8') # bytes to text
return text
def run(self, def run(self,
sentence: str, sentence: str,
spk_id: int=0, spk_id: int=0):
speed: float=1.0,
volume: float=1.0,
sample_rate: int=0,
save_path: str=None):
""" run include inference and postprocess. """ run include inference and postprocess.
Args: Args:
sentence (str): text to be synthesized sentence (str): text to be synthesized
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0. spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
speed (float, optional): speed. Defaults to 1.0.
volume (float, optional): volume. Defaults to 1.0.
sample_rate (int, optional): target sample rate for synthesized audio,
0 means the same as the model sampling rate. Defaults to 0.
save_path (str, optional): The save path of the synthesized audio.
None means do not save audio. Defaults to None.
Returns: Returns:
wav_base64: The base64 format of the synthesized audio. wav_base64: The base64 format of the synthesized audio.
@ -488,7 +479,7 @@ class PaddleTTSConnectionHandler:
yield wav_base64 yield wav_base64
wav_all = np.concatenate(wav_list, axis=0) wav_all = np.concatenate(wav_list, axis=0)
duration = len(wav_all) / self.config.voc_sample_rate duration = len(wav_all) / self.tts_engine.sample_rate
logger.info(f"sentence: {sentence}") logger.info(f"sentence: {sentence}")
logger.info(f"The durations of audio is: {duration} s") logger.info(f"The durations of audio is: {duration} s")
logger.info(f"first response time: {self.first_response_time} s") logger.info(f"first response time: {self.first_response_time} s")

@ -276,6 +276,13 @@ class TTSEngine(BaseEngine):
logger.error(e) logger.error(e)
return False return False
assert (
self.executor.am_config.fs == self.executor.voc_config.fs
), "The sample rate of AM and Vocoder model are different, please check model."
self.sample_rate = self.executor.am_config.fs
self.am_block = self.config.am_block self.am_block = self.config.am_block
self.am_pad = self.config.am_pad self.am_pad = self.config.am_pad
self.voc_block = self.config.voc_block self.voc_block = self.config.voc_block
@ -459,32 +466,15 @@ class PaddleTTSConnectionHandler:
self.final_response_time = time.time() - frontend_st self.final_response_time = time.time() - frontend_st
def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
# Convert byte to text
if text_bese64:
text_bytes = base64.b64decode(text_bese64) # base64 to bytes
text = text_bytes.decode('utf-8') # bytes to text
return text
def run(self, def run(self,
sentence: str, sentence: str,
spk_id: int=0, spk_id: int=0,):
speed: float=1.0,
volume: float=1.0,
sample_rate: int=0,
save_path: str=None):
""" run include inference and postprocess. """ run include inference and postprocess.
Args: Args:
sentence (str): text to be synthesized sentence (str): text to be synthesized
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0. spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
speed (float, optional): speed. Defaults to 1.0.
volume (float, optional): volume. Defaults to 1.0.
sample_rate (int, optional): target sample rate for synthesized audio,
0 means the same as the model sampling rate. Defaults to 0.
save_path (str, optional): The save path of the synthesized audio.
None means do not save audio. Defaults to None.
Returns: Returns:
wav_base64: The base64 format of the synthesized audio. wav_base64: The base64 format of the synthesized audio.
@ -507,7 +497,7 @@ class PaddleTTSConnectionHandler:
yield wav_base64 yield wav_base64
wav_all = np.concatenate(wav_list, axis=0) wav_all = np.concatenate(wav_list, axis=0)
duration = len(wav_all) / self.executor.am_config.fs duration = len(wav_all) / self.tts_engine.sample_rate
logger.info(f"sentence: {sentence}") logger.info(f"sentence: {sentence}")
logger.info(f"The durations of audio is: {duration} s") logger.info(f"The durations of audio is: {duration} s")

@ -266,6 +266,12 @@ class TTSWsHandler:
self.url = "ws://" + self.server + ":" + str( self.url = "ws://" + self.server + ":" + str(
self.port) + "/paddlespeech/tts/streaming" self.port) + "/paddlespeech/tts/streaming"
self.play = play self.play = play
# get model sample rate
self.url_get_sr = "http://" + str(self.server) + ":" + str(
self.port) + "/paddlespeech/tts/streaming/samplerate"
self.sample_rate = requests.get(self.url_get_sr).json()["sample_rate"]
if self.play: if self.play:
import pyaudio import pyaudio
self.buffer = b'' self.buffer = b''
@ -273,7 +279,7 @@ class TTSWsHandler:
self.stream = self.p.open( self.stream = self.p.open(
format=self.p.get_format_from_width(2), format=self.p.get_format_from_width(2),
channels=1, channels=1,
rate=24000, rate=self.sample_rate,
output=True) output=True)
self.mutex = threading.Lock() self.mutex = threading.Lock()
self.start_play = True self.start_play = True
@ -293,12 +299,16 @@ class TTSWsHandler:
self.buffer = b'' self.buffer = b''
self.mutex.release() self.mutex.release()
async def run(self, text: str, output: str=None): async def run(self,
text: str,
spk_id=0,
output: str=None):
"""Send a text to online server """Send a text to online server
Args: Args:
text (str): sentence to be synthesized text (str): sentence to be synthesized
output (str): save audio path spk_id (int, optional): speaker id. Defaults to 0.
output (str, optional): client save audio path. Defaults to None.
""" """
all_bytes = b'' all_bytes = b''
receive_time_list = [] receive_time_list = []
@ -315,8 +325,13 @@ class TTSWsHandler:
session = msg["session"] session = msg["session"]
# 3. send speech synthesis request # 3. send speech synthesis request
text_base64 = str(base64.b64encode((text).encode('utf-8')), "UTF8") #text_base64 = str(base64.b64encode((text).encode('utf-8')), "UTF8")
request = json.dumps({"text": text_base64}) params = {
"text": text,
"spk_id": spk_id,
}
request = json.dumps(params)
st = time.time() st = time.time()
await ws.send(request) await ws.send(request)
logging.info("send a message to the server") logging.info("send a message to the server")
@ -341,10 +356,10 @@ class TTSWsHandler:
# Rerutn last packet normally, no audio information # Rerutn last packet normally, no audio information
elif status == 2: elif status == 2:
final_response = time.time() - st final_response = time.time() - st
duration = len(all_bytes) / 2.0 / 24000 duration = len(all_bytes) / 2.0 / self.sample_rate
if output is not None: if output is not None:
save_audio_success = save_audio(all_bytes, output) save_audio_success = save_audio(all_bytes, output, self.sample_rate)
else: else:
save_audio_success = False save_audio_success = False
@ -362,7 +377,7 @@ class TTSWsHandler:
receive_time_list.append(time.time()) receive_time_list.append(time.time())
audio = message["audio"] audio = message["audio"]
audio = base64.b64decode(audio) # bytes audio = base64.b64decode(audio) # bytes
chunk_duration_list.append(len(audio) / 2.0 / 24000) chunk_duration_list.append(len(audio) / 2.0 / self.sample_rate)
all_bytes += audio all_bytes += audio
if self.play: if self.play:
self.mutex.acquire() self.mutex.acquire()
@ -403,19 +418,26 @@ class TTSHttpHandler:
self.port) + "/paddlespeech/tts/streaming" self.port) + "/paddlespeech/tts/streaming"
self.play = play self.play = play
# get model sample rate
self.url_get_sr = "http://" + str(self.server) + ":" + str(
self.port) + "/paddlespeech/tts/streaming/samplerate"
self.sample_rate = requests.get(self.url_get_sr).json()["sample_rate"]
if self.play: if self.play:
import pyaudio import pyaudio
self.buffer = b'' self.buffer = b''
self.p = pyaudio.PyAudio() self.p = pyaudio.PyAudio()
self.start_play = True
self.max_fail = 50
self.stream = self.p.open( self.stream = self.p.open(
format=self.p.get_format_from_width(2), format=self.p.get_format_from_width(2),
channels=1, channels=1,
rate=24000, rate=self.sample_rate,
output=True) output=True)
self.mutex = threading.Lock() self.mutex = threading.Lock()
self.start_play = True
self.t = threading.Thread(target=self.play_audio) self.t = threading.Thread(target=self.play_audio)
self.max_fail = 50
logger.info(f"endpoint: {self.url}") logger.info(f"endpoint: {self.url}")
def play_audio(self): def play_audio(self):
@ -433,28 +455,19 @@ class TTSHttpHandler:
def run(self, def run(self,
text: str, text: str,
spk_id=0, spk_id=0,
speed=1.0,
volume=1.0,
sample_rate=0,
output: str=None): output: str=None):
"""Send a text to tts online server """Send a text to tts online server
Args: Args:
text (str): sentence to be synthesized. text (str): sentence to be synthesized.
spk_id (int, optional): speaker id. Defaults to 0. spk_id (int, optional): speaker id. Defaults to 0.
speed (float, optional): audio speed. Defaults to 1.0. output (str, optional): client save audio path. Defaults to None.
volume (float, optional): audio volume. Defaults to 1.0.
sample_rate (int, optional): audio sample rate, 0 means the same as model. Defaults to 0.
output (str, optional): save audio path. Defaults to None.
""" """
# 1. Create request # 1. Create request
params = { params = {
"text": text, "text": text,
"spk_id": spk_id, "spk_id": spk_id,
"speed": speed,
"volume": volume,
"sample_rate": sample_rate,
"save_path": output
} }
all_bytes = b'' all_bytes = b''
@ -482,14 +495,14 @@ class TTSHttpHandler:
self.t.start() self.t.start()
self.start_play = False self.start_play = False
all_bytes += audio all_bytes += audio
chunk_duration_list.append(len(audio) / 2.0 / 24000) chunk_duration_list.append(len(audio) / 2.0 / self.sample_rate)
final_response = time.time() - st final_response = time.time() - st
duration = len(all_bytes) / 2.0 / 24000 duration = len(all_bytes) / 2.0 / self.sample_rate
html.close() # when stream=True html.close() # when stream=True
if output is not None: if output is not None:
save_audio_success = save_audio(all_bytes, output) save_audio_success = save_audio(all_bytes, output, self.sample_rate)
else: else:
save_audio_success = False save_audio_success = False

@ -16,7 +16,7 @@ from typing import Optional
import onnxruntime as ort import onnxruntime as ort
from .log import logger from paddlespeech.cli.log import logger
def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None): def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None):

Loading…
Cancel
Save