diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 860d9a97..cbea6bf7 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -119,12 +119,9 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `protocol`: Service protocol, choices: [http, websocket], default: http. - `input`: (required): Input text to generate. - `spk_id`: Speaker id for multi-speaker text to speech. Default: 0 - - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0 - - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0 - - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. + - `output`: Client output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. - - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. + - Currently, only the single-speaker model is supported in the code, so `spk_id` does not take effect. Streaming TTS does not support changing sample rate, variable speed and volume. Output: ```bash @@ -150,9 +147,6 @@ The configuration file can be found in `conf/tts_online_application.yaml`. port=8092, protocol="http", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) @@ -256,12 +250,10 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `protocol`: Service protocol, choices: [http, websocket], default: http. - `input`: (required): Input text to generate. - `spk_id`: Speaker id for multi-speaker text to speech. Default: 0 - - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0 - - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0 - - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. + - `output`: Client output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. - - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. + - Currently, only the single-speaker model is supported in the code, so `spk_id` does not take effect. Streaming TTS does not support changing sample rate, variable speed and volume. + Output: @@ -288,9 +280,6 @@ The configuration file can be found in `conf/tts_online_application.yaml`. port=8092, protocol="websocket", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 254ec26a..3cd28170 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -118,12 +118,9 @@ - `protocol`: 服务协议,可选 [http, websocket], 默认: http。 - `input`: (必须输入): 待合成的文本。 - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。 - - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0 - - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0 - - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 + - `output`: 客户端输出音频的路径, 默认值:None,表示不保存音频。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 - - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 + - 目前代码中只支持单说话人的模型,因此 spk_id 的选择并不生效。流式 TTS 不支持更换采样率,变速和变音量等功能。 输出: @@ -150,9 +147,6 @@ port=8092, protocol="http", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) @@ -256,12 +250,10 @@ - `protocol`: 服务协议,可选 [http, websocket], 默认: http。 - `input`: (必须输入): 待合成的文本。 - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。 - - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0 - - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0 - - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 + - `output`: 客户端输出音频的路径, 默认值:None,表示不保存音频。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 - - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 + - 目前代码中只支持单说话人的模型,因此 spk_id 的选择并不生效。流式 TTS 不支持更换采样率,变速和变音量等功能。 + 输出: @@ -288,9 +280,6 @@ port=8092, protocol="websocket", spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, output="./output.wav", play=False) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index bd1186df..e8e57fff 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -191,23 +191,10 @@ class TTSOnlineClientExecutor(BaseExecutor): self.parser.add_argument( '--spk_id', type=int, default=0, help='Speaker id') self.parser.add_argument( - '--speed', - type=float, - default=1.0, - help='Audio speed, the value should be set between 0 and 3') - self.parser.add_argument( - '--volume', - type=float, - default=1.0, - help='Audio volume, the value should be set between 0 and 3') - self.parser.add_argument( - '--sample_rate', - type=int, - default=0, - choices=[0, 8000, 16000], - help='Sampling rate, the default is the same as the model') - self.parser.add_argument( - '--output', type=str, default=None, help='Synthesized audio file') + '--output', + type=str, + default=None, + help='Client saves synthesized audio') self.parser.add_argument( "--play", type=bool, help="whether to play audio", default=False) @@ -218,9 +205,6 @@ class TTSOnlineClientExecutor(BaseExecutor): port = args.port protocol = args.protocol spk_id = args.spk_id - speed = args.speed - volume = args.volume - sample_rate = args.sample_rate output = args.output play = args.play @@ -231,9 +215,6 @@ class TTSOnlineClientExecutor(BaseExecutor): port=port, protocol=protocol, spk_id=spk_id, - speed=speed, - volume=volume, - sample_rate=sample_rate, output=output, play=play) return True @@ -249,9 +230,6 @@ class TTSOnlineClientExecutor(BaseExecutor): port: int=8092, protocol: str="http", spk_id: int=0, - speed: float=1.0, - volume: float=1.0, - sample_rate: int=0, output: str=None, play: bool=False): """ @@ -263,7 +241,7 @@ class TTSOnlineClientExecutor(BaseExecutor): from paddlespeech.server.utils.audio_handler import TTSHttpHandler handler = TTSHttpHandler(server_ip, port, play) first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run( - input, spk_id, speed, volume, sample_rate, output) + input, spk_id, output) delay_time_list = compute_delay(receive_time_list, chunk_duration_list) @@ -273,7 +251,7 @@ class TTSOnlineClientExecutor(BaseExecutor): handler = TTSWsHandler(server_ip, port, play) loop = asyncio.get_event_loop() first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete( - handler.run(input, output)) + handler.run(input, spk_id, output)) delay_time_list = compute_delay(receive_time_list, chunk_duration_list) diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index 7b8e04e8..0995a55d 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -64,6 +64,7 @@ class TTSServerExecutor(TTSExecutor): self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'): logger.debug('Models had been initialized.') return + # am am_tag = am + '-' + lang if am == "fastspeech2_csmsc_onnx": @@ -211,6 +212,8 @@ class TTSEngine(BaseEngine): self.config.voc_sample_rate == self.config.am_sample_rate ), "The sample rate of AM and Vocoder model are different, please check model." + self.sample_rate = self.config.voc_sample_rate + try: if self.config.am_sess_conf.device is not None: self.device = self.config.am_sess_conf.device @@ -439,33 +442,13 @@ class PaddleTTSConnectionHandler: self.final_response_time = time.time() - frontend_st - def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): - # Convert byte to text - if text_bese64: - text_bytes = base64.b64decode(text_bese64) # base64 to bytes - text = text_bytes.decode('utf-8') # bytes to text - - return text - - def run(self, - sentence: str, - spk_id: int=0, - speed: float=1.0, - volume: float=1.0, - sample_rate: int=0, - save_path: str=None): + def run(self, sentence: str, spk_id: int=0): """ run include inference and postprocess. Args: sentence (str): text to be synthesized spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0. - speed (float, optional): speed. Defaults to 1.0. - volume (float, optional): volume. Defaults to 1.0. - sample_rate (int, optional): target sample rate for synthesized audio, - 0 means the same as the model sampling rate. Defaults to 0. - save_path (str, optional): The save path of the synthesized audio. - None means do not save audio. Defaults to None. - + Returns: wav_base64: The base64 format of the synthesized audio. """ @@ -486,7 +469,7 @@ class PaddleTTSConnectionHandler: yield wav_base64 wav_all = np.concatenate(wav_list, axis=0) - duration = len(wav_all) / self.config.voc_sample_rate + duration = len(wav_all) / self.tts_engine.sample_rate logger.info(f"sentence: {sentence}") logger.info(f"The durations of audio is: {duration} s") logger.info(f"first response time: {self.first_response_time} s") diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py index 9bd95849..a46b84bd 100644 --- a/paddlespeech/server/engine/tts/online/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py @@ -282,6 +282,12 @@ class TTSEngine(BaseEngine): logger.error(e) return False + assert ( + self.executor.am_config.fs == self.executor.voc_config.fs + ), "The sample rate of AM and Vocoder model are different, please check model." + + self.sample_rate = self.executor.am_config.fs + self.am_block = self.config.am_block self.am_pad = self.config.am_pad self.voc_block = self.config.voc_block @@ -465,32 +471,15 @@ class PaddleTTSConnectionHandler: self.final_response_time = time.time() - frontend_st - def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): - # Convert byte to text - if text_bese64: - text_bytes = base64.b64decode(text_bese64) # base64 to bytes - text = text_bytes.decode('utf-8') # bytes to text - - return text - - def run(self, + def run( + self, sentence: str, - spk_id: int=0, - speed: float=1.0, - volume: float=1.0, - sample_rate: int=0, - save_path: str=None): + spk_id: int=0, ): """ run include inference and postprocess. Args: sentence (str): text to be synthesized spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0. - speed (float, optional): speed. Defaults to 1.0. - volume (float, optional): volume. Defaults to 1.0. - sample_rate (int, optional): target sample rate for synthesized audio, - 0 means the same as the model sampling rate. Defaults to 0. - save_path (str, optional): The save path of the synthesized audio. - None means do not save audio. Defaults to None. Returns: wav_base64: The base64 format of the synthesized audio. @@ -513,7 +502,7 @@ class PaddleTTSConnectionHandler: yield wav_base64 wav_all = np.concatenate(wav_list, axis=0) - duration = len(wav_all) / self.executor.am_config.fs + duration = len(wav_all) / self.tts_engine.sample_rate logger.info(f"sentence: {sentence}") logger.info(f"The durations of audio is: {duration} s") diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index 53fe159f..61e4c49f 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -140,7 +140,9 @@ def tts(request_body: TTSRequest): @router.post("/paddlespeech/tts/streaming") async def stream_tts(request_body: TTSRequest): + # get params text = request_body.text + spk_id = request_body.spk_id engine_pool = get_engine_pool() tts_engine = engine_pool['tts'] @@ -156,4 +158,24 @@ async def stream_tts(request_body: TTSRequest): connection_handler = PaddleTTSConnectionHandler(tts_engine) - return StreamingResponse(connection_handler.run(sentence=text)) + return StreamingResponse( + connection_handler.run(sentence=text, spk_id=spk_id)) + + +@router.get("/paddlespeech/tts/streaming/samplerate") +def get_samplerate(): + try: + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + logger.info("Get tts engine successfully.") + sample_rate = tts_engine.sample_rate + + response = {"sample_rate": sample_rate} + + except ServerBaseException as e: + response = failed_response(e.error_code, e.msg) + except BaseException: + response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) + traceback.print_exc() + + return response diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index d4540781..b5629037 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -266,6 +266,12 @@ class TTSWsHandler: self.url = "ws://" + self.server + ":" + str( self.port) + "/paddlespeech/tts/streaming" self.play = play + + # get model sample rate + self.url_get_sr = "http://" + str(self.server) + ":" + str( + self.port) + "/paddlespeech/tts/streaming/samplerate" + self.sample_rate = requests.get(self.url_get_sr).json()["sample_rate"] + if self.play: import pyaudio self.buffer = b'' @@ -273,7 +279,7 @@ class TTSWsHandler: self.stream = self.p.open( format=self.p.get_format_from_width(2), channels=1, - rate=24000, + rate=self.sample_rate, output=True) self.mutex = threading.Lock() self.start_play = True @@ -293,12 +299,13 @@ class TTSWsHandler: self.buffer = b'' self.mutex.release() - async def run(self, text: str, output: str=None): + async def run(self, text: str, spk_id=0, output: str=None): """Send a text to online server Args: text (str): sentence to be synthesized - output (str): save audio path + spk_id (int, optional): speaker id. Defaults to 0. + output (str, optional): client save audio path. Defaults to None. """ all_bytes = b'' receive_time_list = [] @@ -315,8 +322,13 @@ class TTSWsHandler: session = msg["session"] # 3. send speech synthesis request - text_base64 = str(base64.b64encode((text).encode('utf-8')), "UTF8") - request = json.dumps({"text": text_base64}) + #text_base64 = str(base64.b64encode((text).encode('utf-8')), "UTF8") + params = { + "text": text, + "spk_id": spk_id, + } + + request = json.dumps(params) st = time.time() await ws.send(request) logging.debug("send a message to the server") @@ -341,10 +353,11 @@ class TTSWsHandler: # Rerutn last packet normally, no audio information elif status == 2: final_response = time.time() - st - duration = len(all_bytes) / 2.0 / 24000 + duration = len(all_bytes) / 2.0 / self.sample_rate if output is not None: - save_audio_success = save_audio(all_bytes, output) + save_audio_success = save_audio(all_bytes, output, + self.sample_rate) else: save_audio_success = False @@ -362,7 +375,8 @@ class TTSWsHandler: receive_time_list.append(time.time()) audio = message["audio"] audio = base64.b64decode(audio) # bytes - chunk_duration_list.append(len(audio) / 2.0 / 24000) + chunk_duration_list.append( + len(audio) / 2.0 / self.sample_rate) all_bytes += audio if self.play: self.mutex.acquire() @@ -403,19 +417,26 @@ class TTSHttpHandler: self.port) + "/paddlespeech/tts/streaming" self.play = play + # get model sample rate + self.url_get_sr = "http://" + str(self.server) + ":" + str( + self.port) + "/paddlespeech/tts/streaming/samplerate" + self.sample_rate = requests.get(self.url_get_sr).json()["sample_rate"] + if self.play: import pyaudio self.buffer = b'' self.p = pyaudio.PyAudio() + self.start_play = True + self.max_fail = 50 + self.stream = self.p.open( format=self.p.get_format_from_width(2), channels=1, - rate=24000, + rate=self.sample_rate, output=True) self.mutex = threading.Lock() - self.start_play = True self.t = threading.Thread(target=self.play_audio) - self.max_fail = 50 + logger.info(f"endpoint: {self.url}") def play_audio(self): @@ -430,31 +451,19 @@ class TTSHttpHandler: self.buffer = b'' self.mutex.release() - def run(self, - text: str, - spk_id=0, - speed=1.0, - volume=1.0, - sample_rate=0, - output: str=None): + def run(self, text: str, spk_id=0, output: str=None): """Send a text to tts online server Args: text (str): sentence to be synthesized. spk_id (int, optional): speaker id. Defaults to 0. - speed (float, optional): audio speed. Defaults to 1.0. - volume (float, optional): audio volume. Defaults to 1.0. - sample_rate (int, optional): audio sample rate, 0 means the same as model. Defaults to 0. - output (str, optional): save audio path. Defaults to None. + output (str, optional): client save audio path. Defaults to None. """ + # 1. Create request params = { "text": text, "spk_id": spk_id, - "speed": speed, - "volume": volume, - "sample_rate": sample_rate, - "save_path": output } all_bytes = b'' @@ -482,14 +491,14 @@ class TTSHttpHandler: self.t.start() self.start_play = False all_bytes += audio - chunk_duration_list.append(len(audio) / 2.0 / 24000) + chunk_duration_list.append(len(audio) / 2.0 / self.sample_rate) final_response = time.time() - st - duration = len(all_bytes) / 2.0 / 24000 + duration = len(all_bytes) / 2.0 / self.sample_rate html.close() # when stream=True if output is not None: - save_audio_success = save_audio(all_bytes, output) + save_audio_success = save_audio(all_bytes, output, self.sample_rate) else: save_audio_success = False diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py index 3d8b222e..275711f5 100644 --- a/paddlespeech/server/ws/tts_api.py +++ b/paddlespeech/server/ws/tts_api.py @@ -87,12 +87,12 @@ async def websocket_endpoint(websocket: WebSocket): # speech synthesis request elif 'text' in message: - text_bese64 = message["text"] - sentence = connection_handler.preprocess( - text_bese64=text_bese64) + text = message["text"] + spk_id = message["spk_id"] # run - wav_generator = connection_handler.run(sentence) + wav_generator = connection_handler.run( + sentence=text, spk_id=spk_id) while True: try: @@ -116,3 +116,22 @@ async def websocket_endpoint(websocket: WebSocket): except Exception as e: logger.error(e) + + +@router.get("/paddlespeech/tts/streaming/samplerate") +def get_samplerate(): + try: + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + logger.info("Get tts engine successfully.") + sample_rate = tts_engine.sample_rate + + response = {"sample_rate": sample_rate} + + except ServerBaseException as e: + response = failed_response(e.error_code, e.msg) + except BaseException: + response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) + traceback.print_exc() + + return response