add stream_play_tts.py, test=doc

3 years ago · 6bbe6de1ec
parent 4b1f82d312
commit 6bbe6de1ec
1 changed files with 181 additions and 0 deletions
--- a/paddlespeech/t2s/exps/stream_play_tts.py
+++ b/paddlespeech/t2s/exps/stream_play_tts.py
@ -0,0 +1,181 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# stream play TTS
+# Before first execution, download and decompress the models in the execution directory
+# wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip
+# wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip
+# unzip fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip
+# unzip mb_melgan_csmsc_onnx_0.2.0.zip
+import math
+import time
+
+import numpy as np
+import onnxruntime as ort
+import pyaudio
+import soundfile as sf
+
+from paddlespeech.server.utils.audio_process import float2pcm
+from paddlespeech.server.utils.util import denorm
+from paddlespeech.server.utils.util import get_chunks
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+
+voc_block = 36
+voc_pad = 14
+am_block = 72
+am_pad = 12
+voc_upsample = 300
+
+phones_dict = "fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0/phone_id_map.txt"
+frontend = Frontend(phone_vocab_path=phones_dict, tone_vocab_path=None)
+
+am_stat_path = "fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0/speech_stats.npy"
+am_mu, am_std = np.load(am_stat_path)
+
+# 模型路径
+onnx_am_encoder = "fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0/fastspeech2_csmsc_am_encoder_infer.onnx"
+onnx_am_decoder = "fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0/fastspeech2_csmsc_am_decoder.onnx"
+onnx_am_postnet = "fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0/fastspeech2_csmsc_am_postnet.onnx"
+onnx_voc_melgan = "mb_melgan_csmsc_onnx_0.2.0/mb_melgan_csmsc.onnx"
+
+# 用CPU推理
+providers = ['CPUExecutionProvider']
+
+# 配置ort session
+sess_options = ort.SessionOptions()
+
+# 创建session
+am_encoder_infer_sess = ort.InferenceSession(
+    onnx_am_encoder, providers=providers, sess_options=sess_options)
+am_decoder_sess = ort.InferenceSession(
+    onnx_am_decoder, providers=providers, sess_options=sess_options)
+am_postnet_sess = ort.InferenceSession(
+    onnx_am_postnet, providers=providers, sess_options=sess_options)
+voc_melgan_sess = ort.InferenceSession(
+    onnx_voc_melgan, providers=providers, sess_options=sess_options)
+
+
+def depadding(data, chunk_num, chunk_id, block, pad, upsample):
+    """ 
+    Streaming inference removes the result of pad inference
+    """
+    front_pad = min(chunk_id * block, pad)
+    # first chunk
+    if chunk_id == 0:
+        data = data[:block * upsample]
+    # last chunk
+    elif chunk_id == chunk_num - 1:
+        data = data[front_pad * upsample:]
+    # middle chunk
+    else:
+        data = data[front_pad * upsample:(front_pad + block) * upsample]
+
+    return data
+
+
+def inference_stream(text):
+    input_ids = frontend.get_input_ids(
+        text, merge_sentences=False, get_tone_ids=False)
+    phone_ids = input_ids["phone_ids"]
+    for i in range(len(phone_ids)):
+        part_phone_ids = phone_ids[i].numpy()
+        voc_chunk_id = 0
+
+        orig_hs = am_encoder_infer_sess.run(
+            None, input_feed={'text': part_phone_ids})
+        orig_hs = orig_hs[0]
+
+        # streaming voc chunk info
+        mel_len = orig_hs.shape[1]
+        voc_chunk_num = math.ceil(mel_len / voc_block)
+        start = 0
+        end = min(voc_block + voc_pad, mel_len)
+
+        # streaming am
+        hss = get_chunks(orig_hs, am_block, am_pad, "am")
+        am_chunk_num = len(hss)
+        for i, hs in enumerate(hss):
+            am_decoder_output = am_decoder_sess.run(None, input_feed={'xs': hs})
+            am_postnet_output = am_postnet_sess.run(
+                None,
+                input_feed={
+                    'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
+                })
+            am_output_data = am_decoder_output + np.transpose(
+                am_postnet_output[0], (0, 2, 1))
+            normalized_mel = am_output_data[0][0]
+
+            sub_mel = denorm(normalized_mel, am_mu, am_std)
+            sub_mel = depadding(sub_mel, am_chunk_num, i, am_block, am_pad, 1)
+
+            if i == 0:
+                mel_streaming = sub_mel
+            else:
+                mel_streaming = np.concatenate((mel_streaming, sub_mel), axis=0)
+
+            # streaming voc
+            # 当流式AM推理的mel帧数大于流式voc推理的chunk size，开始进行流式voc 推理
+            while (mel_streaming.shape[0] >= end and
+                   voc_chunk_id < voc_chunk_num):
+                voc_chunk = mel_streaming[start:end, :]
+
+                sub_wav = voc_melgan_sess.run(
+                    output_names=None, input_feed={'logmel': voc_chunk})
+                sub_wav = depadding(sub_wav[0], voc_chunk_num, voc_chunk_id,
+                                    voc_block, voc_pad, voc_upsample)
+
+                yield sub_wav
+
+                voc_chunk_id += 1
+                start = max(0, voc_chunk_id * voc_block - voc_pad)
+                end = min((voc_chunk_id + 1) * voc_block + voc_pad, mel_len)
+
+
+if __name__ == '__main__':
+
+    text = "欢迎使用飞桨语音合成系统，测试一下合成效果。"
+    # warm up
+    # onnxruntime 第一次时间会长一些，建议先 warmup 一下
+    for sub_wav in inference_stream(text="哈哈哈哈"):
+        continue
+
+    # pyaudio 播放
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        format=p.get_format_from_width(2),  # int16
+        channels=1,
+        rate=24000,
+        output=True)
+
+    # 计时
+    wavs = []
+    t1 = time.time()
+    for sub_wav in inference_stream(text):
+        print("响应时间：", time.time() - t1)
+        t1 = time.time()
+        wavs.append(sub_wav.flatten())
+        # float32 to int16
+        wav = float2pcm(sub_wav)
+        # to bytes  
+        wav_bytes = wav.tobytes()
+        stream.write(wav_bytes)
+
+    # 关闭 pyaudio 播放器
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+
+    # 流式合成的结果导出
+    wav = np.concatenate(wavs)
+    print(wav.shape)
+    sf.write("demo_stream.wav", data=wav, samplerate=24000)