add fastspeech2 cnndecoder onnx model, test=tts

3 years ago · dafe7c3657
parent 149b1fb1fa
commit dafe7c3657
17 changed files with 908 additions and 201 deletions
--- a/examples/csmsc/tts2/local/inference.sh
+++ b/examples/csmsc/tts2/local/inference.sh
@ -30,21 +30,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --tones_dict=dump/tone_id_map.txt
 fi
 # style melgan
 # style melgan's Dygraph to Static Graph is not ready now
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
        --voc=style_melgan_csmsc \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
 fi
 # hifigan
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@ -231,14 +231,19 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 The static model can be downloaded here:
 - [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
 - [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
 - [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
 - [fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
 The ONNX model can be downloaded here:
 - [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
 - [fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
 - [fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip)
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
 default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
 conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
 cnndecoder| 1(gpu) x 153000|1.1153|0.61475|0.03380|0.30414|0.14707|
 FastSpeech2 checkpoint contains files listed below.
 ```text
--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@ -5,6 +5,7 @@ train_output_path=$1
 stage=0
 stop_stage=0
 # pwgan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
@ -27,20 +28,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --phones_dict=dump/phone_id_map.txt
 fi
 # style melgan
 # style melgan's Dygraph to Static Graph is not ready now
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=style_melgan_csmsc \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
 # hifigan
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
@ -51,7 +41,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 fi
 # wavernn
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
--- a/examples/csmsc/tts3/local/inference_streaming.sh
+++ b/examples/csmsc/tts3/local/inference_streaming.sh
@ -0,0 +1,47 @@
 #!/bin/bash
 train_output_path=$1
 stage=2
 stop_stage=2
 # pwgan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../inference_streaming.py \
        --inference_dir=${train_output_path}/inference_streaming \
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
 fi
 # for more GAN Vocoders
 # multi band melgan
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../inference_streaming.py \
        --inference_dir=${train_output_path}/inference_streaming \
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
 fi
 # hifigan
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../inference_streaming.py \
        --inference_dir=${train_output_path}/inference_streaming \
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
 fi
--- a/examples/csmsc/tts3/local/ort_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/ort_predict_streaming.sh
@ -0,0 +1,19 @@
 train_output_path=$1
 stage=0
 stop_stage=0
 # e2e, synthesize from text
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../ort_predict_streaming.py \
        --inference_dir=${train_output_path}/inference_onnx_streaming \
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --am_streaming=True
 fi
--- a/examples/csmsc/tts3/local/synthesize_streaming.sh
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@ -88,5 +88,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
-        --am_streaming=True
+        --am_streaming=True \
        --inference_dir=${train_output_path}/inference_streaming
 fi
--- a/examples/csmsc/tts3/run_cnndecoder.sh
+++ b/examples/csmsc/tts3/run_cnndecoder.sh
@ -9,7 +9,7 @@ stop_stage=100
 conf_path=conf/cnndecoder.yaml
 train_output_path=exp/cnndecoder
-ckpt_name=snapshot_iter_153.pdz
+ckpt_name=snapshot_iter_153000.pdz
 # with the following command, you can choose the stage range you want to run
 # such as `./run.sh --stage 0 --stop-stage 0`
@ -31,18 +31,75 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 # synthesize_e2e non-streaming
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # synthesize_e2e, vocoder is pwgan
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 # inference non-streaming
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # inference with static model
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
 # synthesize_e2e streaming
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # synthesize_e2e, vocoder is pwgan
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 # inference streaming
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # inference with static model
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference_streaming.sh ${train_output_path} || exit -1
 fi
 # paddle2onnx non streaming
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
        pip install paddle2onnx==0.9.4
    fi
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
 fi
 # onnxruntime non streaming
 # inference with onnxruntime, use fastspeech2 + hifigan by default
 if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
    # install onnxruntime
    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
        pip install onnxruntime==1.10.0
    fi
    ./local/ort_predict.sh ${train_output_path}
 fi
 # paddle2onnx streaming
 if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
        pip install paddle2onnx==0.9.4
    fi
    # streaming acoustic model
    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_postnet
    # vocoder
    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
 fi
 # onnxruntime streaming
 if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
    # install onnxruntime
    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
        pip install onnxruntime==1.10.0
    fi
    ./local/ort_predict_streaming.sh ${train_output_path}
 fi
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -14,92 +14,17 @@
 import argparse
 from pathlib import Path
 import numpy
 import soundfile as sf
 from paddle import inference
 from timer import timer
 from paddlespeech.t2s.exps.syn_utils import get_am_output
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_predictor
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.exps.syn_utils import get_voc_output
 from paddlespeech.t2s.utils import str2bool
 def get_predictor(args, filed='am'):
    full_name = ''
    if filed == 'am':
        full_name = args.am
    elif filed == 'voc':
        full_name = args.voc
    model_name = full_name[:full_name.rindex('_')]
    config = inference.Config(
        str(Path(args.inference_dir) / (full_name + ".pdmodel")),
        str(Path(args.inference_dir) / (full_name + ".pdiparams")))
    if args.device == "gpu":
        config.enable_use_gpu(100, 0)
    elif args.device == "cpu":
        config.disable_gpu()
    config.enable_memory_optim()
    predictor = inference.create_predictor(config)
    return predictor
 def get_am_output(args, am_predictor, frontend, merge_sentences, input):
    am_name = args.am[:args.am.rindex('_')]
    am_dataset = args.am[args.am.rindex('_') + 1:]
    am_input_names = am_predictor.get_input_names()
    get_tone_ids = False
    get_spk_id = False
    if am_name == 'speedyspeech':
        get_tone_ids = True
    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
        get_spk_id = True
        spk_id = numpy.array([args.spk_id])
    if args.lang == 'zh':
        input_ids = frontend.get_input_ids(
            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
        phone_ids = input_ids["phone_ids"]
    elif args.lang == 'en':
        input_ids = frontend.get_input_ids(
            input, merge_sentences=merge_sentences)
        phone_ids = input_ids["phone_ids"]
    else:
        print("lang should in {'zh', 'en'}!")
    if get_tone_ids:
        tone_ids = input_ids["tone_ids"]
        tones = tone_ids[0].numpy()
        tones_handle = am_predictor.get_input_handle(am_input_names[1])
        tones_handle.reshape(tones.shape)
        tones_handle.copy_from_cpu(tones)
    if get_spk_id:
        spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
        spk_id_handle.reshape(spk_id.shape)
        spk_id_handle.copy_from_cpu(spk_id)
    phones = phone_ids[0].numpy()
    phones_handle = am_predictor.get_input_handle(am_input_names[0])
    phones_handle.reshape(phones.shape)
    phones_handle.copy_from_cpu(phones)
    am_predictor.run()
    am_output_names = am_predictor.get_output_names()
    am_output_handle = am_predictor.get_output_handle(am_output_names[0])
    am_output_data = am_output_handle.copy_to_cpu()
    return am_output_data
 def get_voc_output(args, voc_predictor, input):
    voc_input_names = voc_predictor.get_input_names()
    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
    mel_handle.reshape(input.shape)
    mel_handle.copy_from_cpu(input)
    voc_predictor.run()
    voc_output_names = voc_predictor.get_output_names()
    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
    wav = voc_output_handle.copy_to_cpu()
    return wav
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Paddle Infernce with acoustic model & vocoder.")
@ -204,7 +129,7 @@ def main():
                merge_sentences=merge_sentences,
                input=sentence)
            wav = get_voc_output(
-                args, voc_predictor=voc_predictor, input=am_output_data)
+                voc_predictor=voc_predictor, input=am_output_data)
        speed = wav.size / t.elapse
        rtf = fs / speed
        print(
@ -224,7 +149,7 @@ def main():
                merge_sentences=merge_sentences,
                input=sentence)
            wav = get_voc_output(
-                args, voc_predictor=voc_predictor, input=am_output_data)
+                voc_predictor=voc_predictor, input=am_output_data)
        N += wav.size
        T += t.elapse
--- a/paddlespeech/t2s/exps/inference_streaming.py
+++ b/paddlespeech/t2s/exps/inference_streaming.py
@ -0,0 +1,224 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 from pathlib import Path
 import numpy as np
 import soundfile as sf
 from timer import timer
 from paddlespeech.t2s.exps.syn_utils import denorm
 from paddlespeech.t2s.exps.syn_utils import get_am_sublayer_output
 from paddlespeech.t2s.exps.syn_utils import get_chunks
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_predictor
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.exps.syn_utils import get_streaming_am_output
 from paddlespeech.t2s.exps.syn_utils import get_streaming_am_predictor
 from paddlespeech.t2s.exps.syn_utils import get_voc_output
 from paddlespeech.t2s.utils import str2bool
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Paddle Infernce with acoustic model & vocoder.")
    # acoustic model
    parser.add_argument(
        '--am',
        type=str,
        default='fastspeech2_csmsc',
        choices=['fastspeech2_csmsc'],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
        "--am_stat",
        type=str,
        default=None,
        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
    )
    parser.add_argument(
        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
    parser.add_argument(
        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
    parser.add_argument(
        "--speaker_dict", type=str, default=None, help="speaker id map file.")
    parser.add_argument(
        '--spk_id',
        type=int,
        default=0,
        help='spk id for multi speaker acoustic model')
    # voc
    parser.add_argument(
        '--voc',
        type=str,
        default='pwgan_csmsc',
        choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
        help='Choose vocoder type of tts task.')
    # other
    parser.add_argument(
        '--lang',
        type=str,
        default='zh',
        help='Choose model language. zh or en')
    parser.add_argument(
        "--text",
        type=str,
        help="text to synthesize, a 'utt_id sentence' pair per line")
    parser.add_argument(
        "--inference_dir", type=str, help="dir to save inference models")
    parser.add_argument("--output_dir", type=str, help="output dir")
    # inference
    parser.add_argument(
        "--device",
        default="gpu",
        choices=["gpu", "cpu"],
        help="Device selected for inference.", )
    # streaming related
    parser.add_argument(
        "--am_streaming",
        type=str2bool,
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")
    args, _ = parser.parse_known_args()
    return args
 # only inference for models trained with csmsc now
 def main():
    args = parse_args()
    # frontend
    frontend = get_frontend(args)
    # am_predictor
    am_encoder_infer_predictor, am_decoder_predictor, am_postnet_predictor = get_streaming_am_predictor(
        args)
    am_mu, am_std = np.load(args.am_stat)
    # model: {model_name}_{dataset}
    am_dataset = args.am[args.am.rindex('_') + 1:]
    # voc_predictor
    voc_predictor = get_predictor(args, filed='voc')
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    sentences = get_sentences(args)
    merge_sentences = True
    fs = 24000 if am_dataset != 'ljspeech' else 22050
    # warmup
    for utt_id, sentence in sentences[:3]:
        with timer() as t:
            normalized_mel = get_streaming_am_output(
                args,
                am_encoder_infer_predictor=am_encoder_infer_predictor,
                am_decoder_predictor=am_decoder_predictor,
                am_postnet_predictor=am_postnet_predictor,
                frontend=frontend,
                merge_sentences=merge_sentences,
                input=sentence)
            mel = denorm(normalized_mel, am_mu, am_std)
            wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
        speed = wav.size / t.elapse
        rtf = fs / speed
        print(
            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
        )
    print("warm up done!")
    N = 0
    T = 0
    chunk_size = args.chunk_size
    pad_size = args.pad_size
    get_tone_ids = False
    for utt_id, sentence in sentences:
        with timer() as t:
            # frontend
            if args.lang == 'zh':
                input_ids = frontend.get_input_ids(
                    sentence,
                    merge_sentences=merge_sentences,
                    get_tone_ids=get_tone_ids)
                phone_ids = input_ids["phone_ids"]
            else:
                print("lang should be 'zh' here!")
            phones = phone_ids[0].numpy()
            # acoustic model
            orig_hs = get_am_sublayer_output(
                am_encoder_infer_predictor, input=phones)
            if args.am_streaming:
                hss = get_chunks(orig_hs, chunk_size, pad_size)
                chunk_num = len(hss)
                mel_list = []
                for i, hs in enumerate(hss):
                    am_decoder_output = get_am_sublayer_output(
                        am_decoder_predictor, input=hs)
                    am_postnet_output = get_am_sublayer_output(
                        am_postnet_predictor,
                        input=np.transpose(am_decoder_output, (0, 2, 1)))
                    am_output_data = am_decoder_output + np.transpose(
                        am_postnet_output, (0, 2, 1))
                    normalized_mel = am_output_data[0]
                    sub_mel = denorm(normalized_mel, am_mu, am_std)
                    # clip output part of pad
                    if i == 0:
                        sub_mel = sub_mel[:-pad_size]
                    elif i == chunk_num - 1:
                        # 最后一块的右侧一定没有 pad 够
                        sub_mel = sub_mel[pad_size:]
                    else:
                        # 倒数几块的右侧也可能没有 pad 够
                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
                                          sub_mel.shape[0]]
                    mel_list.append(sub_mel)
                mel = np.concatenate(mel_list, axis=0)
            else:
                am_decoder_output = get_am_sublayer_output(
                    am_decoder_predictor, input=orig_hs)
                am_postnet_output = get_am_sublayer_output(
                    am_postnet_predictor,
                    input=np.transpose(am_decoder_output, (0, 2, 1)))
                am_output_data = am_decoder_output + np.transpose(
                    am_postnet_output, (0, 2, 1))
                normalized_mel = am_output_data[0]
                mel = denorm(normalized_mel, am_mu, am_std)
            # vocoder
            wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
        N += wav.size
        T += t.elapse
        speed = wav.size / t.elapse
        rtf = fs / speed
        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
        print(
            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
        )
        print(f"{utt_id} done!")
    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
 if __name__ == "__main__":
    main()
--- a/paddlespeech/t2s/exps/ort_predict.py
+++ b/paddlespeech/t2s/exps/ort_predict.py
@ -16,39 +16,14 @@ from pathlib import Path
 import jsonlines
 import numpy as np
 import onnxruntime as ort
 import soundfile as sf
 from timer import timer
 from paddlespeech.t2s.exps.syn_utils import get_sess
 from paddlespeech.t2s.exps.syn_utils import get_test_dataset
 from paddlespeech.t2s.utils import str2bool
 def get_sess(args, filed='am'):
    full_name = ''
    if filed == 'am':
        full_name = args.am
    elif filed == 'voc':
        full_name = args.voc
    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    if args.device == "gpu":
        # fastspeech2/mb_melgan can't use trt now!
        if args.use_trt:
            providers = ['TensorrtExecutionProvider']
        else:
            providers = ['CUDAExecutionProvider']
    elif args.device == "cpu":
        providers = ['CPUExecutionProvider']
    sess_options.intra_op_num_threads = args.cpu_threads
    sess = ort.InferenceSession(
        model_dir, providers=providers, sess_options=sess_options)
    return sess
 def ort_predict(args):
    # construct dataset for evaluation
    with jsonlines.open(args.test_metadata, 'r') as reader:
@ -131,7 +106,7 @@ def parse_args():
        '--voc',
        type=str,
        default='hifigan_csmsc',
-        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
        help='Choose vocoder type of tts task.')
    # other
    parser.add_argument(
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@ -15,40 +15,15 @@ import argparse
 from pathlib import Path
 import numpy as np
 import onnxruntime as ort
 import soundfile as sf
 from timer import timer
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.exps.syn_utils import get_sess
 from paddlespeech.t2s.utils import str2bool
 def get_sess(args, filed='am'):
    full_name = ''
    if filed == 'am':
        full_name = args.am
    elif filed == 'voc':
        full_name = args.voc
    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    if args.device == "gpu":
        # fastspeech2/mb_melgan can't use trt now!
        if args.use_trt:
            providers = ['TensorrtExecutionProvider']
        else:
            providers = ['CUDAExecutionProvider']
    elif args.device == "cpu":
        providers = ['CPUExecutionProvider']
    sess_options.intra_op_num_threads = args.cpu_threads
    sess = ort.InferenceSession(
        model_dir, providers=providers, sess_options=sess_options)
    return sess
 def ort_predict(args):
    # frontend
@ -156,7 +131,7 @@ def parse_args():
        '--voc',
        type=str,
        default='hifigan_csmsc',
-        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
        help='Choose vocoder type of tts task.')
    # other
    parser.add_argument(
--- a/paddlespeech/t2s/exps/ort_predict_streaming.py
+++ b/paddlespeech/t2s/exps/ort_predict_streaming.py
@ -0,0 +1,233 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 from pathlib import Path
 import numpy as np
 import soundfile as sf
 from timer import timer
 from paddlespeech.t2s.exps.syn_utils import denorm
 from paddlespeech.t2s.exps.syn_utils import get_chunks
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.exps.syn_utils import get_sess
 from paddlespeech.t2s.exps.syn_utils import get_streaming_am_sess
 from paddlespeech.t2s.utils import str2bool
 def ort_predict(args):
    # frontend
    frontend = get_frontend(args)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    sentences = get_sentences(args)
    am_name = args.am[:args.am.rindex('_')]
    am_dataset = args.am[args.am.rindex('_') + 1:]
    fs = 24000 if am_dataset != 'ljspeech' else 22050
    # am
    am_encoder_infer_sess, am_decoder_sess, am_postnet_sess = get_streaming_am_sess(
        args)
    am_mu, am_std = np.load(args.am_stat)
    # vocoder
    voc_sess = get_sess(args, filed='voc')
    # frontend warmup
    # Loading model cost 0.5+ seconds
    if args.lang == 'zh':
        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
    else:
        print("lang should in be 'zh' here!")
    # am warmup
    for T in [27, 38, 54]:
        phone_ids = np.random.randint(1, 266, size=(T, ))
        am_encoder_infer_sess.run(None, input_feed={'text': phone_ids})
        am_decoder_input = np.random.rand(1, T * 15, 384).astype('float32')
        am_decoder_sess.run(None, input_feed={'xs': am_decoder_input})
        am_postnet_input = np.random.rand(1, 80, T * 15).astype('float32')
        am_postnet_sess.run(None, input_feed={'xs': am_postnet_input})
    # voc warmup
    for T in [227, 308, 544]:
        data = np.random.rand(T, 80).astype("float32")
        voc_sess.run(None, input_feed={"logmel": data})
    print("warm up done!")
    N = 0
    T = 0
    merge_sentences = True
    get_tone_ids = False
    chunk_size = args.chunk_size
    pad_size = args.pad_size
    for utt_id, sentence in sentences:
        with timer() as t:
            if args.lang == 'zh':
                input_ids = frontend.get_input_ids(
                    sentence,
                    merge_sentences=merge_sentences,
                    get_tone_ids=get_tone_ids)
                phone_ids = input_ids["phone_ids"]
            else:
                print("lang should in be 'zh' here!")
            # merge_sentences=True here, so we only use the first item of phone_ids
            phone_ids = phone_ids[0].numpy()
            orig_hs = am_encoder_infer_sess.run(
                None, input_feed={'text': phone_ids})
            if args.am_streaming:
                hss = get_chunks(orig_hs[0], chunk_size, pad_size)
                chunk_num = len(hss)
                mel_list = []
                for i, hs in enumerate(hss):
                    am_decoder_output = am_decoder_sess.run(
                        None, input_feed={'xs': hs})
                    am_postnet_output = am_postnet_sess.run(
                        None,
                        input_feed={
                            'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
                        })
                    am_output_data = am_decoder_output + np.transpose(
                        am_postnet_output[0], (0, 2, 1))
                    normalized_mel = am_output_data[0][0]
                    sub_mel = denorm(normalized_mel, am_mu, am_std)
                    # clip output part of pad
                    if i == 0:
                        sub_mel = sub_mel[:-pad_size]
                    elif i == chunk_num - 1:
                        # 最后一块的右侧一定没有 pad 够
                        sub_mel = sub_mel[pad_size:]
                    else:
                        # 倒数几块的右侧也可能没有 pad 够
                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
                                          sub_mel.shape[0]]
                    mel_list.append(sub_mel)
                mel = np.concatenate(mel_list, axis=0)
            else:
                am_decoder_output = am_decoder_sess.run(
                    None, input_feed={'xs': orig_hs[0]})
                am_postnet_output = am_postnet_sess.run(
                    None,
                    input_feed={
                        'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
                    })
                am_output_data = am_decoder_output + np.transpose(
                    am_postnet_output[0], (0, 2, 1))
                normalized_mel = am_output_data[0]
                mel = denorm(normalized_mel, am_mu, am_std)
                mel = mel[0]
            # vocoder
            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
            N += len(wav[0])
            T += t.elapse
            speed = len(wav[0]) / t.elapse
            rtf = fs / speed
        sf.write(
            str(output_dir / (utt_id + ".wav")),
            np.array(wav)[0],
            samplerate=fs)
        print(
            f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
        )
    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
 def parse_args():
    parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
    # acoustic model
    parser.add_argument(
        '--am',
        type=str,
        default='fastspeech2_csmsc',
        choices=['fastspeech2_csmsc'],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
        "--am_stat",
        type=str,
        default=None,
        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
    )
    parser.add_argument(
        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
    parser.add_argument(
        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
    # voc
    parser.add_argument(
        '--voc',
        type=str,
        default='hifigan_csmsc',
        choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
        help='Choose vocoder type of tts task.')
    # other
    parser.add_argument(
        "--inference_dir", type=str, help="dir to save inference models")
    parser.add_argument(
        "--text",
        type=str,
        help="text to synthesize, a 'utt_id sentence' pair per line")
    parser.add_argument("--output_dir", type=str, help="output dir")
    parser.add_argument(
        '--lang',
        type=str,
        default='zh',
        help='Choose model language. zh or en')
    # inference
    parser.add_argument(
        "--use_trt",
        type=str2bool,
        default=False,
        help="Whether to use inference engin TensorRT.", )
    parser.add_argument(
        "--device",
        default="gpu",
        choices=["gpu", "cpu"],
        help="Device selected for inference.", )
    parser.add_argument('--cpu_threads', type=int, default=1)
    # streaming related
    parser.add_argument(
        "--am_streaming",
        type=str2bool,
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")
    args, _ = parser.parse_known_args()
    return args
 def main():
    args = parse_args()
    ort_predict(args)
 if __name__ == "__main__":
    main()
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -11,10 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import os
 from pathlib import Path
 import numpy as np
 import onnxruntime as ort
 import paddle
 from paddle import inference
 from paddle import jit
 from paddle.static import InputSpec
@ -62,6 +66,21 @@ model_alias = {
 }
 def denorm(data, mean, std):
    return data * std + mean
 def get_chunks(data, chunk_size, pad_size):
    data_len = data.shape[1]
    chunks = []
    n = math.ceil(data_len / chunk_size)
    for i in range(n):
        start = max(0, i * chunk_size - pad_size)
        end = min((i + 1) * chunk_size + pad_size, data_len)
        chunks.append(data[:, start:end, :])
    return chunks
 # input
 def get_sentences(args):
    # construct dataset for evaluation
@ -241,3 +260,221 @@ def voc_to_static(args, voc_inference):
    paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc))
    voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc))
    return voc_inference
 # inference
 def get_predictor(args, filed='am'):
    full_name = ''
    if filed == 'am':
        full_name = args.am
    elif filed == 'voc':
        full_name = args.voc
    config = inference.Config(
        str(Path(args.inference_dir) / (full_name + ".pdmodel")),
        str(Path(args.inference_dir) / (full_name + ".pdiparams")))
    if args.device == "gpu":
        config.enable_use_gpu(100, 0)
    elif args.device == "cpu":
        config.disable_gpu()
    config.enable_memory_optim()
    predictor = inference.create_predictor(config)
    return predictor
 def get_am_output(args, am_predictor, frontend, merge_sentences, input):
    am_name = args.am[:args.am.rindex('_')]
    am_dataset = args.am[args.am.rindex('_') + 1:]
    am_input_names = am_predictor.get_input_names()
    get_tone_ids = False
    get_spk_id = False
    if am_name == 'speedyspeech':
        get_tone_ids = True
    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
        get_spk_id = True
        spk_id = np.array([args.spk_id])
    if args.lang == 'zh':
        input_ids = frontend.get_input_ids(
            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
        phone_ids = input_ids["phone_ids"]
    elif args.lang == 'en':
        input_ids = frontend.get_input_ids(
            input, merge_sentences=merge_sentences)
        phone_ids = input_ids["phone_ids"]
    else:
        print("lang should in {'zh', 'en'}!")
    if get_tone_ids:
        tone_ids = input_ids["tone_ids"]
        tones = tone_ids[0].numpy()
        tones_handle = am_predictor.get_input_handle(am_input_names[1])
        tones_handle.reshape(tones.shape)
        tones_handle.copy_from_cpu(tones)
    if get_spk_id:
        spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
        spk_id_handle.reshape(spk_id.shape)
        spk_id_handle.copy_from_cpu(spk_id)
    phones = phone_ids[0].numpy()
    phones_handle = am_predictor.get_input_handle(am_input_names[0])
    phones_handle.reshape(phones.shape)
    phones_handle.copy_from_cpu(phones)
    am_predictor.run()
    am_output_names = am_predictor.get_output_names()
    am_output_handle = am_predictor.get_output_handle(am_output_names[0])
    am_output_data = am_output_handle.copy_to_cpu()
    return am_output_data
 def get_voc_output(voc_predictor, input):
    voc_input_names = voc_predictor.get_input_names()
    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
    mel_handle.reshape(input.shape)
    mel_handle.copy_from_cpu(input)
    voc_predictor.run()
    voc_output_names = voc_predictor.get_output_names()
    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
    wav = voc_output_handle.copy_to_cpu()
    return wav
 # streaming am
 def get_streaming_am_predictor(args):
    full_name = args.am
    am_encoder_infer_config = inference.Config(
        str(
            Path(args.inference_dir) /
            (full_name + "_am_encoder_infer" + ".pdmodel")),
        str(
            Path(args.inference_dir) /
            (full_name + "_am_encoder_infer" + ".pdiparams")))
    am_decoder_config = inference.Config(
        str(
            Path(args.inference_dir) /
            (full_name + "_am_decoder" + ".pdmodel")),
        str(
            Path(args.inference_dir) /
            (full_name + "_am_decoder" + ".pdiparams")))
    am_postnet_config = inference.Config(
        str(
            Path(args.inference_dir) /
            (full_name + "_am_postnet" + ".pdmodel")),
        str(
            Path(args.inference_dir) /
            (full_name + "_am_postnet" + ".pdiparams")))
    if args.device == "gpu":
        am_encoder_infer_config.enable_use_gpu(100, 0)
        am_decoder_config.enable_use_gpu(100, 0)
        am_postnet_config.enable_use_gpu(100, 0)
    elif args.device == "cpu":
        am_encoder_infer_config.disable_gpu()
        am_decoder_config.disable_gpu()
        am_postnet_config.disable_gpu()
    am_encoder_infer_config.enable_memory_optim()
    am_decoder_config.enable_memory_optim()
    am_postnet_config.enable_memory_optim()
    am_encoder_infer_predictor = inference.create_predictor(
        am_encoder_infer_config)
    am_decoder_predictor = inference.create_predictor(am_decoder_config)
    am_postnet_predictor = inference.create_predictor(am_postnet_config)
    return am_encoder_infer_predictor, am_decoder_predictor, am_postnet_predictor
 def get_am_sublayer_output(am_sublayer_predictor, input):
    am_sublayer_input_names = am_sublayer_predictor.get_input_names()
    input_handle = am_sublayer_predictor.get_input_handle(
        am_sublayer_input_names[0])
    input_handle.reshape(input.shape)
    input_handle.copy_from_cpu(input)
    am_sublayer_predictor.run()
    am_sublayer_names = am_sublayer_predictor.get_output_names()
    am_sublayer_handle = am_sublayer_predictor.get_output_handle(
        am_sublayer_names[0])
    am_sublayer_output = am_sublayer_handle.copy_to_cpu()
    return am_sublayer_output
 def get_streaming_am_output(args, am_encoder_infer_predictor,
                            am_decoder_predictor, am_postnet_predictor,
                            frontend, merge_sentences, input):
    get_tone_ids = False
    if args.lang == 'zh':
        input_ids = frontend.get_input_ids(
            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
        phone_ids = input_ids["phone_ids"]
    else:
        print("lang should be 'zh' here!")
    phones = phone_ids[0].numpy()
    am_encoder_infer_output = get_am_sublayer_output(
        am_encoder_infer_predictor, input=phones)
    am_decoder_output = get_am_sublayer_output(
        am_decoder_predictor, input=am_encoder_infer_output)
    am_postnet_output = get_am_sublayer_output(
        am_postnet_predictor, input=np.transpose(am_decoder_output, (0, 2, 1)))
    am_output_data = am_decoder_output + np.transpose(am_postnet_output,
                                                      (0, 2, 1))
    normalized_mel = am_output_data[0]
    return normalized_mel
 def get_sess(args, filed='am'):
    full_name = ''
    if filed == 'am':
        full_name = args.am
    elif filed == 'voc':
        full_name = args.voc
    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    if args.device == "gpu":
        # fastspeech2/mb_melgan can't use trt now!
        if args.use_trt:
            providers = ['TensorrtExecutionProvider']
        else:
            providers = ['CUDAExecutionProvider']
    elif args.device == "cpu":
        providers = ['CPUExecutionProvider']
    sess_options.intra_op_num_threads = args.cpu_threads
    sess = ort.InferenceSession(
        model_dir, providers=providers, sess_options=sess_options)
    return sess
 # streaming am
 def get_streaming_am_sess(args):
    full_name = args.am
    am_encoder_infer_model_dir = str(
        Path(args.inference_dir) / (full_name + "_am_encoder_infer" + ".onnx"))
    am_decoder_model_dir = str(
        Path(args.inference_dir) / (full_name + "_am_decoder" + ".onnx"))
    am_postnet_model_dir = str(
        Path(args.inference_dir) / (full_name + "_am_postnet" + ".onnx"))
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    if args.device == "gpu":
        # fastspeech2/mb_melgan can't use trt now!
        if args.use_trt:
            providers = ['TensorrtExecutionProvider']
        else:
            providers = ['CUDAExecutionProvider']
    elif args.device == "cpu":
        providers = ['CPUExecutionProvider']
    sess_options.intra_op_num_threads = args.cpu_threads
    am_encoder_infer_sess = ort.InferenceSession(
        am_encoder_infer_model_dir,
        providers=providers,
        sess_options=sess_options)
    am_decoder_sess = ort.InferenceSession(
        am_decoder_model_dir, providers=providers, sess_options=sess_options)
    am_postnet_sess = ort.InferenceSession(
        am_postnet_model_dir, providers=providers, sess_options=sess_options)
    return am_encoder_infer_sess, am_decoder_sess, am_postnet_sess
--- a/paddlespeech/t2s/exps/synthesize_streaming.py
+++ b/paddlespeech/t2s/exps/synthesize_streaming.py
@ -12,39 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import math
+import os
 from pathlib import Path
 import numpy as np
 import paddle
 import soundfile as sf
 import yaml
 from paddle import jit
 from paddle.static import InputSpec
 from timer import timer
 from yacs.config import CfgNode
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.t2s.exps.syn_utils import denorm
 from paddlespeech.t2s.exps.syn_utils import get_chunks
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.exps.syn_utils import model_alias
 from paddlespeech.t2s.exps.syn_utils import voc_to_static
 from paddlespeech.t2s.utils import str2bool
 def denorm(data, mean, std):
    return data * std + mean
 def get_chunks(data, chunk_size, pad_size):
    data_len = data.shape[1]
    chunks = []
    n = math.ceil(data_len / chunk_size)
    for i in range(n):
        start = max(0, i * chunk_size - pad_size)
        end = min((i + 1) * chunk_size + pad_size, data_len)
        chunks.append(data[:, start:end, :])
    return chunks
 def evaluate(args):
    # Init body.
@ -84,9 +74,49 @@ def evaluate(args):
    am_mu = paddle.to_tensor(am_mu)
    am_std = paddle.to_tensor(am_std)
    # am sub layers
    am_encoder_infer = am.encoder_infer
    am_decoder = am.decoder
    am_postnet = am.postnet
    # vocoder
    voc_inference = get_voc_inference(args, voc_config)
    # whether dygraph to static
    if args.inference_dir:
        # fastspeech2 cnndecoder to static
        # am.encoder_infer
        am_encoder_infer = jit.to_static(
            am_encoder_infer, input_spec=[InputSpec([-1], dtype=paddle.int64)])
        paddle.jit.save(am_encoder_infer,
                        os.path.join(args.inference_dir,
                                     args.am + "_am_encoder_infer"))
        am_encoder_infer = paddle.jit.load(
            os.path.join(args.inference_dir, args.am + "_am_encoder_infer"))
        # am.decoder
        am_decoder = jit.to_static(
            am_decoder,
            input_spec=[InputSpec([1, -1, 384], dtype=paddle.float32)])
        paddle.jit.save(am_decoder,
                        os.path.join(args.inference_dir,
                                     args.am + "_am_decoder"))
        am_decoder = paddle.jit.load(
            os.path.join(args.inference_dir, args.am + "_am_decoder"))
        # am.postnet
        am_postnet = jit.to_static(
            am_postnet,
            input_spec=[InputSpec([1, 80, -1], dtype=paddle.float32)])
        paddle.jit.save(am_postnet,
                        os.path.join(args.inference_dir,
                                     args.am + "_am_postnet"))
        am_postnet = paddle.jit.load(
            os.path.join(args.inference_dir, args.am + "_am_postnet"))
        # vocoder
        voc_inference = voc_to_static(args, voc_inference)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    merge_sentences = True
@ -107,20 +137,19 @@ def evaluate(args):
                phone_ids = input_ids["phone_ids"]
            else:
-                print("lang should in be 'zh' here!")
+                print("lang should be 'zh' here!")
            # merge_sentences=True here, so we only use the first item of phone_ids
            phone_ids = phone_ids[0]
            with paddle.no_grad():
                # acoustic model
-                orig_hs, h_masks = am.encoder_infer(phone_ids)
+                orig_hs = am_encoder_infer(phone_ids)
                if args.am_streaming:
                    hss = get_chunks(orig_hs, chunk_size, pad_size)
                    chunk_num = len(hss)
                    mel_list = []
                    for i, hs in enumerate(hss):
-                        before_outs, _ = am.decoder(hs)
+                        before_outs = am_decoder(hs)
-                        after_outs = before_outs + am.postnet(
+                        after_outs = before_outs + am_postnet(
                            before_outs.transpose((0, 2, 1))).transpose(
                                (0, 2, 1))
                        normalized_mel = after_outs[0]
@ -139,8 +168,8 @@ def evaluate(args):
                    mel = paddle.concat(mel_list, axis=0)
                else:
-                    before_outs, _ = am.decoder(orig_hs)
+                    before_outs = am_decoder(orig_hs)
-                    after_outs = before_outs + am.postnet(
+                    after_outs = before_outs + am_postnet(
                        before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
                    normalized_mel = after_outs[0]
                    mel = denorm(normalized_mel, am_mu, am_std)
@ -201,16 +230,9 @@ def parse_args():
        default='pwgan_csmsc',
        choices=[
            'pwgan_csmsc',
            'pwgan_ljspeech',
            'pwgan_aishell3',
            'pwgan_vctk',
            'mb_melgan_csmsc',
            'style_melgan_csmsc',
            'hifigan_csmsc',
            'hifigan_ljspeech',
            'hifigan_aishell3',
            'hifigan_vctk',
            'wavernn_csmsc',
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
@ -233,13 +255,19 @@ def parse_args():
        default='zh',
        help='Choose model language. zh or en')
    parser.add_argument(
        "--inference_dir",
        type=str,
        default=None,
        help="dir to save inference models")
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
    parser.add_argument(
        "--text",
        type=str,
        help="text to synthesize, a 'utt_id sentence' pair per line.")
-
+    # streaming related
    parser.add_argument(
        "--am_streaming",
        type=str2bool,
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -590,15 +590,17 @@ class FastSpeech2(nn.Layer):
            h_masks = self._source_mask(olens_in)
        else:
            h_masks = None
        if return_after_enc:
            return hs, h_masks
-        # (B, Lmax, adim)
+
        zs, _ = self.decoder(hs, h_masks)
        # (B, Lmax, odim)
        if self.decoder_type == 'cnndecoder':
            # remove output masks for dygraph to static graph
            zs = self.decoder(hs, h_masks)
            before_outs = zs
        else:
            # (B, Lmax, adim)
            zs, _ = self.decoder(hs, h_masks)
            # (B, Lmax, odim)
            before_outs = self.feat_out(zs).reshape(
                (paddle.shape(zs)[0], -1, self.odim))
@ -633,7 +635,8 @@ class FastSpeech2(nn.Layer):
            tone_id = tone_id.unsqueeze(0)
        # (1, L, odim)
-        hs, h_masks = self._forward(
+        # use *_ to avoid bug in dygraph to static graph    
        hs, *_ = self._forward(
            xs,
            ilens,
            is_inference=True,
@ -642,7 +645,7 @@ class FastSpeech2(nn.Layer):
            spk_emb=spk_emb,
            spk_id=spk_id,
            tone_id=tone_id)
-        return hs, h_masks
+        return hs
    def inference(
            self,
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@ -602,7 +602,7 @@ class CNNDecoder(nn.Layer):
        if masks is not None:
            outputs = outputs * masks
        outputs = outputs.transpose([0, 2, 1])
-        return outputs, masks
+        return outputs
 class CNNPostnet(nn.Layer):
--- a/setup.py
+++ b/setup.py
@ -42,6 +42,7 @@ base = [
    "loguru",
    "matplotlib",
    "nara_wpe",
    "onnxruntime",
    "pandas",
    "paddleaudio",
    "paddlenlp",