From 82992b3ed6eaffd78fa27fae57235488f2ded168 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 11 Apr 2022 11:00:04 +0800
Subject: [PATCH 1/7] add test code, test=doc

---
 .../server/tests/tts/infer/csmsc_test.txt     | 100 +++
 paddlespeech/server/tests/tts/infer/run.sh    |  64 ++
 .../server/tests/tts/infer/test_online_tts.py | 650 ++++++++++++++++++
 3 files changed, 814 insertions(+)
 create mode 100644 paddlespeech/server/tests/tts/infer/csmsc_test.txt
 create mode 100644 paddlespeech/server/tests/tts/infer/run.sh
 create mode 100644 paddlespeech/server/tests/tts/infer/test_online_tts.py

diff --git a/paddlespeech/server/tests/tts/infer/csmsc_test.txt b/paddlespeech/server/tests/tts/infer/csmsc_test.txt
new file mode 100644
index 00000000..d8cf367c
--- /dev/null
+++ b/paddlespeech/server/tests/tts/infer/csmsc_test.txt
@@ -0,0 +1,100 @@
+009901 昨日，这名伤者与医生全部被警方依法刑事拘留。
+009902 钱伟长想到上海来办学校是经过深思熟虑的。
+009903 她见我一进门就骂，吃饭时也骂，骂得我抬不起头。
+009904 李述德在离开之前，只说了一句柱驼杀父亲了。
+009905 这种车票和保险单捆绑出售属于重复性购买。
+009906 戴佩妮的男友西米露接唱情歌，让她非常开心。
+009907 观大势，谋大局，出大策始终是该院的办院方针。
+009908 他们骑着摩托回家，正好为农忙时的父母帮忙。
+009909 但是因为还没到退休年龄，只能掰着指头捱日子。
+009910 这几天雨水不断，人们恨不得待在家里不出门。
+009911 没想到徐赟，张海翔两人就此玩起了人间蒸发。
+009912 藤村此番发言可能是为了凸显野田的领导能力。
+009913 程长庚，生在清王朝嘉庆年间，安徽的潜山小县。
+009914 南海海域综合补给基地码头项目正在论证中。
+009915 也就是说今晚成都市民极有可能再次看到飘雪。
+009916 随着天气转热，各地的游泳场所开始人头攒动。
+009917 更让徐先生纳闷的是，房客的手机也打不通了。
+009918 遇到颠簸时，应听从乘务员的安全指令，回座位坐好。
+009919 他在后面呆惯了，怕自己一插身后的人会不满，不敢排进去。
+009920 傍晚七个小人回来了，白雪公主说，你们就是我命中的七个小矮人吧。
+009921 他本想说，教育局管这个，他们是一路的，这样一管岂不是妓女起嫖客？
+009922 一种表示商品所有权的财物证券，也称商品证券，如提货单，交货单。
+009923 会有很丰富的东西留下来，说都说不完。
+009924 这句话像从天而降，吓得四周一片寂静。
+009925 记者所在的是受害人家属所在的右区。
+009926 不管哈大爷去哪，它都一步不离地跟着。
+009927 大家抬头望去，一只老鼠正趴在吊顶上。
+009928 我决定过年就辞职，接手我爸的废品站！
+009929 最终，中国男子乒乓球队获得此奖项。
+009930 防汛抗旱两手抓，抗旱相对抓的不够。
+009931 图们江下游地区开发开放的进展如何？
+009932 这要求中国必须有一个坚强的政党领导。
+009933 再说，关于利益上的事俺俩都不好开口。
+009934 明代瓦剌，鞑靼入侵明境也是通过此地。
+009935 咪咪舔着孩子，把它身上的毛舔干净。
+009936 是否这次的国标修订被大企业绑架了？
+009937 判决后，姚某妻子胡某不服，提起上诉。
+009938 由此可以看出邯钢的经济效益来自何处。
+009939 琳达说，是瑜伽改变了她和马儿的生活。
+009940 楼下的保安告诉记者，这里不租也不卖。
+009941 习近平说，中斯两国人民传统友谊深厚。
+009942 传闻越来越多，后来连老汉儿自己都怕了。
+009943 我怒吼一声冲上去，举起砖头砸了过去。
+009944 我现在还不会，这就回去问问发明我的人。
+009945 显然，洛阳性奴案不具备上述两个前提。
+009946 另外，杰克逊有文唇线，眼线，眉毛的动作。
+009947 昨晚，华西都市报记者电话采访了尹琪。
+009948 涅拉季科未透露这些航空公司的名称。
+009949 从运行轨迹上来说，它也不可能是星星。
+009950 目前看，如果继续加息也存在两难问题。
+009951 曾宝仪在节目录制现场大爆观众糗事。
+009952 但任凭周某怎么叫，男子仍酣睡不醒。
+009953 老大爷说，小子，你挡我财路了，知道不？
+009954 没料到，闯下大头佛的阿伟还不知悔改。
+009955 卡扎菲部落式统治已遭遇部落内讧。
+009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。
+009957 出现这种泥鳅内阁的局面既是野田有意为之，也实属无奈。
+009958 济青高速济南，华山，章丘，邹平，周村，淄博，临淄站。
+009959 赵凌飞的话，反映了沈阳赛区所有奥运志愿者的共同心声。
+009960 因为，我们所发出的力量必会因难度加大而减弱。
+009961 发生事故的楼梯拐角处仍可看到血迹。
+009962 想过进公安，可能身高不够，老汉儿也不让我进去。
+009963 路上关卡很多，为了方便撤离，只好轻装前进。
+009964 原来比尔盖茨就是美国微软公司联合创始人呀。
+009965 之后他们一家三口将与双方父母往峇里岛旅游。
+009966 谢谢总理，也感谢广大网友的参与，我们明年再见。
+009967 事实上是，从来没有一个欺善怕恶的人能作出过稍大一点的成就。
+009968 我会打开邮件，你可以从那里继续。
+009969 美方对近期东海局势表示关切。
+009970 据悉，奥巴马一家人对这座冬季白宫极为满意。
+009971 打扫完你会很有成就感的，试一试，你就信了。
+009972 诺曼站在滑板车上，各就各位，准备出发啦！
+009973 塔河的寒夜，气温降到了零下三十多摄氏度。
+009974 其间，连破六点六，六点五，六点四，六点三五等多个重要关口。
+009975 算命其实只是人们的一种自我安慰和自我暗示而已，我们还是要相信科学才好。
+009976 这一切都令人欢欣鼓舞，阿讷西没理由不坚持到最后。
+009977 直至公元前一万一千年，它又再次出现。
+009978 尽量少玩电脑，少看电视，少打游戏。
+009979 从五到七，前后也就是六个月的时间。
+009980 一进咖啡店，他就遇见一张熟悉的脸。
+009981 好在众弟兄看到了把她追了回来。
+009982 有一个人说，哥们儿我们跑过它才能活。
+009983 捅了她以后，模糊记得她没咋动了。
+009984 从小到大，葛启义没有收到过压岁钱。
+009985 舞台下的你会对舞台上的你说什么？
+009986 但考生普遍认为，试题的怪多过难。
+009987 我希望每个人都能够尊重我们的隐私。
+009988 漫天的红霞使劲给两人增添气氛。
+009989 晚上加完班开车回家，太累了，迷迷糊糊开着车，走一半的时候，铛一声！
+009990 该车将三人撞倒后，在大雾中逃窜。
+009991 这人一哆嗦，方向盘也把不稳了，差点撞上了高速边道护栏。
+009992 那女孩儿委屈的说，我一回头见你已经进去了我不敢进去啊！
+009993 小明摇摇头说，不是，我只是美女看多了，想换个口味而已。
+009994 接下来，红娘要求记者交费，记者表示不知表姐身份证号码。
+009995 李东蓊表示，自己当时在法庭上发表了一次独特的公诉意见。
+009996 另一男子扑了上来，手里拿着明晃晃的长刀，向他胸口直刺。
+009997 今天，快递员拿着一个快递在办公室喊，秦王是哪个，有他快递？
+009998 这场抗议活动究竟是如何发展演变的，又究竟是谁伤害了谁？
+009999 因华国锋肖鸡，墓地设计根据其属相设计。
+010000 在狱中，张明宝悔恨交加，写了一份忏悔书。
diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh
new file mode 100644
index 00000000..fdceec41
--- /dev/null
+++ b/paddlespeech/server/tests/tts/infer/run.sh
@@ -0,0 +1,64 @@
+model_path=/home/users/liangyunming/.paddlespeech/models/
+#am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_nosil_baker_ckpt_0.4/    ## fastspeech2
+am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/    ## fastspeech2_cnn
+voc_model_dir=$model_path/hifigan_csmsc-zh/hifigan_csmsc_ckpt_0.1.1/    ## hifigan
+#voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/    ## mb_melgan
+
+if [[ $am_model_dir == *"fastspeech2_cnndecoder"* ]]; then
+    am_support_stream=True
+else
+    am_support_stream=False
+fi
+
+# get am file
+for file in $(ls $am_model_dir)
+do
+    if [[ $file == *"yaml"* ]]; then
+        am_config_file=$file
+    elif [[ $file == *"pdz"* ]]; then
+        am_ckpt_file=$file
+    elif [[ $file == *"stat"* ]]; then
+        am_stat_file=$file
+    elif [[ $file == *"phone"* ]]; then
+        phones_dict_file=$file
+    fi
+    
+done
+
+# get voc file
+for file in $(ls $voc_model_dir)
+do
+    if [[ $file == *"yaml"* ]]; then
+        voc_config_file=$file
+    elif [[ $file == *"pdz"* ]]; then
+        voc_ckpt_file=$file
+    elif [[ $file == *"stat"* ]]; then
+        voc_stat_file=$file
+    fi
+    
+done
+
+
+#run
+python test_online_tts.py --am fastspeech2_csmsc \
+                          --am_support_stream $am_support_stream \
+                          --am_config $am_model_dir/$am_config_file \
+                          --am_ckpt $am_model_dir/$am_ckpt_file \
+                          --am_stat $am_model_dir/$am_stat_file \
+                          --phones_dict $am_model_dir/$phones_dict_file \
+                          --voc hifigan_csmsc \
+                          --voc_config $voc_model_dir/$voc_config_file \
+                          --voc_ckpt $voc_model_dir/$voc_ckpt_file \
+                          --voc_stat $voc_model_dir/$voc_stat_file  \
+                          --lang zh \
+                          --device cpu \
+                          --text ./csmsc_test.txt \
+                          --output_dir ./output \
+                          --log_file ./result.log \
+                          --am_streaming False \
+                          --am_pad 12 \
+                          --am_block 42 \
+                          --voc_streaming True \
+                          --voc_pad 14 \
+                          --voc_block 14 \
+
diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py
new file mode 100644
index 00000000..17ac0ea7
--- /dev/null
+++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py
@@ -0,0 +1,650 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import math
+import threading
+import time
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import model_alias
+from paddlespeech.t2s.utils import str2bool
+
+mel_streaming = None
+wav_streaming = None
+stream_first_time = 0.0
+voc_stream_st = 0.0
+sample_rate = 0
+
+
+def denorm(data, mean, std):
+    return data * std + mean
+
+
+def get_chunks(data, block_size, pad_size, step):
+    if step == "am":
+        data_len = data.shape[1]
+    elif step == "voc":
+        data_len = data.shape[0]
+    else:
+        print("Please set correct type to get chunks, am or voc")
+
+    chunks = []
+    n = math.ceil(data_len / block_size)
+    for i in range(n):
+        start = max(0, i * block_size - pad_size)
+        end = min((i + 1) * block_size + pad_size, data_len)
+        if step == "am":
+            chunks.append(data[:, start:end, :])
+        elif step == "voc":
+            chunks.append(data[start:end, :])
+        else:
+            print("Please set correct type to get chunks, am or voc")
+    return chunks
+
+
+def get_stream_am_inference(args, am_config):
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    odim = am_config.n_mels
+
+    am_class = dynamic_import(am_name, model_alias)
+    am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+    am.eval()
+    am_mu, am_std = np.load(args.am_stat)
+    am_mu = paddle.to_tensor(am_mu)
+    am_std = paddle.to_tensor(am_std)
+
+    return am, am_mu, am_std
+
+
+def init(args):
+    global sample_rate
+    # get config
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+
+    sample_rate = am_config.fs
+
+    # frontend
+    frontend = get_frontend(args)
+
+    # acoustic model
+    if args.am_support_stream:
+        am, am_mu, am_std = get_stream_am_inference(args, am_config)
+        am_infer_info = [am, am_mu, am_std, am_config]
+    else:
+        am_inference, am_name, am_dataset = get_am_inference(args, am_config)
+        am_infer_info = [am_inference, am_name, am_dataset, am_config]
+
+    # vocoder
+    voc_inference = get_voc_inference(args, voc_config)
+    voc_infer_info = [voc_inference, voc_config]
+
+    return frontend, am_infer_info, voc_infer_info
+
+
+def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids):
+    am_name = args.am[:args.am.rindex('_')]
+    tone_ids = None
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
+
+    if args.lang == 'zh':
+        input_ids = frontend.get_input_ids(
+            sentence,
+            merge_sentences=merge_sentences,
+            get_tone_ids=get_tone_ids)
+        phone_ids = input_ids["phone_ids"]
+        if get_tone_ids:
+            tone_ids = input_ids["tone_ids"]
+    elif args.lang == 'en':
+        input_ids = frontend.get_input_ids(
+            sentence, merge_sentences=merge_sentences)
+        phone_ids = input_ids["phone_ids"]
+    else:
+        print("lang should in {'zh', 'en'}!")
+
+    return phone_ids, tone_ids
+
+
+@paddle.no_grad()
+# 生成完整的mel
+def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids):
+    # 如果是支持流式的AM模型
+    if args.am_support_stream:
+        am, am_mu, am_std, am_config = am_infer_info
+        orig_hs, h_masks = am.encoder_infer(part_phone_ids)
+        if args.am_streaming:
+            am_pad = args.am_pad
+            am_block = args.am_block
+            hss = get_chunks(orig_hs, am_block, am_pad, "am")
+            chunk_num = len(hss)
+            mel_list = []
+            for i, hs in enumerate(hss):
+                before_outs, _ = am.decoder(hs)
+                after_outs = before_outs + am.postnet(
+                    before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                normalized_mel = after_outs[0]
+                sub_mel = denorm(normalized_mel, am_mu, am_std)
+                # clip output part of pad
+                if i == 0:
+                    sub_mel = sub_mel[:-am_pad]
+                elif i == chunk_num - 1:
+                    # 最后一块的右侧一定没有 pad 够
+                    sub_mel = sub_mel[am_pad:]
+                else:
+                    # 倒数几块的右侧也可能没有 pad 够
+                    sub_mel = sub_mel[am_pad:(am_block + am_pad) -
+                                      sub_mel.shape[0]]
+                mel_list.append(sub_mel)
+                mel = paddle.concat(mel_list, axis=0)
+
+        else:
+            orig_hs, h_masks = am.encoder_infer(part_phone_ids)
+            before_outs, _ = am.decoder(orig_hs)
+            after_outs = before_outs + am.postnet(
+                before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+            normalized_mel = after_outs[0]
+            mel = denorm(normalized_mel, am_mu, am_std)
+
+    else:
+        am_inference, am_name, am_dataset, am_config = am_infer_info
+        # acoustic model
+        if am_name == 'fastspeech2':
+            # multi speaker
+            if am_dataset in {"aishell3", "vctk"}:
+                spk_id = paddle.to_tensor(args.spk_id)
+                mel = am_inference(part_phone_ids, spk_id)
+            else:
+                mel = am_inference(part_phone_ids)
+        elif am_name == 'speedyspeech':
+            part_tone_ids = tone_ids[i]
+            if am_dataset in {"aishell3", "vctk"}:
+                spk_id = paddle.to_tensor(args.spk_id)
+                mel = am_inference(part_phone_ids, part_tone_ids, spk_id)
+            else:
+                mel = am_inference(part_phone_ids, part_tone_ids)
+        elif am_name == 'tacotron2':
+            mel = am_inference(part_phone_ids)
+
+    return mel
+
+
+@paddle.no_grad()
+def stream_voc_infer(args, voc_infer_info, mel_len):
+    global mel_streaming
+    global stream_first_time
+    global wav_streaming
+    voc_inference, voc_config = voc_infer_info
+    block = args.voc_block
+    pad = args.voc_pad
+    upsample = voc_config.n_shift
+    wav_list = []
+    flag = 1
+
+    valid_start = 0
+    valid_end = min(valid_start + block, mel_len)
+    actual_start = 0
+    actual_end = min(valid_end + pad, mel_len)
+    mel_chunk = mel_streaming[actual_start:actual_end, :]
+
+    while valid_end <= mel_len:
+        sub_wav = voc_inference(mel_chunk)
+        if flag == 1:
+            stream_first_time = time.time()
+            flag = 0
+
+        # get valid wav    
+        start = valid_start - actual_start
+        if valid_end == mel_len:
+            sub_wav = sub_wav[start * upsample:]
+            wav_list.append(sub_wav)
+            break
+        else:
+            end = start + block
+            sub_wav = sub_wav[start * upsample:end * upsample]
+            wav_list.append(sub_wav)
+
+        # generate new mel chunk
+        valid_start = valid_end
+        valid_end = min(valid_start + block, mel_len)
+        if valid_start - pad < 0:
+            actual_start = 0
+        else:
+            actual_start = valid_start - pad
+        actual_end = min(valid_end + pad, mel_len)
+        mel_chunk = mel_streaming[actual_start:actual_end, :]
+
+    wav = paddle.concat(wav_list, axis=0)
+    wav_streaming = wav
+
+
+@paddle.no_grad()
+# 非流式AM / 流式AM + 非流式Voc
+def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
+                    part_tone_ids):
+    mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids)
+    am_infer_time = time.time()
+    voc_inference, voc_config = voc_infer_info
+    wav = voc_inference(mel)
+    first_response_time = time.time()
+    final_response_time = first_response_time
+    voc_infer_time = first_response_time
+
+    return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
+
+
+@paddle.no_grad()
+# 非流式AM + 流式Voc
+def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
+                           part_tone_ids):
+    global mel_streaming
+    global stream_first_time
+    global wav_streaming
+
+    mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids)
+    am_infer_time = time.time()
+
+    # voc streaming
+    mel_streaming = mel
+    mel_len = mel.shape[0]
+    stream_voc_infer(args, voc_infer_info, mel_len)
+    first_response_time = stream_first_time
+    wav = wav_streaming
+    final_response_time = time.time()
+    voc_infer_time = final_response_time
+
+    return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
+
+
+@paddle.no_grad()
+# 流式AM + 流式 Voc
+def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
+                         part_tone_ids):
+    global mel_streaming
+    global stream_first_time
+    global wav_streaming
+    global voc_stream_st
+    mel_streaming = None
+    flag = 1  #用来表示开启流式voc的线程
+
+    am, am_mu, am_std, am_config = am_infer_info
+    orig_hs, h_masks = am.encoder_infer(part_phone_ids)
+    mel_len = orig_hs.shape[1]
+    am_block = args.am_block
+    am_pad = args.am_pad
+    hss = get_chunks(orig_hs, am_block, am_pad, "am")
+    chunk_num = len(hss)
+
+    for i, hs in enumerate(hss):
+        before_outs, _ = am.decoder(hs)
+        after_outs = before_outs + am.postnet(
+            before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+        normalized_mel = after_outs[0]
+        sub_mel = denorm(normalized_mel, am_mu, am_std)
+        # clip output part of pad
+        if i == 0:
+            sub_mel = sub_mel[:-am_pad]
+            mel_streaming = sub_mel
+        elif i == chunk_num - 1:
+            # 最后一块的右侧一定没有 pad 够
+            sub_mel = sub_mel[am_pad:]
+            mel_streaming = paddle.concat([mel_streaming, sub_mel])
+            am_infer_time = time.time()
+        else:
+            # 倒数几块的右侧也可能没有 pad 够
+            sub_mel = sub_mel[am_pad:(am_block + am_pad) - sub_mel.shape[0]]
+            mel_streaming = paddle.concat([mel_streaming, sub_mel])
+
+        if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad:
+            t = threading.Thread(
+                target=stream_voc_infer, args=(args, voc_infer_info, mel_len, ))
+            t.start()
+            voc_stream_st = time.time()
+            flag = 0
+
+    t.join()
+    final_response_time = time.time()
+    voc_infer_time = final_response_time
+    first_response_time = stream_first_time
+    wav = wav_streaming
+
+    return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
+
+
+def try_infer(args, logger, frontend, am_infer_info, voc_infer_info):
+    global sample_rate
+    logger.info(
+        "Before the formal test, we test a few texts to make the inference speed more stable."
+    )
+    if args.lang == 'zh':
+        sentence = "您好，欢迎使用语音合成服务。"
+    if args.lang == 'en':
+        sentence = "Hello and welcome to the speech synthesis service."
+
+    if args.voc_streaming:
+        if args.am_streaming:
+            infer_func = stream_am_stream_voc
+        else:
+            infer_func = nostream_am_stream_voc
+    else:
+        infer_func = am_nostream_voc
+
+    merge_sentences = True
+    get_tone_ids = False
+    for i in range(3):  # 推理3次
+        st = time.time()
+        phone_ids, tone_ids = get_phone(args, frontend, sentence,
+                                        merge_sentences, get_tone_ids)
+        part_phone_ids = phone_ids[0]
+        if tone_ids:
+            part_tone_ids = tone_ids[0]
+        else:
+            part_tone_ids = None
+
+        am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func(
+            args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids)
+        wav = wav.numpy()
+        duration = wav.size / sample_rate
+        logger.info(
+            f"sentence: {sentence}; duration: {duration} s; first response time: {first_response_time - st} s; final response time: {final_response_time - st} s"
+        )
+
+
+def evaluate(args, logger, frontend, am_infer_info, voc_infer_info):
+    global sample_rate
+    sentences = get_sentences(args)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    get_tone_ids = False
+    merge_sentences = True
+
+    # choose infer function
+    if args.voc_streaming:
+        if args.am_streaming:
+            infer_func = stream_am_stream_voc
+        else:
+            infer_func = nostream_am_stream_voc
+    else:
+        infer_func = am_nostream_voc
+
+    final_up_duration = 0.0
+    sentence_count = 0
+    front_time_list = []
+    am_time_list = []
+    voc_time_list = []
+    first_response_list = []
+    final_response_list = []
+    sentence_length_list = []
+    duration_list = []
+
+    for utt_id, sentence in sentences:
+        # front
+        front_st = time.time()
+        phone_ids, tone_ids = get_phone(args, frontend, sentence,
+                                        merge_sentences, get_tone_ids)
+        part_phone_ids = phone_ids[0]
+        if tone_ids:
+            part_tone_ids = tone_ids[0]
+        else:
+            part_tone_ids = None
+        front_et = time.time()
+        front_time = front_et - front_st
+
+        am_st = time.time()
+        am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func(
+            args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids)
+        am_time = am_infer_time - am_st
+        if args.voc_streaming and args.am_streaming:
+            voc_time = voc_infer_time - voc_stream_st
+        else:
+            voc_time = voc_infer_time - am_infer_time
+
+        first_response = first_response_time - front_st
+        final_response = final_response_time - front_st
+
+        wav = wav.numpy()
+        duration = wav.size / sample_rate
+        sf.write(
+            str(output_dir / (utt_id + ".wav")), wav, samplerate=sample_rate)
+        print(f"{utt_id} done!")
+
+        sentence_count += 1
+        front_time_list.append(front_time)
+        am_time_list.append(am_time)
+        voc_time_list.append(voc_time)
+        first_response_list.append(first_response)
+        final_response_list.append(final_response)
+        sentence_length_list.append(len(sentence))
+        duration_list.append(duration)
+
+        logger.info(
+            f"uttid: {utt_id}; sentence: '{sentence}'; front time: {front_time} s; am time: {am_time} s; voc time: {voc_time} s; \
+                        first response time: {first_response} s; final response time: {final_response} s; audio duration: {duration} s;"
+        )
+
+        if final_response > duration:
+            final_up_duration += 1
+
+    all_time_sum = sum(final_response_list)
+    front_rate = sum(front_time_list) / all_time_sum
+    am_rate = sum(am_time_list) / all_time_sum
+    voc_rate = sum(voc_time_list) / all_time_sum
+    rtf = all_time_sum / sum(duration_list)
+
+    logger.info(
+        f"The length of test text information, test num: {sentence_count}; text num: {sum(sentence_length_list)}; min: {min(sentence_length_list)}; max: {max(sentence_length_list)}; avg: {sum(sentence_length_list)/len(sentence_length_list)}"
+    )
+    logger.info(
+        f"duration information, min: {min(duration_list)}; max: {max(duration_list)}; avg: {sum(duration_list) / len(duration_list)}; sum: {sum(duration_list)}"
+    )
+    logger.info(
+        f"Front time information: min: {min(front_time_list)} s; max: {max(front_time_list)} s; avg: {sum(front_time_list)/len(front_time_list)} s; ratio: {front_rate * 100}%"
+    )
+    logger.info(
+        f"AM time information: min: {min(am_time_list)} s; max: {max(am_time_list)} s; avg: {sum(am_time_list)/len(am_time_list)} s; ratio: {am_rate * 100}%"
+    )
+    logger.info(
+        f"Vocoder time information: min: {min(voc_time_list)} s, max: {max(voc_time_list)} s; avg: {sum(voc_time_list)/len(voc_time_list)} s; ratio: {voc_rate * 100}%"
+    )
+    logger.info(
+        f"first response time information: min: {min(first_response_list)} s; max: {max(first_response_list)} s; avg: {sum(first_response_list)/len(first_response_list)} s"
+    )
+    logger.info(
+        f"final response time information: min: {min(final_response_list)} s; max: {max(final_response_list)} s; avg: {sum(final_response_list)/len(final_response_list)} s"
+    )
+    logger.info(f"RTF is: {rtf}")
+    logger.info(
+        f"The number of final_response is greater than duration is {final_up_duration}, ratio: {final_up_duration / sentence_count}%"
+    )
+
+
+def parse_args():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
+            'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
+            'tacotron2_csmsc', 'tacotron2_ljspeech'
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_support_stream',
+        type=str2bool,
+        default=False,
+        help='if am model is fastspeech2_csmsc, specify whether it supports streaming'
+    )
+    parser.add_argument(
+        '--am_config',
+        type=str,
+        default=None,
+        help='Config of acoustic model. Use deault config when it is None.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='mb_melgan_csmsc',
+        choices=[
+            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
+            'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc',
+            'wavernn_csmsc'
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config',
+        type=str,
+        default=None,
+        help='Config of voc. Use deault config when it is None.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        choices=['zh', 'en'],
+        help='Choose model language. zh or en')
+
+    parser.add_argument(
+        "--device", type=str, default='cpu', help="set cpu or gpu:id")
+
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="./csmsc_test.txt",
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--log_file", type=str, default="result.log", help="log file.")
+
+    parser.add_argument(
+        "--am_streaming",
+        type=str2bool,
+        default=False,
+        help="whether use streaming acoustic model")
+
+    parser.add_argument("--am_pad", type=int, default=12, help="am pad size.")
+
+    parser.add_argument(
+        "--am_block", type=int, default=42, help="am block size.")
+
+    parser.add_argument(
+        "--voc_streaming",
+        type=str2bool,
+        default=False,
+        help="whether use streaming vocoder model")
+
+    parser.add_argument("--voc_pad", type=int, default=14, help="voc pad size.")
+
+    parser.add_argument(
+        "--voc_block", type=int, default=14, help="voc block size.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    paddle.set_device(args.device)
+    if args.am_support_stream:
+        assert (args.am == 'fastspeech2_csmsc')
+    if args.am_streaming:
+        assert (args.am_support_stream and args.am == 'fastspeech2_csmsc')
+    if args.voc_streaming:
+        assert (args.voc == 'mb_melgan_csmsc' or args.voc == 'hifigan_csmsc')
+
+    logger = logging.getLogger()
+    fhandler = logging.FileHandler(filename=args.log_file, mode='w')
+    formatter = logging.Formatter(
+        '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
+    )
+    fhandler.setFormatter(formatter)
+    logger.addHandler(fhandler)
+    logger.setLevel(logging.DEBUG)
+
+    # set basic information
+    logger.info(
+        f"AM: {args.am}; Vocoder: {args.voc}; device: {args.device}; am streaming: {args.am_streaming}; voc streaming: {args.voc_streaming}"
+    )
+    logger.info(
+        f"am pad size: {args.am_pad}; am block size: {args.am_block}; voc pad size: {args.voc_pad}; voc block size: {args.voc_block};"
+    )
+
+    # get information about model
+    frontend, am_infer_info, voc_infer_info = init(args)
+    logger.info(
+        "************************ try infer *********************************")
+    try_infer(args, logger, frontend, am_infer_info, voc_infer_info)
+    logger.info(
+        "************************ normal test *******************************")
+    evaluate(args, logger, frontend, am_infer_info, voc_infer_info)
+
+
+if __name__ == "__main__":
+    main()

From 4b111146dc959daac319879ba8d89fb9a3f24b75 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 11 Apr 2022 15:31:03 +0800
Subject: [PATCH 2/7] code format, test=doc

---
 .../server/tests/tts/infer/csmsc_test.txt     | 100 ------------------
 paddlespeech/server/tests/tts/infer/run.sh    |  28 ++---
 .../server/tests/tts/infer/test_online_tts.py |  71 +++----------
 3 files changed, 26 insertions(+), 173 deletions(-)
 delete mode 100644 paddlespeech/server/tests/tts/infer/csmsc_test.txt

diff --git a/paddlespeech/server/tests/tts/infer/csmsc_test.txt b/paddlespeech/server/tests/tts/infer/csmsc_test.txt
deleted file mode 100644
index d8cf367c..00000000
--- a/paddlespeech/server/tests/tts/infer/csmsc_test.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-009901 昨日，这名伤者与医生全部被警方依法刑事拘留。
-009902 钱伟长想到上海来办学校是经过深思熟虑的。
-009903 她见我一进门就骂，吃饭时也骂，骂得我抬不起头。
-009904 李述德在离开之前，只说了一句柱驼杀父亲了。
-009905 这种车票和保险单捆绑出售属于重复性购买。
-009906 戴佩妮的男友西米露接唱情歌，让她非常开心。
-009907 观大势，谋大局，出大策始终是该院的办院方针。
-009908 他们骑着摩托回家，正好为农忙时的父母帮忙。
-009909 但是因为还没到退休年龄，只能掰着指头捱日子。
-009910 这几天雨水不断，人们恨不得待在家里不出门。
-009911 没想到徐赟，张海翔两人就此玩起了人间蒸发。
-009912 藤村此番发言可能是为了凸显野田的领导能力。
-009913 程长庚，生在清王朝嘉庆年间，安徽的潜山小县。
-009914 南海海域综合补给基地码头项目正在论证中。
-009915 也就是说今晚成都市民极有可能再次看到飘雪。
-009916 随着天气转热，各地的游泳场所开始人头攒动。
-009917 更让徐先生纳闷的是，房客的手机也打不通了。
-009918 遇到颠簸时，应听从乘务员的安全指令，回座位坐好。
-009919 他在后面呆惯了，怕自己一插身后的人会不满，不敢排进去。
-009920 傍晚七个小人回来了，白雪公主说，你们就是我命中的七个小矮人吧。
-009921 他本想说，教育局管这个，他们是一路的，这样一管岂不是妓女起嫖客？
-009922 一种表示商品所有权的财物证券，也称商品证券，如提货单，交货单。
-009923 会有很丰富的东西留下来，说都说不完。
-009924 这句话像从天而降，吓得四周一片寂静。
-009925 记者所在的是受害人家属所在的右区。
-009926 不管哈大爷去哪，它都一步不离地跟着。
-009927 大家抬头望去，一只老鼠正趴在吊顶上。
-009928 我决定过年就辞职，接手我爸的废品站！
-009929 最终，中国男子乒乓球队获得此奖项。
-009930 防汛抗旱两手抓，抗旱相对抓的不够。
-009931 图们江下游地区开发开放的进展如何？
-009932 这要求中国必须有一个坚强的政党领导。
-009933 再说，关于利益上的事俺俩都不好开口。
-009934 明代瓦剌，鞑靼入侵明境也是通过此地。
-009935 咪咪舔着孩子，把它身上的毛舔干净。
-009936 是否这次的国标修订被大企业绑架了？
-009937 判决后，姚某妻子胡某不服，提起上诉。
-009938 由此可以看出邯钢的经济效益来自何处。
-009939 琳达说，是瑜伽改变了她和马儿的生活。
-009940 楼下的保安告诉记者，这里不租也不卖。
-009941 习近平说，中斯两国人民传统友谊深厚。
-009942 传闻越来越多，后来连老汉儿自己都怕了。
-009943 我怒吼一声冲上去，举起砖头砸了过去。
-009944 我现在还不会，这就回去问问发明我的人。
-009945 显然，洛阳性奴案不具备上述两个前提。
-009946 另外，杰克逊有文唇线，眼线，眉毛的动作。
-009947 昨晚，华西都市报记者电话采访了尹琪。
-009948 涅拉季科未透露这些航空公司的名称。
-009949 从运行轨迹上来说，它也不可能是星星。
-009950 目前看，如果继续加息也存在两难问题。
-009951 曾宝仪在节目录制现场大爆观众糗事。
-009952 但任凭周某怎么叫，男子仍酣睡不醒。
-009953 老大爷说，小子，你挡我财路了，知道不？
-009954 没料到，闯下大头佛的阿伟还不知悔改。
-009955 卡扎菲部落式统治已遭遇部落内讧。
-009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。
-009957 出现这种泥鳅内阁的局面既是野田有意为之，也实属无奈。
-009958 济青高速济南，华山，章丘，邹平，周村，淄博，临淄站。
-009959 赵凌飞的话，反映了沈阳赛区所有奥运志愿者的共同心声。
-009960 因为，我们所发出的力量必会因难度加大而减弱。
-009961 发生事故的楼梯拐角处仍可看到血迹。
-009962 想过进公安，可能身高不够，老汉儿也不让我进去。
-009963 路上关卡很多，为了方便撤离，只好轻装前进。
-009964 原来比尔盖茨就是美国微软公司联合创始人呀。
-009965 之后他们一家三口将与双方父母往峇里岛旅游。
-009966 谢谢总理，也感谢广大网友的参与，我们明年再见。
-009967 事实上是，从来没有一个欺善怕恶的人能作出过稍大一点的成就。
-009968 我会打开邮件，你可以从那里继续。
-009969 美方对近期东海局势表示关切。
-009970 据悉，奥巴马一家人对这座冬季白宫极为满意。
-009971 打扫完你会很有成就感的，试一试，你就信了。
-009972 诺曼站在滑板车上，各就各位，准备出发啦！
-009973 塔河的寒夜，气温降到了零下三十多摄氏度。
-009974 其间，连破六点六，六点五，六点四，六点三五等多个重要关口。
-009975 算命其实只是人们的一种自我安慰和自我暗示而已，我们还是要相信科学才好。
-009976 这一切都令人欢欣鼓舞，阿讷西没理由不坚持到最后。
-009977 直至公元前一万一千年，它又再次出现。
-009978 尽量少玩电脑，少看电视，少打游戏。
-009979 从五到七，前后也就是六个月的时间。
-009980 一进咖啡店，他就遇见一张熟悉的脸。
-009981 好在众弟兄看到了把她追了回来。
-009982 有一个人说，哥们儿我们跑过它才能活。
-009983 捅了她以后，模糊记得她没咋动了。
-009984 从小到大，葛启义没有收到过压岁钱。
-009985 舞台下的你会对舞台上的你说什么？
-009986 但考生普遍认为，试题的怪多过难。
-009987 我希望每个人都能够尊重我们的隐私。
-009988 漫天的红霞使劲给两人增添气氛。
-009989 晚上加完班开车回家，太累了，迷迷糊糊开着车，走一半的时候，铛一声！
-009990 该车将三人撞倒后，在大雾中逃窜。
-009991 这人一哆嗦，方向盘也把不稳了，差点撞上了高速边道护栏。
-009992 那女孩儿委屈的说，我一回头见你已经进去了我不敢进去啊！
-009993 小明摇摇头说，不是，我只是美女看多了，想换个口味而已。
-009994 接下来，红娘要求记者交费，记者表示不知表姐身份证号码。
-009995 李东蓊表示，自己当时在法庭上发表了一次独特的公诉意见。
-009996 另一男子扑了上来，手里拿着明晃晃的长刀，向他胸口直刺。
-009997 今天，快递员拿着一个快递在办公室喊，秦王是哪个，有他快递？
-009998 这场抗议活动究竟是如何发展演变的，又究竟是谁伤害了谁？
-009999 因华国锋肖鸡，墓地设计根据其属相设计。
-010000 在狱中，张明宝悔恨交加，写了一份忏悔书。
diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh
index fdceec41..631daddd 100644
--- a/paddlespeech/server/tests/tts/infer/run.sh
+++ b/paddlespeech/server/tests/tts/infer/run.sh
@@ -1,14 +1,7 @@
-model_path=/home/users/liangyunming/.paddlespeech/models/
-#am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_nosil_baker_ckpt_0.4/    ## fastspeech2
-am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/    ## fastspeech2_cnn
-voc_model_dir=$model_path/hifigan_csmsc-zh/hifigan_csmsc_ckpt_0.1.1/    ## hifigan
-#voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/    ## mb_melgan
-
-if [[ $am_model_dir == *"fastspeech2_cnndecoder"* ]]; then
-    am_support_stream=True
-else
-    am_support_stream=False
-fi
+model_path=~/.paddlespeech/models/
+am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/    ## fastspeech2_c
+voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/    ## mb_melgan
+testdata=../../../../t2s/exps/csmsc_test.txt
 
 # get am file
 for file in $(ls $am_model_dir)
@@ -39,23 +32,24 @@ do
 done
 
 
-#run
-python test_online_tts.py --am fastspeech2_csmsc \
-                          --am_support_stream $am_support_stream \
+# run test
+# am can choose fastspeech2_csmsc or fastspeech2-C_csmsc, where fastspeech2-C_csmsc supports streaming inference.
+# voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference.
+python test_online_tts.py --am fastspeech2-C_csmsc \
                           --am_config $am_model_dir/$am_config_file \
                           --am_ckpt $am_model_dir/$am_ckpt_file \
                           --am_stat $am_model_dir/$am_stat_file \
                           --phones_dict $am_model_dir/$phones_dict_file \
-                          --voc hifigan_csmsc \
+                          --voc mb_melgan_csmsc \
                           --voc_config $voc_model_dir/$voc_config_file \
                           --voc_ckpt $voc_model_dir/$voc_ckpt_file \
                           --voc_stat $voc_model_dir/$voc_stat_file  \
                           --lang zh \
                           --device cpu \
-                          --text ./csmsc_test.txt \
+                          --text $testdata \
                           --output_dir ./output \
                           --log_file ./result.log \
-                          --am_streaming False \
+                          --am_streaming True \
                           --am_pad 12 \
                           --am_block 42 \
                           --voc_streaming True \
diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py
index 17ac0ea7..8ccf724b 100644
--- a/paddlespeech/server/tests/tts/infer/test_online_tts.py
+++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py
@@ -71,8 +71,7 @@ def get_stream_am_inference(args, am_config):
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
 
-    am_name = args.am[:args.am.rindex('_')]
-    am_dataset = args.am[args.am.rindex('_') + 1:]
+    am_name = "fastspeech2"
     odim = am_config.n_mels
 
     am_class = dynamic_import(am_name, model_alias)
@@ -100,7 +99,7 @@ def init(args):
     frontend = get_frontend(args)
 
     # acoustic model
-    if args.am_support_stream:
+    if args.am == 'fastspeech2-C_csmsc':
         am, am_mu, am_std = get_stream_am_inference(args, am_config)
         am_infer_info = [am, am_mu, am_std, am_config]
     else:
@@ -117,8 +116,6 @@ def init(args):
 def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids):
     am_name = args.am[:args.am.rindex('_')]
     tone_ids = None
-    if am_name == 'speedyspeech':
-        get_tone_ids = True
 
     if args.lang == 'zh':
         input_ids = frontend.get_input_ids(
@@ -142,7 +139,7 @@ def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids):
 # 生成完整的mel
 def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids):
     # 如果是支持流式的AM模型
-    if args.am_support_stream:
+    if args.am == 'fastspeech2-C_csmsc':
         am, am_mu, am_std, am_config = am_infer_info
         orig_hs, h_masks = am.encoder_infer(part_phone_ids)
         if args.am_streaming:
@@ -180,23 +177,7 @@ def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids):
 
     else:
         am_inference, am_name, am_dataset, am_config = am_infer_info
-        # acoustic model
-        if am_name == 'fastspeech2':
-            # multi speaker
-            if am_dataset in {"aishell3", "vctk"}:
-                spk_id = paddle.to_tensor(args.spk_id)
-                mel = am_inference(part_phone_ids, spk_id)
-            else:
-                mel = am_inference(part_phone_ids)
-        elif am_name == 'speedyspeech':
-            part_tone_ids = tone_ids[i]
-            if am_dataset in {"aishell3", "vctk"}:
-                spk_id = paddle.to_tensor(args.spk_id)
-                mel = am_inference(part_phone_ids, part_tone_ids, spk_id)
-            else:
-                mel = am_inference(part_phone_ids, part_tone_ids)
-        elif am_name == 'tacotron2':
-            mel = am_inference(part_phone_ids)
+        mel = am_inference(part_phone_ids)
 
     return mel
 
@@ -297,7 +278,8 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
     global wav_streaming
     global voc_stream_st
     mel_streaming = None
-    flag = 1  #用来表示开启流式voc的线程
+    #用来表示开启流式voc的线程
+    flag = 1
 
     am, am_mu, am_std, am_config = am_infer_info
     orig_hs, h_masks = am.encoder_infer(part_phone_ids)
@@ -343,7 +325,7 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
     return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
 
 
-def try_infer(args, logger, frontend, am_infer_info, voc_infer_info):
+def warm_up(args, logger, frontend, am_infer_info, voc_infer_info):
     global sample_rate
     logger.info(
         "Before the formal test, we test a few texts to make the inference speed more stable."
@@ -363,7 +345,7 @@ def try_infer(args, logger, frontend, am_infer_info, voc_infer_info):
 
     merge_sentences = True
     get_tone_ids = False
-    for i in range(3):  # 推理3次
+    for i in range(5):  # 推理5次
         st = time.time()
         phone_ids, tone_ids = get_phone(args, frontend, sentence,
                                         merge_sentences, get_tone_ids)
@@ -500,18 +482,10 @@ def parse_args():
         '--am',
         type=str,
         default='fastspeech2_csmsc',
-        choices=[
-            'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
-            'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
-            'tacotron2_csmsc', 'tacotron2_ljspeech'
-        ],
-        help='Choose acoustic model type of tts task.')
-    parser.add_argument(
-        '--am_support_stream',
-        type=str2bool,
-        default=False,
-        help='if am model is fastspeech2_csmsc, specify whether it supports streaming'
+        choices=['fastspeech2_csmsc', 'fastspeech2-C_csmsc'],
+        help='Choose acoustic model type of tts task. where fastspeech2-C_csmsc supports streaming inference'
     )
+
     parser.add_argument(
         '--am_config',
         type=str,
@@ -532,23 +506,12 @@ def parse_args():
         "--phones_dict", type=str, default=None, help="phone vocabulary file.")
     parser.add_argument(
         "--tones_dict", type=str, default=None, help="tone vocabulary file.")
-    parser.add_argument(
-        "--speaker_dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        '--spk_id',
-        type=int,
-        default=0,
-        help='spk id for multi speaker acoustic model')
     # vocoder
     parser.add_argument(
         '--voc',
         type=str,
         default='mb_melgan_csmsc',
-        choices=[
-            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
-            'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc',
-            'wavernn_csmsc'
-        ],
+        choices=['mb_melgan_csmsc', 'hifigan_csmsc'],
         help='Choose vocoder type of tts task.')
     parser.add_argument(
         '--voc_config',
@@ -612,12 +575,8 @@ def parse_args():
 def main():
     args = parse_args()
     paddle.set_device(args.device)
-    if args.am_support_stream:
-        assert (args.am == 'fastspeech2_csmsc')
     if args.am_streaming:
-        assert (args.am_support_stream and args.am == 'fastspeech2_csmsc')
-    if args.voc_streaming:
-        assert (args.voc == 'mb_melgan_csmsc' or args.voc == 'hifigan_csmsc')
+        assert (args.am == 'fastspeech2-C_csmsc')
 
     logger = logging.getLogger()
     fhandler = logging.FileHandler(filename=args.log_file, mode='w')
@@ -639,8 +598,8 @@ def main():
     # get information about model
     frontend, am_infer_info, voc_infer_info = init(args)
     logger.info(
-        "************************ try infer *********************************")
-    try_infer(args, logger, frontend, am_infer_info, voc_infer_info)
+        "************************ warm up *********************************")
+    warm_up(args, logger, frontend, am_infer_info, voc_infer_info)
     logger.info(
         "************************ normal test *******************************")
     evaluate(args, logger, frontend, am_infer_info, voc_infer_info)

From 9d0224460bec81139fd7d69732dce0f7c7ec36fa Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 11 Apr 2022 15:54:44 +0800
Subject: [PATCH 3/7] code format, test=doc

---
 paddlespeech/server/tests/tts/infer/run.sh    | 12 ++--
 .../server/tests/tts/infer/test_online_tts.py | 67 ++++++++++---------
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh
index 631daddd..3733c3fb 100644
--- a/paddlespeech/server/tests/tts/infer/run.sh
+++ b/paddlespeech/server/tests/tts/infer/run.sh
@@ -1,6 +1,6 @@
 model_path=~/.paddlespeech/models/
-am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/    ## fastspeech2_c
-voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/    ## mb_melgan
+am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/   
+voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/    
 testdata=../../../../t2s/exps/csmsc_test.txt
 
 # get am file
@@ -33,9 +33,13 @@ done
 
 
 # run test
-# am can choose fastspeech2_csmsc or fastspeech2-C_csmsc, where fastspeech2-C_csmsc supports streaming inference.
+# am can choose fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc, where fastspeech2_cnndecoder_csmsc supports streaming inference.
 # voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference.
-python test_online_tts.py --am fastspeech2-C_csmsc \
+# When am is fastspeech2_cnndecoder_csmsc and am_pad is set to 12, there is no diff between streaming and non-streaming inference results.
+# When voc is mb_melgan_csmsc and voc_pad is set to 14, there is no diff between streaming and non-streaming inference results.
+# When voc is hifigan_csmsc and voc_pad is set to 20, there is no diff between streaming and non-streaming inference results.
+
+python test_online_tts.py --am fastspeech2_cnndecoder_csmsc \
                           --am_config $am_model_dir/$am_config_file \
                           --am_ckpt $am_model_dir/$am_ckpt_file \
                           --am_stat $am_model_dir/$am_stat_file \
diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py
index 8ccf724b..eb5fc80b 100644
--- a/paddlespeech/server/tests/tts/infer/test_online_tts.py
+++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py
@@ -34,8 +34,8 @@ from paddlespeech.t2s.utils import str2bool
 
 mel_streaming = None
 wav_streaming = None
-stream_first_time = 0.0
-voc_stream_st = 0.0
+streaming_first_time = 0.0
+streaming_voc_st = 0.0
 sample_rate = 0
 
 
@@ -65,7 +65,7 @@ def get_chunks(data, block_size, pad_size, step):
     return chunks
 
 
-def get_stream_am_inference(args, am_config):
+def get_streaming_am_inference(args, am_config):
     with open(args.phones_dict, "r") as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
@@ -99,8 +99,8 @@ def init(args):
     frontend = get_frontend(args)
 
     # acoustic model
-    if args.am == 'fastspeech2-C_csmsc':
-        am, am_mu, am_std = get_stream_am_inference(args, am_config)
+    if args.am == 'fastspeech2_cnndecoder_csmsc':
+        am, am_mu, am_std = get_streaming_am_inference(args, am_config)
         am_infer_info = [am, am_mu, am_std, am_config]
     else:
         am_inference, am_name, am_dataset = get_am_inference(args, am_config)
@@ -139,7 +139,7 @@ def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids):
 # 生成完整的mel
 def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids):
     # 如果是支持流式的AM模型
-    if args.am == 'fastspeech2-C_csmsc':
+    if args.am == 'fastspeech2_cnndecoder_csmsc':
         am, am_mu, am_std, am_config = am_infer_info
         orig_hs, h_masks = am.encoder_infer(part_phone_ids)
         if args.am_streaming:
@@ -183,9 +183,9 @@ def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids):
 
 
 @paddle.no_grad()
-def stream_voc_infer(args, voc_infer_info, mel_len):
+def streaming_voc_infer(args, voc_infer_info, mel_len):
     global mel_streaming
-    global stream_first_time
+    global streaming_first_time
     global wav_streaming
     voc_inference, voc_config = voc_infer_info
     block = args.voc_block
@@ -203,7 +203,7 @@ def stream_voc_infer(args, voc_infer_info, mel_len):
     while valid_end <= mel_len:
         sub_wav = voc_inference(mel_chunk)
         if flag == 1:
-            stream_first_time = time.time()
+            streaming_first_time = time.time()
             flag = 0
 
         # get valid wav    
@@ -233,8 +233,8 @@ def stream_voc_infer(args, voc_infer_info, mel_len):
 
 @paddle.no_grad()
 # 非流式AM / 流式AM + 非流式Voc
-def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
-                    part_tone_ids):
+def am_nonstreaming_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
+                        part_tone_ids):
     mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids)
     am_infer_time = time.time()
     voc_inference, voc_config = voc_infer_info
@@ -248,10 +248,10 @@ def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
 
 @paddle.no_grad()
 # 非流式AM + 流式Voc
-def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
-                           part_tone_ids):
+def nonstreaming_am_streaming_voc(args, am_infer_info, voc_infer_info,
+                                  part_phone_ids, part_tone_ids):
     global mel_streaming
-    global stream_first_time
+    global streaming_first_time
     global wav_streaming
 
     mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids)
@@ -260,8 +260,8 @@ def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
     # voc streaming
     mel_streaming = mel
     mel_len = mel.shape[0]
-    stream_voc_infer(args, voc_infer_info, mel_len)
-    first_response_time = stream_first_time
+    streaming_voc_infer(args, voc_infer_info, mel_len)
+    first_response_time = streaming_first_time
     wav = wav_streaming
     final_response_time = time.time()
     voc_infer_time = final_response_time
@@ -271,12 +271,12 @@ def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
 
 @paddle.no_grad()
 # 流式AM + 流式 Voc
-def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
-                         part_tone_ids):
+def streaming_am_streaming_voc(args, am_infer_info, voc_infer_info,
+                               part_phone_ids, part_tone_ids):
     global mel_streaming
-    global stream_first_time
+    global streaming_first_time
     global wav_streaming
-    global voc_stream_st
+    global streaming_voc_st
     mel_streaming = None
     #用来表示开启流式voc的线程
     flag = 1
@@ -311,15 +311,16 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
 
         if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad:
             t = threading.Thread(
-                target=stream_voc_infer, args=(args, voc_infer_info, mel_len, ))
+                target=streaming_voc_infer,
+                args=(args, voc_infer_info, mel_len, ))
             t.start()
-            voc_stream_st = time.time()
+            streaming_voc_st = time.time()
             flag = 0
 
     t.join()
     final_response_time = time.time()
     voc_infer_time = final_response_time
-    first_response_time = stream_first_time
+    first_response_time = streaming_first_time
     wav = wav_streaming
 
     return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
@@ -337,11 +338,11 @@ def warm_up(args, logger, frontend, am_infer_info, voc_infer_info):
 
     if args.voc_streaming:
         if args.am_streaming:
-            infer_func = stream_am_stream_voc
+            infer_func = streaming_am_streaming_voc
         else:
-            infer_func = nostream_am_stream_voc
+            infer_func = nonstreaming_am_streaming_voc
     else:
-        infer_func = am_nostream_voc
+        infer_func = am_nonstreaming_voc
 
     merge_sentences = True
     get_tone_ids = False
@@ -376,11 +377,11 @@ def evaluate(args, logger, frontend, am_infer_info, voc_infer_info):
     # choose infer function
     if args.voc_streaming:
         if args.am_streaming:
-            infer_func = stream_am_stream_voc
+            infer_func = streaming_am_streaming_voc
         else:
-            infer_func = nostream_am_stream_voc
+            infer_func = nonstreaming_am_streaming_voc
     else:
-        infer_func = am_nostream_voc
+        infer_func = am_nonstreaming_voc
 
     final_up_duration = 0.0
     sentence_count = 0
@@ -410,7 +411,7 @@ def evaluate(args, logger, frontend, am_infer_info, voc_infer_info):
             args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids)
         am_time = am_infer_time - am_st
         if args.voc_streaming and args.am_streaming:
-            voc_time = voc_infer_time - voc_stream_st
+            voc_time = voc_infer_time - streaming_voc_st
         else:
             voc_time = voc_infer_time - am_infer_time
 
@@ -482,8 +483,8 @@ def parse_args():
         '--am',
         type=str,
         default='fastspeech2_csmsc',
-        choices=['fastspeech2_csmsc', 'fastspeech2-C_csmsc'],
-        help='Choose acoustic model type of tts task. where fastspeech2-C_csmsc supports streaming inference'
+        choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'],
+        help='Choose acoustic model type of tts task. where fastspeech2_cnndecoder_csmsc supports streaming inference'
     )
 
     parser.add_argument(
@@ -576,7 +577,7 @@ def main():
     args = parse_args()
     paddle.set_device(args.device)
     if args.am_streaming:
-        assert (args.am == 'fastspeech2-C_csmsc')
+        assert (args.am == 'fastspeech2_cnndecoder_csmsc')
 
     logger = logging.getLogger()
     fhandler = logging.FileHandler(filename=args.log_file, mode='w')

From 9c0ceaacb6aafa1175b0df7372fb411e2fd772fe Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 18 Apr 2022 17:27:45 +0800
Subject: [PATCH 4/7] add streaming am infer, test=doc

---
 .../server/engine/tts/online/tts_engine.py    | 517 ++++++++++++++++--
 paddlespeech/server/utils/util.py             |   4 +
 2 files changed, 462 insertions(+), 59 deletions(-)

diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py
index 25a8bc76..8e76225d 100644
--- a/paddlespeech/server/engine/tts/online/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/tts_engine.py
@@ -12,24 +12,322 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import base64
+import math
+import os
 import time
+from typing import Optional
 
 import numpy as np
 import paddle
+import yaml
+from yacs.config import CfgNode
 
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
+from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
+from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+__all__ = ['TTSEngine']
+
+# support online model
+pretrained_models = {
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+        'md5':
+        '637d28a5e53aa60275612ba4393d5f22',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_76000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+    "fastspeech2_cnndecoder_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
+        'md5':
+        '6eb28e22ace73e0ebe7845f86478f89f',
+        'config':
+        'cnndecoder.yaml',
+        'ckpt':
+        'snapshot_iter_153000.pdz',
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+    },
+
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'ee5f0604e20091f0d495b6ec4618b90d',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1000000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'dd40a3d88dfcf64513fba2f0f961ada6',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+}
+
+model_alias = {
+    # acoustic model
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+
+    # voc
+    "mb_melgan":
+    "paddlespeech.t2s.models.melgan:MelGANGenerator",
+    "mb_melgan_inference":
+    "paddlespeech.t2s.models.melgan:MelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
+}
 
 __all__ = ['TTSEngine']
 
 
 class TTSServerExecutor(TTSExecutor):
-    def __init__(self):
+    def __init__(self, am_block, am_pad, voc_block, voc_pad):
         super().__init__()
-        pass
+        self.am_block = am_block
+        self.am_pad = am_pad
+        self.voc_block = voc_block
+        self.voc_pad = voc_pad
+
+    def get_model_info(self, step, model_name, ckpt, stat):
+        """get model information
+
+        Args:
+            step (string): am or voc
+            model_name (string): model type, support fastspeech2, higigan, mb_melgan 
+            ckpt (string): ckpt file
+            stat (string): stat file, including mean and standard deviation
+
+        Returns:
+            model, model_mu, model_std
+        """
+        model_class = dynamic_import(model_name, model_alias)
+
+        if step == "am":
+            odim = self.am_config.n_mels
+            model = model_class(
+                idim=self.vocab_size, odim=odim, **self.am_config["model"])
+            model.set_state_dict(paddle.load(ckpt)["main_params"])
+
+        elif step == "voc":
+            model = model_class(**self.voc_config["generator_params"])
+            model.set_state_dict(paddle.load(ckpt)["generator_params"])
+            model.remove_weight_norm()
+
+        else:
+            logger.error("Please set correct step, am or voc")
+
+        model.eval()
+        model_mu, model_std = np.load(stat)
+        model_mu = paddle.to_tensor(model_mu)
+        model_std = paddle.to_tensor(model_std)
+
+        return model, model_mu, model_std
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+        return decompressed_path
+
+    def _init_from_path(
+            self,
+            am: str='fastspeech2_csmsc',
+            am_config: Optional[os.PathLike]=None,
+            am_ckpt: Optional[os.PathLike]=None,
+            am_stat: Optional[os.PathLike]=None,
+            phones_dict: Optional[os.PathLike]=None,
+            tones_dict: Optional[os.PathLike]=None,
+            speaker_dict: Optional[os.PathLike]=None,
+            voc: str='mb_melgan_csmsc',
+            voc_config: Optional[os.PathLike]=None,
+            voc_ckpt: Optional[os.PathLike]=None,
+            voc_stat: Optional[os.PathLike]=None,
+            lang: str='zh', ):
+        """
+        Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'):
+            logger.info('Models had been initialized.')
+            return
+        # am model info
+        am_tag = am + '-' + lang
+        if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
+            am_res_path = self._get_pretrained_path(am_tag)
+            self.am_res_path = am_res_path
+            self.am_config = os.path.join(am_res_path,
+                                          pretrained_models[am_tag]['config'])
+            self.am_ckpt = os.path.join(am_res_path,
+                                        pretrained_models[am_tag]['ckpt'])
+            self.am_stat = os.path.join(
+                am_res_path, pretrained_models[am_tag]['speech_stats'])
+            # must have phones_dict in acoustic
+            self.phones_dict = os.path.join(
+                am_res_path, pretrained_models[am_tag]['phones_dict'])
+            print("self.phones_dict:", self.phones_dict)
+            logger.info(am_res_path)
+            logger.info(self.am_config)
+            logger.info(self.am_ckpt)
+        else:
+            self.am_config = os.path.abspath(am_config)
+            self.am_ckpt = os.path.abspath(am_ckpt)
+            self.am_stat = os.path.abspath(am_stat)
+            self.phones_dict = os.path.abspath(phones_dict)
+            self.am_res_path = os.path.dirname(os.path.abspath(self.am_config))
+        print("self.phones_dict:", self.phones_dict)
+
+        self.tones_dict = None
+        self.speaker_dict = None
+
+        # voc model info
+        voc_tag = voc + '-' + lang
+        if voc_ckpt is None or voc_config is None or voc_stat is None:
+            voc_res_path = self._get_pretrained_path(voc_tag)
+            self.voc_res_path = voc_res_path
+            self.voc_config = os.path.join(voc_res_path,
+                                           pretrained_models[voc_tag]['config'])
+            self.voc_ckpt = os.path.join(voc_res_path,
+                                         pretrained_models[voc_tag]['ckpt'])
+            self.voc_stat = os.path.join(
+                voc_res_path, pretrained_models[voc_tag]['speech_stats'])
+            logger.info(voc_res_path)
+            logger.info(self.voc_config)
+            logger.info(self.voc_ckpt)
+        else:
+            self.voc_config = os.path.abspath(voc_config)
+            self.voc_ckpt = os.path.abspath(voc_ckpt)
+            self.voc_stat = os.path.abspath(voc_stat)
+            self.voc_res_path = os.path.dirname(
+                os.path.abspath(self.voc_config))
+
+        # Init body.
+        with open(self.am_config) as f:
+            self.am_config = CfgNode(yaml.safe_load(f))
+        with open(self.voc_config) as f:
+            self.voc_config = CfgNode(yaml.safe_load(f))
+
+        with open(self.phones_dict, "r") as f:
+            phn_id = [line.strip().split() for line in f.readlines()]
+        self.vocab_size = len(phn_id)
+        print("vocab_size:", self.vocab_size)
+
+        # frontend
+        if lang == 'zh':
+            self.frontend = Frontend(
+                phone_vocab_path=self.phones_dict,
+                tone_vocab_path=self.tones_dict)
+
+        elif lang == 'en':
+            self.frontend = English(phone_vocab_path=self.phones_dict)
+        print("frontend done!")
+
+        # am infer info
+        self.am_name = am[:am.rindex('_')]
+        if self.am_name == "fastspeech2_cnndecoder":
+            self.am_inference, self.am_mu, self.am_std = self.get_model_info(
+                "am", "fastspeech2", self.am_ckpt, self.am_stat)
+        else:
+            am, am_mu, am_std = self.get_model_info("am", self.am_name,
+                                                    self.am_ckpt, self.am_stat)
+            am_normalizer = ZScore(am_mu, am_std)
+            am_inference_class = dynamic_import(self.am_name + '_inference',
+                                                model_alias)
+            self.am_inference = am_inference_class(am_normalizer, am)
+            self.am_inference.eval()
+        print("acoustic model done!")
+
+        # voc infer info
+        self.voc_name = voc[:voc.rindex('_')]
+        voc, voc_mu, voc_std = self.get_model_info("voc", self.voc_name,
+                                                   self.voc_ckpt, self.voc_stat)
+        voc_normalizer = ZScore(voc_mu, voc_std)
+        voc_inference_class = dynamic_import(self.voc_name + '_inference',
+                                             model_alias)
+        self.voc_inference = voc_inference_class(voc_normalizer, voc)
+        self.voc_inference.eval()
+        print("voc done!")
+
+    def get_phone(self, sentence, lang, merge_sentences, get_tone_ids):
+        tone_ids = None
+        if lang == 'zh':
+            input_ids = self.frontend.get_input_ids(
+                sentence,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids)
+            phone_ids = input_ids["phone_ids"]
+            if get_tone_ids:
+                tone_ids = input_ids["tone_ids"]
+        elif lang == 'en':
+            input_ids = self.frontend.get_input_ids(
+                sentence, merge_sentences=merge_sentences)
+            phone_ids = input_ids["phone_ids"]
+        else:
+            print("lang should in {'zh', 'en'}!")
+
+    def depadding(self, data, chunk_num, chunk_id, block, pad, upsample):
+        """ 
+        Streaming inference removes the result of pad inference
+        """
+        front_pad = min(chunk_id * block, pad)
+        # first chunk
+        if chunk_id == 0:
+            data = data[:block * upsample]
+        # last chunk
+        elif chunk_id == chunk_num - 1:
+            data = data[front_pad * upsample:]
+        # middle chunk
+        else:
+            data = data[front_pad * upsample:(front_pad + block) * upsample]
+
+        return data
 
     @paddle.no_grad()
     def infer(
@@ -37,16 +335,19 @@ class TTSServerExecutor(TTSExecutor):
             text: str,
             lang: str='zh',
             am: str='fastspeech2_csmsc',
-            spk_id: int=0,
-            am_block: int=42,
-            am_pad: int=12,
-            voc_block: int=14,
-            voc_pad: int=14, ):
+            spk_id: int=0, ):
         """
         Model inference and result stored in self.output.
         """
-        am_name = am[:am.rindex('_')]
-        am_dataset = am[am.rindex('_') + 1:]
+
+        am_block = self.am_block
+        am_pad = self.am_pad
+        am_upsample = 1
+        voc_block = self.voc_block
+        voc_pad = self.voc_pad
+        voc_upsample = self.voc_config.n_shift
+        flag = 1
+
         get_tone_ids = False
         merge_sentences = False
         frontend_st = time.time()
@@ -64,43 +365,99 @@ class TTSServerExecutor(TTSExecutor):
             phone_ids = input_ids["phone_ids"]
         else:
             print("lang should in {'zh', 'en'}!")
-        self.frontend_time = time.time() - frontend_st
+        frontend_et = time.time()
+        self.frontend_time = frontend_et - frontend_st
 
         for i in range(len(phone_ids)):
-            am_st = time.time()
             part_phone_ids = phone_ids[i]
-            # am
-            if am_name == 'speedyspeech':
-                part_tone_ids = tone_ids[i]
-                mel = self.am_inference(part_phone_ids, part_tone_ids)
-            # fastspeech2
+            voc_chunk_id = 0
+
+            # fastspeech2_csmsc
+            if am == "fastspeech2_csmsc":
+                # am 
+                mel = self.am_inference(part_phone_ids)
+                if flag == 1:
+                    first_am_et = time.time()
+                    self.first_am_infer = first_am_et - frontend_et
+
+                # voc streaming
+                mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc")
+                voc_chunk_num = len(mel_chunks)
+                voc_st = time.time()
+                for i, mel_chunk in enumerate(mel_chunks):
+                    sub_wav = self.voc_inference(mel_chunk)
+                    sub_wav = self.depadding(sub_wav, voc_chunk_num, i,
+                                             voc_block, voc_pad, voc_upsample)
+                    if flag == 1:
+                        first_voc_et = time.time()
+                        self.first_voc_infer = first_voc_et - first_am_et
+                        self.first_response_time = first_voc_et - frontend_st
+                        flag = 0
+
+                    yield sub_wav
+
+            # fastspeech2_cnndecoder_csmsc 
+            elif am == "fastspeech2_cnndecoder_csmsc":
+                # am 
+                orig_hs, h_masks = self.am_inference.encoder_infer(
+                    part_phone_ids)
+
+                # streaming voc chunk info
+                mel_len = orig_hs.shape[1]
+                voc_chunk_num = math.ceil(mel_len / self.voc_block)
+                start = 0
+                end = min(self.voc_block + self.voc_pad, mel_len)
+
+                # streaming am
+                hss = get_chunks(orig_hs, self.am_block, self.am_pad, "am")
+                am_chunk_num = len(hss)
+                for i, hs in enumerate(hss):
+                    before_outs, _ = self.am_inference.decoder(hs)
+                    after_outs = before_outs + self.am_inference.postnet(
+                        before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                    normalized_mel = after_outs[0]
+                    sub_mel = denorm(normalized_mel, self.am_mu, self.am_std)
+                    sub_mel = self.depadding(sub_mel, am_chunk_num, i, am_block,
+                                             am_pad, am_upsample)
+
+                    if i == 0:
+                        mel_streaming = sub_mel
+                    else:
+                        mel_streaming = np.concatenate(
+                            (mel_streaming, sub_mel), axis=0)
+
+                    # streaming voc
+                    while (mel_streaming.shape[0] >= end and
+                           voc_chunk_id < voc_chunk_num):
+                        if flag == 1:
+                            first_am_et = time.time()
+                            self.first_am_infer = first_am_et - frontend_et
+                        voc_chunk = mel_streaming[start:end, :]
+                        voc_chunk = paddle.to_tensor(voc_chunk)
+                        sub_wav = self.voc_inference(voc_chunk)
+
+                        sub_wav = self.depadding(sub_wav, voc_chunk_num,
+                                                 voc_chunk_id, voc_block,
+                                                 voc_pad, voc_upsample)
+                        if flag == 1:
+                            first_voc_et = time.time()
+                            self.first_voc_infer = first_voc_et - first_am_et
+                            self.first_response_time = first_voc_et - frontend_st
+                            flag = 0
+
+                        yield sub_wav
+
+                        voc_chunk_id += 1
+                        start = max(0, voc_chunk_id * voc_block - voc_pad)
+                        end = min((voc_chunk_id + 1) * voc_block + voc_pad,
+                                  mel_len)
+
             else:
-                # multi speaker
-                if am_dataset in {"aishell3", "vctk"}:
-                    mel = self.am_inference(
-                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
-                else:
-                    mel = self.am_inference(part_phone_ids)
-            am_et = time.time()
-
-            # voc streaming
-            voc_upsample = self.voc_config.n_shift
-            mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc")
-            chunk_num = len(mel_chunks)
-            voc_st = time.time()
-            for i, mel_chunk in enumerate(mel_chunks):
-                sub_wav = self.voc_inference(mel_chunk)
-                front_pad = min(i * voc_block, voc_pad)
-
-                if i == 0:
-                    sub_wav = sub_wav[:voc_block * voc_upsample]
-                elif i == chunk_num - 1:
-                    sub_wav = sub_wav[front_pad * voc_upsample:]
-                else:
-                    sub_wav = sub_wav[front_pad * voc_upsample:(
-                        front_pad + voc_block) * voc_upsample]
-
-                yield sub_wav
+                logger.error(
+                    "Only support fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc on streaming tts."
+                )
+
+        self.final_response_time = time.time() - frontend_st
 
 
 class TTSEngine(BaseEngine):
@@ -116,11 +473,18 @@ class TTSEngine(BaseEngine):
         super(TTSEngine, self).__init__()
 
     def init(self, config: dict) -> bool:
-        self.executor = TTSServerExecutor()
         self.config = config
-        assert "fastspeech2_csmsc" in config.am and (
-            config.voc == "hifigan_csmsc-zh" or config.voc == "mb_melgan_csmsc"
+        assert (
+            config.am == "fastspeech2_csmsc" or
+            config.am == "fastspeech2_cnndecoder_csmsc"
+        ) and (
+            config.voc == "hifigan_csmsc" or config.voc == "mb_melgan_csmsc"
         ), 'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.'
+
+        assert (
+            config.voc_block > 0 and config.voc_pad > 0
+        ), "Please set correct voc_block and voc_pad, they should be more than 0."
+
         try:
             if self.config.device:
                 self.device = self.config.device
@@ -135,6 +499,9 @@ class TTSEngine(BaseEngine):
                          (self.device))
             return False
 
+        self.executor = TTSServerExecutor(config.am_block, config.am_pad,
+                                          config.voc_block, config.voc_pad)
+
         try:
             self.executor._init_from_path(
                 am=self.config.am,
@@ -155,15 +522,42 @@ class TTSEngine(BaseEngine):
                          (self.device))
             return False
 
-        self.am_block = self.config.am_block
-        self.am_pad = self.config.am_pad
-        self.voc_block = self.config.voc_block
-        self.voc_pad = self.config.voc_pad
-
         logger.info("Initialize TTS server engine successfully on device: %s." %
                     (self.device))
+
+        # warm up
+        try:
+            self.warm_up()
+        except Exception as e:
+            logger.error("Failed to warm up on tts engine.")
+            return False
+
         return True
 
+    def warm_up(self):
+        """warm up
+        """
+        if self.config.lang == 'zh':
+            sentence = "您好，欢迎使用语音合成服务。"
+        if self.config.lang == 'en':
+            sentence = "Hello and welcome to the speech synthesis service."
+        logger.info(
+            "*******************************warm up ********************************"
+        )
+        for i in range(3):
+            for wav in self.executor.infer(
+                    text=sentence,
+                    lang=self.config.lang,
+                    am=self.config.am,
+                    spk_id=0, ):
+                logger.info(
+                    f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
+                )
+                break
+        logger.info(
+            "**********************************************************************"
+        )
+
     def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
         # Convert byte to text
         if text_bese64:
@@ -195,18 +589,14 @@ class TTSEngine(BaseEngine):
             wav_base64: The base64 format of the synthesized audio.
         """
 
-        lang = self.config.lang
         wav_list = []
 
         for wav in self.executor.infer(
                 text=sentence,
-                lang=lang,
+                lang=self.config.lang,
                 am=self.config.am,
-                spk_id=spk_id,
-                am_block=self.am_block,
-                am_pad=self.am_pad,
-                voc_block=self.voc_block,
-                voc_pad=self.voc_pad):
+                spk_id=spk_id, ):
+
             # wav type: <class 'numpy.ndarray'>  float32, convert to pcm (base64)
             wav = float2pcm(wav)  # float32 to int16
             wav_bytes = wav.tobytes()  # to bytes
@@ -216,5 +606,14 @@ class TTSEngine(BaseEngine):
             yield wav_base64
 
         wav_all = np.concatenate(wav_list, axis=0)
-        logger.info("The durations of audio is: {} s".format(
-            len(wav_all) / self.executor.am_config.fs))
+        duration = len(wav_all) / self.executor.am_config.fs
+        logger.info(f"sentence: {sentence}")
+        logger.info(f"The durations of audio is: {duration} s")
+        logger.info(
+            f"first response time: {self.executor.first_response_time} s")
+        logger.info(
+            f"final response time: {self.executor.final_response_time} s")
+        logger.info(f"RTF: {self.executor.final_response_time / duration}")
+        logger.info(
+            f"Other info: front time: {self.executor.frontend_time} s, first am infer time: {self.executor.first_am_infer} s, first voc infer time: {self.executor.first_voc_infer} s,"
+        )
diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py
index 0fe70849..72ee0060 100644
--- a/paddlespeech/server/utils/util.py
+++ b/paddlespeech/server/utils/util.py
@@ -52,6 +52,10 @@ def get_chunks(data, block_size, pad_size, step):
     Returns:
         list: chunks list
     """
+
+    if block_size == -1:
+        return [data]
+
     if step == "am":
         data_len = data.shape[1]
     elif step == "voc":

From 00a6236fe2c0affa3093551c1d88f0a92b2d0a42 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 18 Apr 2022 17:31:47 +0800
Subject: [PATCH 5/7] remove test code, test=doc

---
 paddlespeech/server/tests/tts/infer/run.sh    |  62 --
 .../server/tests/tts/infer/test_online_tts.py | 610 ------------------
 2 files changed, 672 deletions(-)
 delete mode 100644 paddlespeech/server/tests/tts/infer/run.sh
 delete mode 100644 paddlespeech/server/tests/tts/infer/test_online_tts.py

diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh
deleted file mode 100644
index 3733c3fb..00000000
--- a/paddlespeech/server/tests/tts/infer/run.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-model_path=~/.paddlespeech/models/
-am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/   
-voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/    
-testdata=../../../../t2s/exps/csmsc_test.txt
-
-# get am file
-for file in $(ls $am_model_dir)
-do
-    if [[ $file == *"yaml"* ]]; then
-        am_config_file=$file
-    elif [[ $file == *"pdz"* ]]; then
-        am_ckpt_file=$file
-    elif [[ $file == *"stat"* ]]; then
-        am_stat_file=$file
-    elif [[ $file == *"phone"* ]]; then
-        phones_dict_file=$file
-    fi
-    
-done
-
-# get voc file
-for file in $(ls $voc_model_dir)
-do
-    if [[ $file == *"yaml"* ]]; then
-        voc_config_file=$file
-    elif [[ $file == *"pdz"* ]]; then
-        voc_ckpt_file=$file
-    elif [[ $file == *"stat"* ]]; then
-        voc_stat_file=$file
-    fi
-    
-done
-
-
-# run test
-# am can choose fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc, where fastspeech2_cnndecoder_csmsc supports streaming inference.
-# voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference.
-# When am is fastspeech2_cnndecoder_csmsc and am_pad is set to 12, there is no diff between streaming and non-streaming inference results.
-# When voc is mb_melgan_csmsc and voc_pad is set to 14, there is no diff between streaming and non-streaming inference results.
-# When voc is hifigan_csmsc and voc_pad is set to 20, there is no diff between streaming and non-streaming inference results.
-
-python test_online_tts.py --am fastspeech2_cnndecoder_csmsc \
-                          --am_config $am_model_dir/$am_config_file \
-                          --am_ckpt $am_model_dir/$am_ckpt_file \
-                          --am_stat $am_model_dir/$am_stat_file \
-                          --phones_dict $am_model_dir/$phones_dict_file \
-                          --voc mb_melgan_csmsc \
-                          --voc_config $voc_model_dir/$voc_config_file \
-                          --voc_ckpt $voc_model_dir/$voc_ckpt_file \
-                          --voc_stat $voc_model_dir/$voc_stat_file  \
-                          --lang zh \
-                          --device cpu \
-                          --text $testdata \
-                          --output_dir ./output \
-                          --log_file ./result.log \
-                          --am_streaming True \
-                          --am_pad 12 \
-                          --am_block 42 \
-                          --voc_streaming True \
-                          --voc_pad 14 \
-                          --voc_block 14 \
-
diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py
deleted file mode 100644
index eb5fc80b..00000000
--- a/paddlespeech/server/tests/tts/infer/test_online_tts.py
+++ /dev/null
@@ -1,610 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import logging
-import math
-import threading
-import time
-from pathlib import Path
-
-import numpy as np
-import paddle
-import soundfile as sf
-import yaml
-from yacs.config import CfgNode
-
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.t2s.exps.syn_utils import get_am_inference
-from paddlespeech.t2s.exps.syn_utils import get_frontend
-from paddlespeech.t2s.exps.syn_utils import get_sentences
-from paddlespeech.t2s.exps.syn_utils import get_voc_inference
-from paddlespeech.t2s.exps.syn_utils import model_alias
-from paddlespeech.t2s.utils import str2bool
-
-mel_streaming = None
-wav_streaming = None
-streaming_first_time = 0.0
-streaming_voc_st = 0.0
-sample_rate = 0
-
-
-def denorm(data, mean, std):
-    return data * std + mean
-
-
-def get_chunks(data, block_size, pad_size, step):
-    if step == "am":
-        data_len = data.shape[1]
-    elif step == "voc":
-        data_len = data.shape[0]
-    else:
-        print("Please set correct type to get chunks, am or voc")
-
-    chunks = []
-    n = math.ceil(data_len / block_size)
-    for i in range(n):
-        start = max(0, i * block_size - pad_size)
-        end = min((i + 1) * block_size + pad_size, data_len)
-        if step == "am":
-            chunks.append(data[:, start:end, :])
-        elif step == "voc":
-            chunks.append(data[start:end, :])
-        else:
-            print("Please set correct type to get chunks, am or voc")
-    return chunks
-
-
-def get_streaming_am_inference(args, am_config):
-    with open(args.phones_dict, "r") as f:
-        phn_id = [line.strip().split() for line in f.readlines()]
-    vocab_size = len(phn_id)
-    print("vocab_size:", vocab_size)
-
-    am_name = "fastspeech2"
-    odim = am_config.n_mels
-
-    am_class = dynamic_import(am_name, model_alias)
-    am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
-    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
-    am.eval()
-    am_mu, am_std = np.load(args.am_stat)
-    am_mu = paddle.to_tensor(am_mu)
-    am_std = paddle.to_tensor(am_std)
-
-    return am, am_mu, am_std
-
-
-def init(args):
-    global sample_rate
-    # get config
-    with open(args.am_config) as f:
-        am_config = CfgNode(yaml.safe_load(f))
-    with open(args.voc_config) as f:
-        voc_config = CfgNode(yaml.safe_load(f))
-
-    sample_rate = am_config.fs
-
-    # frontend
-    frontend = get_frontend(args)
-
-    # acoustic model
-    if args.am == 'fastspeech2_cnndecoder_csmsc':
-        am, am_mu, am_std = get_streaming_am_inference(args, am_config)
-        am_infer_info = [am, am_mu, am_std, am_config]
-    else:
-        am_inference, am_name, am_dataset = get_am_inference(args, am_config)
-        am_infer_info = [am_inference, am_name, am_dataset, am_config]
-
-    # vocoder
-    voc_inference = get_voc_inference(args, voc_config)
-    voc_infer_info = [voc_inference, voc_config]
-
-    return frontend, am_infer_info, voc_infer_info
-
-
-def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids):
-    am_name = args.am[:args.am.rindex('_')]
-    tone_ids = None
-
-    if args.lang == 'zh':
-        input_ids = frontend.get_input_ids(
-            sentence,
-            merge_sentences=merge_sentences,
-            get_tone_ids=get_tone_ids)
-        phone_ids = input_ids["phone_ids"]
-        if get_tone_ids:
-            tone_ids = input_ids["tone_ids"]
-    elif args.lang == 'en':
-        input_ids = frontend.get_input_ids(
-            sentence, merge_sentences=merge_sentences)
-        phone_ids = input_ids["phone_ids"]
-    else:
-        print("lang should in {'zh', 'en'}!")
-
-    return phone_ids, tone_ids
-
-
-@paddle.no_grad()
-# 生成完整的mel
-def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids):
-    # 如果是支持流式的AM模型
-    if args.am == 'fastspeech2_cnndecoder_csmsc':
-        am, am_mu, am_std, am_config = am_infer_info
-        orig_hs, h_masks = am.encoder_infer(part_phone_ids)
-        if args.am_streaming:
-            am_pad = args.am_pad
-            am_block = args.am_block
-            hss = get_chunks(orig_hs, am_block, am_pad, "am")
-            chunk_num = len(hss)
-            mel_list = []
-            for i, hs in enumerate(hss):
-                before_outs, _ = am.decoder(hs)
-                after_outs = before_outs + am.postnet(
-                    before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
-                normalized_mel = after_outs[0]
-                sub_mel = denorm(normalized_mel, am_mu, am_std)
-                # clip output part of pad
-                if i == 0:
-                    sub_mel = sub_mel[:-am_pad]
-                elif i == chunk_num - 1:
-                    # 最后一块的右侧一定没有 pad 够
-                    sub_mel = sub_mel[am_pad:]
-                else:
-                    # 倒数几块的右侧也可能没有 pad 够
-                    sub_mel = sub_mel[am_pad:(am_block + am_pad) -
-                                      sub_mel.shape[0]]
-                mel_list.append(sub_mel)
-                mel = paddle.concat(mel_list, axis=0)
-
-        else:
-            orig_hs, h_masks = am.encoder_infer(part_phone_ids)
-            before_outs, _ = am.decoder(orig_hs)
-            after_outs = before_outs + am.postnet(
-                before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
-            normalized_mel = after_outs[0]
-            mel = denorm(normalized_mel, am_mu, am_std)
-
-    else:
-        am_inference, am_name, am_dataset, am_config = am_infer_info
-        mel = am_inference(part_phone_ids)
-
-    return mel
-
-
-@paddle.no_grad()
-def streaming_voc_infer(args, voc_infer_info, mel_len):
-    global mel_streaming
-    global streaming_first_time
-    global wav_streaming
-    voc_inference, voc_config = voc_infer_info
-    block = args.voc_block
-    pad = args.voc_pad
-    upsample = voc_config.n_shift
-    wav_list = []
-    flag = 1
-
-    valid_start = 0
-    valid_end = min(valid_start + block, mel_len)
-    actual_start = 0
-    actual_end = min(valid_end + pad, mel_len)
-    mel_chunk = mel_streaming[actual_start:actual_end, :]
-
-    while valid_end <= mel_len:
-        sub_wav = voc_inference(mel_chunk)
-        if flag == 1:
-            streaming_first_time = time.time()
-            flag = 0
-
-        # get valid wav    
-        start = valid_start - actual_start
-        if valid_end == mel_len:
-            sub_wav = sub_wav[start * upsample:]
-            wav_list.append(sub_wav)
-            break
-        else:
-            end = start + block
-            sub_wav = sub_wav[start * upsample:end * upsample]
-            wav_list.append(sub_wav)
-
-        # generate new mel chunk
-        valid_start = valid_end
-        valid_end = min(valid_start + block, mel_len)
-        if valid_start - pad < 0:
-            actual_start = 0
-        else:
-            actual_start = valid_start - pad
-        actual_end = min(valid_end + pad, mel_len)
-        mel_chunk = mel_streaming[actual_start:actual_end, :]
-
-    wav = paddle.concat(wav_list, axis=0)
-    wav_streaming = wav
-
-
-@paddle.no_grad()
-# 非流式AM / 流式AM + 非流式Voc
-def am_nonstreaming_voc(args, am_infer_info, voc_infer_info, part_phone_ids,
-                        part_tone_ids):
-    mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids)
-    am_infer_time = time.time()
-    voc_inference, voc_config = voc_infer_info
-    wav = voc_inference(mel)
-    first_response_time = time.time()
-    final_response_time = first_response_time
-    voc_infer_time = first_response_time
-
-    return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
-
-
-@paddle.no_grad()
-# 非流式AM + 流式Voc
-def nonstreaming_am_streaming_voc(args, am_infer_info, voc_infer_info,
-                                  part_phone_ids, part_tone_ids):
-    global mel_streaming
-    global streaming_first_time
-    global wav_streaming
-
-    mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids)
-    am_infer_time = time.time()
-
-    # voc streaming
-    mel_streaming = mel
-    mel_len = mel.shape[0]
-    streaming_voc_infer(args, voc_infer_info, mel_len)
-    first_response_time = streaming_first_time
-    wav = wav_streaming
-    final_response_time = time.time()
-    voc_infer_time = final_response_time
-
-    return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
-
-
-@paddle.no_grad()
-# 流式AM + 流式 Voc
-def streaming_am_streaming_voc(args, am_infer_info, voc_infer_info,
-                               part_phone_ids, part_tone_ids):
-    global mel_streaming
-    global streaming_first_time
-    global wav_streaming
-    global streaming_voc_st
-    mel_streaming = None
-    #用来表示开启流式voc的线程
-    flag = 1
-
-    am, am_mu, am_std, am_config = am_infer_info
-    orig_hs, h_masks = am.encoder_infer(part_phone_ids)
-    mel_len = orig_hs.shape[1]
-    am_block = args.am_block
-    am_pad = args.am_pad
-    hss = get_chunks(orig_hs, am_block, am_pad, "am")
-    chunk_num = len(hss)
-
-    for i, hs in enumerate(hss):
-        before_outs, _ = am.decoder(hs)
-        after_outs = before_outs + am.postnet(
-            before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
-        normalized_mel = after_outs[0]
-        sub_mel = denorm(normalized_mel, am_mu, am_std)
-        # clip output part of pad
-        if i == 0:
-            sub_mel = sub_mel[:-am_pad]
-            mel_streaming = sub_mel
-        elif i == chunk_num - 1:
-            # 最后一块的右侧一定没有 pad 够
-            sub_mel = sub_mel[am_pad:]
-            mel_streaming = paddle.concat([mel_streaming, sub_mel])
-            am_infer_time = time.time()
-        else:
-            # 倒数几块的右侧也可能没有 pad 够
-            sub_mel = sub_mel[am_pad:(am_block + am_pad) - sub_mel.shape[0]]
-            mel_streaming = paddle.concat([mel_streaming, sub_mel])
-
-        if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad:
-            t = threading.Thread(
-                target=streaming_voc_infer,
-                args=(args, voc_infer_info, mel_len, ))
-            t.start()
-            streaming_voc_st = time.time()
-            flag = 0
-
-    t.join()
-    final_response_time = time.time()
-    voc_infer_time = final_response_time
-    first_response_time = streaming_first_time
-    wav = wav_streaming
-
-    return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav
-
-
-def warm_up(args, logger, frontend, am_infer_info, voc_infer_info):
-    global sample_rate
-    logger.info(
-        "Before the formal test, we test a few texts to make the inference speed more stable."
-    )
-    if args.lang == 'zh':
-        sentence = "您好，欢迎使用语音合成服务。"
-    if args.lang == 'en':
-        sentence = "Hello and welcome to the speech synthesis service."
-
-    if args.voc_streaming:
-        if args.am_streaming:
-            infer_func = streaming_am_streaming_voc
-        else:
-            infer_func = nonstreaming_am_streaming_voc
-    else:
-        infer_func = am_nonstreaming_voc
-
-    merge_sentences = True
-    get_tone_ids = False
-    for i in range(5):  # 推理5次
-        st = time.time()
-        phone_ids, tone_ids = get_phone(args, frontend, sentence,
-                                        merge_sentences, get_tone_ids)
-        part_phone_ids = phone_ids[0]
-        if tone_ids:
-            part_tone_ids = tone_ids[0]
-        else:
-            part_tone_ids = None
-
-        am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func(
-            args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids)
-        wav = wav.numpy()
-        duration = wav.size / sample_rate
-        logger.info(
-            f"sentence: {sentence}; duration: {duration} s; first response time: {first_response_time - st} s; final response time: {final_response_time - st} s"
-        )
-
-
-def evaluate(args, logger, frontend, am_infer_info, voc_infer_info):
-    global sample_rate
-    sentences = get_sentences(args)
-
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    get_tone_ids = False
-    merge_sentences = True
-
-    # choose infer function
-    if args.voc_streaming:
-        if args.am_streaming:
-            infer_func = streaming_am_streaming_voc
-        else:
-            infer_func = nonstreaming_am_streaming_voc
-    else:
-        infer_func = am_nonstreaming_voc
-
-    final_up_duration = 0.0
-    sentence_count = 0
-    front_time_list = []
-    am_time_list = []
-    voc_time_list = []
-    first_response_list = []
-    final_response_list = []
-    sentence_length_list = []
-    duration_list = []
-
-    for utt_id, sentence in sentences:
-        # front
-        front_st = time.time()
-        phone_ids, tone_ids = get_phone(args, frontend, sentence,
-                                        merge_sentences, get_tone_ids)
-        part_phone_ids = phone_ids[0]
-        if tone_ids:
-            part_tone_ids = tone_ids[0]
-        else:
-            part_tone_ids = None
-        front_et = time.time()
-        front_time = front_et - front_st
-
-        am_st = time.time()
-        am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func(
-            args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids)
-        am_time = am_infer_time - am_st
-        if args.voc_streaming and args.am_streaming:
-            voc_time = voc_infer_time - streaming_voc_st
-        else:
-            voc_time = voc_infer_time - am_infer_time
-
-        first_response = first_response_time - front_st
-        final_response = final_response_time - front_st
-
-        wav = wav.numpy()
-        duration = wav.size / sample_rate
-        sf.write(
-            str(output_dir / (utt_id + ".wav")), wav, samplerate=sample_rate)
-        print(f"{utt_id} done!")
-
-        sentence_count += 1
-        front_time_list.append(front_time)
-        am_time_list.append(am_time)
-        voc_time_list.append(voc_time)
-        first_response_list.append(first_response)
-        final_response_list.append(final_response)
-        sentence_length_list.append(len(sentence))
-        duration_list.append(duration)
-
-        logger.info(
-            f"uttid: {utt_id}; sentence: '{sentence}'; front time: {front_time} s; am time: {am_time} s; voc time: {voc_time} s; \
-                        first response time: {first_response} s; final response time: {final_response} s; audio duration: {duration} s;"
-        )
-
-        if final_response > duration:
-            final_up_duration += 1
-
-    all_time_sum = sum(final_response_list)
-    front_rate = sum(front_time_list) / all_time_sum
-    am_rate = sum(am_time_list) / all_time_sum
-    voc_rate = sum(voc_time_list) / all_time_sum
-    rtf = all_time_sum / sum(duration_list)
-
-    logger.info(
-        f"The length of test text information, test num: {sentence_count}; text num: {sum(sentence_length_list)}; min: {min(sentence_length_list)}; max: {max(sentence_length_list)}; avg: {sum(sentence_length_list)/len(sentence_length_list)}"
-    )
-    logger.info(
-        f"duration information, min: {min(duration_list)}; max: {max(duration_list)}; avg: {sum(duration_list) / len(duration_list)}; sum: {sum(duration_list)}"
-    )
-    logger.info(
-        f"Front time information: min: {min(front_time_list)} s; max: {max(front_time_list)} s; avg: {sum(front_time_list)/len(front_time_list)} s; ratio: {front_rate * 100}%"
-    )
-    logger.info(
-        f"AM time information: min: {min(am_time_list)} s; max: {max(am_time_list)} s; avg: {sum(am_time_list)/len(am_time_list)} s; ratio: {am_rate * 100}%"
-    )
-    logger.info(
-        f"Vocoder time information: min: {min(voc_time_list)} s, max: {max(voc_time_list)} s; avg: {sum(voc_time_list)/len(voc_time_list)} s; ratio: {voc_rate * 100}%"
-    )
-    logger.info(
-        f"first response time information: min: {min(first_response_list)} s; max: {max(first_response_list)} s; avg: {sum(first_response_list)/len(first_response_list)} s"
-    )
-    logger.info(
-        f"final response time information: min: {min(final_response_list)} s; max: {max(final_response_list)} s; avg: {sum(final_response_list)/len(final_response_list)} s"
-    )
-    logger.info(f"RTF is: {rtf}")
-    logger.info(
-        f"The number of final_response is greater than duration is {final_up_duration}, ratio: {final_up_duration / sentence_count}%"
-    )
-
-
-def parse_args():
-    # parse args and config and redirect to train_sp
-    parser = argparse.ArgumentParser(
-        description="Synthesize with acoustic model & vocoder")
-    # acoustic model
-    parser.add_argument(
-        '--am',
-        type=str,
-        default='fastspeech2_csmsc',
-        choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'],
-        help='Choose acoustic model type of tts task. where fastspeech2_cnndecoder_csmsc supports streaming inference'
-    )
-
-    parser.add_argument(
-        '--am_config',
-        type=str,
-        default=None,
-        help='Config of acoustic model. Use deault config when it is None.')
-    parser.add_argument(
-        '--am_ckpt',
-        type=str,
-        default=None,
-        help='Checkpoint file of acoustic model.')
-    parser.add_argument(
-        "--am_stat",
-        type=str,
-        default=None,
-        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
-    )
-    parser.add_argument(
-        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
-    parser.add_argument(
-        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
-    # vocoder
-    parser.add_argument(
-        '--voc',
-        type=str,
-        default='mb_melgan_csmsc',
-        choices=['mb_melgan_csmsc', 'hifigan_csmsc'],
-        help='Choose vocoder type of tts task.')
-    parser.add_argument(
-        '--voc_config',
-        type=str,
-        default=None,
-        help='Config of voc. Use deault config when it is None.')
-    parser.add_argument(
-        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
-    parser.add_argument(
-        "--voc_stat",
-        type=str,
-        default=None,
-        help="mean and standard deviation used to normalize spectrogram when training voc."
-    )
-    # other
-    parser.add_argument(
-        '--lang',
-        type=str,
-        default='zh',
-        choices=['zh', 'en'],
-        help='Choose model language. zh or en')
-
-    parser.add_argument(
-        "--device", type=str, default='cpu', help="set cpu or gpu:id")
-
-    parser.add_argument(
-        "--text",
-        type=str,
-        default="./csmsc_test.txt",
-        help="text to synthesize, a 'utt_id sentence' pair per line.")
-    parser.add_argument("--output_dir", type=str, help="output dir.")
-    parser.add_argument(
-        "--log_file", type=str, default="result.log", help="log file.")
-
-    parser.add_argument(
-        "--am_streaming",
-        type=str2bool,
-        default=False,
-        help="whether use streaming acoustic model")
-
-    parser.add_argument("--am_pad", type=int, default=12, help="am pad size.")
-
-    parser.add_argument(
-        "--am_block", type=int, default=42, help="am block size.")
-
-    parser.add_argument(
-        "--voc_streaming",
-        type=str2bool,
-        default=False,
-        help="whether use streaming vocoder model")
-
-    parser.add_argument("--voc_pad", type=int, default=14, help="voc pad size.")
-
-    parser.add_argument(
-        "--voc_block", type=int, default=14, help="voc block size.")
-
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-    paddle.set_device(args.device)
-    if args.am_streaming:
-        assert (args.am == 'fastspeech2_cnndecoder_csmsc')
-
-    logger = logging.getLogger()
-    fhandler = logging.FileHandler(filename=args.log_file, mode='w')
-    formatter = logging.Formatter(
-        '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
-    )
-    fhandler.setFormatter(formatter)
-    logger.addHandler(fhandler)
-    logger.setLevel(logging.DEBUG)
-
-    # set basic information
-    logger.info(
-        f"AM: {args.am}; Vocoder: {args.voc}; device: {args.device}; am streaming: {args.am_streaming}; voc streaming: {args.voc_streaming}"
-    )
-    logger.info(
-        f"am pad size: {args.am_pad}; am block size: {args.am_block}; voc pad size: {args.voc_pad}; voc block size: {args.voc_block};"
-    )
-
-    # get information about model
-    frontend, am_infer_info, voc_infer_info = init(args)
-    logger.info(
-        "************************ warm up *********************************")
-    warm_up(args, logger, frontend, am_infer_info, voc_infer_info)
-    logger.info(
-        "************************ normal test *******************************")
-    evaluate(args, logger, frontend, am_infer_info, voc_infer_info)
-
-
-if __name__ == "__main__":
-    main()

From 40dde22fc48f41cffdace68847ccbeb00cc1cef4 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Tue, 19 Apr 2022 12:59:48 +0800
Subject: [PATCH 6/7] code format, test=doc

---
 .../server/engine/tts/online/tts_engine.py    | 44 ++++++++++++-------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py
index 8e76225d..a84644e7 100644
--- a/paddlespeech/server/engine/tts/online/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/tts_engine.py
@@ -127,33 +127,40 @@ class TTSServerExecutor(TTSExecutor):
         self.voc_block = voc_block
         self.voc_pad = voc_pad
 
-    def get_model_info(self, step, model_name, ckpt, stat):
+    def get_model_info(self,
+                       field: str,
+                       model_name: str,
+                       ckpt: Optional[os.PathLike],
+                       stat: Optional[os.PathLike]):
         """get model information
 
         Args:
-            step (string): am or voc
-            model_name (string): model type, support fastspeech2, higigan, mb_melgan 
-            ckpt (string): ckpt file
-            stat (string): stat file, including mean and standard deviation
+            field (str): am or voc
+            model_name (str): model type, support fastspeech2, higigan, mb_melgan
+            ckpt (Optional[os.PathLike]): ckpt file
+            stat (Optional[os.PathLike]): stat file, including mean and standard deviation
 
         Returns:
-            model, model_mu, model_std
+            [module]: model module
+            [Tensor]: mean
+            [Tensor]: standard deviation
         """
+
         model_class = dynamic_import(model_name, model_alias)
 
-        if step == "am":
+        if field == "am":
             odim = self.am_config.n_mels
             model = model_class(
                 idim=self.vocab_size, odim=odim, **self.am_config["model"])
             model.set_state_dict(paddle.load(ckpt)["main_params"])
 
-        elif step == "voc":
+        elif field == "voc":
             model = model_class(**self.voc_config["generator_params"])
             model.set_state_dict(paddle.load(ckpt)["generator_params"])
             model.remove_weight_norm()
 
         else:
-            logger.error("Please set correct step, am or voc")
+            logger.error("Please set correct field, am or voc")
 
         model.eval()
         model_mu, model_std = np.load(stat)
@@ -346,7 +353,8 @@ class TTSServerExecutor(TTSExecutor):
         voc_block = self.voc_block
         voc_pad = self.voc_pad
         voc_upsample = self.voc_config.n_shift
-        flag = 1
+        # first_flag 用于标记首包
+        first_flag = 1
 
         get_tone_ids = False
         merge_sentences = False
@@ -376,7 +384,7 @@ class TTSServerExecutor(TTSExecutor):
             if am == "fastspeech2_csmsc":
                 # am 
                 mel = self.am_inference(part_phone_ids)
-                if flag == 1:
+                if first_flag == 1:
                     first_am_et = time.time()
                     self.first_am_infer = first_am_et - frontend_et
 
@@ -388,11 +396,11 @@ class TTSServerExecutor(TTSExecutor):
                     sub_wav = self.voc_inference(mel_chunk)
                     sub_wav = self.depadding(sub_wav, voc_chunk_num, i,
                                              voc_block, voc_pad, voc_upsample)
-                    if flag == 1:
+                    if first_flag == 1:
                         first_voc_et = time.time()
                         self.first_voc_infer = first_voc_et - first_am_et
                         self.first_response_time = first_voc_et - frontend_st
-                        flag = 0
+                        first_flag = 0
 
                     yield sub_wav
 
@@ -427,9 +435,10 @@ class TTSServerExecutor(TTSExecutor):
                             (mel_streaming, sub_mel), axis=0)
 
                     # streaming voc
+                    # 当流式AM推理的mel帧数大于流式voc推理的chunk size，开始进行流式voc 推理
                     while (mel_streaming.shape[0] >= end and
                            voc_chunk_id < voc_chunk_num):
-                        if flag == 1:
+                        if first_flag == 1:
                             first_am_et = time.time()
                             self.first_am_infer = first_am_et - frontend_et
                         voc_chunk = mel_streaming[start:end, :]
@@ -439,11 +448,11 @@ class TTSServerExecutor(TTSExecutor):
                         sub_wav = self.depadding(sub_wav, voc_chunk_num,
                                                  voc_chunk_id, voc_block,
                                                  voc_pad, voc_upsample)
-                        if flag == 1:
+                        if first_flag == 1:
                             first_voc_et = time.time()
                             self.first_voc_infer = first_voc_et - first_am_et
                             self.first_response_time = first_voc_et - frontend_st
-                            flag = 0
+                            first_flag = 0
 
                         yield sub_wav
 
@@ -470,7 +479,8 @@ class TTSEngine(BaseEngine):
     def __init__(self, name=None):
         """Initialize TTS server engine
         """
-        super(TTSEngine, self).__init__()
+        #super(TTSEngine, self).__init__()
+        super().__init__()
 
     def init(self, config: dict) -> bool:
         self.config = config

From 9e41ac8550b5f53b77ce3656e3561c58e0f25a82 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Tue, 19 Apr 2022 15:51:44 +0800
Subject: [PATCH 7/7] code format, test=doc

---
 paddlespeech/server/engine/tts/online/tts_engine.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py
index a84644e7..c9135b88 100644
--- a/paddlespeech/server/engine/tts/online/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/tts_engine.py
@@ -479,7 +479,6 @@ class TTSEngine(BaseEngine):
     def __init__(self, name=None):
         """Initialize TTS server engine
         """
-        #super(TTSEngine, self).__init__()
         super().__init__()
 
     def init(self, config: dict) -> bool: