From 82992b3ed6eaffd78fa27fae57235488f2ded168 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 11 Apr 2022 11:00:04 +0800 Subject: [PATCH 1/7] add test code, test=doc --- .../server/tests/tts/infer/csmsc_test.txt | 100 +++ paddlespeech/server/tests/tts/infer/run.sh | 64 ++ .../server/tests/tts/infer/test_online_tts.py | 650 ++++++++++++++++++ 3 files changed, 814 insertions(+) create mode 100644 paddlespeech/server/tests/tts/infer/csmsc_test.txt create mode 100644 paddlespeech/server/tests/tts/infer/run.sh create mode 100644 paddlespeech/server/tests/tts/infer/test_online_tts.py diff --git a/paddlespeech/server/tests/tts/infer/csmsc_test.txt b/paddlespeech/server/tests/tts/infer/csmsc_test.txt new file mode 100644 index 00000000..d8cf367c --- /dev/null +++ b/paddlespeech/server/tests/tts/infer/csmsc_test.txt @@ -0,0 +1,100 @@ +009901 昨日,这名伤者与医生全部被警方依法刑事拘留。 +009902 钱伟长想到上海来办学校是经过深思熟虑的。 +009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 +009904 李述德在离开之前,只说了一句柱驼杀父亲了。 +009905 这种车票和保险单捆绑出售属于重复性购买。 +009906 戴佩妮的男友西米露接唱情歌,让她非常开心。 +009907 观大势,谋大局,出大策始终是该院的办院方针。 +009908 他们骑着摩托回家,正好为农忙时的父母帮忙。 +009909 但是因为还没到退休年龄,只能掰着指头捱日子。 +009910 这几天雨水不断,人们恨不得待在家里不出门。 +009911 没想到徐赟,张海翔两人就此玩起了人间蒸发。 +009912 藤村此番发言可能是为了凸显野田的领导能力。 +009913 程长庚,生在清王朝嘉庆年间,安徽的潜山小县。 +009914 南海海域综合补给基地码头项目正在论证中。 +009915 也就是说今晚成都市民极有可能再次看到飘雪。 +009916 随着天气转热,各地的游泳场所开始人头攒动。 +009917 更让徐先生纳闷的是,房客的手机也打不通了。 +009918 遇到颠簸时,应听从乘务员的安全指令,回座位坐好。 +009919 他在后面呆惯了,怕自己一插身后的人会不满,不敢排进去。 +009920 傍晚七个小人回来了,白雪公主说,你们就是我命中的七个小矮人吧。 +009921 他本想说,教育局管这个,他们是一路的,这样一管岂不是妓女起嫖客? +009922 一种表示商品所有权的财物证券,也称商品证券,如提货单,交货单。 +009923 会有很丰富的东西留下来,说都说不完。 +009924 这句话像从天而降,吓得四周一片寂静。 +009925 记者所在的是受害人家属所在的右区。 +009926 不管哈大爷去哪,它都一步不离地跟着。 +009927 大家抬头望去,一只老鼠正趴在吊顶上。 +009928 我决定过年就辞职,接手我爸的废品站! +009929 最终,中国男子乒乓球队获得此奖项。 +009930 防汛抗旱两手抓,抗旱相对抓的不够。 +009931 图们江下游地区开发开放的进展如何? +009932 这要求中国必须有一个坚强的政党领导。 +009933 再说,关于利益上的事俺俩都不好开口。 +009934 明代瓦剌,鞑靼入侵明境也是通过此地。 +009935 咪咪舔着孩子,把它身上的毛舔干净。 +009936 是否这次的国标修订被大企业绑架了? +009937 判决后,姚某妻子胡某不服,提起上诉。 +009938 由此可以看出邯钢的经济效益来自何处。 +009939 琳达说,是瑜伽改变了她和马儿的生活。 +009940 楼下的保安告诉记者,这里不租也不卖。 +009941 习近平说,中斯两国人民传统友谊深厚。 +009942 传闻越来越多,后来连老汉儿自己都怕了。 +009943 我怒吼一声冲上去,举起砖头砸了过去。 +009944 我现在还不会,这就回去问问发明我的人。 +009945 显然,洛阳性奴案不具备上述两个前提。 +009946 另外,杰克逊有文唇线,眼线,眉毛的动作。 +009947 昨晚,华西都市报记者电话采访了尹琪。 +009948 涅拉季科未透露这些航空公司的名称。 +009949 从运行轨迹上来说,它也不可能是星星。 +009950 目前看,如果继续加息也存在两难问题。 +009951 曾宝仪在节目录制现场大爆观众糗事。 +009952 但任凭周某怎么叫,男子仍酣睡不醒。 +009953 老大爷说,小子,你挡我财路了,知道不? +009954 没料到,闯下大头佛的阿伟还不知悔改。 +009955 卡扎菲部落式统治已遭遇部落内讧。 +009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。 +009957 出现这种泥鳅内阁的局面既是野田有意为之,也实属无奈。 +009958 济青高速济南,华山,章丘,邹平,周村,淄博,临淄站。 +009959 赵凌飞的话,反映了沈阳赛区所有奥运志愿者的共同心声。 +009960 因为,我们所发出的力量必会因难度加大而减弱。 +009961 发生事故的楼梯拐角处仍可看到血迹。 +009962 想过进公安,可能身高不够,老汉儿也不让我进去。 +009963 路上关卡很多,为了方便撤离,只好轻装前进。 +009964 原来比尔盖茨就是美国微软公司联合创始人呀。 +009965 之后他们一家三口将与双方父母往峇里岛旅游。 +009966 谢谢总理,也感谢广大网友的参与,我们明年再见。 +009967 事实上是,从来没有一个欺善怕恶的人能作出过稍大一点的成就。 +009968 我会打开邮件,你可以从那里继续。 +009969 美方对近期东海局势表示关切。 +009970 据悉,奥巴马一家人对这座冬季白宫极为满意。 +009971 打扫完你会很有成就感的,试一试,你就信了。 +009972 诺曼站在滑板车上,各就各位,准备出发啦! +009973 塔河的寒夜,气温降到了零下三十多摄氏度。 +009974 其间,连破六点六,六点五,六点四,六点三五等多个重要关口。 +009975 算命其实只是人们的一种自我安慰和自我暗示而已,我们还是要相信科学才好。 +009976 这一切都令人欢欣鼓舞,阿讷西没理由不坚持到最后。 +009977 直至公元前一万一千年,它又再次出现。 +009978 尽量少玩电脑,少看电视,少打游戏。 +009979 从五到七,前后也就是六个月的时间。 +009980 一进咖啡店,他就遇见一张熟悉的脸。 +009981 好在众弟兄看到了把她追了回来。 +009982 有一个人说,哥们儿我们跑过它才能活。 +009983 捅了她以后,模糊记得她没咋动了。 +009984 从小到大,葛启义没有收到过压岁钱。 +009985 舞台下的你会对舞台上的你说什么? +009986 但考生普遍认为,试题的怪多过难。 +009987 我希望每个人都能够尊重我们的隐私。 +009988 漫天的红霞使劲给两人增添气氛。 +009989 晚上加完班开车回家,太累了,迷迷糊糊开着车,走一半的时候,铛一声! +009990 该车将三人撞倒后,在大雾中逃窜。 +009991 这人一哆嗦,方向盘也把不稳了,差点撞上了高速边道护栏。 +009992 那女孩儿委屈的说,我一回头见你已经进去了我不敢进去啊! +009993 小明摇摇头说,不是,我只是美女看多了,想换个口味而已。 +009994 接下来,红娘要求记者交费,记者表示不知表姐身份证号码。 +009995 李东蓊表示,自己当时在法庭上发表了一次独特的公诉意见。 +009996 另一男子扑了上来,手里拿着明晃晃的长刀,向他胸口直刺。 +009997 今天,快递员拿着一个快递在办公室喊,秦王是哪个,有他快递? +009998 这场抗议活动究竟是如何发展演变的,又究竟是谁伤害了谁? +009999 因华国锋肖鸡,墓地设计根据其属相设计。 +010000 在狱中,张明宝悔恨交加,写了一份忏悔书。 diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh new file mode 100644 index 00000000..fdceec41 --- /dev/null +++ b/paddlespeech/server/tests/tts/infer/run.sh @@ -0,0 +1,64 @@ +model_path=/home/users/liangyunming/.paddlespeech/models/ +#am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_nosil_baker_ckpt_0.4/ ## fastspeech2 +am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_cnn +voc_model_dir=$model_path/hifigan_csmsc-zh/hifigan_csmsc_ckpt_0.1.1/ ## hifigan +#voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan + +if [[ $am_model_dir == *"fastspeech2_cnndecoder"* ]]; then + am_support_stream=True +else + am_support_stream=False +fi + +# get am file +for file in $(ls $am_model_dir) +do + if [[ $file == *"yaml"* ]]; then + am_config_file=$file + elif [[ $file == *"pdz"* ]]; then + am_ckpt_file=$file + elif [[ $file == *"stat"* ]]; then + am_stat_file=$file + elif [[ $file == *"phone"* ]]; then + phones_dict_file=$file + fi + +done + +# get voc file +for file in $(ls $voc_model_dir) +do + if [[ $file == *"yaml"* ]]; then + voc_config_file=$file + elif [[ $file == *"pdz"* ]]; then + voc_ckpt_file=$file + elif [[ $file == *"stat"* ]]; then + voc_stat_file=$file + fi + +done + + +#run +python test_online_tts.py --am fastspeech2_csmsc \ + --am_support_stream $am_support_stream \ + --am_config $am_model_dir/$am_config_file \ + --am_ckpt $am_model_dir/$am_ckpt_file \ + --am_stat $am_model_dir/$am_stat_file \ + --phones_dict $am_model_dir/$phones_dict_file \ + --voc hifigan_csmsc \ + --voc_config $voc_model_dir/$voc_config_file \ + --voc_ckpt $voc_model_dir/$voc_ckpt_file \ + --voc_stat $voc_model_dir/$voc_stat_file \ + --lang zh \ + --device cpu \ + --text ./csmsc_test.txt \ + --output_dir ./output \ + --log_file ./result.log \ + --am_streaming False \ + --am_pad 12 \ + --am_block 42 \ + --voc_streaming True \ + --voc_pad 14 \ + --voc_block 14 \ + diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py new file mode 100644 index 00000000..17ac0ea7 --- /dev/null +++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py @@ -0,0 +1,650 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging +import math +import threading +import time +from pathlib import Path + +import numpy as np +import paddle +import soundfile as sf +import yaml +from yacs.config import CfgNode + +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.exps.syn_utils import model_alias +from paddlespeech.t2s.utils import str2bool + +mel_streaming = None +wav_streaming = None +stream_first_time = 0.0 +voc_stream_st = 0.0 +sample_rate = 0 + + +def denorm(data, mean, std): + return data * std + mean + + +def get_chunks(data, block_size, pad_size, step): + if step == "am": + data_len = data.shape[1] + elif step == "voc": + data_len = data.shape[0] + else: + print("Please set correct type to get chunks, am or voc") + + chunks = [] + n = math.ceil(data_len / block_size) + for i in range(n): + start = max(0, i * block_size - pad_size) + end = min((i + 1) * block_size + pad_size, data_len) + if step == "am": + chunks.append(data[:, start:end, :]) + elif step == "voc": + chunks.append(data[start:end, :]) + else: + print("Please set correct type to get chunks, am or voc") + return chunks + + +def get_stream_am_inference(args, am_config): + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + odim = am_config.n_mels + + am_class = dynamic_import(am_name, model_alias) + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) + am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) + am.eval() + am_mu, am_std = np.load(args.am_stat) + am_mu = paddle.to_tensor(am_mu) + am_std = paddle.to_tensor(am_std) + + return am, am_mu, am_std + + +def init(args): + global sample_rate + # get config + with open(args.am_config) as f: + am_config = CfgNode(yaml.safe_load(f)) + with open(args.voc_config) as f: + voc_config = CfgNode(yaml.safe_load(f)) + + sample_rate = am_config.fs + + # frontend + frontend = get_frontend(args) + + # acoustic model + if args.am_support_stream: + am, am_mu, am_std = get_stream_am_inference(args, am_config) + am_infer_info = [am, am_mu, am_std, am_config] + else: + am_inference, am_name, am_dataset = get_am_inference(args, am_config) + am_infer_info = [am_inference, am_name, am_dataset, am_config] + + # vocoder + voc_inference = get_voc_inference(args, voc_config) + voc_infer_info = [voc_inference, voc_config] + + return frontend, am_infer_info, voc_infer_info + + +def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): + am_name = args.am[:args.am.rindex('_')] + tone_ids = None + if am_name == 'speedyspeech': + get_tone_ids = True + + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + elif args.lang == 'en': + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + + return phone_ids, tone_ids + + +@paddle.no_grad() +# 生成完整的mel +def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): + # 如果是支持流式的AM模型 + if args.am_support_stream: + am, am_mu, am_std, am_config = am_infer_info + orig_hs, h_masks = am.encoder_infer(part_phone_ids) + if args.am_streaming: + am_pad = args.am_pad + am_block = args.am_block + hss = get_chunks(orig_hs, am_block, am_pad, "am") + chunk_num = len(hss) + mel_list = [] + for i, hs in enumerate(hss): + before_outs, _ = am.decoder(hs) + after_outs = before_outs + am.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + sub_mel = denorm(normalized_mel, am_mu, am_std) + # clip output part of pad + if i == 0: + sub_mel = sub_mel[:-am_pad] + elif i == chunk_num - 1: + # 最后一块的右侧一定没有 pad 够 + sub_mel = sub_mel[am_pad:] + else: + # 倒数几块的右侧也可能没有 pad 够 + sub_mel = sub_mel[am_pad:(am_block + am_pad) - + sub_mel.shape[0]] + mel_list.append(sub_mel) + mel = paddle.concat(mel_list, axis=0) + + else: + orig_hs, h_masks = am.encoder_infer(part_phone_ids) + before_outs, _ = am.decoder(orig_hs) + after_outs = before_outs + am.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + mel = denorm(normalized_mel, am_mu, am_std) + + else: + am_inference, am_name, am_dataset, am_config = am_infer_info + # acoustic model + if am_name == 'fastspeech2': + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, spk_id) + else: + mel = am_inference(part_phone_ids) + elif am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, part_tone_ids, spk_id) + else: + mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': + mel = am_inference(part_phone_ids) + + return mel + + +@paddle.no_grad() +def stream_voc_infer(args, voc_infer_info, mel_len): + global mel_streaming + global stream_first_time + global wav_streaming + voc_inference, voc_config = voc_infer_info + block = args.voc_block + pad = args.voc_pad + upsample = voc_config.n_shift + wav_list = [] + flag = 1 + + valid_start = 0 + valid_end = min(valid_start + block, mel_len) + actual_start = 0 + actual_end = min(valid_end + pad, mel_len) + mel_chunk = mel_streaming[actual_start:actual_end, :] + + while valid_end <= mel_len: + sub_wav = voc_inference(mel_chunk) + if flag == 1: + stream_first_time = time.time() + flag = 0 + + # get valid wav + start = valid_start - actual_start + if valid_end == mel_len: + sub_wav = sub_wav[start * upsample:] + wav_list.append(sub_wav) + break + else: + end = start + block + sub_wav = sub_wav[start * upsample:end * upsample] + wav_list.append(sub_wav) + + # generate new mel chunk + valid_start = valid_end + valid_end = min(valid_start + block, mel_len) + if valid_start - pad < 0: + actual_start = 0 + else: + actual_start = valid_start - pad + actual_end = min(valid_end + pad, mel_len) + mel_chunk = mel_streaming[actual_start:actual_end, :] + + wav = paddle.concat(wav_list, axis=0) + wav_streaming = wav + + +@paddle.no_grad() +# 非流式AM / 流式AM + 非流式Voc +def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): + mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) + am_infer_time = time.time() + voc_inference, voc_config = voc_infer_info + wav = voc_inference(mel) + first_response_time = time.time() + final_response_time = first_response_time + voc_infer_time = first_response_time + + return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav + + +@paddle.no_grad() +# 非流式AM + 流式Voc +def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): + global mel_streaming + global stream_first_time + global wav_streaming + + mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) + am_infer_time = time.time() + + # voc streaming + mel_streaming = mel + mel_len = mel.shape[0] + stream_voc_infer(args, voc_infer_info, mel_len) + first_response_time = stream_first_time + wav = wav_streaming + final_response_time = time.time() + voc_infer_time = final_response_time + + return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav + + +@paddle.no_grad() +# 流式AM + 流式 Voc +def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): + global mel_streaming + global stream_first_time + global wav_streaming + global voc_stream_st + mel_streaming = None + flag = 1 #用来表示开启流式voc的线程 + + am, am_mu, am_std, am_config = am_infer_info + orig_hs, h_masks = am.encoder_infer(part_phone_ids) + mel_len = orig_hs.shape[1] + am_block = args.am_block + am_pad = args.am_pad + hss = get_chunks(orig_hs, am_block, am_pad, "am") + chunk_num = len(hss) + + for i, hs in enumerate(hss): + before_outs, _ = am.decoder(hs) + after_outs = before_outs + am.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + sub_mel = denorm(normalized_mel, am_mu, am_std) + # clip output part of pad + if i == 0: + sub_mel = sub_mel[:-am_pad] + mel_streaming = sub_mel + elif i == chunk_num - 1: + # 最后一块的右侧一定没有 pad 够 + sub_mel = sub_mel[am_pad:] + mel_streaming = paddle.concat([mel_streaming, sub_mel]) + am_infer_time = time.time() + else: + # 倒数几块的右侧也可能没有 pad 够 + sub_mel = sub_mel[am_pad:(am_block + am_pad) - sub_mel.shape[0]] + mel_streaming = paddle.concat([mel_streaming, sub_mel]) + + if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad: + t = threading.Thread( + target=stream_voc_infer, args=(args, voc_infer_info, mel_len, )) + t.start() + voc_stream_st = time.time() + flag = 0 + + t.join() + final_response_time = time.time() + voc_infer_time = final_response_time + first_response_time = stream_first_time + wav = wav_streaming + + return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav + + +def try_infer(args, logger, frontend, am_infer_info, voc_infer_info): + global sample_rate + logger.info( + "Before the formal test, we test a few texts to make the inference speed more stable." + ) + if args.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if args.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + + if args.voc_streaming: + if args.am_streaming: + infer_func = stream_am_stream_voc + else: + infer_func = nostream_am_stream_voc + else: + infer_func = am_nostream_voc + + merge_sentences = True + get_tone_ids = False + for i in range(3): # 推理3次 + st = time.time() + phone_ids, tone_ids = get_phone(args, frontend, sentence, + merge_sentences, get_tone_ids) + part_phone_ids = phone_ids[0] + if tone_ids: + part_tone_ids = tone_ids[0] + else: + part_tone_ids = None + + am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( + args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) + wav = wav.numpy() + duration = wav.size / sample_rate + logger.info( + f"sentence: {sentence}; duration: {duration} s; first response time: {first_response_time - st} s; final response time: {final_response_time - st} s" + ) + + +def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): + global sample_rate + sentences = get_sentences(args) + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + get_tone_ids = False + merge_sentences = True + + # choose infer function + if args.voc_streaming: + if args.am_streaming: + infer_func = stream_am_stream_voc + else: + infer_func = nostream_am_stream_voc + else: + infer_func = am_nostream_voc + + final_up_duration = 0.0 + sentence_count = 0 + front_time_list = [] + am_time_list = [] + voc_time_list = [] + first_response_list = [] + final_response_list = [] + sentence_length_list = [] + duration_list = [] + + for utt_id, sentence in sentences: + # front + front_st = time.time() + phone_ids, tone_ids = get_phone(args, frontend, sentence, + merge_sentences, get_tone_ids) + part_phone_ids = phone_ids[0] + if tone_ids: + part_tone_ids = tone_ids[0] + else: + part_tone_ids = None + front_et = time.time() + front_time = front_et - front_st + + am_st = time.time() + am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( + args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) + am_time = am_infer_time - am_st + if args.voc_streaming and args.am_streaming: + voc_time = voc_infer_time - voc_stream_st + else: + voc_time = voc_infer_time - am_infer_time + + first_response = first_response_time - front_st + final_response = final_response_time - front_st + + wav = wav.numpy() + duration = wav.size / sample_rate + sf.write( + str(output_dir / (utt_id + ".wav")), wav, samplerate=sample_rate) + print(f"{utt_id} done!") + + sentence_count += 1 + front_time_list.append(front_time) + am_time_list.append(am_time) + voc_time_list.append(voc_time) + first_response_list.append(first_response) + final_response_list.append(final_response) + sentence_length_list.append(len(sentence)) + duration_list.append(duration) + + logger.info( + f"uttid: {utt_id}; sentence: '{sentence}'; front time: {front_time} s; am time: {am_time} s; voc time: {voc_time} s; \ + first response time: {first_response} s; final response time: {final_response} s; audio duration: {duration} s;" + ) + + if final_response > duration: + final_up_duration += 1 + + all_time_sum = sum(final_response_list) + front_rate = sum(front_time_list) / all_time_sum + am_rate = sum(am_time_list) / all_time_sum + voc_rate = sum(voc_time_list) / all_time_sum + rtf = all_time_sum / sum(duration_list) + + logger.info( + f"The length of test text information, test num: {sentence_count}; text num: {sum(sentence_length_list)}; min: {min(sentence_length_list)}; max: {max(sentence_length_list)}; avg: {sum(sentence_length_list)/len(sentence_length_list)}" + ) + logger.info( + f"duration information, min: {min(duration_list)}; max: {max(duration_list)}; avg: {sum(duration_list) / len(duration_list)}; sum: {sum(duration_list)}" + ) + logger.info( + f"Front time information: min: {min(front_time_list)} s; max: {max(front_time_list)} s; avg: {sum(front_time_list)/len(front_time_list)} s; ratio: {front_rate * 100}%" + ) + logger.info( + f"AM time information: min: {min(am_time_list)} s; max: {max(am_time_list)} s; avg: {sum(am_time_list)/len(am_time_list)} s; ratio: {am_rate * 100}%" + ) + logger.info( + f"Vocoder time information: min: {min(voc_time_list)} s, max: {max(voc_time_list)} s; avg: {sum(voc_time_list)/len(voc_time_list)} s; ratio: {voc_rate * 100}%" + ) + logger.info( + f"first response time information: min: {min(first_response_list)} s; max: {max(first_response_list)} s; avg: {sum(first_response_list)/len(first_response_list)} s" + ) + logger.info( + f"final response time information: min: {min(final_response_list)} s; max: {max(final_response_list)} s; avg: {sum(final_response_list)/len(final_response_list)} s" + ) + logger.info(f"RTF is: {rtf}") + logger.info( + f"The number of final_response is greater than duration is {final_up_duration}, ratio: {final_up_duration / sentence_count}%" + ) + + +def parse_args(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with acoustic model & vocoder") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', + 'tacotron2_csmsc', 'tacotron2_ljspeech' + ], + help='Choose acoustic model type of tts task.') + parser.add_argument( + '--am_support_stream', + type=str2bool, + default=False, + help='if am model is fastspeech2_csmsc, specify whether it supports streaming' + ) + parser.add_argument( + '--am_config', + type=str, + default=None, + help='Config of acoustic model. Use deault config when it is None.') + parser.add_argument( + '--am_ckpt', + type=str, + default=None, + help='Checkpoint file of acoustic model.') + parser.add_argument( + "--am_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." + ) + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + '--spk_id', + type=int, + default=0, + help='spk id for multi speaker acoustic model') + # vocoder + parser.add_argument( + '--voc', + type=str, + default='mb_melgan_csmsc', + choices=[ + 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', + 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', + 'wavernn_csmsc' + ], + help='Choose vocoder type of tts task.') + parser.add_argument( + '--voc_config', + type=str, + default=None, + help='Config of voc. Use deault config when it is None.') + parser.add_argument( + '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') + parser.add_argument( + "--voc_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training voc." + ) + # other + parser.add_argument( + '--lang', + type=str, + default='zh', + choices=['zh', 'en'], + help='Choose model language. zh or en') + + parser.add_argument( + "--device", type=str, default='cpu', help="set cpu or gpu:id") + + parser.add_argument( + "--text", + type=str, + default="./csmsc_test.txt", + help="text to synthesize, a 'utt_id sentence' pair per line.") + parser.add_argument("--output_dir", type=str, help="output dir.") + parser.add_argument( + "--log_file", type=str, default="result.log", help="log file.") + + parser.add_argument( + "--am_streaming", + type=str2bool, + default=False, + help="whether use streaming acoustic model") + + parser.add_argument("--am_pad", type=int, default=12, help="am pad size.") + + parser.add_argument( + "--am_block", type=int, default=42, help="am block size.") + + parser.add_argument( + "--voc_streaming", + type=str2bool, + default=False, + help="whether use streaming vocoder model") + + parser.add_argument("--voc_pad", type=int, default=14, help="voc pad size.") + + parser.add_argument( + "--voc_block", type=int, default=14, help="voc block size.") + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + paddle.set_device(args.device) + if args.am_support_stream: + assert (args.am == 'fastspeech2_csmsc') + if args.am_streaming: + assert (args.am_support_stream and args.am == 'fastspeech2_csmsc') + if args.voc_streaming: + assert (args.voc == 'mb_melgan_csmsc' or args.voc == 'hifigan_csmsc') + + logger = logging.getLogger() + fhandler = logging.FileHandler(filename=args.log_file, mode='w') + formatter = logging.Formatter( + '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' + ) + fhandler.setFormatter(formatter) + logger.addHandler(fhandler) + logger.setLevel(logging.DEBUG) + + # set basic information + logger.info( + f"AM: {args.am}; Vocoder: {args.voc}; device: {args.device}; am streaming: {args.am_streaming}; voc streaming: {args.voc_streaming}" + ) + logger.info( + f"am pad size: {args.am_pad}; am block size: {args.am_block}; voc pad size: {args.voc_pad}; voc block size: {args.voc_block};" + ) + + # get information about model + frontend, am_infer_info, voc_infer_info = init(args) + logger.info( + "************************ try infer *********************************") + try_infer(args, logger, frontend, am_infer_info, voc_infer_info) + logger.info( + "************************ normal test *******************************") + evaluate(args, logger, frontend, am_infer_info, voc_infer_info) + + +if __name__ == "__main__": + main() From 4b111146dc959daac319879ba8d89fb9a3f24b75 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 11 Apr 2022 15:31:03 +0800 Subject: [PATCH 2/7] code format, test=doc --- .../server/tests/tts/infer/csmsc_test.txt | 100 ------------------ paddlespeech/server/tests/tts/infer/run.sh | 28 ++--- .../server/tests/tts/infer/test_online_tts.py | 71 +++---------- 3 files changed, 26 insertions(+), 173 deletions(-) delete mode 100644 paddlespeech/server/tests/tts/infer/csmsc_test.txt diff --git a/paddlespeech/server/tests/tts/infer/csmsc_test.txt b/paddlespeech/server/tests/tts/infer/csmsc_test.txt deleted file mode 100644 index d8cf367c..00000000 --- a/paddlespeech/server/tests/tts/infer/csmsc_test.txt +++ /dev/null @@ -1,100 +0,0 @@ -009901 昨日,这名伤者与医生全部被警方依法刑事拘留。 -009902 钱伟长想到上海来办学校是经过深思熟虑的。 -009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 -009904 李述德在离开之前,只说了一句柱驼杀父亲了。 -009905 这种车票和保险单捆绑出售属于重复性购买。 -009906 戴佩妮的男友西米露接唱情歌,让她非常开心。 -009907 观大势,谋大局,出大策始终是该院的办院方针。 -009908 他们骑着摩托回家,正好为农忙时的父母帮忙。 -009909 但是因为还没到退休年龄,只能掰着指头捱日子。 -009910 这几天雨水不断,人们恨不得待在家里不出门。 -009911 没想到徐赟,张海翔两人就此玩起了人间蒸发。 -009912 藤村此番发言可能是为了凸显野田的领导能力。 -009913 程长庚,生在清王朝嘉庆年间,安徽的潜山小县。 -009914 南海海域综合补给基地码头项目正在论证中。 -009915 也就是说今晚成都市民极有可能再次看到飘雪。 -009916 随着天气转热,各地的游泳场所开始人头攒动。 -009917 更让徐先生纳闷的是,房客的手机也打不通了。 -009918 遇到颠簸时,应听从乘务员的安全指令,回座位坐好。 -009919 他在后面呆惯了,怕自己一插身后的人会不满,不敢排进去。 -009920 傍晚七个小人回来了,白雪公主说,你们就是我命中的七个小矮人吧。 -009921 他本想说,教育局管这个,他们是一路的,这样一管岂不是妓女起嫖客? -009922 一种表示商品所有权的财物证券,也称商品证券,如提货单,交货单。 -009923 会有很丰富的东西留下来,说都说不完。 -009924 这句话像从天而降,吓得四周一片寂静。 -009925 记者所在的是受害人家属所在的右区。 -009926 不管哈大爷去哪,它都一步不离地跟着。 -009927 大家抬头望去,一只老鼠正趴在吊顶上。 -009928 我决定过年就辞职,接手我爸的废品站! -009929 最终,中国男子乒乓球队获得此奖项。 -009930 防汛抗旱两手抓,抗旱相对抓的不够。 -009931 图们江下游地区开发开放的进展如何? -009932 这要求中国必须有一个坚强的政党领导。 -009933 再说,关于利益上的事俺俩都不好开口。 -009934 明代瓦剌,鞑靼入侵明境也是通过此地。 -009935 咪咪舔着孩子,把它身上的毛舔干净。 -009936 是否这次的国标修订被大企业绑架了? -009937 判决后,姚某妻子胡某不服,提起上诉。 -009938 由此可以看出邯钢的经济效益来自何处。 -009939 琳达说,是瑜伽改变了她和马儿的生活。 -009940 楼下的保安告诉记者,这里不租也不卖。 -009941 习近平说,中斯两国人民传统友谊深厚。 -009942 传闻越来越多,后来连老汉儿自己都怕了。 -009943 我怒吼一声冲上去,举起砖头砸了过去。 -009944 我现在还不会,这就回去问问发明我的人。 -009945 显然,洛阳性奴案不具备上述两个前提。 -009946 另外,杰克逊有文唇线,眼线,眉毛的动作。 -009947 昨晚,华西都市报记者电话采访了尹琪。 -009948 涅拉季科未透露这些航空公司的名称。 -009949 从运行轨迹上来说,它也不可能是星星。 -009950 目前看,如果继续加息也存在两难问题。 -009951 曾宝仪在节目录制现场大爆观众糗事。 -009952 但任凭周某怎么叫,男子仍酣睡不醒。 -009953 老大爷说,小子,你挡我财路了,知道不? -009954 没料到,闯下大头佛的阿伟还不知悔改。 -009955 卡扎菲部落式统治已遭遇部落内讧。 -009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。 -009957 出现这种泥鳅内阁的局面既是野田有意为之,也实属无奈。 -009958 济青高速济南,华山,章丘,邹平,周村,淄博,临淄站。 -009959 赵凌飞的话,反映了沈阳赛区所有奥运志愿者的共同心声。 -009960 因为,我们所发出的力量必会因难度加大而减弱。 -009961 发生事故的楼梯拐角处仍可看到血迹。 -009962 想过进公安,可能身高不够,老汉儿也不让我进去。 -009963 路上关卡很多,为了方便撤离,只好轻装前进。 -009964 原来比尔盖茨就是美国微软公司联合创始人呀。 -009965 之后他们一家三口将与双方父母往峇里岛旅游。 -009966 谢谢总理,也感谢广大网友的参与,我们明年再见。 -009967 事实上是,从来没有一个欺善怕恶的人能作出过稍大一点的成就。 -009968 我会打开邮件,你可以从那里继续。 -009969 美方对近期东海局势表示关切。 -009970 据悉,奥巴马一家人对这座冬季白宫极为满意。 -009971 打扫完你会很有成就感的,试一试,你就信了。 -009972 诺曼站在滑板车上,各就各位,准备出发啦! -009973 塔河的寒夜,气温降到了零下三十多摄氏度。 -009974 其间,连破六点六,六点五,六点四,六点三五等多个重要关口。 -009975 算命其实只是人们的一种自我安慰和自我暗示而已,我们还是要相信科学才好。 -009976 这一切都令人欢欣鼓舞,阿讷西没理由不坚持到最后。 -009977 直至公元前一万一千年,它又再次出现。 -009978 尽量少玩电脑,少看电视,少打游戏。 -009979 从五到七,前后也就是六个月的时间。 -009980 一进咖啡店,他就遇见一张熟悉的脸。 -009981 好在众弟兄看到了把她追了回来。 -009982 有一个人说,哥们儿我们跑过它才能活。 -009983 捅了她以后,模糊记得她没咋动了。 -009984 从小到大,葛启义没有收到过压岁钱。 -009985 舞台下的你会对舞台上的你说什么? -009986 但考生普遍认为,试题的怪多过难。 -009987 我希望每个人都能够尊重我们的隐私。 -009988 漫天的红霞使劲给两人增添气氛。 -009989 晚上加完班开车回家,太累了,迷迷糊糊开着车,走一半的时候,铛一声! -009990 该车将三人撞倒后,在大雾中逃窜。 -009991 这人一哆嗦,方向盘也把不稳了,差点撞上了高速边道护栏。 -009992 那女孩儿委屈的说,我一回头见你已经进去了我不敢进去啊! -009993 小明摇摇头说,不是,我只是美女看多了,想换个口味而已。 -009994 接下来,红娘要求记者交费,记者表示不知表姐身份证号码。 -009995 李东蓊表示,自己当时在法庭上发表了一次独特的公诉意见。 -009996 另一男子扑了上来,手里拿着明晃晃的长刀,向他胸口直刺。 -009997 今天,快递员拿着一个快递在办公室喊,秦王是哪个,有他快递? -009998 这场抗议活动究竟是如何发展演变的,又究竟是谁伤害了谁? -009999 因华国锋肖鸡,墓地设计根据其属相设计。 -010000 在狱中,张明宝悔恨交加,写了一份忏悔书。 diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh index fdceec41..631daddd 100644 --- a/paddlespeech/server/tests/tts/infer/run.sh +++ b/paddlespeech/server/tests/tts/infer/run.sh @@ -1,14 +1,7 @@ -model_path=/home/users/liangyunming/.paddlespeech/models/ -#am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_nosil_baker_ckpt_0.4/ ## fastspeech2 -am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_cnn -voc_model_dir=$model_path/hifigan_csmsc-zh/hifigan_csmsc_ckpt_0.1.1/ ## hifigan -#voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan - -if [[ $am_model_dir == *"fastspeech2_cnndecoder"* ]]; then - am_support_stream=True -else - am_support_stream=False -fi +model_path=~/.paddlespeech/models/ +am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_c +voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan +testdata=../../../../t2s/exps/csmsc_test.txt # get am file for file in $(ls $am_model_dir) @@ -39,23 +32,24 @@ do done -#run -python test_online_tts.py --am fastspeech2_csmsc \ - --am_support_stream $am_support_stream \ +# run test +# am can choose fastspeech2_csmsc or fastspeech2-C_csmsc, where fastspeech2-C_csmsc supports streaming inference. +# voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference. +python test_online_tts.py --am fastspeech2-C_csmsc \ --am_config $am_model_dir/$am_config_file \ --am_ckpt $am_model_dir/$am_ckpt_file \ --am_stat $am_model_dir/$am_stat_file \ --phones_dict $am_model_dir/$phones_dict_file \ - --voc hifigan_csmsc \ + --voc mb_melgan_csmsc \ --voc_config $voc_model_dir/$voc_config_file \ --voc_ckpt $voc_model_dir/$voc_ckpt_file \ --voc_stat $voc_model_dir/$voc_stat_file \ --lang zh \ --device cpu \ - --text ./csmsc_test.txt \ + --text $testdata \ --output_dir ./output \ --log_file ./result.log \ - --am_streaming False \ + --am_streaming True \ --am_pad 12 \ --am_block 42 \ --voc_streaming True \ diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py index 17ac0ea7..8ccf724b 100644 --- a/paddlespeech/server/tests/tts/infer/test_online_tts.py +++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py @@ -71,8 +71,7 @@ def get_stream_am_inference(args, am_config): vocab_size = len(phn_id) print("vocab_size:", vocab_size) - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] + am_name = "fastspeech2" odim = am_config.n_mels am_class = dynamic_import(am_name, model_alias) @@ -100,7 +99,7 @@ def init(args): frontend = get_frontend(args) # acoustic model - if args.am_support_stream: + if args.am == 'fastspeech2-C_csmsc': am, am_mu, am_std = get_stream_am_inference(args, am_config) am_infer_info = [am, am_mu, am_std, am_config] else: @@ -117,8 +116,6 @@ def init(args): def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): am_name = args.am[:args.am.rindex('_')] tone_ids = None - if am_name == 'speedyspeech': - get_tone_ids = True if args.lang == 'zh': input_ids = frontend.get_input_ids( @@ -142,7 +139,7 @@ def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): # 生成完整的mel def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): # 如果是支持流式的AM模型 - if args.am_support_stream: + if args.am == 'fastspeech2-C_csmsc': am, am_mu, am_std, am_config = am_infer_info orig_hs, h_masks = am.encoder_infer(part_phone_ids) if args.am_streaming: @@ -180,23 +177,7 @@ def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): else: am_inference, am_name, am_dataset, am_config = am_infer_info - # acoustic model - if am_name == 'fastspeech2': - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, spk_id) - else: - mel = am_inference(part_phone_ids) - elif am_name == 'speedyspeech': - part_tone_ids = tone_ids[i] - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, part_tone_ids, spk_id) - else: - mel = am_inference(part_phone_ids, part_tone_ids) - elif am_name == 'tacotron2': - mel = am_inference(part_phone_ids) + mel = am_inference(part_phone_ids) return mel @@ -297,7 +278,8 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, global wav_streaming global voc_stream_st mel_streaming = None - flag = 1 #用来表示开启流式voc的线程 + #用来表示开启流式voc的线程 + flag = 1 am, am_mu, am_std, am_config = am_infer_info orig_hs, h_masks = am.encoder_infer(part_phone_ids) @@ -343,7 +325,7 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav -def try_infer(args, logger, frontend, am_infer_info, voc_infer_info): +def warm_up(args, logger, frontend, am_infer_info, voc_infer_info): global sample_rate logger.info( "Before the formal test, we test a few texts to make the inference speed more stable." @@ -363,7 +345,7 @@ def try_infer(args, logger, frontend, am_infer_info, voc_infer_info): merge_sentences = True get_tone_ids = False - for i in range(3): # 推理3次 + for i in range(5): # 推理5次 st = time.time() phone_ids, tone_ids = get_phone(args, frontend, sentence, merge_sentences, get_tone_ids) @@ -500,18 +482,10 @@ def parse_args(): '--am', type=str, default='fastspeech2_csmsc', - choices=[ - 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc', - 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', - 'tacotron2_csmsc', 'tacotron2_ljspeech' - ], - help='Choose acoustic model type of tts task.') - parser.add_argument( - '--am_support_stream', - type=str2bool, - default=False, - help='if am model is fastspeech2_csmsc, specify whether it supports streaming' + choices=['fastspeech2_csmsc', 'fastspeech2-C_csmsc'], + help='Choose acoustic model type of tts task. where fastspeech2-C_csmsc supports streaming inference' ) + parser.add_argument( '--am_config', type=str, @@ -532,23 +506,12 @@ def parse_args(): "--phones_dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--tones_dict", type=str, default=None, help="tone vocabulary file.") - parser.add_argument( - "--speaker_dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - '--spk_id', - type=int, - default=0, - help='spk id for multi speaker acoustic model') # vocoder parser.add_argument( '--voc', type=str, default='mb_melgan_csmsc', - choices=[ - 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', - 'wavernn_csmsc' - ], + choices=['mb_melgan_csmsc', 'hifigan_csmsc'], help='Choose vocoder type of tts task.') parser.add_argument( '--voc_config', @@ -612,12 +575,8 @@ def parse_args(): def main(): args = parse_args() paddle.set_device(args.device) - if args.am_support_stream: - assert (args.am == 'fastspeech2_csmsc') if args.am_streaming: - assert (args.am_support_stream and args.am == 'fastspeech2_csmsc') - if args.voc_streaming: - assert (args.voc == 'mb_melgan_csmsc' or args.voc == 'hifigan_csmsc') + assert (args.am == 'fastspeech2-C_csmsc') logger = logging.getLogger() fhandler = logging.FileHandler(filename=args.log_file, mode='w') @@ -639,8 +598,8 @@ def main(): # get information about model frontend, am_infer_info, voc_infer_info = init(args) logger.info( - "************************ try infer *********************************") - try_infer(args, logger, frontend, am_infer_info, voc_infer_info) + "************************ warm up *********************************") + warm_up(args, logger, frontend, am_infer_info, voc_infer_info) logger.info( "************************ normal test *******************************") evaluate(args, logger, frontend, am_infer_info, voc_infer_info) From 9d0224460bec81139fd7d69732dce0f7c7ec36fa Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 11 Apr 2022 15:54:44 +0800 Subject: [PATCH 3/7] code format, test=doc --- paddlespeech/server/tests/tts/infer/run.sh | 12 ++-- .../server/tests/tts/infer/test_online_tts.py | 67 ++++++++++--------- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh index 631daddd..3733c3fb 100644 --- a/paddlespeech/server/tests/tts/infer/run.sh +++ b/paddlespeech/server/tests/tts/infer/run.sh @@ -1,6 +1,6 @@ model_path=~/.paddlespeech/models/ -am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_c -voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan +am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ +voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ testdata=../../../../t2s/exps/csmsc_test.txt # get am file @@ -33,9 +33,13 @@ done # run test -# am can choose fastspeech2_csmsc or fastspeech2-C_csmsc, where fastspeech2-C_csmsc supports streaming inference. +# am can choose fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc, where fastspeech2_cnndecoder_csmsc supports streaming inference. # voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference. -python test_online_tts.py --am fastspeech2-C_csmsc \ +# When am is fastspeech2_cnndecoder_csmsc and am_pad is set to 12, there is no diff between streaming and non-streaming inference results. +# When voc is mb_melgan_csmsc and voc_pad is set to 14, there is no diff between streaming and non-streaming inference results. +# When voc is hifigan_csmsc and voc_pad is set to 20, there is no diff between streaming and non-streaming inference results. + +python test_online_tts.py --am fastspeech2_cnndecoder_csmsc \ --am_config $am_model_dir/$am_config_file \ --am_ckpt $am_model_dir/$am_ckpt_file \ --am_stat $am_model_dir/$am_stat_file \ diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py index 8ccf724b..eb5fc80b 100644 --- a/paddlespeech/server/tests/tts/infer/test_online_tts.py +++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py @@ -34,8 +34,8 @@ from paddlespeech.t2s.utils import str2bool mel_streaming = None wav_streaming = None -stream_first_time = 0.0 -voc_stream_st = 0.0 +streaming_first_time = 0.0 +streaming_voc_st = 0.0 sample_rate = 0 @@ -65,7 +65,7 @@ def get_chunks(data, block_size, pad_size, step): return chunks -def get_stream_am_inference(args, am_config): +def get_streaming_am_inference(args, am_config): with open(args.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) @@ -99,8 +99,8 @@ def init(args): frontend = get_frontend(args) # acoustic model - if args.am == 'fastspeech2-C_csmsc': - am, am_mu, am_std = get_stream_am_inference(args, am_config) + if args.am == 'fastspeech2_cnndecoder_csmsc': + am, am_mu, am_std = get_streaming_am_inference(args, am_config) am_infer_info = [am, am_mu, am_std, am_config] else: am_inference, am_name, am_dataset = get_am_inference(args, am_config) @@ -139,7 +139,7 @@ def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): # 生成完整的mel def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): # 如果是支持流式的AM模型 - if args.am == 'fastspeech2-C_csmsc': + if args.am == 'fastspeech2_cnndecoder_csmsc': am, am_mu, am_std, am_config = am_infer_info orig_hs, h_masks = am.encoder_infer(part_phone_ids) if args.am_streaming: @@ -183,9 +183,9 @@ def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): @paddle.no_grad() -def stream_voc_infer(args, voc_infer_info, mel_len): +def streaming_voc_infer(args, voc_infer_info, mel_len): global mel_streaming - global stream_first_time + global streaming_first_time global wav_streaming voc_inference, voc_config = voc_infer_info block = args.voc_block @@ -203,7 +203,7 @@ def stream_voc_infer(args, voc_infer_info, mel_len): while valid_end <= mel_len: sub_wav = voc_inference(mel_chunk) if flag == 1: - stream_first_time = time.time() + streaming_first_time = time.time() flag = 0 # get valid wav @@ -233,8 +233,8 @@ def stream_voc_infer(args, voc_infer_info, mel_len): @paddle.no_grad() # 非流式AM / 流式AM + 非流式Voc -def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): +def am_nonstreaming_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) am_infer_time = time.time() voc_inference, voc_config = voc_infer_info @@ -248,10 +248,10 @@ def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, @paddle.no_grad() # 非流式AM + 流式Voc -def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): +def nonstreaming_am_streaming_voc(args, am_infer_info, voc_infer_info, + part_phone_ids, part_tone_ids): global mel_streaming - global stream_first_time + global streaming_first_time global wav_streaming mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) @@ -260,8 +260,8 @@ def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, # voc streaming mel_streaming = mel mel_len = mel.shape[0] - stream_voc_infer(args, voc_infer_info, mel_len) - first_response_time = stream_first_time + streaming_voc_infer(args, voc_infer_info, mel_len) + first_response_time = streaming_first_time wav = wav_streaming final_response_time = time.time() voc_infer_time = final_response_time @@ -271,12 +271,12 @@ def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, @paddle.no_grad() # 流式AM + 流式 Voc -def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): +def streaming_am_streaming_voc(args, am_infer_info, voc_infer_info, + part_phone_ids, part_tone_ids): global mel_streaming - global stream_first_time + global streaming_first_time global wav_streaming - global voc_stream_st + global streaming_voc_st mel_streaming = None #用来表示开启流式voc的线程 flag = 1 @@ -311,15 +311,16 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad: t = threading.Thread( - target=stream_voc_infer, args=(args, voc_infer_info, mel_len, )) + target=streaming_voc_infer, + args=(args, voc_infer_info, mel_len, )) t.start() - voc_stream_st = time.time() + streaming_voc_st = time.time() flag = 0 t.join() final_response_time = time.time() voc_infer_time = final_response_time - first_response_time = stream_first_time + first_response_time = streaming_first_time wav = wav_streaming return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav @@ -337,11 +338,11 @@ def warm_up(args, logger, frontend, am_infer_info, voc_infer_info): if args.voc_streaming: if args.am_streaming: - infer_func = stream_am_stream_voc + infer_func = streaming_am_streaming_voc else: - infer_func = nostream_am_stream_voc + infer_func = nonstreaming_am_streaming_voc else: - infer_func = am_nostream_voc + infer_func = am_nonstreaming_voc merge_sentences = True get_tone_ids = False @@ -376,11 +377,11 @@ def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): # choose infer function if args.voc_streaming: if args.am_streaming: - infer_func = stream_am_stream_voc + infer_func = streaming_am_streaming_voc else: - infer_func = nostream_am_stream_voc + infer_func = nonstreaming_am_streaming_voc else: - infer_func = am_nostream_voc + infer_func = am_nonstreaming_voc final_up_duration = 0.0 sentence_count = 0 @@ -410,7 +411,7 @@ def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) am_time = am_infer_time - am_st if args.voc_streaming and args.am_streaming: - voc_time = voc_infer_time - voc_stream_st + voc_time = voc_infer_time - streaming_voc_st else: voc_time = voc_infer_time - am_infer_time @@ -482,8 +483,8 @@ def parse_args(): '--am', type=str, default='fastspeech2_csmsc', - choices=['fastspeech2_csmsc', 'fastspeech2-C_csmsc'], - help='Choose acoustic model type of tts task. where fastspeech2-C_csmsc supports streaming inference' + choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'], + help='Choose acoustic model type of tts task. where fastspeech2_cnndecoder_csmsc supports streaming inference' ) parser.add_argument( @@ -576,7 +577,7 @@ def main(): args = parse_args() paddle.set_device(args.device) if args.am_streaming: - assert (args.am == 'fastspeech2-C_csmsc') + assert (args.am == 'fastspeech2_cnndecoder_csmsc') logger = logging.getLogger() fhandler = logging.FileHandler(filename=args.log_file, mode='w') From 9c0ceaacb6aafa1175b0df7372fb411e2fd772fe Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 18 Apr 2022 17:27:45 +0800 Subject: [PATCH 4/7] add streaming am infer, test=doc --- .../server/engine/tts/online/tts_engine.py | 517 ++++++++++++++++-- paddlespeech/server/utils/util.py | 4 + 2 files changed, 462 insertions(+), 59 deletions(-) diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py index 25a8bc76..8e76225d 100644 --- a/paddlespeech/server/engine/tts/online/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/tts_engine.py @@ -12,24 +12,322 @@ # See the License for the specific language governing permissions and # limitations under the License. import base64 +import math +import os import time +from typing import Optional import numpy as np import paddle +import yaml +from yacs.config import CfgNode from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor +from paddlespeech.cli.utils import download_and_decompress +from paddlespeech.cli.utils import MODEL_HOME +from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm +from paddlespeech.server.utils.util import denorm from paddlespeech.server.utils.util import get_chunks +from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.modules.normalizer import ZScore + +__all__ = ['TTSEngine'] + +# support online model +pretrained_models = { + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', + 'md5': + '637d28a5e53aa60275612ba4393d5f22', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_76000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "fastspeech2_cnndecoder_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip', + 'md5': + '6eb28e22ace73e0ebe7845f86478f89f', + 'config': + 'cnndecoder.yaml', + 'ckpt': + 'snapshot_iter_153000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'ee5f0604e20091f0d495b6ec4618b90d', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, +} + +model_alias = { + # acoustic model + "fastspeech2": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2", + "fastspeech2_inference": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + + # voc + "mb_melgan": + "paddlespeech.t2s.models.melgan:MelGANGenerator", + "mb_melgan_inference": + "paddlespeech.t2s.models.melgan:MelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", +} __all__ = ['TTSEngine'] class TTSServerExecutor(TTSExecutor): - def __init__(self): + def __init__(self, am_block, am_pad, voc_block, voc_pad): super().__init__() - pass + self.am_block = am_block + self.am_pad = am_pad + self.voc_block = voc_block + self.voc_pad = voc_pad + + def get_model_info(self, step, model_name, ckpt, stat): + """get model information + + Args: + step (string): am or voc + model_name (string): model type, support fastspeech2, higigan, mb_melgan + ckpt (string): ckpt file + stat (string): stat file, including mean and standard deviation + + Returns: + model, model_mu, model_std + """ + model_class = dynamic_import(model_name, model_alias) + + if step == "am": + odim = self.am_config.n_mels + model = model_class( + idim=self.vocab_size, odim=odim, **self.am_config["model"]) + model.set_state_dict(paddle.load(ckpt)["main_params"]) + + elif step == "voc": + model = model_class(**self.voc_config["generator_params"]) + model.set_state_dict(paddle.load(ckpt)["generator_params"]) + model.remove_weight_norm() + + else: + logger.error("Please set correct step, am or voc") + + model.eval() + model_mu, model_std = np.load(stat) + model_mu = paddle.to_tensor(model_mu) + model_std = paddle.to_tensor(model_std) + + return model, model_mu, model_std + + def _get_pretrained_path(self, tag: str) -> os.PathLike: + """ + Download and returns pretrained resources path of current task. + """ + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) + + res_path = os.path.join(MODEL_HOME, tag) + decompressed_path = download_and_decompress(pretrained_models[tag], + res_path) + decompressed_path = os.path.abspath(decompressed_path) + logger.info( + 'Use pretrained model stored in: {}'.format(decompressed_path)) + return decompressed_path + + def _init_from_path( + self, + am: str='fastspeech2_csmsc', + am_config: Optional[os.PathLike]=None, + am_ckpt: Optional[os.PathLike]=None, + am_stat: Optional[os.PathLike]=None, + phones_dict: Optional[os.PathLike]=None, + tones_dict: Optional[os.PathLike]=None, + speaker_dict: Optional[os.PathLike]=None, + voc: str='mb_melgan_csmsc', + voc_config: Optional[os.PathLike]=None, + voc_ckpt: Optional[os.PathLike]=None, + voc_stat: Optional[os.PathLike]=None, + lang: str='zh', ): + """ + Init model and other resources from a specific path. + """ + if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): + logger.info('Models had been initialized.') + return + # am model info + am_tag = am + '-' + lang + if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + am_res_path = self._get_pretrained_path(am_tag) + self.am_res_path = am_res_path + self.am_config = os.path.join(am_res_path, + pretrained_models[am_tag]['config']) + self.am_ckpt = os.path.join(am_res_path, + pretrained_models[am_tag]['ckpt']) + self.am_stat = os.path.join( + am_res_path, pretrained_models[am_tag]['speech_stats']) + # must have phones_dict in acoustic + self.phones_dict = os.path.join( + am_res_path, pretrained_models[am_tag]['phones_dict']) + print("self.phones_dict:", self.phones_dict) + logger.info(am_res_path) + logger.info(self.am_config) + logger.info(self.am_ckpt) + else: + self.am_config = os.path.abspath(am_config) + self.am_ckpt = os.path.abspath(am_ckpt) + self.am_stat = os.path.abspath(am_stat) + self.phones_dict = os.path.abspath(phones_dict) + self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) + print("self.phones_dict:", self.phones_dict) + + self.tones_dict = None + self.speaker_dict = None + + # voc model info + voc_tag = voc + '-' + lang + if voc_ckpt is None or voc_config is None or voc_stat is None: + voc_res_path = self._get_pretrained_path(voc_tag) + self.voc_res_path = voc_res_path + self.voc_config = os.path.join(voc_res_path, + pretrained_models[voc_tag]['config']) + self.voc_ckpt = os.path.join(voc_res_path, + pretrained_models[voc_tag]['ckpt']) + self.voc_stat = os.path.join( + voc_res_path, pretrained_models[voc_tag]['speech_stats']) + logger.info(voc_res_path) + logger.info(self.voc_config) + logger.info(self.voc_ckpt) + else: + self.voc_config = os.path.abspath(voc_config) + self.voc_ckpt = os.path.abspath(voc_ckpt) + self.voc_stat = os.path.abspath(voc_stat) + self.voc_res_path = os.path.dirname( + os.path.abspath(self.voc_config)) + + # Init body. + with open(self.am_config) as f: + self.am_config = CfgNode(yaml.safe_load(f)) + with open(self.voc_config) as f: + self.voc_config = CfgNode(yaml.safe_load(f)) + + with open(self.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + self.vocab_size = len(phn_id) + print("vocab_size:", self.vocab_size) + + # frontend + if lang == 'zh': + self.frontend = Frontend( + phone_vocab_path=self.phones_dict, + tone_vocab_path=self.tones_dict) + + elif lang == 'en': + self.frontend = English(phone_vocab_path=self.phones_dict) + print("frontend done!") + + # am infer info + self.am_name = am[:am.rindex('_')] + if self.am_name == "fastspeech2_cnndecoder": + self.am_inference, self.am_mu, self.am_std = self.get_model_info( + "am", "fastspeech2", self.am_ckpt, self.am_stat) + else: + am, am_mu, am_std = self.get_model_info("am", self.am_name, + self.am_ckpt, self.am_stat) + am_normalizer = ZScore(am_mu, am_std) + am_inference_class = dynamic_import(self.am_name + '_inference', + model_alias) + self.am_inference = am_inference_class(am_normalizer, am) + self.am_inference.eval() + print("acoustic model done!") + + # voc infer info + self.voc_name = voc[:voc.rindex('_')] + voc, voc_mu, voc_std = self.get_model_info("voc", self.voc_name, + self.voc_ckpt, self.voc_stat) + voc_normalizer = ZScore(voc_mu, voc_std) + voc_inference_class = dynamic_import(self.voc_name + '_inference', + model_alias) + self.voc_inference = voc_inference_class(voc_normalizer, voc) + self.voc_inference.eval() + print("voc done!") + + def get_phone(self, sentence, lang, merge_sentences, get_tone_ids): + tone_ids = None + if lang == 'zh': + input_ids = self.frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + elif lang == 'en': + input_ids = self.frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + + def depadding(self, data, chunk_num, chunk_id, block, pad, upsample): + """ + Streaming inference removes the result of pad inference + """ + front_pad = min(chunk_id * block, pad) + # first chunk + if chunk_id == 0: + data = data[:block * upsample] + # last chunk + elif chunk_id == chunk_num - 1: + data = data[front_pad * upsample:] + # middle chunk + else: + data = data[front_pad * upsample:(front_pad + block) * upsample] + + return data @paddle.no_grad() def infer( @@ -37,16 +335,19 @@ class TTSServerExecutor(TTSExecutor): text: str, lang: str='zh', am: str='fastspeech2_csmsc', - spk_id: int=0, - am_block: int=42, - am_pad: int=12, - voc_block: int=14, - voc_pad: int=14, ): + spk_id: int=0, ): """ Model inference and result stored in self.output. """ - am_name = am[:am.rindex('_')] - am_dataset = am[am.rindex('_') + 1:] + + am_block = self.am_block + am_pad = self.am_pad + am_upsample = 1 + voc_block = self.voc_block + voc_pad = self.voc_pad + voc_upsample = self.voc_config.n_shift + flag = 1 + get_tone_ids = False merge_sentences = False frontend_st = time.time() @@ -64,43 +365,99 @@ class TTSServerExecutor(TTSExecutor): phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") - self.frontend_time = time.time() - frontend_st + frontend_et = time.time() + self.frontend_time = frontend_et - frontend_st for i in range(len(phone_ids)): - am_st = time.time() part_phone_ids = phone_ids[i] - # am - if am_name == 'speedyspeech': - part_tone_ids = tone_ids[i] - mel = self.am_inference(part_phone_ids, part_tone_ids) - # fastspeech2 + voc_chunk_id = 0 + + # fastspeech2_csmsc + if am == "fastspeech2_csmsc": + # am + mel = self.am_inference(part_phone_ids) + if flag == 1: + first_am_et = time.time() + self.first_am_infer = first_am_et - frontend_et + + # voc streaming + mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc") + voc_chunk_num = len(mel_chunks) + voc_st = time.time() + for i, mel_chunk in enumerate(mel_chunks): + sub_wav = self.voc_inference(mel_chunk) + sub_wav = self.depadding(sub_wav, voc_chunk_num, i, + voc_block, voc_pad, voc_upsample) + if flag == 1: + first_voc_et = time.time() + self.first_voc_infer = first_voc_et - first_am_et + self.first_response_time = first_voc_et - frontend_st + flag = 0 + + yield sub_wav + + # fastspeech2_cnndecoder_csmsc + elif am == "fastspeech2_cnndecoder_csmsc": + # am + orig_hs, h_masks = self.am_inference.encoder_infer( + part_phone_ids) + + # streaming voc chunk info + mel_len = orig_hs.shape[1] + voc_chunk_num = math.ceil(mel_len / self.voc_block) + start = 0 + end = min(self.voc_block + self.voc_pad, mel_len) + + # streaming am + hss = get_chunks(orig_hs, self.am_block, self.am_pad, "am") + am_chunk_num = len(hss) + for i, hs in enumerate(hss): + before_outs, _ = self.am_inference.decoder(hs) + after_outs = before_outs + self.am_inference.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + sub_mel = denorm(normalized_mel, self.am_mu, self.am_std) + sub_mel = self.depadding(sub_mel, am_chunk_num, i, am_block, + am_pad, am_upsample) + + if i == 0: + mel_streaming = sub_mel + else: + mel_streaming = np.concatenate( + (mel_streaming, sub_mel), axis=0) + + # streaming voc + while (mel_streaming.shape[0] >= end and + voc_chunk_id < voc_chunk_num): + if flag == 1: + first_am_et = time.time() + self.first_am_infer = first_am_et - frontend_et + voc_chunk = mel_streaming[start:end, :] + voc_chunk = paddle.to_tensor(voc_chunk) + sub_wav = self.voc_inference(voc_chunk) + + sub_wav = self.depadding(sub_wav, voc_chunk_num, + voc_chunk_id, voc_block, + voc_pad, voc_upsample) + if flag == 1: + first_voc_et = time.time() + self.first_voc_infer = first_voc_et - first_am_et + self.first_response_time = first_voc_et - frontend_st + flag = 0 + + yield sub_wav + + voc_chunk_id += 1 + start = max(0, voc_chunk_id * voc_block - voc_pad) + end = min((voc_chunk_id + 1) * voc_block + voc_pad, + mel_len) + else: - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - mel = self.am_inference( - part_phone_ids, spk_id=paddle.to_tensor(spk_id)) - else: - mel = self.am_inference(part_phone_ids) - am_et = time.time() - - # voc streaming - voc_upsample = self.voc_config.n_shift - mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc") - chunk_num = len(mel_chunks) - voc_st = time.time() - for i, mel_chunk in enumerate(mel_chunks): - sub_wav = self.voc_inference(mel_chunk) - front_pad = min(i * voc_block, voc_pad) - - if i == 0: - sub_wav = sub_wav[:voc_block * voc_upsample] - elif i == chunk_num - 1: - sub_wav = sub_wav[front_pad * voc_upsample:] - else: - sub_wav = sub_wav[front_pad * voc_upsample:( - front_pad + voc_block) * voc_upsample] - - yield sub_wav + logger.error( + "Only support fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc on streaming tts." + ) + + self.final_response_time = time.time() - frontend_st class TTSEngine(BaseEngine): @@ -116,11 +473,18 @@ class TTSEngine(BaseEngine): super(TTSEngine, self).__init__() def init(self, config: dict) -> bool: - self.executor = TTSServerExecutor() self.config = config - assert "fastspeech2_csmsc" in config.am and ( - config.voc == "hifigan_csmsc-zh" or config.voc == "mb_melgan_csmsc" + assert ( + config.am == "fastspeech2_csmsc" or + config.am == "fastspeech2_cnndecoder_csmsc" + ) and ( + config.voc == "hifigan_csmsc" or config.voc == "mb_melgan_csmsc" ), 'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.' + + assert ( + config.voc_block > 0 and config.voc_pad > 0 + ), "Please set correct voc_block and voc_pad, they should be more than 0." + try: if self.config.device: self.device = self.config.device @@ -135,6 +499,9 @@ class TTSEngine(BaseEngine): (self.device)) return False + self.executor = TTSServerExecutor(config.am_block, config.am_pad, + config.voc_block, config.voc_pad) + try: self.executor._init_from_path( am=self.config.am, @@ -155,15 +522,42 @@ class TTSEngine(BaseEngine): (self.device)) return False - self.am_block = self.config.am_block - self.am_pad = self.config.am_pad - self.voc_block = self.config.voc_block - self.voc_pad = self.config.voc_pad - logger.info("Initialize TTS server engine successfully on device: %s." % (self.device)) + + # warm up + try: + self.warm_up() + except Exception as e: + logger.error("Failed to warm up on tts engine.") + return False + return True + def warm_up(self): + """warm up + """ + if self.config.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if self.config.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + logger.info( + "*******************************warm up ********************************" + ) + for i in range(3): + for wav in self.executor.infer( + text=sentence, + lang=self.config.lang, + am=self.config.am, + spk_id=0, ): + logger.info( + f"The first response time of the {i} warm up: {self.executor.first_response_time} s" + ) + break + logger.info( + "**********************************************************************" + ) + def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): # Convert byte to text if text_bese64: @@ -195,18 +589,14 @@ class TTSEngine(BaseEngine): wav_base64: The base64 format of the synthesized audio. """ - lang = self.config.lang wav_list = [] for wav in self.executor.infer( text=sentence, - lang=lang, + lang=self.config.lang, am=self.config.am, - spk_id=spk_id, - am_block=self.am_block, - am_pad=self.am_pad, - voc_block=self.voc_block, - voc_pad=self.voc_pad): + spk_id=spk_id, ): + # wav type: float32, convert to pcm (base64) wav = float2pcm(wav) # float32 to int16 wav_bytes = wav.tobytes() # to bytes @@ -216,5 +606,14 @@ class TTSEngine(BaseEngine): yield wav_base64 wav_all = np.concatenate(wav_list, axis=0) - logger.info("The durations of audio is: {} s".format( - len(wav_all) / self.executor.am_config.fs)) + duration = len(wav_all) / self.executor.am_config.fs + logger.info(f"sentence: {sentence}") + logger.info(f"The durations of audio is: {duration} s") + logger.info( + f"first response time: {self.executor.first_response_time} s") + logger.info( + f"final response time: {self.executor.final_response_time} s") + logger.info(f"RTF: {self.executor.final_response_time / duration}") + logger.info( + f"Other info: front time: {self.executor.frontend_time} s, first am infer time: {self.executor.first_am_infer} s, first voc infer time: {self.executor.first_voc_infer} s," + ) diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 0fe70849..72ee0060 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -52,6 +52,10 @@ def get_chunks(data, block_size, pad_size, step): Returns: list: chunks list """ + + if block_size == -1: + return [data] + if step == "am": data_len = data.shape[1] elif step == "voc": From 00a6236fe2c0affa3093551c1d88f0a92b2d0a42 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 18 Apr 2022 17:31:47 +0800 Subject: [PATCH 5/7] remove test code, test=doc --- paddlespeech/server/tests/tts/infer/run.sh | 62 -- .../server/tests/tts/infer/test_online_tts.py | 610 ------------------ 2 files changed, 672 deletions(-) delete mode 100644 paddlespeech/server/tests/tts/infer/run.sh delete mode 100644 paddlespeech/server/tests/tts/infer/test_online_tts.py diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh deleted file mode 100644 index 3733c3fb..00000000 --- a/paddlespeech/server/tests/tts/infer/run.sh +++ /dev/null @@ -1,62 +0,0 @@ -model_path=~/.paddlespeech/models/ -am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ -voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ -testdata=../../../../t2s/exps/csmsc_test.txt - -# get am file -for file in $(ls $am_model_dir) -do - if [[ $file == *"yaml"* ]]; then - am_config_file=$file - elif [[ $file == *"pdz"* ]]; then - am_ckpt_file=$file - elif [[ $file == *"stat"* ]]; then - am_stat_file=$file - elif [[ $file == *"phone"* ]]; then - phones_dict_file=$file - fi - -done - -# get voc file -for file in $(ls $voc_model_dir) -do - if [[ $file == *"yaml"* ]]; then - voc_config_file=$file - elif [[ $file == *"pdz"* ]]; then - voc_ckpt_file=$file - elif [[ $file == *"stat"* ]]; then - voc_stat_file=$file - fi - -done - - -# run test -# am can choose fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc, where fastspeech2_cnndecoder_csmsc supports streaming inference. -# voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference. -# When am is fastspeech2_cnndecoder_csmsc and am_pad is set to 12, there is no diff between streaming and non-streaming inference results. -# When voc is mb_melgan_csmsc and voc_pad is set to 14, there is no diff between streaming and non-streaming inference results. -# When voc is hifigan_csmsc and voc_pad is set to 20, there is no diff between streaming and non-streaming inference results. - -python test_online_tts.py --am fastspeech2_cnndecoder_csmsc \ - --am_config $am_model_dir/$am_config_file \ - --am_ckpt $am_model_dir/$am_ckpt_file \ - --am_stat $am_model_dir/$am_stat_file \ - --phones_dict $am_model_dir/$phones_dict_file \ - --voc mb_melgan_csmsc \ - --voc_config $voc_model_dir/$voc_config_file \ - --voc_ckpt $voc_model_dir/$voc_ckpt_file \ - --voc_stat $voc_model_dir/$voc_stat_file \ - --lang zh \ - --device cpu \ - --text $testdata \ - --output_dir ./output \ - --log_file ./result.log \ - --am_streaming True \ - --am_pad 12 \ - --am_block 42 \ - --voc_streaming True \ - --voc_pad 14 \ - --voc_block 14 \ - diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py deleted file mode 100644 index eb5fc80b..00000000 --- a/paddlespeech/server/tests/tts/infer/test_online_tts.py +++ /dev/null @@ -1,610 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import logging -import math -import threading -import time -from pathlib import Path - -import numpy as np -import paddle -import soundfile as sf -import yaml -from yacs.config import CfgNode - -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.t2s.exps.syn_utils import get_am_inference -from paddlespeech.t2s.exps.syn_utils import get_frontend -from paddlespeech.t2s.exps.syn_utils import get_sentences -from paddlespeech.t2s.exps.syn_utils import get_voc_inference -from paddlespeech.t2s.exps.syn_utils import model_alias -from paddlespeech.t2s.utils import str2bool - -mel_streaming = None -wav_streaming = None -streaming_first_time = 0.0 -streaming_voc_st = 0.0 -sample_rate = 0 - - -def denorm(data, mean, std): - return data * std + mean - - -def get_chunks(data, block_size, pad_size, step): - if step == "am": - data_len = data.shape[1] - elif step == "voc": - data_len = data.shape[0] - else: - print("Please set correct type to get chunks, am or voc") - - chunks = [] - n = math.ceil(data_len / block_size) - for i in range(n): - start = max(0, i * block_size - pad_size) - end = min((i + 1) * block_size + pad_size, data_len) - if step == "am": - chunks.append(data[:, start:end, :]) - elif step == "voc": - chunks.append(data[start:end, :]) - else: - print("Please set correct type to get chunks, am or voc") - return chunks - - -def get_streaming_am_inference(args, am_config): - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - - am_name = "fastspeech2" - odim = am_config.n_mels - - am_class = dynamic_import(am_name, model_alias) - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - - return am, am_mu, am_std - - -def init(args): - global sample_rate - # get config - with open(args.am_config) as f: - am_config = CfgNode(yaml.safe_load(f)) - with open(args.voc_config) as f: - voc_config = CfgNode(yaml.safe_load(f)) - - sample_rate = am_config.fs - - # frontend - frontend = get_frontend(args) - - # acoustic model - if args.am == 'fastspeech2_cnndecoder_csmsc': - am, am_mu, am_std = get_streaming_am_inference(args, am_config) - am_infer_info = [am, am_mu, am_std, am_config] - else: - am_inference, am_name, am_dataset = get_am_inference(args, am_config) - am_infer_info = [am_inference, am_name, am_dataset, am_config] - - # vocoder - voc_inference = get_voc_inference(args, voc_config) - voc_infer_info = [voc_inference, voc_config] - - return frontend, am_infer_info, voc_infer_info - - -def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): - am_name = args.am[:args.am.rindex('_')] - tone_ids = None - - if args.lang == 'zh': - input_ids = frontend.get_input_ids( - sentence, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids) - phone_ids = input_ids["phone_ids"] - if get_tone_ids: - tone_ids = input_ids["tone_ids"] - elif args.lang == 'en': - input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) - phone_ids = input_ids["phone_ids"] - else: - print("lang should in {'zh', 'en'}!") - - return phone_ids, tone_ids - - -@paddle.no_grad() -# 生成完整的mel -def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): - # 如果是支持流式的AM模型 - if args.am == 'fastspeech2_cnndecoder_csmsc': - am, am_mu, am_std, am_config = am_infer_info - orig_hs, h_masks = am.encoder_infer(part_phone_ids) - if args.am_streaming: - am_pad = args.am_pad - am_block = args.am_block - hss = get_chunks(orig_hs, am_block, am_pad, "am") - chunk_num = len(hss) - mel_list = [] - for i, hs in enumerate(hss): - before_outs, _ = am.decoder(hs) - after_outs = before_outs + am.postnet( - before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - normalized_mel = after_outs[0] - sub_mel = denorm(normalized_mel, am_mu, am_std) - # clip output part of pad - if i == 0: - sub_mel = sub_mel[:-am_pad] - elif i == chunk_num - 1: - # 最后一块的右侧一定没有 pad 够 - sub_mel = sub_mel[am_pad:] - else: - # 倒数几块的右侧也可能没有 pad 够 - sub_mel = sub_mel[am_pad:(am_block + am_pad) - - sub_mel.shape[0]] - mel_list.append(sub_mel) - mel = paddle.concat(mel_list, axis=0) - - else: - orig_hs, h_masks = am.encoder_infer(part_phone_ids) - before_outs, _ = am.decoder(orig_hs) - after_outs = before_outs + am.postnet( - before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - normalized_mel = after_outs[0] - mel = denorm(normalized_mel, am_mu, am_std) - - else: - am_inference, am_name, am_dataset, am_config = am_infer_info - mel = am_inference(part_phone_ids) - - return mel - - -@paddle.no_grad() -def streaming_voc_infer(args, voc_infer_info, mel_len): - global mel_streaming - global streaming_first_time - global wav_streaming - voc_inference, voc_config = voc_infer_info - block = args.voc_block - pad = args.voc_pad - upsample = voc_config.n_shift - wav_list = [] - flag = 1 - - valid_start = 0 - valid_end = min(valid_start + block, mel_len) - actual_start = 0 - actual_end = min(valid_end + pad, mel_len) - mel_chunk = mel_streaming[actual_start:actual_end, :] - - while valid_end <= mel_len: - sub_wav = voc_inference(mel_chunk) - if flag == 1: - streaming_first_time = time.time() - flag = 0 - - # get valid wav - start = valid_start - actual_start - if valid_end == mel_len: - sub_wav = sub_wav[start * upsample:] - wav_list.append(sub_wav) - break - else: - end = start + block - sub_wav = sub_wav[start * upsample:end * upsample] - wav_list.append(sub_wav) - - # generate new mel chunk - valid_start = valid_end - valid_end = min(valid_start + block, mel_len) - if valid_start - pad < 0: - actual_start = 0 - else: - actual_start = valid_start - pad - actual_end = min(valid_end + pad, mel_len) - mel_chunk = mel_streaming[actual_start:actual_end, :] - - wav = paddle.concat(wav_list, axis=0) - wav_streaming = wav - - -@paddle.no_grad() -# 非流式AM / 流式AM + 非流式Voc -def am_nonstreaming_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): - mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) - am_infer_time = time.time() - voc_inference, voc_config = voc_infer_info - wav = voc_inference(mel) - first_response_time = time.time() - final_response_time = first_response_time - voc_infer_time = first_response_time - - return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav - - -@paddle.no_grad() -# 非流式AM + 流式Voc -def nonstreaming_am_streaming_voc(args, am_infer_info, voc_infer_info, - part_phone_ids, part_tone_ids): - global mel_streaming - global streaming_first_time - global wav_streaming - - mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) - am_infer_time = time.time() - - # voc streaming - mel_streaming = mel - mel_len = mel.shape[0] - streaming_voc_infer(args, voc_infer_info, mel_len) - first_response_time = streaming_first_time - wav = wav_streaming - final_response_time = time.time() - voc_infer_time = final_response_time - - return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav - - -@paddle.no_grad() -# 流式AM + 流式 Voc -def streaming_am_streaming_voc(args, am_infer_info, voc_infer_info, - part_phone_ids, part_tone_ids): - global mel_streaming - global streaming_first_time - global wav_streaming - global streaming_voc_st - mel_streaming = None - #用来表示开启流式voc的线程 - flag = 1 - - am, am_mu, am_std, am_config = am_infer_info - orig_hs, h_masks = am.encoder_infer(part_phone_ids) - mel_len = orig_hs.shape[1] - am_block = args.am_block - am_pad = args.am_pad - hss = get_chunks(orig_hs, am_block, am_pad, "am") - chunk_num = len(hss) - - for i, hs in enumerate(hss): - before_outs, _ = am.decoder(hs) - after_outs = before_outs + am.postnet( - before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - normalized_mel = after_outs[0] - sub_mel = denorm(normalized_mel, am_mu, am_std) - # clip output part of pad - if i == 0: - sub_mel = sub_mel[:-am_pad] - mel_streaming = sub_mel - elif i == chunk_num - 1: - # 最后一块的右侧一定没有 pad 够 - sub_mel = sub_mel[am_pad:] - mel_streaming = paddle.concat([mel_streaming, sub_mel]) - am_infer_time = time.time() - else: - # 倒数几块的右侧也可能没有 pad 够 - sub_mel = sub_mel[am_pad:(am_block + am_pad) - sub_mel.shape[0]] - mel_streaming = paddle.concat([mel_streaming, sub_mel]) - - if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad: - t = threading.Thread( - target=streaming_voc_infer, - args=(args, voc_infer_info, mel_len, )) - t.start() - streaming_voc_st = time.time() - flag = 0 - - t.join() - final_response_time = time.time() - voc_infer_time = final_response_time - first_response_time = streaming_first_time - wav = wav_streaming - - return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav - - -def warm_up(args, logger, frontend, am_infer_info, voc_infer_info): - global sample_rate - logger.info( - "Before the formal test, we test a few texts to make the inference speed more stable." - ) - if args.lang == 'zh': - sentence = "您好,欢迎使用语音合成服务。" - if args.lang == 'en': - sentence = "Hello and welcome to the speech synthesis service." - - if args.voc_streaming: - if args.am_streaming: - infer_func = streaming_am_streaming_voc - else: - infer_func = nonstreaming_am_streaming_voc - else: - infer_func = am_nonstreaming_voc - - merge_sentences = True - get_tone_ids = False - for i in range(5): # 推理5次 - st = time.time() - phone_ids, tone_ids = get_phone(args, frontend, sentence, - merge_sentences, get_tone_ids) - part_phone_ids = phone_ids[0] - if tone_ids: - part_tone_ids = tone_ids[0] - else: - part_tone_ids = None - - am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( - args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) - wav = wav.numpy() - duration = wav.size / sample_rate - logger.info( - f"sentence: {sentence}; duration: {duration} s; first response time: {first_response_time - st} s; final response time: {final_response_time - st} s" - ) - - -def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): - global sample_rate - sentences = get_sentences(args) - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - get_tone_ids = False - merge_sentences = True - - # choose infer function - if args.voc_streaming: - if args.am_streaming: - infer_func = streaming_am_streaming_voc - else: - infer_func = nonstreaming_am_streaming_voc - else: - infer_func = am_nonstreaming_voc - - final_up_duration = 0.0 - sentence_count = 0 - front_time_list = [] - am_time_list = [] - voc_time_list = [] - first_response_list = [] - final_response_list = [] - sentence_length_list = [] - duration_list = [] - - for utt_id, sentence in sentences: - # front - front_st = time.time() - phone_ids, tone_ids = get_phone(args, frontend, sentence, - merge_sentences, get_tone_ids) - part_phone_ids = phone_ids[0] - if tone_ids: - part_tone_ids = tone_ids[0] - else: - part_tone_ids = None - front_et = time.time() - front_time = front_et - front_st - - am_st = time.time() - am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( - args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) - am_time = am_infer_time - am_st - if args.voc_streaming and args.am_streaming: - voc_time = voc_infer_time - streaming_voc_st - else: - voc_time = voc_infer_time - am_infer_time - - first_response = first_response_time - front_st - final_response = final_response_time - front_st - - wav = wav.numpy() - duration = wav.size / sample_rate - sf.write( - str(output_dir / (utt_id + ".wav")), wav, samplerate=sample_rate) - print(f"{utt_id} done!") - - sentence_count += 1 - front_time_list.append(front_time) - am_time_list.append(am_time) - voc_time_list.append(voc_time) - first_response_list.append(first_response) - final_response_list.append(final_response) - sentence_length_list.append(len(sentence)) - duration_list.append(duration) - - logger.info( - f"uttid: {utt_id}; sentence: '{sentence}'; front time: {front_time} s; am time: {am_time} s; voc time: {voc_time} s; \ - first response time: {first_response} s; final response time: {final_response} s; audio duration: {duration} s;" - ) - - if final_response > duration: - final_up_duration += 1 - - all_time_sum = sum(final_response_list) - front_rate = sum(front_time_list) / all_time_sum - am_rate = sum(am_time_list) / all_time_sum - voc_rate = sum(voc_time_list) / all_time_sum - rtf = all_time_sum / sum(duration_list) - - logger.info( - f"The length of test text information, test num: {sentence_count}; text num: {sum(sentence_length_list)}; min: {min(sentence_length_list)}; max: {max(sentence_length_list)}; avg: {sum(sentence_length_list)/len(sentence_length_list)}" - ) - logger.info( - f"duration information, min: {min(duration_list)}; max: {max(duration_list)}; avg: {sum(duration_list) / len(duration_list)}; sum: {sum(duration_list)}" - ) - logger.info( - f"Front time information: min: {min(front_time_list)} s; max: {max(front_time_list)} s; avg: {sum(front_time_list)/len(front_time_list)} s; ratio: {front_rate * 100}%" - ) - logger.info( - f"AM time information: min: {min(am_time_list)} s; max: {max(am_time_list)} s; avg: {sum(am_time_list)/len(am_time_list)} s; ratio: {am_rate * 100}%" - ) - logger.info( - f"Vocoder time information: min: {min(voc_time_list)} s, max: {max(voc_time_list)} s; avg: {sum(voc_time_list)/len(voc_time_list)} s; ratio: {voc_rate * 100}%" - ) - logger.info( - f"first response time information: min: {min(first_response_list)} s; max: {max(first_response_list)} s; avg: {sum(first_response_list)/len(first_response_list)} s" - ) - logger.info( - f"final response time information: min: {min(final_response_list)} s; max: {max(final_response_list)} s; avg: {sum(final_response_list)/len(final_response_list)} s" - ) - logger.info(f"RTF is: {rtf}") - logger.info( - f"The number of final_response is greater than duration is {final_up_duration}, ratio: {final_up_duration / sentence_count}%" - ) - - -def parse_args(): - # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Synthesize with acoustic model & vocoder") - # acoustic model - parser.add_argument( - '--am', - type=str, - default='fastspeech2_csmsc', - choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'], - help='Choose acoustic model type of tts task. where fastspeech2_cnndecoder_csmsc supports streaming inference' - ) - - parser.add_argument( - '--am_config', - type=str, - default=None, - help='Config of acoustic model. Use deault config when it is None.') - parser.add_argument( - '--am_ckpt', - type=str, - default=None, - help='Checkpoint file of acoustic model.') - parser.add_argument( - "--am_stat", - type=str, - default=None, - help="mean and standard deviation used to normalize spectrogram when training acoustic model." - ) - parser.add_argument( - "--phones_dict", type=str, default=None, help="phone vocabulary file.") - parser.add_argument( - "--tones_dict", type=str, default=None, help="tone vocabulary file.") - # vocoder - parser.add_argument( - '--voc', - type=str, - default='mb_melgan_csmsc', - choices=['mb_melgan_csmsc', 'hifigan_csmsc'], - help='Choose vocoder type of tts task.') - parser.add_argument( - '--voc_config', - type=str, - default=None, - help='Config of voc. Use deault config when it is None.') - parser.add_argument( - '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') - parser.add_argument( - "--voc_stat", - type=str, - default=None, - help="mean and standard deviation used to normalize spectrogram when training voc." - ) - # other - parser.add_argument( - '--lang', - type=str, - default='zh', - choices=['zh', 'en'], - help='Choose model language. zh or en') - - parser.add_argument( - "--device", type=str, default='cpu', help="set cpu or gpu:id") - - parser.add_argument( - "--text", - type=str, - default="./csmsc_test.txt", - help="text to synthesize, a 'utt_id sentence' pair per line.") - parser.add_argument("--output_dir", type=str, help="output dir.") - parser.add_argument( - "--log_file", type=str, default="result.log", help="log file.") - - parser.add_argument( - "--am_streaming", - type=str2bool, - default=False, - help="whether use streaming acoustic model") - - parser.add_argument("--am_pad", type=int, default=12, help="am pad size.") - - parser.add_argument( - "--am_block", type=int, default=42, help="am block size.") - - parser.add_argument( - "--voc_streaming", - type=str2bool, - default=False, - help="whether use streaming vocoder model") - - parser.add_argument("--voc_pad", type=int, default=14, help="voc pad size.") - - parser.add_argument( - "--voc_block", type=int, default=14, help="voc block size.") - - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - paddle.set_device(args.device) - if args.am_streaming: - assert (args.am == 'fastspeech2_cnndecoder_csmsc') - - logger = logging.getLogger() - fhandler = logging.FileHandler(filename=args.log_file, mode='w') - formatter = logging.Formatter( - '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' - ) - fhandler.setFormatter(formatter) - logger.addHandler(fhandler) - logger.setLevel(logging.DEBUG) - - # set basic information - logger.info( - f"AM: {args.am}; Vocoder: {args.voc}; device: {args.device}; am streaming: {args.am_streaming}; voc streaming: {args.voc_streaming}" - ) - logger.info( - f"am pad size: {args.am_pad}; am block size: {args.am_block}; voc pad size: {args.voc_pad}; voc block size: {args.voc_block};" - ) - - # get information about model - frontend, am_infer_info, voc_infer_info = init(args) - logger.info( - "************************ warm up *********************************") - warm_up(args, logger, frontend, am_infer_info, voc_infer_info) - logger.info( - "************************ normal test *******************************") - evaluate(args, logger, frontend, am_infer_info, voc_infer_info) - - -if __name__ == "__main__": - main() From 40dde22fc48f41cffdace68847ccbeb00cc1cef4 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Tue, 19 Apr 2022 12:59:48 +0800 Subject: [PATCH 6/7] code format, test=doc --- .../server/engine/tts/online/tts_engine.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py index 8e76225d..a84644e7 100644 --- a/paddlespeech/server/engine/tts/online/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/tts_engine.py @@ -127,33 +127,40 @@ class TTSServerExecutor(TTSExecutor): self.voc_block = voc_block self.voc_pad = voc_pad - def get_model_info(self, step, model_name, ckpt, stat): + def get_model_info(self, + field: str, + model_name: str, + ckpt: Optional[os.PathLike], + stat: Optional[os.PathLike]): """get model information Args: - step (string): am or voc - model_name (string): model type, support fastspeech2, higigan, mb_melgan - ckpt (string): ckpt file - stat (string): stat file, including mean and standard deviation + field (str): am or voc + model_name (str): model type, support fastspeech2, higigan, mb_melgan + ckpt (Optional[os.PathLike]): ckpt file + stat (Optional[os.PathLike]): stat file, including mean and standard deviation Returns: - model, model_mu, model_std + [module]: model module + [Tensor]: mean + [Tensor]: standard deviation """ + model_class = dynamic_import(model_name, model_alias) - if step == "am": + if field == "am": odim = self.am_config.n_mels model = model_class( idim=self.vocab_size, odim=odim, **self.am_config["model"]) model.set_state_dict(paddle.load(ckpt)["main_params"]) - elif step == "voc": + elif field == "voc": model = model_class(**self.voc_config["generator_params"]) model.set_state_dict(paddle.load(ckpt)["generator_params"]) model.remove_weight_norm() else: - logger.error("Please set correct step, am or voc") + logger.error("Please set correct field, am or voc") model.eval() model_mu, model_std = np.load(stat) @@ -346,7 +353,8 @@ class TTSServerExecutor(TTSExecutor): voc_block = self.voc_block voc_pad = self.voc_pad voc_upsample = self.voc_config.n_shift - flag = 1 + # first_flag 用于标记首包 + first_flag = 1 get_tone_ids = False merge_sentences = False @@ -376,7 +384,7 @@ class TTSServerExecutor(TTSExecutor): if am == "fastspeech2_csmsc": # am mel = self.am_inference(part_phone_ids) - if flag == 1: + if first_flag == 1: first_am_et = time.time() self.first_am_infer = first_am_et - frontend_et @@ -388,11 +396,11 @@ class TTSServerExecutor(TTSExecutor): sub_wav = self.voc_inference(mel_chunk) sub_wav = self.depadding(sub_wav, voc_chunk_num, i, voc_block, voc_pad, voc_upsample) - if flag == 1: + if first_flag == 1: first_voc_et = time.time() self.first_voc_infer = first_voc_et - first_am_et self.first_response_time = first_voc_et - frontend_st - flag = 0 + first_flag = 0 yield sub_wav @@ -427,9 +435,10 @@ class TTSServerExecutor(TTSExecutor): (mel_streaming, sub_mel), axis=0) # streaming voc + # 当流式AM推理的mel帧数大于流式voc推理的chunk size,开始进行流式voc 推理 while (mel_streaming.shape[0] >= end and voc_chunk_id < voc_chunk_num): - if flag == 1: + if first_flag == 1: first_am_et = time.time() self.first_am_infer = first_am_et - frontend_et voc_chunk = mel_streaming[start:end, :] @@ -439,11 +448,11 @@ class TTSServerExecutor(TTSExecutor): sub_wav = self.depadding(sub_wav, voc_chunk_num, voc_chunk_id, voc_block, voc_pad, voc_upsample) - if flag == 1: + if first_flag == 1: first_voc_et = time.time() self.first_voc_infer = first_voc_et - first_am_et self.first_response_time = first_voc_et - frontend_st - flag = 0 + first_flag = 0 yield sub_wav @@ -470,7 +479,8 @@ class TTSEngine(BaseEngine): def __init__(self, name=None): """Initialize TTS server engine """ - super(TTSEngine, self).__init__() + #super(TTSEngine, self).__init__() + super().__init__() def init(self, config: dict) -> bool: self.config = config From 9e41ac8550b5f53b77ce3656e3561c58e0f25a82 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Tue, 19 Apr 2022 15:51:44 +0800 Subject: [PATCH 7/7] code format, test=doc --- paddlespeech/server/engine/tts/online/tts_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py index a84644e7..c9135b88 100644 --- a/paddlespeech/server/engine/tts/online/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/tts_engine.py @@ -479,7 +479,6 @@ class TTSEngine(BaseEngine): def __init__(self, name=None): """Initialize TTS server engine """ - #super(TTSEngine, self).__init__() super().__init__() def init(self, config: dict) -> bool: