From 2e51e0da9033c96741eff421d439807e97f04525 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Tue, 27 Dec 2022 18:21:24 +0800 Subject: [PATCH 1/3] [TTS]Fix attention bugs and sort VITS data with feats_lengths (#2770) --- paddlespeech/t2s/exps/vits/normalize.py | 2 +- paddlespeech/t2s/exps/vits/preprocess.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 6 +++--- paddlespeech/t2s/modules/transformer/attention.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 5881ae95c..514cbef8e 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -187,7 +187,7 @@ def main(): record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) - output_metadata.sort(key=itemgetter('utt_id')) + output_metadata.sort(key=itemgetter('feats_lengths')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" with jsonlines.open(output_metadata_path, 'w') as writer: for item in output_metadata: diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index f89ab356f..2b1a40834 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -166,7 +166,7 @@ def process_sentences(config, if record: results.append(record) - results.sort(key=itemgetter("utt_id")) + results.sort(key=itemgetter("feats_lengths")) with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: for item in results: writer.write(item) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index c994faa5a..c0238a98a 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -24,13 +24,13 @@ import yaml from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader -from paddle.io import DistributedBatchSampler from paddle.optimizer import Adam from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.datasets.sampler import ErnieSATSampler from paddlespeech.t2s.models.vits import VITS from paddlespeech.t2s.models.vits import VITSEvaluator from paddlespeech.t2s.models.vits import VITSUpdater @@ -107,12 +107,12 @@ def train_sp(args, config): converters=converters, ) # collate function and dataloader - train_sampler = DistributedBatchSampler( + train_sampler = ErnieSATSampler( train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True) - dev_sampler = DistributedBatchSampler( + dev_sampler = ErnieSATSampler( dev_dataset, batch_size=config.batch_size, shuffle=False, diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index d7a032445..e3c9a992a 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -196,7 +196,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): if self.zero_triu: ones = paddle.ones((t1, t2)) - x = x * paddle.tril(ones, t2 - 1)[None, None, :, :] + x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] return x @@ -299,7 +299,7 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): if self.zero_triu: ones = paddle.ones((t1, t2)) - x = x * paddle.tril(ones, t2 - 1)[None, None, :, :] + x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] return x From 1e8394202c38882db82c07fabfcb49ee559dbaac Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 28 Dec 2022 14:51:55 +0800 Subject: [PATCH 2/3] fix onnxruntime's version (#2771) --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e55f8454d..4f6d10d7c 100644 --- a/setup.py +++ b/setup.py @@ -44,8 +44,9 @@ base = [ "loguru", "matplotlib", "nara_wpe", - "onnxruntime==1.11.0", + "onnxruntime>=1.11.0", "opencc", + "opencc-python-reimplemented", "pandas", "paddlenlp>=2.4.8", "paddlespeech_feat", From 96d76c83ade6dad94738ef4ae1a095dec3f858b3 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Wed, 28 Dec 2022 21:58:44 +0800 Subject: [PATCH 3/3] multi-spk tts static model (#2779) * updata readme, test=doc * update yaml and readme, test=tts * fix batch_size, test=tts * update readme, test=doc * chmod, test=tts * add multi-spk tts static model infer on server, test=tts --- .../server/engine/tts/paddleinference/tts_engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index 2878c852b..20b98fae6 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -247,11 +247,13 @@ class TTSServerExecutor(TTSExecutor): else: # multi speaker do not have static model if am_dataset in {"aishell3", "vctk"}: - pass + am_result = run_model( + self.am_predictor, + [part_phone_ids.numpy(), np.array([spk_id])]) else: am_result = run_model(self.am_predictor, [part_phone_ids.numpy()]) - mel = am_result[0] + mel = am_result[0] self.am_time += (time.time() - am_st) # voc