diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index 2878c852b..20b98fae6 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -247,11 +247,13 @@ class TTSServerExecutor(TTSExecutor): else: # multi speaker do not have static model if am_dataset in {"aishell3", "vctk"}: - pass + am_result = run_model( + self.am_predictor, + [part_phone_ids.numpy(), np.array([spk_id])]) else: am_result = run_model(self.am_predictor, [part_phone_ids.numpy()]) - mel = am_result[0] + mel = am_result[0] self.am_time += (time.time() - am_st) # voc diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 5881ae95c..514cbef8e 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -187,7 +187,7 @@ def main(): record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) - output_metadata.sort(key=itemgetter('utt_id')) + output_metadata.sort(key=itemgetter('feats_lengths')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" with jsonlines.open(output_metadata_path, 'w') as writer: for item in output_metadata: diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index f89ab356f..2b1a40834 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -166,7 +166,7 @@ def process_sentences(config, if record: results.append(record) - results.sort(key=itemgetter("utt_id")) + results.sort(key=itemgetter("feats_lengths")) with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: for item in results: writer.write(item) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index c994faa5a..c0238a98a 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -24,13 +24,13 @@ import yaml from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader -from paddle.io import DistributedBatchSampler from paddle.optimizer import Adam from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.datasets.sampler import ErnieSATSampler from paddlespeech.t2s.models.vits import VITS from paddlespeech.t2s.models.vits import VITSEvaluator from paddlespeech.t2s.models.vits import VITSUpdater @@ -107,12 +107,12 @@ def train_sp(args, config): converters=converters, ) # collate function and dataloader - train_sampler = DistributedBatchSampler( + train_sampler = ErnieSATSampler( train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True) - dev_sampler = DistributedBatchSampler( + dev_sampler = ErnieSATSampler( dev_dataset, batch_size=config.batch_size, shuffle=False, diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index d7a032445..e3c9a992a 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -196,7 +196,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): if self.zero_triu: ones = paddle.ones((t1, t2)) - x = x * paddle.tril(ones, t2 - 1)[None, None, :, :] + x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] return x @@ -299,7 +299,7 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): if self.zero_triu: ones = paddle.ones((t1, t2)) - x = x * paddle.tril(ones, t2 - 1)[None, None, :, :] + x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] return x diff --git a/setup.py b/setup.py index e55f8454d..4f6d10d7c 100644 --- a/setup.py +++ b/setup.py @@ -44,8 +44,9 @@ base = [ "loguru", "matplotlib", "nara_wpe", - "onnxruntime==1.11.0", + "onnxruntime>=1.11.0", "opencc", + "opencc-python-reimplemented", "pandas", "paddlenlp>=2.4.8", "paddlespeech_feat",