From 02679906e649bf123c15e988bf84facd885aa7ee Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 14 Sep 2022 15:22:25 +0800 Subject: [PATCH 1/7] Update tts_papers.md --- docs/source/tts/tts_papers.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/tts/tts_papers.md b/docs/source/tts/tts_papers.md index 681b2106..f3ca1b62 100644 --- a/docs/source/tts/tts_papers.md +++ b/docs/source/tts/tts_papers.md @@ -5,6 +5,7 @@ - [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf) - [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf) * github: https://github.com/PaperMechanica/SemiPPL +- [WikipediaHomographData](https://github.com/google-research-datasets/WikipediaHomographData) ### Text Normalization #### English - [applenob/text_normalization](https://github.com/applenob/text_normalization) From 324b166c5293323082e2c326d728618fd05fcac0 Mon Sep 17 00:00:00 2001 From: WongLaw <95171490+WongLaw@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:11:12 +0800 Subject: [PATCH 2/7] Removed useless spk_id in speech_server and streaming_tts_server from demos, and support bilingual server engine, test=tts (#2380) * Removed useless spk_id in speech_server and streaming_tts_server from demos, and support bilingual server engine. --- demos/speech_server/conf/application.yaml | 4 ++-- demos/streaming_tts_server/conf/tts_online_application.yaml | 3 +-- .../streaming_tts_server/conf/tts_online_ws_application.yaml | 3 +-- paddlespeech/server/engine/engine_warmup.py | 4 +++- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index 9c171c47..b5ee8009 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -61,7 +61,7 @@ tts_python: phones_dict: tones_dict: speaker_dict: - spk_id: 0 + # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', # 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc', @@ -87,7 +87,7 @@ tts_inference: phones_dict: tones_dict: speaker_dict: - spk_id: 0 + am_predictor_conf: device: # set 'gpu:id' or 'cpu' diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml index e617912f..f5ec9dc8 100644 --- a/demos/streaming_tts_server/conf/tts_online_application.yaml +++ b/demos/streaming_tts_server/conf/tts_online_application.yaml @@ -29,7 +29,7 @@ tts_online: phones_dict: tones_dict: speaker_dict: - spk_id: 0 + # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference @@ -70,7 +70,6 @@ tts_online-onnx: phones_dict: tones_dict: speaker_dict: - spk_id: 0 am_sample_rate: 24000 am_sess_conf: device: "cpu" # set 'gpu:id' or 'cpu' diff --git a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml index 329f882c..c6563391 100644 --- a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml +++ b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml @@ -29,7 +29,7 @@ tts_online: phones_dict: tones_dict: speaker_dict: - spk_id: 0 + # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference @@ -70,7 +70,6 @@ tts_online-onnx: phones_dict: tones_dict: speaker_dict: - spk_id: 0 am_sample_rate: 24000 am_sess_conf: device: "cpu" # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/engine/engine_warmup.py b/paddlespeech/server/engine/engine_warmup.py index 3751554c..ff65dff9 100644 --- a/paddlespeech/server/engine/engine_warmup.py +++ b/paddlespeech/server/engine/engine_warmup.py @@ -27,8 +27,10 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: sentence = "您好,欢迎使用语音合成服务。" elif tts_engine.lang == 'en': sentence = "Hello and welcome to the speech synthesis service." + elif tts_engine.lang == 'mix': + sentence = "您好,欢迎使用TTS多语种服务。" else: - logger.error("tts engine only support lang: zh or en.") + logger.error("tts engine only support lang: zh or en or mix.") sys.exit(-1) if engine_and_type == "tts_python": From cdf095595f0398ac0fb20d9cd6f80672c5c00d0c Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Thu, 15 Sep 2022 15:47:59 +0800 Subject: [PATCH 3/7] [tts] finetune add frozen (#2385) * finetune add frozen --- examples/other/tts_finetune/tts3/README.md | 9 + examples/other/tts_finetune/tts3/finetune.py | 43 ++++- .../other/tts_finetune/tts3/finetune.yaml | 12 ++ .../other/tts_finetune/tts3/local/extract.py | 7 +- .../other/tts_finetune/tts3/local/train.py | 178 ++++++++++++++++++ examples/other/tts_finetune/tts3/run.sh | 12 +- 6 files changed, 242 insertions(+), 19 deletions(-) create mode 100644 examples/other/tts_finetune/tts3/finetune.yaml create mode 100644 examples/other/tts_finetune/tts3/local/train.py diff --git a/examples/other/tts_finetune/tts3/README.md b/examples/other/tts_finetune/tts3/README.md index 1ad30328..192ee7ff 100644 --- a/examples/other/tts_finetune/tts3/README.md +++ b/examples/other/tts_finetune/tts3/README.md @@ -75,6 +75,15 @@ When "Prepare" done. The structure of the current directory is listed below. ``` +### Set finetune.yaml +`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result. +Arguments: + - `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model + - `learning_rate`: learning rate. Default: 0.0001 + - `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model + - `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set []. + + ## Get Started Run the command below to diff --git a/examples/other/tts_finetune/tts3/finetune.py b/examples/other/tts_finetune/tts3/finetune.py index 0f060b44..207e2dbc 100644 --- a/examples/other/tts_finetune/tts3/finetune.py +++ b/examples/other/tts_finetune/tts3/finetune.py @@ -14,6 +14,7 @@ import argparse import os from pathlib import Path +from typing import List from typing import Union import yaml @@ -21,10 +22,10 @@ from local.check_oov import get_check_result from local.extract import extract_feature from local.label_process import get_single_label from local.prepare_env import generate_finetune_env +from local.train import train_sp from paddle import distributed as dist from yacs.config import CfgNode -from paddlespeech.t2s.exps.fastspeech2.train import train_sp from utils.gen_duration_from_textgrid import gen_duration_from_textgrid DICT_EN = 'tools/aligner/cmudict-0.7b' @@ -38,15 +39,24 @@ os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH'] class TrainArgs(): - def __init__(self, ngpu, config_file, dump_dir: Path, output_dir: Path): + def __init__(self, + ngpu, + config_file, + dump_dir: Path, + output_dir: Path, + frozen_layers: List[str]): + # config: fastspeech2 config file. self.config = str(config_file) self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl") self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl") + # model output dir. self.output_dir = str(output_dir) self.ngpu = ngpu self.phones_dict = str(dump_dir / "phone_id_map.txt") self.speaker_dict = str(dump_dir / "speaker_id_map.txt") self.voice_cloning = False + # frozen layers + self.frozen_layers = frozen_layers def get_mfa_result( @@ -122,12 +132,11 @@ if __name__ == '__main__': "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.") parser.add_argument("--epoch", type=int, default=100, help="finetune epoch") - parser.add_argument( - "--batch_size", - type=int, - default=-1, - help="batch size, default -1 means same as pretrained model") + "--finetune_config", + type=str, + default="./finetune.yaml", + help="Path to finetune config file") args = parser.parse_args() @@ -147,8 +156,14 @@ if __name__ == '__main__': with open(config_file) as f: config = CfgNode(yaml.safe_load(f)) config.max_epoch = config.max_epoch + args.epoch - if args.batch_size > 0: - config.batch_size = args.batch_size + + with open(args.finetune_config) as f2: + finetune_config = CfgNode(yaml.safe_load(f2)) + config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size + config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate + config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots + frozen_layers = finetune_config.frozen_layers + assert type(frozen_layers) == list, "frozen_layers should be set a list." if args.lang == 'en': lexicon_file = DICT_EN @@ -158,6 +173,13 @@ if __name__ == '__main__': mfa_phone_file = MFA_PHONE_ZH else: print('please input right lang!!') + + print(f"finetune max_epoch: {config.max_epoch}") + print(f"finetune batch_size: {config.batch_size}") + print(f"finetune learning_rate: {config.optimizer.learning_rate}") + print(f"finetune num_snapshots: {config.num_snapshots}") + print(f"finetune frozen_layers: {frozen_layers}") + am_phone_file = pretrained_model_dir / "phone_id_map.txt" label_file = input_dir / "labels.txt" @@ -181,7 +203,8 @@ if __name__ == '__main__': generate_finetune_env(output_dir, pretrained_model_dir) # create a new args for training - train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir) + train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir, + frozen_layers) # finetune models # dispatch diff --git a/examples/other/tts_finetune/tts3/finetune.yaml b/examples/other/tts_finetune/tts3/finetune.yaml new file mode 100644 index 00000000..374a69f3 --- /dev/null +++ b/examples/other/tts_finetune/tts3/finetune.yaml @@ -0,0 +1,12 @@ +########################################################### +# PARAS SETTING # +########################################################### +# Set to -1 to indicate that the parameter is the same as the pretrained model configuration + +batch_size: -1 +learning_rate: 0.0001 # learning rate +num_snapshots: -1 + +# frozen_layers should be a list +# if you don't need to freeze, set frozen_layers to [] +frozen_layers: ["encoder", "duration_predictor"] diff --git a/examples/other/tts_finetune/tts3/local/extract.py b/examples/other/tts_finetune/tts3/local/extract.py index edd92420..630b58ce 100644 --- a/examples/other/tts_finetune/tts3/local/extract.py +++ b/examples/other/tts_finetune/tts3/local/extract.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import math import os from operator import itemgetter from pathlib import Path @@ -211,9 +210,9 @@ def extract_feature(duration_file: str, mel_extractor, pitch_extractor, energy_extractor = get_extractor(config) wav_files = sorted(list((input_dir).rglob("*.wav"))) - # split data into 3 sections, train: 80%, dev: 10%, test: 10% - num_train = math.ceil(len(wav_files) * 0.8) - num_dev = math.ceil(len(wav_files) * 0.1) + # split data into 3 sections, train: len(wav_files) - 2, dev: 1, test: 1 + num_train = len(wav_files) - 2 + num_dev = 1 print(num_train, num_dev) train_wav_files = wav_files[:num_train] diff --git a/examples/other/tts_finetune/tts3/local/train.py b/examples/other/tts_finetune/tts3/local/train.py new file mode 100644 index 00000000..d065ae59 --- /dev/null +++ b/examples/other/tts_finetune/tts3/local/train.py @@ -0,0 +1,178 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +import shutil +from pathlib import Path +from typing import List + +import jsonlines +import numpy as np +import paddle +from paddle import DataParallel +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler + +from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.optimizer import build_optimizers +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def freeze_layer(model, layers: List[str]): + """freeze layers + + Args: + layers (List[str]): frozen layers + """ + for layer in layers: + for param in eval("model." + layer + ".parameters()"): + param.trainable = False + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + fields = [ + "text", "text_lengths", "speech", "speech_lengths", "durations", + "pitch", "energy" + ] + converters = {"speech": np.load, "pitch": np.load, "energy": np.load} + spk_num = None + if args.speaker_dict is not None: + print("multiple speaker fastspeech2!") + collate_fn = fastspeech2_multi_spk_batch_fn + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id) + fields += ["spk_id"] + elif args.voice_cloning: + print("Training voice cloning!") + collate_fn = fastspeech2_multi_spk_batch_fn + fields += ["spk_emb"] + converters["spk_emb"] = np.load + else: + print("single speaker fastspeech2!") + collate_fn = fastspeech2_single_spk_batch_fn + print("spk_num:", spk_num) + + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=fields, + converters=converters, ) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=fields, + converters=converters, ) + + # collate function and dataloader + + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=collate_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + shuffle=False, + drop_last=False, + batch_size=config.batch_size, + collate_fn=collate_fn, + num_workers=config.num_workers) + print("dataloaders done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + odim = config.n_mels + model = FastSpeech2( + idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"]) + + # freeze layer + if args.frozen_layers != []: + freeze_layer(model, args.frozen_layers) + + if world_size > 1: + model = DataParallel(model) + print("model done!") + + optimizer = build_optimizers(model, **config["optimizer"]) + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = FastSpeech2Updater( + model=model, + optimizer=optimizer, + dataloader=train_dataloader, + output_dir=output_dir, + **config["updater"]) + + trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + + evaluator = FastSpeech2Evaluator( + model, dev_dataloader, output_dir=output_dir, **config["updater"]) + + if dist.get_rank() == 0: + trainer.extend(evaluator, trigger=(1, "epoch")) + trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + trainer.run() diff --git a/examples/other/tts_finetune/tts3/run.sh b/examples/other/tts_finetune/tts3/run.sh index 9bb7ec6f..9c877e64 100755 --- a/examples/other/tts_finetune/tts3/run.sh +++ b/examples/other/tts_finetune/tts3/run.sh @@ -10,11 +10,12 @@ mfa_dir=./mfa_result dump_dir=./dump output_dir=./exp/default lang=zh -ngpu=2 +ngpu=1 +finetune_config=./finetune.yaml -ckpt=snapshot_iter_96600 +ckpt=snapshot_iter_96699 -gpus=0,1 +gpus=1 CUDA_VISIBLE_DEVICES=${gpus} stage=0 stop_stage=100 @@ -35,7 +36,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --output_dir=${output_dir} \ --lang=${lang} \ --ngpu=${ngpu} \ - --epoch=100 + --epoch=100 \ + --finetune_config=${finetune_config} fi @@ -54,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ - --output_dir=./test_e2e \ + --output_dir=./test_e2e/ \ --phones_dict=${dump_dir}/phone_id_map.txt \ --speaker_dict=${dump_dir}/speaker_id_map.txt \ --spk_id=0 From 4ac206e22ff2c7c669e4b4c2b6f74f842020aca6 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Fri, 16 Sep 2022 02:38:17 +0000 Subject: [PATCH 4/7] update wenetspeech RESULT.md, test=doc --- examples/wenetspeech/asr1/RESULTS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index af84a5f6..f22c652e 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 | + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.050767 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 | From eac362057c3db60a2b60ef49eb51867187050a18 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 16 Sep 2022 16:00:52 +0800 Subject: [PATCH 5/7] add typehint for g2pw (#2390) --- paddlespeech/t2s/frontend/g2pw/__init__.py | 2 +- paddlespeech/t2s/frontend/g2pw/dataset.py | 66 +++++++++++----------- paddlespeech/t2s/frontend/g2pw/onnx_api.py | 50 +++++++++------- paddlespeech/t2s/frontend/g2pw/utils.py | 11 ++-- 4 files changed, 71 insertions(+), 58 deletions(-) diff --git a/paddlespeech/t2s/frontend/g2pw/__init__.py b/paddlespeech/t2s/frontend/g2pw/__init__.py index 0eaeee5d..89b3af3c 100644 --- a/paddlespeech/t2s/frontend/g2pw/__init__.py +++ b/paddlespeech/t2s/frontend/g2pw/__init__.py @@ -1 +1 @@ -from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter +from .onnx_api import G2PWOnnxConverter diff --git a/paddlespeech/t2s/frontend/g2pw/dataset.py b/paddlespeech/t2s/frontend/g2pw/dataset.py index 98af5f46..8a1c2e0b 100644 --- a/paddlespeech/t2s/frontend/g2pw/dataset.py +++ b/paddlespeech/t2s/frontend/g2pw/dataset.py @@ -15,6 +15,10 @@ Credits This code is modified from https://github.com/GitYCC/g2pW """ +from typing import Dict +from typing import List +from typing import Tuple + import numpy as np from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map @@ -23,22 +27,17 @@ ANCHOR_CHAR = '▁' def prepare_onnx_input(tokenizer, - labels, - char2phonemes, - chars, - texts, - query_ids, - phonemes=None, - pos_tags=None, - use_mask=False, - use_char_phoneme=False, - use_pos=False, - window_size=None, - max_len=512): + labels: List[str], + char2phonemes: Dict[str, List[int]], + chars: List[str], + texts: List[str], + query_ids: List[int], + use_mask: bool=False, + window_size: int=None, + max_len: int=512) -> Dict[str, np.array]: if window_size is not None: - truncated_texts, truncated_query_ids = _truncate_texts(window_size, - texts, query_ids) - + truncated_texts, truncated_query_ids = _truncate_texts( + window_size=window_size, texts=texts, query_ids=query_ids) input_ids = [] token_type_ids = [] attention_masks = [] @@ -51,13 +50,19 @@ def prepare_onnx_input(tokenizer, query_id = (truncated_query_ids if window_size else query_ids)[idx] try: - tokens, text2token, token2text = tokenize_and_map(tokenizer, text) + tokens, text2token, token2text = tokenize_and_map( + tokenizer=tokenizer, text=text) except Exception: print(f'warning: text "{text}" is invalid') return {} text, query_id, tokens, text2token, token2text = _truncate( - max_len, text, query_id, tokens, text2token, token2text) + max_len=max_len, + text=text, + query_id=query_id, + tokens=tokens, + text2token=text2token, + token2text=token2text) processed_tokens = ['[CLS]'] + tokens + ['[SEP]'] @@ -91,7 +96,8 @@ def prepare_onnx_input(tokenizer, return outputs -def _truncate_texts(window_size, texts, query_ids): +def _truncate_texts(window_size: int, texts: List[str], + query_ids: List[int]) -> Tuple[List[str], List[int]]: truncated_texts = [] truncated_query_ids = [] for text, query_id in zip(texts, query_ids): @@ -105,7 +111,12 @@ def _truncate_texts(window_size, texts, query_ids): return truncated_texts, truncated_query_ids -def _truncate(max_len, text, query_id, tokens, text2token, token2text): +def _truncate(max_len: int, + text: str, + query_id: int, + tokens: List[str], + text2token: List[int], + token2text: List[Tuple[int]]): truncate_len = max_len - 2 if len(tokens) <= truncate_len: return (text, query_id, tokens, text2token, token2text) @@ -132,18 +143,8 @@ def _truncate(max_len, text, query_id, tokens, text2token, token2text): ], [(s - start, e - start) for s, e in token2text[token_start:token_end]]) -def prepare_data(sent_path, lb_path=None): - raw_texts = open(sent_path).read().rstrip().split('\n') - query_ids = [raw.index(ANCHOR_CHAR) for raw in raw_texts] - texts = [raw.replace(ANCHOR_CHAR, '') for raw in raw_texts] - if lb_path is None: - return texts, query_ids - else: - phonemes = open(lb_path).read().rstrip().split('\n') - return texts, query_ids, phonemes - - -def get_phoneme_labels(polyphonic_chars): +def get_phoneme_labels(polyphonic_chars: List[List[str]] + ) -> Tuple[List[str], Dict[str, List[int]]]: labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars]))) char2phonemes = {} for char, phoneme in polyphonic_chars: @@ -153,7 +154,8 @@ def get_phoneme_labels(polyphonic_chars): return labels, char2phonemes -def get_char_phoneme_labels(polyphonic_chars): +def get_char_phoneme_labels(polyphonic_chars: List[List[str]] + ) -> Tuple[List[str], Dict[str, List[int]]]: labels = sorted( list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars]))) char2phonemes = {} diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py index 180e8ae1..ad32c405 100644 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -17,6 +17,10 @@ Credits """ import json import os +from typing import Any +from typing import Dict +from typing import List +from typing import Tuple import numpy as np import onnxruntime @@ -37,7 +41,8 @@ from paddlespeech.utils.env import MODEL_HOME model_version = '1.1' -def predict(session, onnx_input, labels): +def predict(session, onnx_input: Dict[str, Any], + labels: List[str]) -> Tuple[List[str], List[float]]: all_preds = [] all_confidences = [] probs = session.run([], { @@ -61,10 +66,10 @@ def predict(session, onnx_input, labels): class G2PWOnnxConverter: def __init__(self, - model_dir=MODEL_HOME, - style='bopomofo', - model_source=None, - enable_non_tradional_chinese=False): + model_dir: os.PathLike=MODEL_HOME, + style: str='bopomofo', + model_source: str=None, + enable_non_tradional_chinese: bool=False): uncompress_path = download_and_decompress( g2pw_onnx_models['G2PWModel'][model_version], model_dir) @@ -76,7 +81,8 @@ class G2PWOnnxConverter: os.path.join(uncompress_path, 'g2pW.onnx'), sess_options=sess_options) self.config = load_config( - os.path.join(uncompress_path, 'config.py'), use_default=True) + config_path=os.path.join(uncompress_path, 'config.py'), + use_default=True) self.model_source = model_source if model_source else self.config.model_source self.enable_opencc = enable_non_tradional_chinese @@ -103,9 +109,9 @@ class G2PWOnnxConverter: .strip().split('\n') ] self.labels, self.char2phonemes = get_char_phoneme_labels( - self.polyphonic_chars + polyphonic_chars=self.polyphonic_chars ) if self.config.use_char_phoneme else get_phoneme_labels( - self.polyphonic_chars) + polyphonic_chars=self.polyphonic_chars) self.chars = sorted(list(self.char2phonemes.keys())) @@ -146,7 +152,7 @@ class G2PWOnnxConverter: if self.enable_opencc: self.cc = OpenCC('s2tw') - def _convert_bopomofo_to_pinyin(self, bopomofo): + def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str: tone = bopomofo[-1] assert tone in '12345' component = self.bopomofo_convert_dict.get(bopomofo[:-1]) @@ -156,7 +162,7 @@ class G2PWOnnxConverter: print(f'Warning: "{bopomofo}" cannot convert to pinyin') return None - def __call__(self, sentences): + def __call__(self, sentences: List[str]) -> List[List[str]]: if isinstance(sentences, str): sentences = [sentences] @@ -169,23 +175,25 @@ class G2PWOnnxConverter: sentences = translated_sentences texts, query_ids, sent_ids, partial_results = self._prepare_data( - sentences) + sentences=sentences) if len(texts) == 0: # sentences no polyphonic words return partial_results onnx_input = prepare_onnx_input( - self.tokenizer, - self.labels, - self.char2phonemes, - self.chars, - texts, - query_ids, + tokenizer=self.tokenizer, + labels=self.labels, + char2phonemes=self.char2phonemes, + chars=self.chars, + texts=texts, + query_ids=query_ids, use_mask=self.config.use_mask, - use_char_phoneme=self.config.use_char_phoneme, window_size=None) - preds, confidences = predict(self.session_g2pW, onnx_input, self.labels) + preds, confidences = predict( + session=self.session_g2pW, + onnx_input=onnx_input, + labels=self.labels) if self.config.use_char_phoneme: preds = [pred.split(' ')[1] for pred in preds] @@ -195,7 +203,9 @@ class G2PWOnnxConverter: return results - def _prepare_data(self, sentences): + def _prepare_data( + self, sentences: List[str] + ) -> Tuple[List[str], List[int], List[int], List[List[str]]]: texts, query_ids, sent_ids, partial_results = [], [], [], [] for sent_id, sent in enumerate(sentences): # pypinyin works well for Simplified Chinese than Traditional Chinese diff --git a/paddlespeech/t2s/frontend/g2pw/utils.py b/paddlespeech/t2s/frontend/g2pw/utils.py index ad02c4c1..ba9ce51b 100644 --- a/paddlespeech/t2s/frontend/g2pw/utils.py +++ b/paddlespeech/t2s/frontend/g2pw/utils.py @@ -15,10 +15,11 @@ Credits This code is modified from https://github.com/GitYCC/g2pW """ +import os import re -def wordize_and_map(text): +def wordize_and_map(text: str): words = [] index_map_from_text_to_word = [] index_map_from_word_to_text = [] @@ -54,8 +55,8 @@ def wordize_and_map(text): return words, index_map_from_text_to_word, index_map_from_word_to_text -def tokenize_and_map(tokenizer, text): - words, text2word, word2text = wordize_and_map(text) +def tokenize_and_map(tokenizer, text: str): + words, text2word, word2text = wordize_and_map(text=text) tokens = [] index_map_from_token_to_text = [] @@ -82,7 +83,7 @@ def tokenize_and_map(tokenizer, text): return tokens, index_map_from_text_to_token, index_map_from_token_to_text -def _load_config(config_path): +def _load_config(config_path: os.PathLike): import importlib.util spec = importlib.util.spec_from_file_location('__init__', config_path) config = importlib.util.module_from_spec(spec) @@ -130,7 +131,7 @@ default_config_dict = { } -def load_config(config_path, use_default=False): +def load_config(config_path: os.PathLike, use_default: bool=False): config = _load_config(config_path) if use_default: for attr, val in default_config_dict.items(): From e6cbcca3e220b3b2ae869055f0771b48958b512b Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 16 Sep 2022 16:23:47 +0800 Subject: [PATCH 6/7] fix ERNIE-SAT README, test=doc (#2392) --- examples/aishell3/ernie_sat/README.md | 13 ++++++------- examples/aishell3_vctk/ernie_sat/README.md | 13 ++++++------- examples/vctk/ernie_sat/README.md | 11 +++++------ 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/examples/aishell3/ernie_sat/README.md b/examples/aishell3/ernie_sat/README.md index 707ee138..eb867ab7 100644 --- a/examples/aishell3/ernie_sat/README.md +++ b/examples/aishell3/ernie_sat/README.md @@ -1,11 +1,10 @@ -# ERNIE-SAT with AISHELL3 dataset +# ERNIE-SAT with VCTK dataset +ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. -ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 - -## 模型框架 -ERNIE-SAT 中我们提出了两项创新: -- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 -- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 +## Model Framework +In ERNIE-SAT, we propose two innovations: +- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping +- The joint mask learning of speech and text is used to realize the alignment of speech and text

diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md index a849488d..d55af675 100644 --- a/examples/aishell3_vctk/ernie_sat/README.md +++ b/examples/aishell3_vctk/ernie_sat/README.md @@ -1,11 +1,10 @@ -# ERNIE-SAT with AISHELL3 and VCTK dataset +# ERNIE-SAT with VCTK dataset +ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. -ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 - -## 模型框架 -ERNIE-SAT 中我们提出了两项创新: -- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 -- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 +## Model Framework +In ERNIE-SAT, we propose two innovations: +- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping +- The joint mask learning of speech and text is used to realize the alignment of speech and text

diff --git a/examples/vctk/ernie_sat/README.md b/examples/vctk/ernie_sat/README.md index 0a2f9359..94c7ae25 100644 --- a/examples/vctk/ernie_sat/README.md +++ b/examples/vctk/ernie_sat/README.md @@ -1,11 +1,10 @@ # ERNIE-SAT with VCTK dataset +ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. -ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 - -## 模型框架 -ERNIE-SAT 中我们提出了两项创新: -- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 -- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 +## Model Framework +In ERNIE-SAT, we propose two innovations: +- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping +- The joint mask learning of speech and text is used to realize the alignment of speech and text

From 5e714ecb4a40561c2a2e6a54ff8c4d787cea4ec4 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 19 Sep 2022 18:35:08 +0800 Subject: [PATCH 7/7] [doc]update api docs (#2406) * update apt docs, test=doc --- docs/source/api/paddlespeech.audio.rst | 3 + ...ddlespeech.audio.streamdata.autodecode.rst | 7 + .../paddlespeech.audio.streamdata.cache.rst | 7 + .../paddlespeech.audio.streamdata.compat.rst | 7 + ...espeech.audio.streamdata.extradatasets.rst | 7 + .../paddlespeech.audio.streamdata.filters.rst | 7 + .../paddlespeech.audio.streamdata.gopen.rst | 7 + ...paddlespeech.audio.streamdata.handlers.rst | 7 + .../api/paddlespeech.audio.streamdata.mix.rst | 7 + ...lespeech.audio.streamdata.paddle_utils.rst | 7 + ...paddlespeech.audio.streamdata.pipeline.rst | 7 + .../api/paddlespeech.audio.streamdata.rst | 28 ++ ...ddlespeech.audio.streamdata.shardlists.rst | 7 + ...lespeech.audio.streamdata.tariterators.rst | 7 + .../paddlespeech.audio.streamdata.utils.rst | 7 + .../paddlespeech.audio.streamdata.writer.rst | 7 + docs/source/api/paddlespeech.audio.text.rst | 16 + ...addlespeech.audio.text.text_featurizer.rst | 7 + .../api/paddlespeech.audio.text.utility.rst | 7 + ...addlespeech.audio.transform.add_deltas.rst | 7 + ...peech.audio.transform.channel_selector.rst | 7 + .../api/paddlespeech.audio.transform.cmvn.rst | 7 + ...addlespeech.audio.transform.functional.rst | 7 + .../paddlespeech.audio.transform.perturb.rst | 7 + .../api/paddlespeech.audio.transform.rst | 24 ++ ...dlespeech.audio.transform.spec_augment.rst | 7 + ...ddlespeech.audio.transform.spectrogram.rst | 7 + ...ch.audio.transform.transform_interface.rst | 7 + ...espeech.audio.transform.transformation.rst | 7 + .../api/paddlespeech.audio.transform.wpe.rst | 7 + .../paddlespeech.audio.utils.check_kwargs.rst | 7 + ...addlespeech.audio.utils.dynamic_import.rst | 7 + docs/source/api/paddlespeech.audio.utils.rst | 3 + .../paddlespeech.audio.utils.tensor_utils.rst | 7 + .../paddlespeech.kws.exps.mdtc.collate.rst | 7 + ...paddlespeech.kws.exps.mdtc.compute_det.rst | 7 + ...dlespeech.kws.exps.mdtc.plot_det_curve.rst | 7 + .../source/api/paddlespeech.kws.exps.mdtc.rst | 19 ++ .../api/paddlespeech.kws.exps.mdtc.score.rst | 7 + .../api/paddlespeech.kws.exps.mdtc.train.rst | 7 + docs/source/api/paddlespeech.kws.exps.rst | 15 + docs/source/api/paddlespeech.kws.rst | 1 + .../api/paddlespeech.resource.model_alias.rst | 7 + ...addlespeech.resource.pretrained_models.rst | 7 + .../api/paddlespeech.resource.resource.rst | 7 + docs/source/api/paddlespeech.resource.rst | 17 + docs/source/api/paddlespeech.rst | 2 + docs/source/api/paddlespeech.s2t.rst | 1 - docs/source/api/paddlespeech.server.utils.rst | 1 - docs/source/api/paddlespeech.t2s.datasets.rst | 1 + .../api/paddlespeech.t2s.datasets.sampler.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.align.rst | 7 + ...dlespeech.t2s.exps.ernie_sat.normalize.rst | 7 + ...lespeech.t2s.exps.ernie_sat.preprocess.rst | 7 + .../api/paddlespeech.t2s.exps.ernie_sat.rst | 21 ++ ...lespeech.t2s.exps.ernie_sat.synthesize.rst | 7 + ...eech.t2s.exps.ernie_sat.synthesize_e2e.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.train.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.utils.rst | 7 + .../api/paddlespeech.t2s.exps.fastspeech2.rst | 1 + ...espeech.t2s.exps.fastspeech2.vc2_infer.rst | 7 + docs/source/api/paddlespeech.t2s.exps.rst | 3 + .../paddlespeech.t2s.exps.stream_play_tts.rst | 7 + .../paddlespeech.t2s.exps.vits.normalize.rst | 7 + .../paddlespeech.t2s.exps.vits.preprocess.rst | 7 + .../source/api/paddlespeech.t2s.exps.vits.rst | 20 ++ .../paddlespeech.t2s.exps.vits.synthesize.rst | 7 + ...dlespeech.t2s.exps.vits.synthesize_e2e.rst | 7 + .../api/paddlespeech.t2s.exps.vits.train.rst | 7 + ...ddlespeech.t2s.exps.vits.voice_cloning.rst | 7 + ...paddlespeech.t2s.frontend.g2pw.dataset.rst | 7 + ...addlespeech.t2s.frontend.g2pw.onnx_api.rst | 7 + .../api/paddlespeech.t2s.frontend.g2pw.rst | 17 + .../paddlespeech.t2s.frontend.g2pw.utils.rst | 7 + ...paddlespeech.t2s.frontend.mix_frontend.rst | 7 + docs/source/api/paddlespeech.t2s.frontend.rst | 2 + ...espeech.t2s.models.ernie_sat.ernie_sat.rst | 7 + ...t2s.models.ernie_sat.ernie_sat_updater.rst | 7 + .../api/paddlespeech.t2s.models.ernie_sat.rst | 3 +- ...h.t2s.models.vits.monotonic_align.core.rst | 7 + ...speech.t2s.models.vits.monotonic_align.rst | 16 + ....t2s.models.vits.monotonic_align.setup.rst | 7 + .../api/paddlespeech.utils.dynamic_import.rst | 7 + docs/source/api/paddlespeech.utils.env.rst | 7 + docs/source/api/paddlespeech.utils.rst | 16 + docs/source/index.rst | 2 + .../t2s/models/ernie_sat/ernie_sat.py | 108 ++++--- .../t2s/models/vits/duration_predictor.py | 39 ++- paddlespeech/t2s/models/vits/flow.py | 111 ++++--- paddlespeech/t2s/models/vits/generator.py | 301 +++++++++++------- .../t2s/models/vits/posterior_encoder.py | 54 ++-- .../t2s/models/vits/residual_coupling.py | 99 ++++-- paddlespeech/t2s/models/vits/text_encoder.py | 69 ++-- paddlespeech/t2s/models/vits/vits.py | 153 ++++++--- .../t2s/models/vits/wavenet/residual_block.py | 24 +- .../t2s/models/vits/wavenet/wavenet.py | 72 +++-- paddlespeech/t2s/models/wavernn/wavernn.py | 20 +- 97 files changed, 1348 insertions(+), 375 deletions(-) create mode 100644 docs/source/api/paddlespeech.audio.streamdata.autodecode.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.cache.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.compat.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.filters.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.gopen.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.handlers.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.mix.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.pipeline.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.shardlists.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.tariterators.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.utils.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.writer.rst create mode 100644 docs/source/api/paddlespeech.audio.text.rst create mode 100644 docs/source/api/paddlespeech.audio.text.text_featurizer.rst create mode 100644 docs/source/api/paddlespeech.audio.text.utility.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.add_deltas.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.channel_selector.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.cmvn.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.functional.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.perturb.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.spec_augment.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.spectrogram.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.transform_interface.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.transformation.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.wpe.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.check_kwargs.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.dynamic_import.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.tensor_utils.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.score.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.train.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.rst create mode 100644 docs/source/api/paddlespeech.resource.model_alias.rst create mode 100644 docs/source/api/paddlespeech.resource.pretrained_models.rst create mode 100644 docs/source/api/paddlespeech.resource.resource.rst create mode 100644 docs/source/api/paddlespeech.resource.rst create mode 100644 docs/source/api/paddlespeech.t2s.datasets.sampler.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.train.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst create mode 100644 docs/source/api/paddlespeech.utils.dynamic_import.rst create mode 100644 docs/source/api/paddlespeech.utils.env.rst create mode 100644 docs/source/api/paddlespeech.utils.rst diff --git a/docs/source/api/paddlespeech.audio.rst b/docs/source/api/paddlespeech.audio.rst index 5a3867f9..4ed7e467 100644 --- a/docs/source/api/paddlespeech.audio.rst +++ b/docs/source/api/paddlespeech.audio.rst @@ -20,4 +20,7 @@ Subpackages paddlespeech.audio.io paddlespeech.audio.metric paddlespeech.audio.sox_effects + paddlespeech.audio.streamdata + paddlespeech.audio.text + paddlespeech.audio.transform paddlespeech.audio.utils diff --git a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst new file mode 100644 index 00000000..1e45c137 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.autodecode module +=============================================== + +.. automodule:: paddlespeech.audio.streamdata.autodecode + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.cache.rst b/docs/source/api/paddlespeech.audio.streamdata.cache.rst new file mode 100644 index 00000000..393055e5 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.cache.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.cache module +========================================== + +.. automodule:: paddlespeech.audio.streamdata.cache + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.compat.rst b/docs/source/api/paddlespeech.audio.streamdata.compat.rst new file mode 100644 index 00000000..760695b2 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.compat.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.compat module +=========================================== + +.. automodule:: paddlespeech.audio.streamdata.compat + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst new file mode 100644 index 00000000..74628e96 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.extradatasets module +================================================== + +.. automodule:: paddlespeech.audio.streamdata.extradatasets + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.filters.rst b/docs/source/api/paddlespeech.audio.streamdata.filters.rst new file mode 100644 index 00000000..d2610427 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.filters.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.filters module +============================================ + +.. automodule:: paddlespeech.audio.streamdata.filters + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.gopen.rst b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst new file mode 100644 index 00000000..1cccb776 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.gopen module +========================================== + +.. automodule:: paddlespeech.audio.streamdata.gopen + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst new file mode 100644 index 00000000..7a4b3ce8 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.handlers module +============================================= + +.. automodule:: paddlespeech.audio.streamdata.handlers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.mix.rst b/docs/source/api/paddlespeech.audio.streamdata.mix.rst new file mode 100644 index 00000000..908b35dd --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.mix.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.mix module +======================================== + +.. automodule:: paddlespeech.audio.streamdata.mix + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst new file mode 100644 index 00000000..20334300 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.paddle\_utils module +================================================== + +.. automodule:: paddlespeech.audio.streamdata.paddle_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst new file mode 100644 index 00000000..ae05fbec --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.pipeline module +============================================= + +.. automodule:: paddlespeech.audio.streamdata.pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.rst b/docs/source/api/paddlespeech.audio.streamdata.rst new file mode 100644 index 00000000..a1f4560a --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.rst @@ -0,0 +1,28 @@ +paddlespeech.audio.streamdata package +===================================== + +.. automodule:: paddlespeech.audio.streamdata + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.streamdata.autodecode + paddlespeech.audio.streamdata.cache + paddlespeech.audio.streamdata.compat + paddlespeech.audio.streamdata.extradatasets + paddlespeech.audio.streamdata.filters + paddlespeech.audio.streamdata.gopen + paddlespeech.audio.streamdata.handlers + paddlespeech.audio.streamdata.mix + paddlespeech.audio.streamdata.paddle_utils + paddlespeech.audio.streamdata.pipeline + paddlespeech.audio.streamdata.shardlists + paddlespeech.audio.streamdata.tariterators + paddlespeech.audio.streamdata.utils + paddlespeech.audio.streamdata.writer diff --git a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst new file mode 100644 index 00000000..ec1fe823 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.shardlists module +=============================================== + +.. automodule:: paddlespeech.audio.streamdata.shardlists + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst new file mode 100644 index 00000000..b003b2d4 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.tariterators module +================================================= + +.. automodule:: paddlespeech.audio.streamdata.tariterators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.utils.rst b/docs/source/api/paddlespeech.audio.streamdata.utils.rst new file mode 100644 index 00000000..f248b113 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.utils module +========================================== + +.. automodule:: paddlespeech.audio.streamdata.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.writer.rst b/docs/source/api/paddlespeech.audio.streamdata.writer.rst new file mode 100644 index 00000000..7437241f --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.writer.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.writer module +=========================================== + +.. automodule:: paddlespeech.audio.streamdata.writer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.text.rst b/docs/source/api/paddlespeech.audio.text.rst new file mode 100644 index 00000000..a2018050 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.rst @@ -0,0 +1,16 @@ +paddlespeech.audio.text package +=============================== + +.. automodule:: paddlespeech.audio.text + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.text.text_featurizer + paddlespeech.audio.text.utility diff --git a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst new file mode 100644 index 00000000..1a8262d0 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.text.text\_featurizer module +=============================================== + +.. automodule:: paddlespeech.audio.text.text_featurizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.text.utility.rst b/docs/source/api/paddlespeech.audio.text.utility.rst new file mode 100644 index 00000000..90fcb25f --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.utility.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.text.utility module +====================================== + +.. automodule:: paddlespeech.audio.text.utility + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst new file mode 100644 index 00000000..b4b596d6 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.add\_deltas module +=============================================== + +.. automodule:: paddlespeech.audio.transform.add_deltas + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst new file mode 100644 index 00000000..4828b590 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.channel\_selector module +===================================================== + +.. automodule:: paddlespeech.audio.transform.channel_selector + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.cmvn.rst b/docs/source/api/paddlespeech.audio.transform.cmvn.rst new file mode 100644 index 00000000..44655a1e --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.cmvn.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.cmvn module +======================================== + +.. automodule:: paddlespeech.audio.transform.cmvn + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.functional.rst b/docs/source/api/paddlespeech.audio.transform.functional.rst new file mode 100644 index 00000000..7877d249 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.functional.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.functional module +============================================== + +.. automodule:: paddlespeech.audio.transform.functional + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.perturb.rst b/docs/source/api/paddlespeech.audio.transform.perturb.rst new file mode 100644 index 00000000..e3615a5d --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.perturb.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.perturb module +=========================================== + +.. automodule:: paddlespeech.audio.transform.perturb + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.rst b/docs/source/api/paddlespeech.audio.transform.rst new file mode 100644 index 00000000..47a7303b --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.rst @@ -0,0 +1,24 @@ +paddlespeech.audio.transform package +==================================== + +.. automodule:: paddlespeech.audio.transform + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.transform.add_deltas + paddlespeech.audio.transform.channel_selector + paddlespeech.audio.transform.cmvn + paddlespeech.audio.transform.functional + paddlespeech.audio.transform.perturb + paddlespeech.audio.transform.spec_augment + paddlespeech.audio.transform.spectrogram + paddlespeech.audio.transform.transform_interface + paddlespeech.audio.transform.transformation + paddlespeech.audio.transform.wpe diff --git a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst new file mode 100644 index 00000000..f11a3224 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.spec\_augment module +================================================= + +.. automodule:: paddlespeech.audio.transform.spec_augment + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst new file mode 100644 index 00000000..6be0c32e --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.spectrogram module +=============================================== + +.. automodule:: paddlespeech.audio.transform.spectrogram + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst new file mode 100644 index 00000000..ec8b2085 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.transform\_interface module +======================================================== + +.. automodule:: paddlespeech.audio.transform.transform_interface + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.transformation.rst b/docs/source/api/paddlespeech.audio.transform.transformation.rst new file mode 100644 index 00000000..94629b9a --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.transformation.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.transformation module +================================================== + +.. automodule:: paddlespeech.audio.transform.transformation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.wpe.rst b/docs/source/api/paddlespeech.audio.transform.wpe.rst new file mode 100644 index 00000000..85c75811 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.wpe.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.wpe module +======================================= + +.. automodule:: paddlespeech.audio.transform.wpe + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst new file mode 100644 index 00000000..a18f27e6 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.check\_kwargs module +============================================= + +.. automodule:: paddlespeech.audio.utils.check_kwargs + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst new file mode 100644 index 00000000..5d060ee1 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.dynamic\_import module +=============================================== + +.. automodule:: paddlespeech.audio.utils.dynamic_import + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.rst b/docs/source/api/paddlespeech.audio.utils.rst index db15927d..217afa8f 100644 --- a/docs/source/api/paddlespeech.audio.utils.rst +++ b/docs/source/api/paddlespeech.audio.utils.rst @@ -12,8 +12,11 @@ Submodules .. toctree:: :maxdepth: 4 + paddlespeech.audio.utils.check_kwargs paddlespeech.audio.utils.download + paddlespeech.audio.utils.dynamic_import paddlespeech.audio.utils.error paddlespeech.audio.utils.log paddlespeech.audio.utils.numeric + paddlespeech.audio.utils.tensor_utils paddlespeech.audio.utils.time diff --git a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst new file mode 100644 index 00000000..93a1f70e --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.tensor\_utils module +============================================= + +.. automodule:: paddlespeech.audio.utils.tensor_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst new file mode 100644 index 00000000..b533e8c4 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.collate module +========================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.collate + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst new file mode 100644 index 00000000..45e09455 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.compute\_det module +============================================== + +.. automodule:: paddlespeech.kws.exps.mdtc.compute_det + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst new file mode 100644 index 00000000..46a149b0 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.plot\_det\_curve module +================================================== + +.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst new file mode 100644 index 00000000..f6cad64e --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst @@ -0,0 +1,19 @@ +paddlespeech.kws.exps.mdtc package +================================== + +.. automodule:: paddlespeech.kws.exps.mdtc + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.kws.exps.mdtc.collate + paddlespeech.kws.exps.mdtc.compute_det + paddlespeech.kws.exps.mdtc.plot_det_curve + paddlespeech.kws.exps.mdtc.score + paddlespeech.kws.exps.mdtc.train diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst new file mode 100644 index 00000000..aa956b4c --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.score module +======================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.score + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst new file mode 100644 index 00000000..5e4ca401 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.train module +======================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.rst b/docs/source/api/paddlespeech.kws.exps.rst new file mode 100644 index 00000000..bf10d2c9 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.rst @@ -0,0 +1,15 @@ +paddlespeech.kws.exps package +============================= + +.. automodule:: paddlespeech.kws.exps + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.kws.exps.mdtc diff --git a/docs/source/api/paddlespeech.kws.rst b/docs/source/api/paddlespeech.kws.rst index c2829a42..d21d094c 100644 --- a/docs/source/api/paddlespeech.kws.rst +++ b/docs/source/api/paddlespeech.kws.rst @@ -12,4 +12,5 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.kws.exps paddlespeech.kws.models diff --git a/docs/source/api/paddlespeech.resource.model_alias.rst b/docs/source/api/paddlespeech.resource.model_alias.rst new file mode 100644 index 00000000..b78e643a --- /dev/null +++ b/docs/source/api/paddlespeech.resource.model_alias.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.model\_alias module +========================================= + +.. automodule:: paddlespeech.resource.model_alias + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.pretrained_models.rst b/docs/source/api/paddlespeech.resource.pretrained_models.rst new file mode 100644 index 00000000..a0206169 --- /dev/null +++ b/docs/source/api/paddlespeech.resource.pretrained_models.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.pretrained\_models module +=============================================== + +.. automodule:: paddlespeech.resource.pretrained_models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.resource.rst b/docs/source/api/paddlespeech.resource.resource.rst new file mode 100644 index 00000000..8b51eda3 --- /dev/null +++ b/docs/source/api/paddlespeech.resource.resource.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.resource module +===================================== + +.. automodule:: paddlespeech.resource.resource + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.rst b/docs/source/api/paddlespeech.resource.rst new file mode 100644 index 00000000..61fdd531 --- /dev/null +++ b/docs/source/api/paddlespeech.resource.rst @@ -0,0 +1,17 @@ +paddlespeech.resource package +============================= + +.. automodule:: paddlespeech.resource + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.resource.model_alias + paddlespeech.resource.pretrained_models + paddlespeech.resource.resource diff --git a/docs/source/api/paddlespeech.rst b/docs/source/api/paddlespeech.rst index e7a01bf7..d06cd2c7 100644 --- a/docs/source/api/paddlespeech.rst +++ b/docs/source/api/paddlespeech.rst @@ -16,8 +16,10 @@ Subpackages paddlespeech.cli paddlespeech.cls paddlespeech.kws + paddlespeech.resource paddlespeech.s2t paddlespeech.server paddlespeech.t2s paddlespeech.text + paddlespeech.utils paddlespeech.vector diff --git a/docs/source/api/paddlespeech.s2t.rst b/docs/source/api/paddlespeech.s2t.rst index 4be22cb8..be9ef52f 100644 --- a/docs/source/api/paddlespeech.s2t.rst +++ b/docs/source/api/paddlespeech.s2t.rst @@ -19,5 +19,4 @@ Subpackages paddlespeech.s2t.models paddlespeech.s2t.modules paddlespeech.s2t.training - paddlespeech.s2t.transform paddlespeech.s2t.utils diff --git a/docs/source/api/paddlespeech.server.utils.rst b/docs/source/api/paddlespeech.server.utils.rst index 9d116639..b4051aee 100644 --- a/docs/source/api/paddlespeech.server.utils.rst +++ b/docs/source/api/paddlespeech.server.utils.rst @@ -18,7 +18,6 @@ Submodules paddlespeech.server.utils.config paddlespeech.server.utils.errors paddlespeech.server.utils.exception - paddlespeech.server.utils.log paddlespeech.server.utils.onnx_infer paddlespeech.server.utils.paddle_predictor paddlespeech.server.utils.util diff --git a/docs/source/api/paddlespeech.t2s.datasets.rst b/docs/source/api/paddlespeech.t2s.datasets.rst index b40eb2bf..dfbdb0b4 100644 --- a/docs/source/api/paddlespeech.t2s.datasets.rst +++ b/docs/source/api/paddlespeech.t2s.datasets.rst @@ -19,4 +19,5 @@ Submodules paddlespeech.t2s.datasets.get_feats paddlespeech.t2s.datasets.ljspeech paddlespeech.t2s.datasets.preprocess_utils + paddlespeech.t2s.datasets.sampler paddlespeech.t2s.datasets.vocoder_batch_fn diff --git a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst new file mode 100644 index 00000000..ed29c28d --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.datasets.sampler module +======================================== + +.. automodule:: paddlespeech.t2s.datasets.sampler + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst new file mode 100644 index 00000000..a5e07aac --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.align module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.align + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst new file mode 100644 index 00000000..3771311c --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.normalize module +================================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst new file mode 100644 index 00000000..8d4c24ff --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.preprocess module +================================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst new file mode 100644 index 00000000..a6115842 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst @@ -0,0 +1,21 @@ +paddlespeech.t2s.exps.ernie\_sat package +======================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.exps.ernie_sat.align + paddlespeech.t2s.exps.ernie_sat.normalize + paddlespeech.t2s.exps.ernie_sat.preprocess + paddlespeech.t2s.exps.ernie_sat.synthesize + paddlespeech.t2s.exps.ernie_sat.synthesize_e2e + paddlespeech.t2s.exps.ernie_sat.train + paddlespeech.t2s.exps.ernie_sat.utils diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst new file mode 100644 index 00000000..ecda2a51 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.synthesize module +================================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst new file mode 100644 index 00000000..00fc4495 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module +======================================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst new file mode 100644 index 00000000..ba9a3334 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.train module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst new file mode 100644 index 00000000..a2dd26c3 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.utils module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst index 3c98aa88..fad1fd87 100644 --- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst +++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst @@ -16,3 +16,4 @@ Submodules paddlespeech.t2s.exps.fastspeech2.normalize paddlespeech.t2s.exps.fastspeech2.preprocess paddlespeech.t2s.exps.fastspeech2.train + paddlespeech.t2s.exps.fastspeech2.vc2_infer diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst new file mode 100644 index 00000000..70a9d6e1 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.fastspeech2.vc2\_infer module +=================================================== + +.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst index a688435e..bee18a97 100644 --- a/docs/source/api/paddlespeech.t2s.exps.rst +++ b/docs/source/api/paddlespeech.t2s.exps.rst @@ -12,11 +12,13 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.t2s.exps.ernie_sat paddlespeech.t2s.exps.fastspeech2 paddlespeech.t2s.exps.gan_vocoder paddlespeech.t2s.exps.speedyspeech paddlespeech.t2s.exps.tacotron2 paddlespeech.t2s.exps.transformer_tts + paddlespeech.t2s.exps.vits paddlespeech.t2s.exps.waveflow paddlespeech.t2s.exps.wavernn @@ -31,6 +33,7 @@ Submodules paddlespeech.t2s.exps.ort_predict paddlespeech.t2s.exps.ort_predict_e2e paddlespeech.t2s.exps.ort_predict_streaming + paddlespeech.t2s.exps.stream_play_tts paddlespeech.t2s.exps.syn_utils paddlespeech.t2s.exps.synthesize paddlespeech.t2s.exps.synthesize_e2e diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst new file mode 100644 index 00000000..cb22dde0 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.stream\_play\_tts module +============================================== + +.. automodule:: paddlespeech.t2s.exps.stream_play_tts + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst new file mode 100644 index 00000000..c5606f99 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.normalize module +=========================================== + +.. automodule:: paddlespeech.t2s.exps.vits.normalize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst new file mode 100644 index 00000000..50633c62 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.preprocess module +============================================ + +.. automodule:: paddlespeech.t2s.exps.vits.preprocess + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.rst b/docs/source/api/paddlespeech.t2s.exps.vits.rst new file mode 100644 index 00000000..51a9418d --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.rst @@ -0,0 +1,20 @@ +paddlespeech.t2s.exps.vits package +================================== + +.. automodule:: paddlespeech.t2s.exps.vits + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.exps.vits.normalize + paddlespeech.t2s.exps.vits.preprocess + paddlespeech.t2s.exps.vits.synthesize + paddlespeech.t2s.exps.vits.synthesize_e2e + paddlespeech.t2s.exps.vits.train + paddlespeech.t2s.exps.vits.voice_cloning diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst new file mode 100644 index 00000000..4b22d069 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.synthesize module +============================================ + +.. automodule:: paddlespeech.t2s.exps.vits.synthesize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst new file mode 100644 index 00000000..053ddfc8 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.synthesize\_e2e module +================================================= + +.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst new file mode 100644 index 00000000..31bd3a48 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.train module +======================================= + +.. automodule:: paddlespeech.t2s.exps.vits.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst new file mode 100644 index 00000000..d9be0f31 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.voice\_cloning module +================================================ + +.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst new file mode 100644 index 00000000..1635ec28 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.dataset module +============================================= + +.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst new file mode 100644 index 00000000..b7d54907 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.onnx\_api module +=============================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst new file mode 100644 index 00000000..10a118b7 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst @@ -0,0 +1,17 @@ +paddlespeech.t2s.frontend.g2pw package +====================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.frontend.g2pw.dataset + paddlespeech.t2s.frontend.g2pw.onnx_api + paddlespeech.t2s.frontend.g2pw.utils diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst new file mode 100644 index 00000000..ce942803 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.utils module +=========================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst new file mode 100644 index 00000000..4505dddb --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.mix\_frontend module +============================================== + +.. automodule:: paddlespeech.t2s.frontend.mix_frontend + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.rst index 8fbf1e6e..b6106861 100644 --- a/docs/source/api/paddlespeech.t2s.frontend.rst +++ b/docs/source/api/paddlespeech.t2s.frontend.rst @@ -12,6 +12,7 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.t2s.frontend.g2pw paddlespeech.t2s.frontend.normalizer paddlespeech.t2s.frontend.zh_normalization @@ -23,6 +24,7 @@ Submodules paddlespeech.t2s.frontend.arpabet paddlespeech.t2s.frontend.generate_lexicon + paddlespeech.t2s.frontend.mix_frontend paddlespeech.t2s.frontend.phonectic paddlespeech.t2s.frontend.punctuation paddlespeech.t2s.frontend.tone_sandhi diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst new file mode 100644 index 00000000..fce5a83c --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.ernie\_sat.ernie\_sat module +==================================================== + +.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst new file mode 100644 index 00000000..8a697d6c --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module +============================================================= + +.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst index 680a85de..aff7489c 100644 --- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst @@ -12,4 +12,5 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.t2s.models.ernie_sat.mlm + paddlespeech.t2s.models.ernie_sat.ernie_sat + paddlespeech.t2s.models.ernie_sat.ernie_sat_updater diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst new file mode 100644 index 00000000..7aaba795 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.vits.monotonic\_align.core module +========================================================= + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst new file mode 100644 index 00000000..25c819a7 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst @@ -0,0 +1,16 @@ +paddlespeech.t2s.models.vits.monotonic\_align package +===================================================== + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.models.vits.monotonic_align.core + paddlespeech.t2s.models.vits.monotonic_align.setup diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst new file mode 100644 index 00000000..a93c3b8b --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.vits.monotonic\_align.setup module +========================================================== + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.dynamic_import.rst b/docs/source/api/paddlespeech.utils.dynamic_import.rst new file mode 100644 index 00000000..daa4e6e7 --- /dev/null +++ b/docs/source/api/paddlespeech.utils.dynamic_import.rst @@ -0,0 +1,7 @@ +paddlespeech.utils.dynamic\_import module +========================================= + +.. automodule:: paddlespeech.utils.dynamic_import + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.env.rst b/docs/source/api/paddlespeech.utils.env.rst new file mode 100644 index 00000000..e51278f8 --- /dev/null +++ b/docs/source/api/paddlespeech.utils.env.rst @@ -0,0 +1,7 @@ +paddlespeech.utils.env module +============================= + +.. automodule:: paddlespeech.utils.env + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.rst b/docs/source/api/paddlespeech.utils.rst new file mode 100644 index 00000000..3d47626b --- /dev/null +++ b/docs/source/api/paddlespeech.utils.rst @@ -0,0 +1,16 @@ +paddlespeech.utils package +========================== + +.. automodule:: paddlespeech.utils + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.utils.dynamic_import + paddlespeech.utils.env diff --git a/docs/source/index.rst b/docs/source/index.rst index 83474c52..8540d3fc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -74,8 +74,10 @@ Contents paddlespeech.cli paddlespeech.cls paddlespeech.kws + paddlespeech.resource paddlespeech.s2t paddlespeech.server paddlespeech.t2s paddlespeech.text + paddlespeech.utils paddlespeech.vector diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py index 08c43dc5..eb42b33e 100644 --- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py +++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py @@ -71,31 +71,53 @@ class MLMEncoder(nn.Layer): """Conformer encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimension of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, paddle.nn.Layer]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimension of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): + Input layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - macaron_style (bool): Whether to use macaron style for positionwise layer. - pos_enc_layer_type (str): Encoder positional encoding layer type. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + macaron_style (bool): + Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + use_cnn_module (bool): + Whether to use convolution module. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): + Kernerl size of convolution module. + padding_idx (int): + Padding idx for input_layer=embed. + stochastic_depth_rate (float): + Maximum probability to skip the encoder layer. """ @@ -320,12 +342,16 @@ class MLMDecoder(MLMEncoder): """Encode input sequence. Args: - xs (paddle.Tensor): Input tensor (#batch, time, idim). - masks (paddle.Tensor): Mask tensor (#batch, time). + xs (paddle.Tensor): + Input tensor (#batch, time, idim). + masks (paddle.Tensor): + Mask tensor (#batch, time). Returns: - paddle.Tensor: Output tensor (#batch, time, attention_dim). - paddle.Tensor: Mask tensor (#batch, time). + paddle.Tensor: + Output tensor (#batch, time, attention_dim). + paddle.Tensor: + Mask tensor (#batch, time). """ xs = self.embed(xs) @@ -392,19 +418,27 @@ class MLM(nn.Layer): use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]: ''' Args: - speech (paddle.Tensor): input speech (1, Tmax, D). - text (paddle.Tensor): input text (1, Tmax2). - masked_pos (paddle.Tensor): masked position of input speech (1, Tmax) - speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax). - text_mask (paddle.Tensor): mask of text (1, 1, Tmax2). - speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax). - text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2). - span_bdy (List[int]): masked mel boundary of input speech (2,) - use_teacher_forcing (bool): whether to use teacher forcing + speech (paddle.Tensor): + input speech (1, Tmax, D). + text (paddle.Tensor): + input text (1, Tmax2). + masked_pos (paddle.Tensor): + masked position of input speech (1, Tmax) + speech_mask (paddle.Tensor): + mask of speech (1, 1, Tmax). + text_mask (paddle.Tensor): + mask of text (1, 1, Tmax2). + speech_seg_pos (paddle.Tensor): + n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax). + text_seg_pos (paddle.Tensor): + n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2). + span_bdy (List[int]): + masked mel boundary of input speech (2,) + use_teacher_forcing (bool): + whether to use teacher forcing Returns: List[Tensor]: - eg: - [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])] + eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])] ''' z_cache = None diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py index 6197d569..b0bb68d0 100644 --- a/paddlespeech/t2s/models/vits/duration_predictor.py +++ b/paddlespeech/t2s/models/vits/duration_predictor.py @@ -48,12 +48,18 @@ class StochasticDurationPredictor(nn.Layer): global_channels: int=-1, ): """Initialize StochasticDurationPredictor module. Args: - channels (int): Number of channels. - kernel_size (int): Kernel size. - dropout_rate (float): Dropout rate. - flows (int): Number of flows. - dds_conv_layers (int): Number of conv layers in DDS conv. - global_channels (int): Number of global conditioning channels. + channels (int): + Number of channels. + kernel_size (int): + Kernel size. + dropout_rate (float): + Dropout rate. + flows (int): + Number of flows. + dds_conv_layers (int): + Number of conv layers in DDS conv. + global_channels (int): + Number of global conditioning channels. """ super().__init__() @@ -108,14 +114,21 @@ class StochasticDurationPredictor(nn.Layer): noise_scale: float=1.0, ) -> paddle.Tensor: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T_text). - x_mask (Tensor): Mask tensor (B, 1, T_text). - w (Optional[Tensor]): Duration tensor (B, 1, T_text). - g (Optional[Tensor]): Global conditioning tensor (B, channels, 1) - inverse (bool): Whether to inverse the flow. - noise_scale (float): Noise scale value. + x (Tensor): + Input tensor (B, channels, T_text). + x_mask (Tensor): + Mask tensor (B, 1, T_text). + w (Optional[Tensor]): + Duration tensor (B, 1, T_text). + g (Optional[Tensor]): + Global conditioning tensor (B, channels, 1) + inverse (bool): + Whether to inverse the flow. + noise_scale (float): + Noise scale value. Returns: - Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,). + Tensor: + If not inverse, negative log-likelihood (NLL) tensor (B,). If inverse, log-duration tensor (B, 1, T_text). """ # stop gradient diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py index 3c8f8935..7593eb72 100644 --- a/paddlespeech/t2s/models/vits/flow.py +++ b/paddlespeech/t2s/models/vits/flow.py @@ -34,11 +34,15 @@ class FlipFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, channels, T). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Flipped tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Flipped tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ x = paddle.flip(x, [1]) if not inverse: @@ -60,13 +64,19 @@ class LogFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - inverse (bool): Whether to inverse the flow. - eps (float): Epsilon for log. + x (Tensor): + Input tensor (B, channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + inverse (bool): + Whether to inverse the flow. + eps (float): + Epsilon for log. Returns: - Tensor: Output tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ if not inverse: y = paddle.log(paddle.clip(x, min=eps)) * x_mask @@ -83,7 +93,8 @@ class ElementwiseAffineFlow(nn.Layer): def __init__(self, channels: int): """Initialize ElementwiseAffineFlow module. Args: - channels (int): Number of channels. + channels (int): + Number of channels. """ super().__init__() self.channels = channels @@ -107,12 +118,17 @@ class ElementwiseAffineFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Output tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ if not inverse: y = self.m + paddle.exp(self.logs) * x @@ -157,11 +173,16 @@ class DilatedDepthSeparableConv(nn.Layer): eps: float=1e-5, ): """Initialize DilatedDepthSeparableConv module. Args: - channels (int): Number of channels. - kernel_size (int): Kernel size. - layers (int): Number of layers. - dropout_rate (float): Dropout rate. - eps (float): Epsilon for layer norm. + channels (int): + Number of channels. + kernel_size (int): + Kernel size. + layers (int): + Number of layers. + dropout_rate (float): + Dropout rate. + eps (float): + Epsilon for layer norm. """ super().__init__() @@ -198,11 +219,15 @@ class DilatedDepthSeparableConv(nn.Layer): g: Optional[paddle.Tensor]=None) -> paddle.Tensor: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). + x (Tensor): + Input tensor (B, in_channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). Returns: - Tensor: Output tensor (B, channels, T). + Tensor: + Output tensor (B, channels, T). """ if g is not None: x = x + g @@ -225,12 +250,18 @@ class ConvFlow(nn.Layer): tail_bound: float=5.0, ): """Initialize ConvFlow module. Args: - in_channels (int): Number of input channels. - hidden_channels (int): Number of hidden channels. - kernel_size (int): Kernel size. - layers (int): Number of layers. - bins (int): Number of bins. - tail_bound (float): Tail bound value. + in_channels (int): + Number of input channels. + hidden_channels (int): + Number of hidden channels. + kernel_size (int): + Kernel size. + layers (int): + Number of layers. + bins (int): + Number of bins. + tail_bound (float): + Tail bound value. """ super().__init__() self.half_channels = in_channels // 2 @@ -275,13 +306,19 @@ class ConvFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - g (Optional[Tensor]): Global conditioning tensor (B, channels, 1). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + g (Optional[Tensor]): + Global conditioning tensor (B, channels, 1). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Output tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ xa, xb = x.split(2, 1) h = self.input_conv(xa) diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index 359b6625..7ecc5161 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -97,81 +97,104 @@ class VITSGenerator(nn.Layer): stochastic_duration_predictor_dds_conv_layers: int=3, ): """Initialize VITS generator module. Args: - vocabs (int): Input vocabulary size. - aux_channels (int): Number of acoustic feature channels. - hidden_channels (int): Number of hidden channels. - spks (Optional[int]): Number of speakers. If set to > 1, assume that the + vocabs (int): + Input vocabulary size. + aux_channels (int): + Number of acoustic feature channels. + hidden_channels (int): + Number of hidden channels. + spks (Optional[int]): + Number of speakers. If set to > 1, assume that the sids will be provided as the input and use sid embedding layer. - langs (Optional[int]): Number of languages. If set to > 1, assume that the + langs (Optional[int]): + Number of languages. If set to > 1, assume that the lids will be provided as the input and use sid embedding layer. - spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, + spk_embed_dim (Optional[int]): + Speaker embedding dimension. If set to > 0, assume that spembs will be provided as the input. - global_channels (int): Number of global conditioning channels. - segment_size (int): Segment size for decoder. - text_encoder_attention_heads (int): Number of heads in conformer block - of text encoder. - text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block - of text encoder. - text_encoder_blocks (int): Number of conformer blocks in text encoder. - text_encoder_positionwise_layer_type (str): Position-wise layer type in - conformer block of text encoder. - text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution - kernel size in conformer block of text encoder. Only used when the - above layer type is conv1d or conv1d-linear. - text_encoder_positional_encoding_layer_type (str): Positional encoding layer - type in conformer block of text encoder. - text_encoder_self_attention_layer_type (str): Self-attention layer type in - conformer block of text encoder. - text_encoder_activation_type (str): Activation function type in conformer - block of text encoder. - text_encoder_normalize_before (bool): Whether to apply layer norm before - self-attention in conformer block of text encoder. - text_encoder_dropout_rate (float): Dropout rate in conformer block of - text encoder. - text_encoder_positional_dropout_rate (float): Dropout rate for positional - encoding in conformer block of text encoder. - text_encoder_attention_dropout_rate (float): Dropout rate for attention in - conformer block of text encoder. - text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It - will be used when only use_conformer_conv_in_text_encoder = True. - use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN - in conformer block of text encoder. - use_conformer_conv_in_text_encoder (bool): Whether to use covolution in - conformer block of text encoder. - decoder_kernel_size (int): Decoder kernel size. - decoder_channels (int): Number of decoder initial channels. - decoder_upsample_scales (List[int]): List of upsampling scales in decoder. - decoder_upsample_kernel_sizes (List[int]): List of kernel size for - upsampling layers in decoder. - decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks - in decoder. - decoder_resblock_dilations (List[List[int]]): List of list of dilations for - resblocks in decoder. - use_weight_norm_in_decoder (bool): Whether to apply weight normalization in - decoder. - posterior_encoder_kernel_size (int): Posterior encoder kernel size. - posterior_encoder_layers (int): Number of layers of posterior encoder. - posterior_encoder_stacks (int): Number of stacks of posterior encoder. - posterior_encoder_base_dilation (int): Base dilation of posterior encoder. - posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder. - use_weight_norm_in_posterior_encoder (bool): Whether to apply weight - normalization in posterior encoder. - flow_flows (int): Number of flows in flow. - flow_kernel_size (int): Kernel size in flow. - flow_base_dilation (int): Base dilation in flow. - flow_layers (int): Number of layers in flow. - flow_dropout_rate (float): Dropout rate in flow - use_weight_norm_in_flow (bool): Whether to apply weight normalization in - flow. - use_only_mean_in_flow (bool): Whether to use only mean in flow. - stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic - duration predictor. - stochastic_duration_predictor_dropout_rate (float): Dropout rate in - stochastic duration predictor. - stochastic_duration_predictor_flows (int): Number of flows in stochastic - duration predictor. - stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv - layers in stochastic duration predictor. + global_channels (int): + Number of global conditioning channels. + segment_size (int): + Segment size for decoder. + text_encoder_attention_heads (int): + Number of heads in conformer block of text encoder. + text_encoder_ffn_expand (int): + Expansion ratio of FFN in conformer block of text encoder. + text_encoder_blocks (int): + Number of conformer blocks in text encoder. + text_encoder_positionwise_layer_type (str): + Position-wise layer type in conformer block of text encoder. + text_encoder_positionwise_conv_kernel_size (int): + Position-wise convolution kernel size in conformer block of text encoder. + Only used when the above layer type is conv1d or conv1d-linear. + text_encoder_positional_encoding_layer_type (str): + Positional encoding layer type in conformer block of text encoder. + text_encoder_self_attention_layer_type (str): + Self-attention layer type in conformer block of text encoder. + text_encoder_activation_type (str): + Activation function type in conformer block of text encoder. + text_encoder_normalize_before (bool): + Whether to apply layer norm before self-attention in conformer block of text encoder. + text_encoder_dropout_rate (float): + Dropout rate in conformer block of text encoder. + text_encoder_positional_dropout_rate (float): + Dropout rate for positional encoding in conformer block of text encoder. + text_encoder_attention_dropout_rate (float): + Dropout rate for attention in conformer block of text encoder. + text_encoder_conformer_kernel_size (int): + Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True. + use_macaron_style_in_text_encoder (bool): + Whether to use macaron style FFN in conformer block of text encoder. + use_conformer_conv_in_text_encoder (bool): + Whether to use covolution in conformer block of text encoder. + decoder_kernel_size (int): + Decoder kernel size. + decoder_channels (int): + Number of decoder initial channels. + decoder_upsample_scales (List[int]): + List of upsampling scales in decoder. + decoder_upsample_kernel_sizes (List[int]): + List of kernel size for upsampling layers in decoder. + decoder_resblock_kernel_sizes (List[int]): + List of kernel size for resblocks in decoder. + decoder_resblock_dilations (List[List[int]]): + List of list of dilations for resblocks in decoder. + use_weight_norm_in_decoder (bool): + Whether to apply weight normalization in decoder. + posterior_encoder_kernel_size (int): + Posterior encoder kernel size. + posterior_encoder_layers (int): + Number of layers of posterior encoder. + posterior_encoder_stacks (int): + Number of stacks of posterior encoder. + posterior_encoder_base_dilation (int): + Base dilation of posterior encoder. + posterior_encoder_dropout_rate (float): + Dropout rate for posterior encoder. + use_weight_norm_in_posterior_encoder (bool): + Whether to apply weight normalization in posterior encoder. + flow_flows (int): + Number of flows in flow. + flow_kernel_size (int): + Kernel size in flow. + flow_base_dilation (int): + Base dilation in flow. + flow_layers (int): + Number of layers in flow. + flow_dropout_rate (float): + Dropout rate in flow + use_weight_norm_in_flow (bool): + Whether to apply weight normalization in flow. + use_only_mean_in_flow (bool): + Whether to use only mean in flow. + stochastic_duration_predictor_kernel_size (int): + Kernel size in stochastic duration predictor. + stochastic_duration_predictor_dropout_rate (float): + Dropout rate in stochastic duration predictor. + stochastic_duration_predictor_flows (int): + Number of flows in stochastic duration predictor. + stochastic_duration_predictor_dds_conv_layers (int): + Number of DDS conv layers in stochastic duration predictor. """ super().__init__() self.segment_size = segment_size @@ -272,27 +295,40 @@ class VITSGenerator(nn.Layer): paddle.Tensor, paddle.Tensor, ], ]: """Calculate forward propagation. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, aux_channels, T_feats). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, aux_channels, T_feats). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: - Tensor: Waveform tensor (B, 1, segment_size * upsample_factor). - Tensor: Duration negative log-likelihood (NLL) tensor (B,). - Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text). - Tensor: Segments start index tensor (B,). - Tensor: Text mask tensor (B, 1, T_text). - Tensor: Feature mask tensor (B, 1, T_feats). - tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: - - Tensor: Posterior encoder hidden representation (B, H, T_feats). - - Tensor: Flow hidden representation (B, H, T_feats). - - Tensor: Expanded text encoder projected mean (B, H, T_feats). - - Tensor: Expanded text encoder projected scale (B, H, T_feats). - - Tensor: Posterior encoder projected mean (B, H, T_feats). - - Tensor: Posterior encoder projected scale (B, H, T_feats). + Tensor: + Waveform tensor (B, 1, segment_size * upsample_factor). + Tensor: + Duration negative log-likelihood (NLL) tensor (B,). + Tensor: + Monotonic attention weight tensor (B, 1, T_feats, T_text). + Tensor: + Segments start index tensor (B,). + Tensor: + Text mask tensor (B, 1, T_text). + Tensor: + Feature mask tensor (B, 1, T_feats). + tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: + - Tensor: Posterior encoder hidden representation (B, H, T_feats). + - Tensor: Flow hidden representation (B, H, T_feats). + - Tensor: Expanded text encoder projected mean (B, H, T_feats). + - Tensor: Expanded text encoder projected scale (B, H, T_feats). + - Tensor: Posterior encoder projected mean (B, H, T_feats). + - Tensor: Posterior encoder projected scale (B, H, T_feats). """ # forward text encoder x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths) @@ -402,24 +438,40 @@ class VITSGenerator(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Run inference. Args: - text (Tensor): Input text index tensor (B, T_text,). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, aux_channels, T_feats,). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). - dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided, + text (Tensor): + Input text index tensor (B, T_text,). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, aux_channels, T_feats,). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). + dur (Optional[Tensor]): + Ground-truth duration (B, T_text,). If provided, skip the prediction of durations (i.e., teacher forcing). - noise_scale (float): Noise scale parameter for flow. - noise_scale_dur (float): Noise scale parameter for duration predictor. - alpha (float): Alpha parameter to control the speed of generated speech. - max_len (Optional[int]): Maximum length of acoustic feature sequence. - use_teacher_forcing (bool): Whether to use teacher forcing. + noise_scale (float): + Noise scale parameter for flow. + noise_scale_dur (float): + Noise scale parameter for duration predictor. + alpha (float): + Alpha parameter to control the speed of generated speech. + max_len (Optional[int]): + Maximum length of acoustic feature sequence. + use_teacher_forcing (bool): + Whether to use teacher forcing. Returns: - Tensor: Generated waveform tensor (B, T_wav). - Tensor: Monotonic attention weight tensor (B, T_feats, T_text). - Tensor: Duration tensor (B, T_text). + Tensor: + Generated waveform tensor (B, T_wav). + Tensor: + Monotonic attention weight tensor (B, T_feats, T_text). + Tensor: + Duration tensor (B, T_text). """ # encoder x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths) @@ -533,15 +585,23 @@ class VITSGenerator(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: """Run voice conversion. Args: - feats (Tensor): Feature tensor (B, aux_channels, T_feats,). - feats_lengths (Tensor): Feature length tensor (B,). - sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1). - sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1). - spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim). - spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + feats (Tensor): + Feature tensor (B, aux_channels, T_feats,). + feats_lengths (Tensor): + Feature length tensor (B,). + sids_src (Optional[Tensor]): + Speaker index tensor of source feature (B,) or (B, 1). + sids_tgt (Optional[Tensor]): + Speaker index tensor of target feature (B,) or (B, 1). + spembs_src (Optional[Tensor]): + Speaker embedding tensor of source feature (B, spk_embed_dim). + spembs_tgt (Optional[Tensor]): + Speaker embedding tensor of target feature (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: - Tensor: Generated waveform tensor (B, T_wav). + Tensor: + Generated waveform tensor (B, T_wav). """ # encoder g_src = None @@ -602,10 +662,13 @@ class VITSGenerator(nn.Layer): mask: paddle.Tensor) -> paddle.Tensor: """Generate path a.k.a. monotonic attention. Args: - dur (Tensor): Duration tensor (B, 1, T_text). - mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text). + dur (Tensor): + Duration tensor (B, 1, T_text). + mask (Tensor): + Attention mask tensor (B, 1, T_feats, T_text). Returns: - Tensor: Path tensor (B, 1, T_feats, T_text). + Tensor: + Path tensor (B, 1, T_feats, T_text). """ b, _, t_y, t_x = paddle.shape(mask) cum_dur = paddle.cumsum(dur, -1) diff --git a/paddlespeech/t2s/models/vits/posterior_encoder.py b/paddlespeech/t2s/models/vits/posterior_encoder.py index 85323755..5e3d6b9c 100644 --- a/paddlespeech/t2s/models/vits/posterior_encoder.py +++ b/paddlespeech/t2s/models/vits/posterior_encoder.py @@ -52,17 +52,28 @@ class PosteriorEncoder(nn.Layer): """Initilialize PosteriorEncoder module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - hidden_channels (int): Number of hidden channels. - kernel_size (int): Kernel size in WaveNet. - layers (int): Number of layers of WaveNet. - stacks (int): Number of repeat stacking of WaveNet. - base_dilation (int): Base dilation factor. - global_channels (int): Number of global conditioning channels. - dropout_rate (float): Dropout rate. - bias (bool): Whether to use bias parameters in conv. - use_weight_norm (bool): Whether to apply weight norm. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + hidden_channels (int): + Number of hidden channels. + kernel_size (int): + Kernel size in WaveNet. + layers (int): + Number of layers of WaveNet. + stacks (int): + Number of repeat stacking of WaveNet. + base_dilation (int): + Base dilation factor. + global_channels (int): + Number of global conditioning channels. + dropout_rate (float): + Dropout rate. + bias (bool): + Whether to use bias parameters in conv. + use_weight_norm (bool): + Whether to apply weight norm. """ super().__init__() @@ -99,15 +110,22 @@ class PosteriorEncoder(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T_feats). - x_lengths (Tensor): Length tensor (B,). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). + x (Tensor): + Input tensor (B, in_channels, T_feats). + x_lengths (Tensor): + Length tensor (B,). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). Returns: - Tensor: Encoded hidden representation tensor (B, out_channels, T_feats). - Tensor: Projected mean tensor (B, out_channels, T_feats). - Tensor: Projected scale tensor (B, out_channels, T_feats). - Tensor: Mask tensor for input tensor (B, 1, T_feats). + Tensor: + Encoded hidden representation tensor (B, out_channels, T_feats). + Tensor: + Projected mean tensor (B, out_channels, T_feats). + Tensor: + Projected scale tensor (B, out_channels, T_feats). + Tensor: + Mask tensor for input tensor (B, 1, T_feats). """ x_mask = make_non_pad_mask(x_lengths).unsqueeze(1) diff --git a/paddlespeech/t2s/models/vits/residual_coupling.py b/paddlespeech/t2s/models/vits/residual_coupling.py index c18beedd..afa6d1fa 100644 --- a/paddlespeech/t2s/models/vits/residual_coupling.py +++ b/paddlespeech/t2s/models/vits/residual_coupling.py @@ -55,18 +55,30 @@ class ResidualAffineCouplingBlock(nn.Layer): """Initilize ResidualAffineCouplingBlock module. Args: - in_channels (int): Number of input channels. - hidden_channels (int): Number of hidden channels. - flows (int): Number of flows. - kernel_size (int): Kernel size for WaveNet. - base_dilation (int): Base dilation factor for WaveNet. - layers (int): Number of layers of WaveNet. - stacks (int): Number of stacks of WaveNet. - global_channels (int): Number of global channels. - dropout_rate (float): Dropout rate. - use_weight_norm (bool): Whether to use weight normalization in WaveNet. - bias (bool): Whether to use bias paramters in WaveNet. - use_only_mean (bool): Whether to estimate only mean. + in_channels (int): + Number of input channels. + hidden_channels (int): + Number of hidden channels. + flows (int): + Number of flows. + kernel_size (int): + Kernel size for WaveNet. + base_dilation (int): + Base dilation factor for WaveNet. + layers (int): + Number of layers of WaveNet. + stacks (int): + Number of stacks of WaveNet. + global_channels (int): + Number of global channels. + dropout_rate (float): + Dropout rate. + use_weight_norm (bool): + Whether to use weight normalization in WaveNet. + bias (bool): + Whether to use bias paramters in WaveNet. + use_only_mean (bool): + Whether to estimate only mean. """ super().__init__() @@ -97,10 +109,14 @@ class ResidualAffineCouplingBlock(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - x_mask (Tensor): Length tensor (B, 1, T). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, in_channels, T). + x_mask (Tensor): + Length tensor (B, 1, T). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). + inverse (bool): + Whether to inverse the flow. Returns: Tensor: Output tensor (B, in_channels, T). @@ -134,17 +150,28 @@ class ResidualAffineCouplingLayer(nn.Layer): """Initialzie ResidualAffineCouplingLayer module. Args: - in_channels (int): Number of input channels. - hidden_channels (int): Number of hidden channels. - kernel_size (int): Kernel size for WaveNet. - base_dilation (int): Base dilation factor for WaveNet. - layers (int): Number of layers of WaveNet. - stacks (int): Number of stacks of WaveNet. - global_channels (int): Number of global channels. - dropout_rate (float): Dropout rate. - use_weight_norm (bool): Whether to use weight normalization in WaveNet. - bias (bool): Whether to use bias paramters in WaveNet. - use_only_mean (bool): Whether to estimate only mean. + in_channels (int): + Number of input channels. + hidden_channels (int): + Number of hidden channels. + kernel_size (int): + Kernel size for WaveNet. + base_dilation (int): + Base dilation factor for WaveNet. + layers (int): + Number of layers of WaveNet. + stacks (int): + Number of stacks of WaveNet. + global_channels (int): + Number of global channels. + dropout_rate (float): + Dropout rate. + use_weight_norm (bool): + Whether to use weight normalization in WaveNet. + bias (bool): + Whether to use bias paramters in WaveNet. + use_only_mean (bool): + Whether to estimate only mean. """ assert in_channels % 2 == 0, "in_channels should be divisible by 2" @@ -211,14 +238,20 @@ class ResidualAffineCouplingLayer(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - x_lengths (Tensor): Length tensor (B,). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, in_channels, T). + x_lengths (Tensor): + Length tensor (B,). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Output tensor (B, in_channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, in_channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ xa, xb = paddle.split(x, 2, axis=1) diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py index 3afc7831..799e0c75 100644 --- a/paddlespeech/t2s/models/vits/text_encoder.py +++ b/paddlespeech/t2s/models/vits/text_encoder.py @@ -62,23 +62,40 @@ class TextEncoder(nn.Layer): """Initialize TextEncoder module. Args: - vocabs (int): Vocabulary size. - attention_dim (int): Attention dimension. - attention_heads (int): Number of attention heads. - linear_units (int): Number of linear units of positionwise layers. - blocks (int): Number of encoder blocks. - positionwise_layer_type (str): Positionwise layer type. - positionwise_conv_kernel_size (int): Positionwise layer's kernel size. - positional_encoding_layer_type (str): Positional encoding layer type. - self_attention_layer_type (str): Self-attention layer type. - activation_type (str): Activation function type. - normalize_before (bool): Whether to apply LayerNorm before attention. - use_macaron_style (bool): Whether to use macaron style components. - use_conformer_conv (bool): Whether to use conformer conv layers. - conformer_kernel_size (int): Conformer's conv kernel size. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate for positional encoding. - attention_dropout_rate (float): Dropout rate for attention. + vocabs (int): + Vocabulary size. + attention_dim (int): + Attention dimension. + attention_heads (int): + Number of attention heads. + linear_units (int): + Number of linear units of positionwise layers. + blocks (int): + Number of encoder blocks. + positionwise_layer_type (str): + Positionwise layer type. + positionwise_conv_kernel_size (int): + Positionwise layer's kernel size. + positional_encoding_layer_type (str): + Positional encoding layer type. + self_attention_layer_type (str): + Self-attention layer type. + activation_type (str): + Activation function type. + normalize_before (bool): + Whether to apply LayerNorm before attention. + use_macaron_style (bool): + Whether to use macaron style components. + use_conformer_conv (bool): + Whether to use conformer conv layers. + conformer_kernel_size (int): + Conformer's conv kernel size. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate for positional encoding. + attention_dropout_rate (float): + Dropout rate for attention. """ super().__init__() @@ -121,14 +138,20 @@ class TextEncoder(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input index tensor (B, T_text). - x_lengths (Tensor): Length tensor (B,). + x (Tensor): + Input index tensor (B, T_text). + x_lengths (Tensor): + Length tensor (B,). Returns: - Tensor: Encoded hidden representation (B, attention_dim, T_text). - Tensor: Projected mean tensor (B, attention_dim, T_text). - Tensor: Projected scale tensor (B, attention_dim, T_text). - Tensor: Mask tensor for input tensor (B, 1, T_text). + Tensor: + Encoded hidden representation (B, attention_dim, T_text). + Tensor: + Projected mean tensor (B, attention_dim, T_text). + Tensor: + Projected scale tensor (B, attention_dim, T_text). + Tensor: + Mask tensor for input tensor (B, 1, T_text). """ x = self.emb(x) * math.sqrt(self.attention_dim) diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index 983bf0a3..0ff3a546 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -156,17 +156,25 @@ class VITS(nn.Layer): init_type: str="xavier_uniform", ): """Initialize VITS module. Args: - idim (int): Input vocabrary size. - odim (int): Acoustic feature dimension. The actual output channels will + idim (int): + Input vocabrary size. + odim (int): + Acoustic feature dimension. The actual output channels will be 1 since VITS is the end-to-end text-to-wave model but for the compatibility odim is used to indicate the acoustic feature dimension. - sampling_rate (int): Sampling rate, not used for the training but it will + sampling_rate (int): + Sampling rate, not used for the training but it will be referred in saving waveform during the inference. - generator_type (str): Generator type. - generator_params (Dict[str, Any]): Parameter dict for generator. - discriminator_type (str): Discriminator type. - discriminator_params (Dict[str, Any]): Parameter dict for discriminator. - cache_generator_outputs (bool): Whether to cache generator outputs. + generator_type (str): + Generator type. + generator_params (Dict[str, Any]): + Parameter dict for generator. + discriminator_type (str): + Discriminator type. + discriminator_params (Dict[str, Any]): + Parameter dict for discriminator. + cache_generator_outputs (bool): + Whether to cache generator outputs. """ assert check_argument_types() super().__init__() @@ -218,14 +226,22 @@ class VITS(nn.Layer): forward_generator: bool=True, ) -> Dict[str, Any]: """Perform generator forward. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, T_feats, aux_channels). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). - forward_generator (bool): Whether to forward generator. + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, T_feats, aux_channels). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). + forward_generator (bool): + Whether to forward generator. Returns: """ @@ -259,13 +275,20 @@ class VITS(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]: """Perform generator forward. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, T_feats, aux_channels). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, T_feats, aux_channels). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: """ @@ -304,13 +327,20 @@ class VITS(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]: """Perform discriminator forward. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, T_feats, aux_channels). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, T_feats, aux_channels). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: """ @@ -353,22 +383,36 @@ class VITS(nn.Layer): use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: """Run inference. Args: - text (Tensor): Input text index tensor (T_text,). - feats (Tensor): Feature tensor (T_feats, aux_channels). - sids (Tensor): Speaker index tensor (1,). - spembs (Optional[Tensor]): Speaker embedding tensor (spk_embed_dim,). - lids (Tensor): Language index tensor (1,). - durations (Tensor): Ground-truth duration tensor (T_text,). - noise_scale (float): Noise scale value for flow. - noise_scale_dur (float): Noise scale value for duration predictor. - alpha (float): Alpha parameter to control the speed of generated speech. - max_len (Optional[int]): Maximum length. - use_teacher_forcing (bool): Whether to use teacher forcing. + text (Tensor): + Input text index tensor (T_text,). + feats (Tensor): + Feature tensor (T_feats, aux_channels). + sids (Tensor): + Speaker index tensor (1,). + spembs (Optional[Tensor]): + Speaker embedding tensor (spk_embed_dim,). + lids (Tensor): + Language index tensor (1,). + durations (Tensor): + Ground-truth duration tensor (T_text,). + noise_scale (float): + Noise scale value for flow. + noise_scale_dur (float): + Noise scale value for duration predictor. + alpha (float): + Alpha parameter to control the speed of generated speech. + max_len (Optional[int]): + Maximum length. + use_teacher_forcing (bool): + Whether to use teacher forcing. Returns: Dict[str, Tensor]: - * wav (Tensor): Generated waveform tensor (T_wav,). - * att_w (Tensor): Monotonic attention weight tensor (T_feats, T_text). - * duration (Tensor): Predicted duration tensor (T_text,). + * wav (Tensor): + Generated waveform tensor (T_wav,). + * att_w (Tensor): + Monotonic attention weight tensor (T_feats, T_text). + * duration (Tensor): + Predicted duration tensor (T_text,). """ # setup text = text[None] @@ -417,15 +461,22 @@ class VITS(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: """Run voice conversion. Args: - feats (Tensor): Feature tensor (T_feats, aux_channels). - sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,). - sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,). - spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,). - spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,). - lids (Optional[Tensor]): Language index tensor (1,). + feats (Tensor): + Feature tensor (T_feats, aux_channels). + sids_src (Optional[Tensor]): + Speaker index tensor of source feature (1,). + sids_tgt (Optional[Tensor]): + Speaker index tensor of target feature (1,). + spembs_src (Optional[Tensor]): + Speaker embedding tensor of source feature (spk_embed_dim,). + spembs_tgt (Optional[Tensor]): + Speaker embedding tensor of target feature (spk_embed_dim,). + lids (Optional[Tensor]): + Language index tensor (1,). Returns: Dict[str, Tensor]: - * wav (Tensor): Generated waveform tensor (T_wav,). + * wav (Tensor): + Generated waveform tensor (T_wav,). """ assert feats is not None feats = feats[None].transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/models/vits/wavenet/residual_block.py b/paddlespeech/t2s/models/vits/wavenet/residual_block.py index 197e7497..b5095e16 100644 --- a/paddlespeech/t2s/models/vits/wavenet/residual_block.py +++ b/paddlespeech/t2s/models/vits/wavenet/residual_block.py @@ -39,14 +39,22 @@ class ResidualBlock(nn.Layer): """Initialize ResidualBlock module. Args: - kernel_size (int): Kernel size of dilation convolution layer. - residual_channels (int): Number of channels for residual connection. - skip_channels (int): Number of channels for skip connection. - aux_channels (int): Number of local conditioning channels. - dropout (float): Dropout probability. - dilation (int): Dilation factor. - bias (bool): Whether to add bias parameter in convolution layers. - scale_residual (bool): Whether to scale the residual outputs. + kernel_size (int): + Kernel size of dilation convolution layer. + residual_channels (int): + Number of channels for residual connection. + skip_channels (int): + Number of channels for skip connection. + aux_channels (int): + Number of local conditioning channels. + dropout (float): + Dropout probability. + dilation (int): + Dilation factor. + bias (bool): + Whether to add bias parameter in convolution layers. + scale_residual (bool): + Whether to scale the residual outputs. """ super().__init__() diff --git a/paddlespeech/t2s/models/vits/wavenet/wavenet.py b/paddlespeech/t2s/models/vits/wavenet/wavenet.py index 44693dac..04422939 100644 --- a/paddlespeech/t2s/models/vits/wavenet/wavenet.py +++ b/paddlespeech/t2s/models/vits/wavenet/wavenet.py @@ -47,25 +47,42 @@ class WaveNet(nn.Layer): """Initialize WaveNet module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - kernel_size (int): Kernel size of dilated convolution. - layers (int): Number of residual block layers. - stacks (int): Number of stacks i.e., dilation cycles. - base_dilation (int): Base dilation factor. - residual_channels (int): Number of channels in residual conv. - gate_channels (int): Number of channels in gated conv. - skip_channels (int): Number of channels in skip conv. - aux_channels (int): Number of channels for local conditioning feature. - global_channels (int): Number of channels for global conditioning feature. - dropout_rate (float): Dropout rate. 0.0 means no dropout applied. - bias (bool): Whether to use bias parameter in conv layer. - use_weight_norm (bool): Whether to use weight norm. If set to true, it will - be applied to all of the conv layers. - use_first_conv (bool): Whether to use the first conv layers. - use_last_conv (bool): Whether to use the last conv layers. - scale_residual (bool): Whether to scale the residual outputs. - scale_skip_connect (bool): Whether to scale the skip connection outputs. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + kernel_size (int): + Kernel size of dilated convolution. + layers (int): + Number of residual block layers. + stacks (int): + Number of stacks i.e., dilation cycles. + base_dilation (int): + Base dilation factor. + residual_channels (int): + Number of channels in residual conv. + gate_channels (int): + Number of channels in gated conv. + skip_channels (int): + Number of channels in skip conv. + aux_channels (int): + Number of channels for local conditioning feature. + global_channels (int): + Number of channels for global conditioning feature. + dropout_rate (float): + Dropout rate. 0.0 means no dropout applied. + bias (bool): + Whether to use bias parameter in conv layer. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. + use_first_conv (bool): + Whether to use the first conv layers. + use_last_conv (bool): + Whether to use the last conv layers. + scale_residual (bool): + Whether to scale the residual outputs. + scale_skip_connect (bool): + Whether to scale the skip connection outputs. """ super().__init__() @@ -128,15 +145,18 @@ class WaveNet(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T) if use_first_conv else - (B, residual_channels, T). - x_mask (Optional[Tensor]): Mask tensor (B, 1, T). - c (Optional[Tensor]): Local conditioning features (B, aux_channels, T). - g (Optional[Tensor]): Global conditioning features (B, global_channels, 1). + x (Tensor): + Input noise signal (B, 1, T) if use_first_conv else (B, residual_channels, T). + x_mask (Optional[Tensor]): + Mask tensor (B, 1, T). + c (Optional[Tensor]): + Local conditioning features (B, aux_channels, T). + g (Optional[Tensor]): + Global conditioning features (B, global_channels, 1). Returns: - Tensor: Output tensor (B, out_channels, T) if use_last_conv else - (B, residual_channels, T). + Tensor: + Output tensor (B, out_channels, T) if use_last_conv else(B, residual_channels, T). """ # encode to hidden representation diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index 254edbb2..44e9f2d8 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -69,9 +69,11 @@ class MelResNet(nn.Layer): def forward(self, x): ''' Args: - x (Tensor): Input tensor (B, in_dims, T). + x (Tensor): + Input tensor (B, in_dims, T). Returns: - Tensor: Output tensor (B, res_out_dims, T). + Tensor: + Output tensor (B, res_out_dims, T). ''' x = self.conv_in(x) @@ -119,10 +121,13 @@ class UpsampleNetwork(nn.Layer): def forward(self, m): ''' Args: - c (Tensor): Input tensor (B, C_aux, T). + c (Tensor): + Input tensor (B, C_aux, T). Returns: - Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). - Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). + Tensor: + Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). + Tensor: + Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). ''' # aux: [B, C_aux, T] # -> [B, res_out_dims, T - 2 * aux_context_window] @@ -302,7 +307,8 @@ class WaveRNN(nn.Layer): number of samples for crossfading between batches mu_law(bool) Returns: - wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out). + wav sequence: + Output (T' * prod(upsample_scales), out_channels, C_out). """ self.eval() @@ -423,7 +429,7 @@ class WaveRNN(nn.Layer): x(Tensor): mel, [1, n_frames, 80] pad(int): - side(str, optional): (Default value = 'both') + side(str, optional): (Default value = 'both') Returns: Tensor