From 683679bec72009517f6352395b6a133018cc92dd Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 10 Feb 2022 12:41:24 +0000 Subject: [PATCH 1/2] merge data and datasets, test=tts --- paddlespeech/t2s/__init__.py | 1 - paddlespeech/t2s/data/__init__.py | 17 -- paddlespeech/t2s/data/dataset.py | 261 ------------------ paddlespeech/t2s/datasets/__init__.py | 1 - paddlespeech/t2s/datasets/am_batch_fn.py | 2 +- paddlespeech/t2s/{data => datasets}/batch.py | 0 paddlespeech/t2s/datasets/common.py | 92 ------ .../t2s/{data => datasets}/get_feats.py | 0 .../t2s/exps/fastspeech2/preprocess.py | 6 +- .../parallelwave_gan/synthesize_from_wav.py | 2 +- .../t2s/exps/gan_vocoder/preprocess.py | 2 +- .../t2s/exps/speedyspeech/preprocess.py | 2 +- paddlespeech/t2s/exps/tacotron2/preprocess.py | 2 +- .../t2s/exps/transformer_tts/preprocess.py | 2 +- paddlespeech/t2s/exps/waveflow/ljspeech.py | 4 +- 15 files changed, 11 insertions(+), 383 deletions(-) delete mode 100644 paddlespeech/t2s/data/__init__.py delete mode 100644 paddlespeech/t2s/data/dataset.py rename paddlespeech/t2s/{data => datasets}/batch.py (100%) delete mode 100644 paddlespeech/t2s/datasets/common.py rename paddlespeech/t2s/{data => datasets}/get_feats.py (100%) diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py index 8a0acc48..7d93c026 100644 --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. import logging -from . import data from . import datasets from . import exps from . import frontend diff --git a/paddlespeech/t2s/data/__init__.py b/paddlespeech/t2s/data/__init__.py deleted file mode 100644 index c605205d..00000000 --- a/paddlespeech/t2s/data/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""t2s's infrastructure for data processing. -""" -from .batch import * -from .dataset import * diff --git a/paddlespeech/t2s/data/dataset.py b/paddlespeech/t2s/data/dataset.py deleted file mode 100644 index 2d6c03cb..00000000 --- a/paddlespeech/t2s/data/dataset.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import six -from paddle.io import Dataset - -__all__ = [ - "split", - "TransformDataset", - "CacheDataset", - "TupleDataset", - "DictDataset", - "SliceDataset", - "SubsetDataset", - "FilterDataset", - "ChainDataset", -] - - -def split(dataset, first_size): - """A utility function to split a dataset into two datasets.""" - first = SliceDataset(dataset, 0, first_size) - second = SliceDataset(dataset, first_size, len(dataset)) - return first, second - - -class TransformDataset(Dataset): - def __init__(self, dataset, transform): - """Dataset which is transformed from another with a transform. - - Args: - dataset (Dataset): the base dataset. - transform (callable): the transform which takes an example of the base dataset as parameter and return a new example. - """ - self._dataset = dataset - self._transform = transform - - def __len__(self): - return len(self._dataset) - - def __getitem__(self, i): - in_data = self._dataset[i] - return self._transform(in_data) - - -class CacheDataset(Dataset): - def __init__(self, dataset): - """A lazy cache of the base dataset. - - Args: - dataset (Dataset): the base dataset to cache. - """ - self._dataset = dataset - self._cache = dict() - - def __len__(self): - return len(self._dataset) - - def __getitem__(self, i): - if i not in self._cache: - self._cache[i] = self._dataset[i] - return self._cache[i] - - -class TupleDataset(Dataset): - def __init__(self, *datasets): - """A compound dataset made from several datasets of the same length. An example of the `TupleDataset` is a tuple of examples from the constituent datasets. - - Args: - datasets: tuple[Dataset], the constituent datasets. - """ - if not datasets: - raise ValueError("no datasets are given") - length = len(datasets[0]) - for i, dataset in enumerate(datasets): - if len(dataset) != length: - raise ValueError("all the datasets should have the same length." - "dataset {} has a different length".format(i)) - self._datasets = datasets - self._length = length - - def __getitem__(self, index): - # SOA - batches = [dataset[index] for dataset in self._datasets] - if isinstance(index, slice): - length = len(batches[0]) - # AOS - return [ - tuple([batch[i] for batch in batches]) - for i in six.moves.range(length) - ] - else: - return tuple(batches) - - def __len__(self): - return self._length - - -class DictDataset(Dataset): - def __init__(self, **datasets): - """ - A compound dataset made from several datasets of the same length. An - example of the `DictDataset` is a dict of examples from the constituent - datasets. - - WARNING: paddle does not have a good support for DictDataset, because - every batch yield from a DataLoader is a list, but it cannot be a dict. - So you have to provide a collate function because you cannot use the - default one. - - Args: - datasets: Dict[Dataset], the constituent datasets. - """ - if not datasets: - raise ValueError("no datasets are given") - length = None - for key, dataset in six.iteritems(datasets): - if length is None: - length = len(dataset) - elif len(dataset) != length: - raise ValueError( - "all the datasets should have the same length." - "dataset {} has a different length".format(key)) - self._datasets = datasets - self._length = length - - def __getitem__(self, index): - batches = { - key: dataset[index] - for key, dataset in six.iteritems(self._datasets) - } - if isinstance(index, slice): - length = len(six.next(six.itervalues(batches))) - return [{key: batch[i] - for key, batch in six.iteritems(batches)} - for i in six.moves.range(length)] - else: - return batches - - def __len__(self): - return self._length - - -class SliceDataset(Dataset): - def __init__(self, dataset, start, finish, order=None): - """A Dataset which is a slice of the base dataset. - - Args: - dataset (Dataset): the base dataset. - start (int): the start of the slice. - finish (int): the end of the slice, not inclusive. - order (List[int], optional): the order, it is a permutation of the valid example ids of the base dataset. If `order` is provided, the slice is taken in `order`. Defaults to None. - """ - if start < 0 or finish > len(dataset): - raise ValueError("subset overruns the dataset.") - self._dataset = dataset - self._start = start - self._finish = finish - self._size = finish - start - - if order is not None and len(order) != len(dataset): - raise ValueError( - "order should have the same length as the dataset" - "len(order) = {} which does not euqals len(dataset) = {} ". - format(len(order), len(dataset))) - self._order = order - - def __len__(self): - return self._size - - def __getitem__(self, i): - if i >= 0: - if i >= self._size: - raise IndexError('dataset index out of range') - index = self._start + i - else: - if i < -self._size: - raise IndexError('dataset index out of range') - index = self._finish + i - - if self._order is not None: - index = self._order[index] - return self._dataset[index] - - -class SubsetDataset(Dataset): - def __init__(self, dataset, indices): - """A Dataset which is a subset of the base dataset. - - Args: - dataset (Dataset): the base dataset. - indices (Iterable[int]): the indices of the examples to pick. - """ - self._dataset = dataset - if len(indices) > len(dataset): - raise ValueError("subset's size larger that dataset's size!") - self._indices = indices - self._size = len(indices) - - def __len__(self): - return self._size - - def __getitem__(self, i): - index = self._indices[i] - return self._dataset[index] - - -class FilterDataset(Dataset): - def __init__(self, dataset, filter_fn): - """A filtered dataset. - - Args: - dataset (Dataset): the base dataset. - filter_fn (callable): a callable which takes an example of the base dataset and return a boolean. - """ - self._dataset = dataset - self._indices = [ - i for i in range(len(dataset)) if filter_fn(dataset[i]) - ] - self._size = len(self._indices) - - def __len__(self): - return self._size - - def __getitem__(self, i): - index = self._indices[i] - return self._dataset[index] - - -class ChainDataset(Dataset): - def __init__(self, *datasets): - """A concatenation of the several datasets which the same structure. - - Args: - datasets (Iterable[Dataset]): datasets to concat. - """ - self._datasets = datasets - - def __len__(self): - return sum(len(dataset) for dataset in self._datasets) - - def __getitem__(self, i): - if i < 0: - raise IndexError("ChainDataset doesnot support negative indexing.") - - for dataset in self._datasets: - if i < len(dataset): - return dataset[i] - i -= len(dataset) - - raise IndexError("dataset index out of range") diff --git a/paddlespeech/t2s/datasets/__init__.py b/paddlespeech/t2s/datasets/__init__.py index fc64a82f..caf20aac 100644 --- a/paddlespeech/t2s/datasets/__init__.py +++ b/paddlespeech/t2s/datasets/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .common import * from .ljspeech import * diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 655e06e3..4e3ad3c1 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -14,7 +14,7 @@ import numpy as np import paddle -from paddlespeech.t2s.data.batch import batch_sequences +from paddlespeech.t2s.datasets.batch import batch_sequences def tacotron2_single_spk_batch_fn(examples): diff --git a/paddlespeech/t2s/data/batch.py b/paddlespeech/t2s/datasets/batch.py similarity index 100% rename from paddlespeech/t2s/data/batch.py rename to paddlespeech/t2s/datasets/batch.py diff --git a/paddlespeech/t2s/datasets/common.py b/paddlespeech/t2s/datasets/common.py deleted file mode 100644 index 122a35ae..00000000 --- a/paddlespeech/t2s/datasets/common.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pathlib import Path -from typing import List - -import librosa -import numpy as np -from paddle.io import Dataset - -__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"] - - -class AudioSegmentDataset(Dataset): - """A simple dataset adaptor for audio files to train vocoders. - Read -> trim silence -> normalize -> extract a segment - """ - - def __init__(self, - file_paths: List[Path], - sample_rate: int, - length: int, - top_db: float): - self.file_paths = file_paths - self.sr = sample_rate - self.top_db = top_db - self.length = length # samples in the clip - - def __getitem__(self, i): - fpath = self.file_paths[i] - y, sr = librosa.load(fpath, sr=self.sr) - y, _ = librosa.effects.trim(y, top_db=self.top_db) - y = librosa.util.normalize(y) - y = y.astype(np.float32) - - # pad or trim - if y.size <= self.length: - y = np.pad(y, [0, self.length - len(y)], mode='constant') - else: - start = np.random.randint(0, 1 + len(y) - self.length) - y = y[start:start + self.length] - return y - - def __len__(self): - return len(self.file_paths) - - -class AudioDataset(Dataset): - """A simple dataset adaptor for the audio files. - Read -> trim silence -> normalize - """ - - def __init__(self, - file_paths: List[Path], - sample_rate: int, - top_db: float=60): - self.file_paths = file_paths - self.sr = sample_rate - self.top_db = top_db - - def __getitem__(self, i): - fpath = self.file_paths[i] - y, sr = librosa.load(fpath, sr=self.sr) - y, _ = librosa.effects.trim(y, top_db=self.top_db) - y = librosa.util.normalize(y) - y = y.astype(np.float32) - return y - - def __len__(self): - return len(self.file_paths) - - -class AudioFolderDataset(AudioDataset): - def __init__( - self, - root, - sample_rate, - top_db=60, - extension=".wav", ): - root = Path(root).expanduser() - file_paths = sorted(list(root.rglob("*{}".format(extension)))) - super().__init__(file_paths, sample_rate, top_db) diff --git a/paddlespeech/t2s/data/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py similarity index 100% rename from paddlespeech/t2s/data/get_feats.py rename to paddlespeech/t2s/datasets/get_feats.py diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index fd6da2cb..5bda7545 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -27,9 +27,9 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import Energy -from paddlespeech.t2s.data.get_feats import LogMelFBank -from paddlespeech.t2s.data.get_feats import Pitch +from paddlespeech.t2s.datasets.get_feats import Energy +from paddlespeech.t2s.datasets.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import Pitch from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index f5affb50..def30e67 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -23,7 +23,7 @@ import soundfile as sf import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.parallel_wavegan import PWGInference from paddlespeech.t2s.modules.normalizer import ZScore diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 47d0a292..4871bca7 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -27,7 +27,7 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.utils import str2bool diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index db888fba..3f81c4e1 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -27,7 +27,7 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py index ffbeaad9..7f41089e 100644 --- a/paddlespeech/t2s/exps/tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -27,7 +27,7 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py index 93158b67..7cfa91b9 100644 --- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py @@ -26,7 +26,7 @@ import tqdm import yaml from yacs.config import CfgNode as Configuration -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.frontend import English diff --git a/paddlespeech/t2s/exps/waveflow/ljspeech.py b/paddlespeech/t2s/exps/waveflow/ljspeech.py index 655b63da..a6efa9ec 100644 --- a/paddlespeech/t2s/exps/waveflow/ljspeech.py +++ b/paddlespeech/t2s/exps/waveflow/ljspeech.py @@ -17,8 +17,8 @@ import numpy as np import pandas from paddle.io import Dataset -from paddlespeech.t2s.data.batch import batch_spec -from paddlespeech.t2s.data.batch import batch_wav +from paddlespeech.t2s.datasets.batch import batch_spec +from paddlespeech.t2s.datasets.batch import batch_wav class LJSpeech(Dataset): From 9699c00769e90fcfcd297240e87b12adb21e8caf Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 11 Feb 2022 14:11:40 +0000 Subject: [PATCH 2/2] change the docstring style from numpydoc to google, test=tts --- paddlespeech/t2s/datasets/data_table.py | 56 +- paddlespeech/t2s/datasets/preprocess_utils.py | 51 +- paddlespeech/t2s/datasets/vocoder_batch_fn.py | 64 +-- .../t2s/exps/transformer_tts/preprocess.py | 28 +- paddlespeech/t2s/frontend/arpabet.py | 104 ++-- paddlespeech/t2s/frontend/phonectic.py | 145 ++---- paddlespeech/t2s/frontend/vocab.py | 22 +- .../frontend/zh_normalization/chronology.py | 30 +- .../t2s/frontend/zh_normalization/num.py | 70 +-- .../frontend/zh_normalization/phonecode.py | 20 +- .../frontend/zh_normalization/quantifier.py | 10 +- .../zh_normalization/text_normlization.py | 12 +- .../t2s/models/fastspeech2/fastspeech2.py | 473 ++++++----------- paddlespeech/t2s/models/hifigan/hifigan.py | 295 ++++------- paddlespeech/t2s/models/melgan/melgan.py | 199 +++----- .../t2s/models/melgan/style_melgan.py | 109 ++-- .../parallel_wavegan/parallel_wavegan.py | 227 +++----- .../t2s/models/tacotron2/tacotron2.py | 207 +++----- .../models/transformer_tts/transformer_tts.py | 333 +++++------- paddlespeech/t2s/models/waveflow.py | 483 ++++++------------ paddlespeech/t2s/models/wavernn/wavernn.py | 240 ++++----- paddlespeech/t2s/modules/causal_conv.py | 24 +- .../t2s/modules/conformer/convolution.py | 23 +- .../t2s/modules/conformer/encoder_layer.py | 82 ++- paddlespeech/t2s/modules/conv.py | 164 +++--- paddlespeech/t2s/modules/geometry.py | 28 +- paddlespeech/t2s/modules/layer_norm.py | 22 +- paddlespeech/t2s/modules/losses.py | 434 ++++++---------- paddlespeech/t2s/modules/nets_utils.py | 121 ++--- paddlespeech/t2s/modules/pqmf.py | 64 +-- .../modules/predictor/duration_predictor.py | 87 ++-- .../t2s/modules/predictor/length_regulator.py | 24 +- .../modules/predictor/variance_predictor.py | 33 +- paddlespeech/t2s/modules/residual_block.py | 90 ++-- paddlespeech/t2s/modules/residual_stack.py | 44 +- paddlespeech/t2s/modules/style_encoder.py | 124 ++--- .../t2s/modules/tacotron2/attentions.py | 213 +++----- paddlespeech/t2s/modules/tacotron2/decoder.py | 271 ++++------ paddlespeech/t2s/modules/tacotron2/encoder.py | 75 +-- paddlespeech/t2s/modules/tade_res_block.py | 37 +- .../t2s/modules/transformer/attention.py | 141 ++--- .../t2s/modules/transformer/decoder.py | 150 ++---- .../t2s/modules/transformer/decoder_layer.py | 77 ++- .../t2s/modules/transformer/embedding.py | 83 +-- .../t2s/modules/transformer/encoder.py | 316 ++++-------- .../t2s/modules/transformer/encoder_layer.py | 52 +- .../t2s/modules/transformer/lightconv.py | 44 +- paddlespeech/t2s/modules/transformer/mask.py | 41 +- .../modules/transformer/multi_layer_conv.py | 54 +- .../transformer/positionwise_feed_forward.py | 12 +- .../t2s/modules/transformer/repeat.py | 15 +- .../t2s/modules/transformer/subsampling.py | 36 +- paddlespeech/t2s/modules/upsample.py | 141 ++--- paddlespeech/t2s/training/experiment.py | 53 +- .../t2s/training/extensions/snapshot.py | 6 +- paddlespeech/t2s/utils/error_rate.py | 109 ++-- paddlespeech/t2s/utils/h5_utils.py | 32 +- 57 files changed, 2350 insertions(+), 4150 deletions(-) diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py index b0e4c891..c9815af2 100644 --- a/paddlespeech/t2s/datasets/data_table.py +++ b/paddlespeech/t2s/datasets/data_table.py @@ -22,26 +22,17 @@ from paddle.io import Dataset class DataTable(Dataset): """Dataset to load and convert data for general purpose. - - Parameters - ---------- - data : List[Dict[str, Any]] - Metadata, a list of meta datum, each of which is composed of - several fields - fields : List[str], optional - Fields to use, if not specified, all the fields in the data are - used, by default None - converters : Dict[str, Callable], optional - Converters used to process each field, by default None - use_cache : bool, optional - Whether to use cache, by default False - - Raises - ------ - ValueError - If there is some field that does not exist in data. - ValueError - If there is some field in converters that does not exist in fields. + Args: + data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of several fields + fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None + converters (Dict[str, Callable], optional): Converters used to process each field, by default None + use_cache (bool, optional): Whether to use cache, by default False + + Raises: + ValueError: + If there is some field that does not exist in data. + ValueError: + If there is some field in converters that does not exist in fields. """ def __init__(self, @@ -95,15 +86,11 @@ class DataTable(Dataset): """Convert a meta datum to an example by applying the corresponding converters to each fields requested. - Parameters - ---------- - meta_datum : Dict[str, Any] - Meta datum + Args: + meta_datum (Dict[str, Any]): Meta datum - Returns - ------- - Dict[str, Any] - Converted example + Returns: + Dict[str, Any]: Converted example """ example = {} for field in self.fields: @@ -118,16 +105,11 @@ class DataTable(Dataset): def __getitem__(self, idx: int) -> Dict[str, Any]: """Get an example given an index. + Args: + idx (int): Index of the example to get - Parameters - ---------- - idx : int - Index of the example to get - - Returns - ------- - Dict[str, Any] - A converted example + Returns: + Dict[str, Any]: A converted example """ if self.use_cache and self.caches[idx] is not None: return self.caches[idx] diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py index 8b01f6c3..445b69bd 100644 --- a/paddlespeech/t2s/datasets/preprocess_utils.py +++ b/paddlespeech/t2s/datasets/preprocess_utils.py @@ -18,14 +18,10 @@ import re def get_phn_dur(file_name): ''' read MFA duration.txt - Parameters - ---------- - file_name : str or Path - path of gen_duration_from_textgrid.py's result - Returns - ---------- - Dict - sentence: {'utt': ([char], [int])} + Args: + file_name (str or Path): path of gen_duration_from_textgrid.py's result + Returns: + Dict: sentence: {'utt': ([char], [int])} ''' f = open(file_name, 'r') sentence = {} @@ -48,10 +44,8 @@ def get_phn_dur(file_name): def merge_silence(sentence): ''' merge silences - Parameters - ---------- - sentence : Dict - sentence: {'utt': (([char], [int]), str)} + Args: + sentence (Dict): sentence: {'utt': (([char], [int]), str)} ''' for utt in sentence: cur_phn, cur_dur, speaker = sentence[utt] @@ -81,12 +75,9 @@ def merge_silence(sentence): def get_input_token(sentence, output_path, dataset="baker"): ''' get phone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], [int])} - output_path : str or path - path to save phone_id_map + Args: + sentence (Dict): sentence: {'utt': ([char], [int])} + output_path (str or path):path to save phone_id_map ''' phn_token = set() for utt in sentence: @@ -112,14 +103,10 @@ def get_phones_tones(sentence, dataset="baker"): ''' get phone set and tone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], [int])} - phones_output_path : str or path - path to save phone_id_map - tones_output_path : str or path - path to save tone_id_map + Args: + sentence (Dict): sentence: {'utt': ([char], [int])} + phones_output_path (str or path): path to save phone_id_map + tones_output_path (str or path): path to save tone_id_map ''' phn_token = set() tone_token = set() @@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path): def compare_duration_and_mel_length(sentences, utt, mel): ''' check duration error, correct sentences[utt] if possible, else pop sentences[utt] - Parameters - ---------- - sentences : Dict - sentences[utt] = [phones_list ,durations_list] - utt : str - utt_id - mel : np.ndarry - features (num_frames, n_mels) + Args: + sentences (Dict): sentences[utt] = [phones_list ,durations_list] + utt (str): utt_id + mel (np.ndarry): features (num_frames, n_mels) ''' if utt in sentences: diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py index d969a1d3..08748de0 100644 --- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py +++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py @@ -29,15 +29,11 @@ class Clip(object): hop_size=256, aux_context_window=0, ): """Initialize customized collater for DataLoader. + Args: - Parameters - ---------- - batch_max_steps : int - The maximum length of input signal in batch. - hop_size : int - Hop size of auxiliary features. - aux_context_window : int - Context window size for auxiliary feature conv. + batch_max_steps (int): The maximum length of input signal in batch. + hop_size (int): Hop size of auxiliary features. + aux_context_window (int): Context window size for auxiliary feature conv. """ if batch_max_steps % hop_size != 0: @@ -56,18 +52,15 @@ class Clip(object): def __call__(self, batch): """Convert into batch tensors. - Parameters - ---------- - batch : list - list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). + Args: + batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). - Returns - ---------- - Tensor - Auxiliary feature batch (B, C, T'), where - T = (T' - 2 * aux_context_window) * hop_size. - Tensor - Target signal batch (B, 1, T). + Returns: + Tensor: + Auxiliary feature batch (B, C, T'), where + T = (T' - 2 * aux_context_window) * hop_size. + Tensor: + Target signal batch (B, 1, T). """ # check length @@ -104,11 +97,10 @@ class Clip(object): def _adjust_length(self, x, c): """Adjust the audio and feature lengths. - Note - ------- - Basically we assume that the length of x and c are adjusted - through preprocessing stage, but if we use other library processed - features, this process will be needed. + Note: + Basically we assume that the length of x and c are adjusted + through preprocessing stage, but if we use other library processed + features, this process will be needed. """ if len(x) < c.shape[0] * self.hop_size: @@ -162,22 +154,14 @@ class WaveRNNClip(Clip): # voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15 """Convert into batch tensors. - - Parameters - ---------- - batch : list - list of tuple of the pair of audio and features. - Audio shape (T, ), features shape(T', C). - - Returns - ---------- - Tensor - Input signal batch (B, 1, T). - Tensor - Target signal batch (B, 1, T). - Tensor - Auxiliary feature batch (B, C, T'), where - T = (T' - 2 * aux_context_window) * hop_size. + Args: + batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). + + Returns: + Tensor: Input signal batch (B, 1, T). + Tensor: Target signal batch (B, 1, T). + Tensor: Auxiliary feature batch (B, C, T'), + where T = (T' - 2 * aux_context_window) * hop_size. """ # check length diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py index 7cfa91b9..9aa87e91 100644 --- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py @@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English def get_lj_sentences(file_name, frontend): - ''' - read MFA duration.txt - Parameters - ---------- - file_name : str or Path - Returns - ---------- - Dict - sentence: {'utt': ([char], [int])} + '''read MFA duration.txt + + Args: + file_name (str or Path) + Returns: + Dict: sentence: {'utt': ([char], [int])} ''' f = open(file_name, 'r') sentence = {} @@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend): def get_input_token(sentence, output_path): - ''' - get phone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], str)} - output_path : str or path - path to save phone_id_map + '''get phone set from training data and save it + + Args: + sentence (Dict): sentence: {'utt': ([char], str)} + output_path (str or path): path to save phone_id_map ''' phn_token = set() for utt in sentence: diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py index 094a2bfa..7a81b645 100644 --- a/paddlespeech/t2s/frontend/arpabet.py +++ b/paddlespeech/t2s/frontend/arpabet.py @@ -133,16 +133,11 @@ class ARPABET(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. + Args: + sentence (str): The input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ phonemes = [ self._remove_vowels(item) for item in self.backend(sentence) @@ -156,16 +151,12 @@ class ARPABET(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. + + Args: + phonemes (List[str]): The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids @@ -173,30 +164,23 @@ class ARPABET(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. + Args: + ids( List[int]): The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: + The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. + Args: + sentence (str): The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize( self.phoneticize(sentence, add_start_end=add_start_end)) @@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. + Args: + sentence (str): The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ phonemes = self.backend(sentence) if add_start_end: @@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. + + Args: + phonemes (List[str]): The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. + Args: + sentence (str): The input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize( self.phoneticize(sentence, add_start_end=add_start_end)) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index a488a6fc..8e9f1173 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -65,14 +65,10 @@ class English(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[str]: The list of pronunciation sequence. """ start = self.vocab.start_symbol end = self.vocab.end_symbol @@ -123,14 +119,10 @@ class English(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Args: + phonemes (List[str]): The list of pronunciation sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [ self.vocab.lookup(item) for item in phonemes @@ -140,27 +132,19 @@ class English(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Args: + sentence(str): The input text sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -183,28 +167,21 @@ class EnglishCharacter(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - str - A text sequence after normalize. + Args: + sentence(str): The input text sequence. + Returns: + str: A text sequence after normalize. """ words = normalize(sentence) return words def numericalize(self, sentence): """ Convert a text sequence into ids. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[int] - List of a character id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[int]: + List of a character id sequence. """ ids = [ self.vocab.lookup(item) for item in sentence @@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics): def reverse(self, ids): """ Convert a character id sequence into text. - Parameters - ----------- - ids: List[int] - List of a character id sequence. - Returns - ---------- - str - The input text sequence. + Args: + ids (List[int]): List of a character id sequence. + Returns: + str: The input text sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence): """ Normalize the input text sequence and convert it into character id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[int] - List of a character id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[int]: List of a character id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -264,14 +233,10 @@ class Chinese(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + sentence(str): The input text sequence. + Returns: + List[str]: The list of pronunciation sequence. """ # simplified = self.opencc_backend.convert(sentence) simplified = sentence @@ -296,28 +261,20 @@ class Chinese(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Args: + phonemes(List[str]): The list of pronunciation sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids def __call__(self, sentence): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -329,13 +286,9 @@ class Chinese(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] diff --git a/paddlespeech/t2s/frontend/vocab.py b/paddlespeech/t2s/frontend/vocab.py index 9ef6b137..76bb3c7b 100644 --- a/paddlespeech/t2s/frontend/vocab.py +++ b/paddlespeech/t2s/frontend/vocab.py @@ -20,22 +20,12 @@ __all__ = ["Vocab"] class Vocab(object): """ Vocabulary. - Parameters - ----------- - symbols: Iterable[str] - Common symbols. - - padding_symbol: str, optional - Symbol for pad. Defaults to "". - - unk_symbol: str, optional - Symbol for unknow. Defaults to "" - - start_symbol: str, optional - Symbol for start. Defaults to "" - - end_symbol: str, optional - Symbol for end. Defaults to "" + Args: + symbols (Iterable[str]): Common symbols. + padding_symbol (str, optional): Symbol for pad. Defaults to "". + unk_symbol (str, optional): Symbol for unknow. Defaults to "" + start_symbol (str, optional): Symbol for start. Defaults to "" + end_symbol (str, optional): Symbol for end. Defaults to "" """ def __init__(self, diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index 8801baa0..bfa7d2b1 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' def replace_time(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ is_range = len(match.groups()) > 5 @@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年' def replace_date(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ year = match.group(1) month = match.group(3) @@ -114,12 +110,10 @@ RE_DATE2 = re.compile( def replace_date2(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ year = match.group(1) month = match.group(3) diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 1e575c08..416edfb1 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') def replace_frac(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) nominator = match.group(2) @@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') def replace_percentage(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) percent = match.group(2) @@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)') def replace_negative_num(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) number = match.group(2) @@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') def replace_default_num(match): """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ number = match.group(0) return verbalize_digit(number) @@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') def replace_positive_quantifier(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ number = match.group(1) match_2 = match.group(2) @@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str: def replace_number(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) number = match.group(2) @@ -169,12 +157,10 @@ RE_RANGE = re.compile( def replace_range(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ first, second = match.group(1), match.group(8) first = RE_NUMBER.sub(replace_number, first) diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py index b7b69b41..06b5d41b 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py +++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py @@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str: def replace_phone(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ return phone2str(match.group(0), mobile=False) def replace_mobile(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ return phone2str(match.group(0)) diff --git a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py index d3805a32..268d7229 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py +++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py @@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') def replace_temperature(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) temperature = match.group(2) diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index 9794a700..f9d1b8cb 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -55,14 +55,10 @@ class TextNormalizer(): def _split(self, text: str, lang="zh") -> List[str]: """Split long text into sentences with sentence-splitting punctuations. - Parameters - ---------- - text : str - The input text. - Returns - ------- - List[str] - Sentences. + Args: + text (str): The input text. + Returns: + List[str]: Sentences. """ # Only for pure Chinese here if lang == "zh": diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 3e952c20..73f5498e 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -38,17 +38,21 @@ from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder class FastSpeech2(nn.Layer): """FastSpeech2 module. - + This is a module of FastSpeech2 described in `FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and energy, we use token-averaged value introduced in `FastPitch: Parallel Text-to-speech with Pitch Prediction`_. - + .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`: https://arxiv.org/abs/2006.04558 .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`: https://arxiv.org/abs/2006.06873 + Args: + + Returns: + """ def __init__( @@ -127,136 +131,72 @@ class FastSpeech2(nn.Layer): init_enc_alpha: float=1.0, init_dec_alpha: float=1.0, ): """Initialize FastSpeech2 module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - adim : int - Attention dimension. - aheads : int - Number of attention heads. - elayers : int - Number of encoder layers. - eunits : int - Number of encoder hidden units. - dlayers : int - Number of decoder layers. - dunits : int - Number of decoder hidden units. - postnet_layers : int - Number of postnet layers. - postnet_chans : int - Number of postnet channels. - postnet_filts : int - Kernel size of postnet. - postnet_dropout_rate : float - Dropout rate in postnet. - use_scaled_pos_enc : bool - Whether to use trainable scaled pos encoding. - use_batch_norm : bool - Whether to use batch normalization in encoder prenet. - encoder_normalize_before : bool - Whether to apply layernorm layer before encoder block. - decoder_normalize_before : bool - Whether to apply layernorm layer before - decoder block. - encoder_concat_after : bool - Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after : bool - Whether to concatenate attention layer's input and output in decoder. - reduction_factor : int - Reduction factor. - encoder_type : str - Encoder type ("transformer" or "conformer"). - decoder_type : str - Decoder type ("transformer" or "conformer"). - transformer_enc_dropout_rate : float - Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float): Dropout rate after encoder - positional encoding. - transformer_enc_attn_dropout_rate (float): Dropout rate in encoder - self-attention module. - transformer_dec_dropout_rate (float): Dropout rate in decoder except - attention & positional encoding. - transformer_dec_positional_dropout_rate (float): Dropout rate after decoder - positional encoding. - transformer_dec_attn_dropout_rate (float): Dropout rate in decoder - self-attention module. - conformer_pos_enc_layer_type : str - Pos encoding layer type in conformer. - conformer_self_attn_layer_type : str - Self-attention layer type in conformer - conformer_activation_type : str - Activation function type in conformer. - use_macaron_style_in_conformer : bool - Whether to use macaron style FFN. - use_cnn_in_conformer : bool - Whether to use CNN in conformer. - zero_triu : bool - Whether to use zero triu in relative self-attention module. - conformer_enc_kernel_size : int - Kernel size of encoder conformer. - conformer_dec_kernel_size : int - Kernel size of decoder conformer. - duration_predictor_layers : int - Number of duration predictor layers. - duration_predictor_chans : int - Number of duration predictor channels. - duration_predictor_kernel_size : int - Kernel size of duration predictor. - duration_predictor_dropout_rate : float - Dropout rate in duration predictor. - pitch_predictor_layers : int - Number of pitch predictor layers. - pitch_predictor_chans : int - Number of pitch predictor channels. - pitch_predictor_kernel_size : int - Kernel size of pitch predictor. - pitch_predictor_dropout_rate : float - Dropout rate in pitch predictor. - pitch_embed_kernel_size : float - Kernel size of pitch embedding. - pitch_embed_dropout_rate : float - Dropout rate for pitch embedding. - stop_gradient_from_pitch_predictor : bool - Whether to stop gradient from pitch predictor to encoder. - energy_predictor_layers : int - Number of energy predictor layers. - energy_predictor_chans : int - Number of energy predictor channels. - energy_predictor_kernel_size : int - Kernel size of energy predictor. - energy_predictor_dropout_rate : float - Dropout rate in energy predictor. - energy_embed_kernel_size : float - Kernel size of energy embedding. - energy_embed_dropout_rate : float - Dropout rate for energy embedding. - stop_gradient_from_energy_predictor : bool - Whether to stop gradient from energy predictor to encoder. - spk_num : Optional[int] - Number of speakers. If not None, assume that the spk_embed_dim is not None, - spk_ids will be provided as the input and use spk_embedding_table. - spk_embed_dim : Optional[int] - Speaker embedding dimension. If not None, - assume that spk_emb will be provided as the input or spk_num is not None. - spk_embed_integration_type : str - How to integrate speaker embedding. - tone_num : Optional[int] - Number of tones. If not None, assume that the - tone_ids will be provided as the input and use tone_embedding_table. - tone_embed_dim : Optional[int] - Tone embedding dimension. If not None, assume that tone_num is not None. - tone_embed_integration_type : str - How to integrate tone embedding. - init_type : str - How to initialize transformer parameters. - init_enc_alpha : float - Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha : float - Initial value of alpha in scaled pos encoding of the decoder. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + adim (int): Attention dimension. + aheads (int): Number of attention heads. + elayers (int): Number of encoder layers. + eunits (int): Number of encoder hidden units. + dlayers (int): Number of decoder layers. + dunits (int): Number of decoder hidden units. + postnet_layers (int): Number of postnet layers. + postnet_chans (int): Number of postnet channels. + postnet_filts (int): Kernel size of postnet. + postnet_dropout_rate (float): Dropout rate in postnet. + use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding. + use_batch_norm (bool): Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block. + decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block. + encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool): Whether to concatenate attention layer's input and output in decoder. + reduction_factor (int): Reduction factor. + encoder_type (str): Encoder type ("transformer" or "conformer"). + decoder_type (str): Decoder type ("transformer" or "conformer"). + transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module. + conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer. + conformer_self_attn_layer_type (str): Self-attention layer type in conformer + conformer_activation_type (str): Activation function type in conformer. + use_macaron_style_in_conformer (bool): Whether to use macaron style FFN. + use_cnn_in_conformer (bool): Whether to use CNN in conformer. + zero_triu (bool): Whether to use zero triu in relative self-attention module. + conformer_enc_kernel_size (int): Kernel size of encoder conformer. + conformer_dec_kernel_size (int): Kernel size of decoder conformer. + duration_predictor_layers (int): Number of duration predictor layers. + duration_predictor_chans (int): Number of duration predictor channels. + duration_predictor_kernel_size (int): Kernel size of duration predictor. + duration_predictor_dropout_rate (float): Dropout rate in duration predictor. + pitch_predictor_layers (int): Number of pitch predictor layers. + pitch_predictor_chans (int): Number of pitch predictor channels. + pitch_predictor_kernel_size (int): Kernel size of pitch predictor. + pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor. + pitch_embed_kernel_size (float): Kernel size of pitch embedding. + pitch_embed_dropout_rate (float): Dropout rate for pitch embedding. + stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder. + energy_predictor_layers (int): Number of energy predictor layers. + energy_predictor_chans (int): Number of energy predictor channels. + energy_predictor_kernel_size (int): Kernel size of energy predictor. + energy_predictor_dropout_rate (float): Dropout rate in energy predictor. + energy_embed_kernel_size (float): Kernel size of energy embedding. + energy_embed_dropout_rate (float): Dropout rate for energy embedding. + stop_gradient_from_energy_predictor(bool): Whether to stop gradient from energy predictor to encoder. + spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None, + spk_ids will be provided as the input and use spk_embedding_table. + spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, + assume that spk_emb will be provided as the input or spk_num is not None. + spk_embed_integration_type (str): How to integrate speaker embedding. + tone_num (Optional[int]): Number of tones. If not None, assume that the + tone_ids will be provided as the input and use tone_embedding_table. + tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None. + tone_embed_integration_type (str): How to integrate tone embedding. + init_type (str): How to initialize transformer parameters. + init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. """ assert check_argument_types() @@ -489,45 +429,21 @@ class FastSpeech2(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded token ids (B, Tmax). - text_lengths : Tensor(int64) - Batch of lengths of each input (B,). - speech : Tensor - Batch of padded target features (B, Lmax, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - durations : Tensor(int64) - Batch of padded durations (B, Tmax). - pitch : Tensor - Batch of padded token-averaged pitch (B, Tmax, 1). - energy : Tensor - Batch of padded token-averaged energy (B, Tmax, 1). - tone_id : Tensor, optional(int64) - Batch of padded tone ids (B, Tmax). - spk_emb : Tensor, optional - Batch of speaker embeddings (B, spk_embed_dim). - spk_id : Tnesor, optional(int64) - Batch of speaker ids (B,) - - Returns - ---------- - Tensor - mel outs before postnet - Tensor - mel outs after postnet - Tensor - duration predictor's output - Tensor - pitch predictor's output - Tensor - energy predictor's output - Tensor - speech - Tensor - speech_lengths, modified if reduction_factor > 1 + Args: + text(Tensor(int64)): Batch of padded token ids (B, Tmax). + text_lengths(Tensor(int64)): Batch of lengths of each input (B,). + speech(Tensor): Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). + durations(Tensor(int64)): Batch of padded durations (B, Tmax). + pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1). + energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1). + tone_id(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). + spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + + Returns: + + """ # input of embedding must be int64 @@ -680,34 +596,22 @@ class FastSpeech2(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : Tensor, optional (int64) - Groundtruth of duration (T,). - pitch : Tensor, optional - Groundtruth of token-averaged pitch (T, 1). - energy : Tensor, optional - Groundtruth of token-averaged energy (T, 1). - alpha : float, optional - Alpha to control the speed. - use_teacher_forcing : bool, optional - Whether to use teacher forcing. - If true, groundtruth of duration, pitch and energy will be used. - spk_emb : Tensor, optional - peaker embedding vector (spk_embed_dim,). - spk_id : Tensor, optional(int64) - Batch of padded spk ids (1,). - tone_id : Tensor, optional(int64) - Batch of padded tone ids (T,). - - Returns - ---------- - Tensor - Output sequence of features (L, odim). + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + durations(Tensor, optional (int64)): Groundtruth of duration (T,). + pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1). + energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1). + alpha(float, optional): Alpha to control the speed. + use_teacher_forcing(bool, optional): Whether to use teacher forcing. + If true, groundtruth of duration, pitch and energy will be used. + spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None) + spk_id(Tensor, optional(int64), optional): Batch of padded spk ids (1,). (Default value = None) + tone_id(Tensor, optional(int64), optional): Batch of padded tone ids (T,). (Default value = None) + + Returns: + + """ # input of embedding must be int64 x = paddle.cast(text, 'int64') @@ -761,17 +665,13 @@ class FastSpeech2(nn.Layer): def _integrate_with_spk_embed(self, hs, spk_emb): """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim) + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + + """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states @@ -790,17 +690,13 @@ class FastSpeech2(nn.Layer): def _integrate_with_tone_embed(self, hs, tone_embs): """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - tone_embs : Tensor - Batch of speaker embeddings (B, Tmax, tone_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim) + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim). + + Returns: + + """ if self.tone_embed_integration_type == "add": # apply projection and then add to hidden states @@ -819,24 +715,17 @@ class FastSpeech2(nn.Layer): def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: """Make masks for self-attention. - Parameters - ---------- - ilens : Tensor - Batch of lengths (B,). + Args: + ilens(Tensor): Batch of lengths (B,). - Returns - ------- - Tensor - Mask tensor for self-attention. - dtype=paddle.bool - - Examples - ------- - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool + Returns: + Tensor: Mask tensor for self-attention. dtype=paddle.bool + Examples: + >>> ilens = [5, 3] + >>> self._source_mask(ilens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 0, 0]]]) bool """ x_masks = make_non_pad_mask(ilens) return x_masks.unsqueeze(-2) @@ -910,34 +799,26 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): spk_emb=None, spk_id=None): """ - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : paddle.Tensor/np.ndarray, optional (int64) - Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias - durations_scale: int/float, optional - durations_bias: int/float, optional - pitch : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias - pitch_scale: int/float, optional - In denormed HZ domain. - pitch_bias: int/float, optional - In denormed HZ domain. - energy : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias - energy_scale: int/float, optional - In denormed domain. - energy_bias: int/float, optional - In denormed domain. - robot : bool, optional - Weather output robot style - Returns - ---------- - Tensor - Output sequence of features (L, odim). + + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias + durations_scale(int/float, optional): + durations_bias(int/float, optional): + pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias + pitch_scale(int/float, optional): In denormed HZ domain. + pitch_bias(int/float, optional): In denormed HZ domain. + energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias + energy_scale(int/float, optional): In denormed domain. + energy_bias(int/float, optional): In denormed domain. + robot: bool: (Default value = False) + spk_emb: (Default value = None) + spk_id: (Default value = None) + + Returns: + Tensor: logmel + """ normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( text, @@ -1011,13 +892,9 @@ class FastSpeech2Loss(nn.Layer): def __init__(self, use_masking: bool=True, use_weighted_masking: bool=False): """Initialize feed-forward Transformer loss module. - - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to weighted masking in loss calculation. + Args: + use_masking (bool): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): Whether to weighted masking in loss calculation. """ assert check_argument_types() super().__init__() @@ -1048,42 +925,22 @@ class FastSpeech2Loss(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - d_outs : Tensor - Batch of outputs of duration predictor (B, Tmax). - p_outs : Tensor - Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs : Tensor - Batch of outputs of energy predictor (B, Tmax, 1). - ys : Tensor - Batch of target features (B, Lmax, odim). - ds : Tensor - Batch of durations (B, Tmax). - ps : Tensor - Batch of target token-averaged pitch (B, Tmax, 1). - es : Tensor - Batch of target token-averaged energy (B, Tmax, 1). - ilens : Tensor - Batch of the lengths of each input (B,). - olens : Tensor - Batch of the lengths of each target (B,). - - Returns - ---------- - Tensor - L1 loss value. - Tensor - Duration predictor loss value. - Tensor - Pitch predictor loss value. - Tensor - Energy predictor loss value. - + Args: + after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). + d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax). + p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1). + e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1). + ys(Tensor): Batch of target features (B, Lmax, odim). + ds(Tensor): Batch of durations (B, Tmax). + ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1). + es(Tensor): Batch of target token-averaged energy (B, Tmax, 1). + ilens(Tensor): Batch of the lengths of each input (B,). + olens(Tensor): Batch of the lengths of each target (B,). + + Returns: + + """ # apply mask to remove padded part if self.use_masking: diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py index 82dd66c1..116376ec 100644 --- a/paddlespeech/t2s/models/hifigan/hifigan.py +++ b/paddlespeech/t2s/models/hifigan/hifigan.py @@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initialize HiFiGANGenerator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - channels : int - Number of hidden representation channels. - kernel_size : int - Kernel size of initial and final conv layer. - upsample_scales : list - List of upsampling scales. - upsample_kernel_sizes : list - List of kernel sizes for upsampling layers. - resblock_kernel_sizes : list - List of kernel sizes for residual blocks. - resblock_dilations : list - List of dilation list for residual blocks. - use_additional_convs : bool - Whether to use additional conv layers in residual blocks. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + channels (int): Number of hidden representation channels. + kernel_size (int): Kernel size of initial and final conv layer. + upsample_scales (list): List of upsampling scales. + upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. + resblock_kernel_sizes (list): List of kernel sizes for residual blocks. + resblock_dilations (list): List of dilation list for residual blocks. + use_additional_convs (bool): Whether to use additional conv layers in residual blocks. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T). + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T). """ c = self.input_conv(c) for i in range(self.num_upsamples): @@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Tensor - Input tensor (T, in_channels). - normalize_before (bool): Whether to perform normalization. - Returns - ---------- - Tensor - Output tensor (T ** prod(upsample_scales), out_channels). + Args: + c (Tensor): Input tensor (T, in_channels). + normalize_before (bool): Whether to perform normalization. + Returns: + Tensor: + Output tensor (T ** prod(upsample_scales), out_channels). """ c = self.forward(c.transpose([1, 0]).unsqueeze(0)) return c.squeeze(0).transpose([1, 0]) @@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer): use_spectral_norm: bool=False, init_type: str="xavier_uniform", ): """Initialize HiFiGANPeriodDiscriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - period : int - Period. - kernel_sizes : list - Kernel sizes of initial conv layers and the final conv layer. - channels : int - Number of initial channels. - downsample_scales : list - List of downsampling scales. - max_downsample_channels : int - Number of maximum downsampling channels. - use_additional_convs : bool - Whether to use additional conv layers in residual blocks. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_spectral_norm : bool - Whether to use spectral norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + period (int): Period. + kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer. + channels (int): Number of initial channels. + downsample_scales (list): List of downsampling scales. + max_downsample_channels (int): Number of maximum downsampling channels. + use_additional_convs (bool): Whether to use additional conv layers in residual blocks. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_spectral_norm (bool): Whether to use spectral norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - list - List of each layer's tensors. + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + list: List of each layer's tensors. """ # transform 1d to 2d -> (B, C, T/P, P) b, c, t = paddle.shape(x) @@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): }, init_type: str="xavier_uniform", ): """Initialize HiFiGANMultiPeriodDiscriminator module. - Parameters - ---------- - periods : list - List of periods. - discriminator_params : dict - Parameters for hifi-gan period discriminator module. - The period parameter will be overwritten. + + Args: + periods (list): List of periods. + discriminator_params (dict): Parameters for hifi-gan period discriminator module. + The period parameter will be overwritten. """ super().__init__() # initialize parameters @@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: @@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer): use_spectral_norm: bool=False, init_type: str="xavier_uniform", ): """Initilize HiFiGAN scale discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - kernel_sizes : list - List of four kernel sizes. The first will be used for the first conv layer, - and the second is for downsampling part, and the remaining two are for output layers. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : list - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_spectral_norm : bool - Whether to use spectral norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer, + and the second is for downsampling part, and the remaining two are for output layers. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (list): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_spectral_norm (bool): Whether to use spectral norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of output tensors of each layer. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of output tensors of each layer. """ outs = [] for f in self.layers: @@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): follow_official_norm: bool=False, init_type: str="xavier_uniform", ): """Initilize HiFiGAN multi-scale discriminator module. - Parameters - ---------- - scales : int - Number of multi-scales. - downsample_pooling : str - Pooling module name for downsampling of the inputs. - downsample_pooling_params : dict - Parameters for the above pooling module. - discriminator_params : dict - Parameters for hifi-gan scale discriminator module. - follow_official_norm : bool - Whether to follow the norm setting of the official - implementaion. The first discriminator uses spectral norm and the other - discriminators use weight norm. + + Args: + scales (int): Number of multi-scales. + downsample_pooling (str): Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): Parameters for the above pooling module. + discriminator_params (dict): Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): Whether to follow the norm setting of the official + implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm. """ super().__init__() @@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: @@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): }, init_type: str="xavier_uniform", ): """Initilize HiFiGAN multi-scale + multi-period discriminator module. - Parameters - ---------- - scales : int - Number of multi-scales. - scale_downsample_pooling : str - Pooling module name for downsampling of the inputs. - scale_downsample_pooling_params : dict - Parameters for the above pooling module. - scale_discriminator_params : dict - Parameters for hifi-gan scale discriminator module. - follow_official_norm : bool): Whether to follow the norm setting of the official - implementaion. The first discriminator uses spectral norm and the other - discriminators use weight norm. - periods : list - List of periods. - period_discriminator_params : dict - Parameters for hifi-gan period discriminator module. - The period parameter will be overwritten. + + Args: + scales (int): Number of multi-scales. + scale_downsample_pooling (str): Pooling module name for downsampling of the inputs. + scale_downsample_pooling_params (dict): Parameters for the above pooling module. + scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): Whether to follow the norm setting of the official implementaion. + The first discriminator uses spectral norm and the other discriminators use weight norm. + periods (list): List of periods. + period_discriminator_params (dict): Parameters for hifi-gan period discriminator module. + The period parameter will be overwritten. """ super().__init__() @@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List: - List of list of each discriminator outputs, - which consists of each layer output tensors. - Multi scale and multi period ones are concatenated. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: + List of list of each discriminator outputs, + which consists of each layer output tensors. + Multi scale and multi period ones are concatenated. """ msd_outs = self.msd(x) mpd_outs = self.mpd(x) diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 3e90b691..6a139659 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer): use_causal_conv: bool=False, init_type: str="xavier_uniform", ): """Initialize MelGANGenerator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels, - the number of sub-band is out_channels in multi-band melgan. - kernel_size : int - Kernel size of initial and final conv layer. - channels : int - Initial number of channels for conv layer. - bias : bool - Whether to add bias parameter in convolution layers. - upsample_scales : List[int] - List of upsampling scales. - stack_kernel_size : int - Kernel size of dilated conv layers in residual stack. - stacks : int - Number of stacks in a single residual stack. - nonlinear_activation : Optional[str], optional - Non linear activation in upsample network, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to the linear activation in the upsample network, - by default {} - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. - use_final_nonlinear_activation : nn.Layer - Activation function for the final layer. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels, + the number of sub-band is out_channels in multi-band melgan. + kernel_size (int): Kernel size of initial and final conv layer. + channels (int): Initial number of channels for conv layer. + bias (bool): Whether to add bias parameter in convolution layers. + upsample_scales (List[int]): List of upsampling scales. + stack_kernel_size (int): Kernel size of dilated conv layers in residual stack. + stacks (int): Number of stacks in a single residual stack. + nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, + by default {} + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() @@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T ** prod(upsample_scales)). + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ out = self.melgan(c) return out @@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Union[Tensor, ndarray] - Input tensor (T, in_channels). - Returns - ---------- - Tensor - Output tensor (out_channels*T ** prod(upsample_scales), 1). + + Args: + c (Union[Tensor, ndarray]): Input tensor (T, in_channels). + Returns: + Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1). """ # pseudo batch c = c.transpose([1, 0]).unsqueeze(0) @@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer): pad_params: Dict[str, Any]={"mode": "reflect"}, init_type: str="xavier_uniform", ): """Initilize MelGAN discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - kernel_sizes : List[int] - List of two kernel sizes. The prod will be used for the first conv layer, - and the first and the second kernel sizes will be used for the last two layers. - For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, - the last two layers' kernel size will be 5 and 3, respectively. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : List[int] - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, + the last two layers' kernel size will be 5 and 3, respectively. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. """ super().__init__() @@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of output tensors of each layer (for feat_match_loss). + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of output tensors of each layer (for feat_match_loss). """ outs = [] for f in self.layers: @@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize MelGAN multi-scale discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - scales : int - Number of multi-scales. - downsample_pooling : str - Pooling module name for downsampling of the inputs. - downsample_pooling_params : dict - Parameters for the above pooling module. - kernel_sizes : List[int] - List of two kernel sizes. The sum will be used for the first conv layer, - and the first and the second kernel sizes will be used for the last two layers. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : List[int] - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + scales (int): Number of multi-scales. + downsample_pooling (str): Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): Parameters for the above pooling module. + kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() @@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index bd451e1f..40a2f100 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize Style MelGAN generator. - Parameters - ---------- - in_channels : int - Number of input noise channels. - aux_channels : int - Number of auxiliary input channels. - channels : int - Number of channels for conv layer. - out_channels : int - Number of output channels. - kernel_size : int - Kernel size of conv layers. - dilation : int - Dilation factor for conv layers. - bias : bool - Whether to add bias parameter in convolution layers. - noise_upsample_scales : list - List of noise upsampling scales. - noise_upsample_activation : str - Activation function module name for noise upsampling. - noise_upsample_activation_params : dict - Hyperparameters for the above activation function. - upsample_scales : list - List of upsampling scales. - upsample_mode : str - Upsampling mode in TADE layer. - gated_function : str - Gated function in TADEResBlock ("softmax" or "sigmoid"). - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input noise channels. + aux_channels (int): Number of auxiliary input channels. + channels (int): Number of channels for conv layer. + out_channels (int): Number of output channels. + kernel_size (int): Kernel size of conv layers. + dilation (int): Dilation factor for conv layers. + bias (bool): Whether to add bias parameter in convolution layers. + noise_upsample_scales (list): List of noise upsampling scales. + noise_upsample_activation (str): Activation function module name for noise upsampling. + noise_upsample_activation_params (dict): Hyperparameters for the above activation function. + upsample_scales (list): List of upsampling scales. + upsample_mode (str): Upsampling mode in TADE layer. + gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid"). + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer): def forward(self, c, z=None): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Auxiliary input tensor (B, channels, T). - z : Tensor - Input noise tensor (B, in_channels, 1). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T ** prod(upsample_scales)). + + Args: + c (Tensor): Auxiliary input tensor (B, channels, T). + z (Tensor): Input noise tensor (B, in_channels, 1). + Returns: + Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300) if z is None: @@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Tensor - Input tensor (T, in_channels). - Returns - ---------- - Tensor - Output tensor (T ** prod(upsample_scales), out_channels). + Args: + c (Tensor): Input tensor (T, in_channels). + Returns: + Tensor: Output tensor (T ** prod(upsample_scales), out_channels). """ # (1, in_channels, T) c = c.transpose([1, 0]).unsqueeze(0) @@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize Style MelGAN discriminator. - Parameters - ---------- - repeats : int - Number of repititons to apply RWD. - window_sizes : list - List of random window sizes. - pqmf_params : list - List of list of Parameters for PQMF modules - discriminator_params : dict - Parameters for base discriminator module. - use_weight_nom : bool - Whether to apply weight normalization. + + Args: + repeats (int): Number of repititons to apply RWD. + window_sizes (list): List of random window sizes. + pqmf_params (list): List of list of Parameters for PQMF modules + discriminator_params (dict): Parameters for base discriminator module. + use_weight_nom (bool): Whether to apply weight normalization. """ super().__init__() @@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, 1, T). - Returns - ---------- - List - List of discriminator outputs, #items in the list will be - equal to repeats * #discriminators. + Args: + x (Tensor): Input tensor (B, 1, T). + Returns: + List: List of discriminator outputs, #items in the list will be + equal to repeats * #discriminators. """ outs = [] for _ in range(self.repeats): diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py index 9eff4497..cc8460e4 100644 --- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py +++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py @@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet class PWGGenerator(nn.Layer): """Wave Generator for Parallel WaveGAN - Parameters - ---------- - in_channels : int, optional - Number of channels of the input waveform, by default 1 - out_channels : int, optional - Number of channels of the output waveform, by default 1 - kernel_size : int, optional - Kernel size of the residual blocks inside, by default 3 - layers : int, optional - Number of residual blocks inside, by default 30 - stacks : int, optional - The number of groups to split the residual blocks into, by default 3 - Within each group, the dilation of the residual block grows - exponentially. - residual_channels : int, optional - Residual channel of the residual blocks, by default 64 - gate_channels : int, optional - Gate channel of the residual blocks, by default 128 - skip_channels : int, optional - Skip channel of the residual blocks, by default 64 - aux_channels : int, optional - Auxiliary channel of the residual blocks, by default 80 - aux_context_window : int, optional - The context window size of the first convolution applied to the - auxiliary input, by default 2 - dropout : float, optional - Dropout of the residual blocks, by default 0. - bias : bool, optional - Whether to use bias in residual blocks, by default True - use_weight_norm : bool, optional - Whether to use weight norm in all convolutions, by default True - use_causal_conv : bool, optional - Whether to use causal padding in the upsample network and residual - blocks, by default False - upsample_scales : List[int], optional - Upsample scales of the upsample network, by default [4, 4, 4, 4] - nonlinear_activation : Optional[str], optional - Non linear activation in upsample network, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to the linear activation in the upsample network, - by default {} - interpolate_mode : str, optional - Interpolation mode of the upsample network, by default "nearest" - freq_axis_kernel_size : int, optional - Kernel size along the frequency axis of the upsample network, by default 1 + Args: + in_channels (int, optional): Number of channels of the input waveform, by default 1 + out_channels (int, optional): Number of channels of the output waveform, by default 1 + kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3 + layers (int, optional): Number of residual blocks inside, by default 30 + stacks (int, optional): The number of groups to split the residual blocks into, by default 3 + Within each group, the dilation of the residual block grows exponentially. + residual_channels (int, optional): Residual channel of the residual blocks, by default 64 + gate_channels (int, optional): Gate channel of the residual blocks, by default 128 + skip_channels (int, optional): Skip channel of the residual blocks, by default 64 + aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80 + aux_context_window (int, optional): The context window size of the first convolution applied to the + auxiliary input, by default 2 + dropout (float, optional): Dropout of the residual blocks, by default 0. + bias (bool, optional): Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True + use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual + blocks, by default False + upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4] + nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, + by default {} + interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest" + freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1 """ def __init__( @@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer): def forward(self, x, c): """Generate waveform. - Parameters - ---------- - x : Tensor - Shape (N, C_in, T), The input waveform. - c : Tensor - Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It + Args: + x(Tensor): Shape (N, C_in, T), The input waveform. + c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It is upsampled to match the time resolution of the input. - Returns - ------- - Tensor - Shape (N, C_out, T), the generated waveform. + Returns: + Tensor: Shape (N, C_out, T), the generated waveform. """ c = self.upsample_net(c) assert c.shape[-1] == x.shape[-1] @@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer): self.apply(_remove_weight_norm) def inference(self, c=None): - """Waveform generation. This function is used for single instance - inference. - Parameters - ---------- - c : Tensor, optional - Shape (T', C_aux), the auxiliary input, by default None - x : Tensor, optional - Shape (T, C_in), the noise waveform, by default None - If not provided, a sample is drawn from a gaussian distribution. - Returns - ------- - Tensor - Shape (T, C_out), the generated waveform + """Waveform generation. This function is used for single instance inference. + + Args: + c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None + x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None + + Returns: + Tensor: Shape (T, C_out), the generated waveform """ # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files x = paddle.randn( @@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer): class PWGDiscriminator(nn.Layer): """A convolutional discriminator for audio. - Parameters - ---------- - in_channels : int, optional - Number of channels of the input audio, by default 1 - out_channels : int, optional - Output feature size, by default 1 - kernel_size : int, optional - Kernel size of convolutional sublayers, by default 3 - layers : int, optional - Number of layers, by default 10 - conv_channels : int, optional - Feature size of the convolutional sublayers, by default 64 - dilation_factor : int, optional - The factor with which dilation of each convolutional sublayers grows - exponentially if it is greater than 1, else the dilation of each - convolutional sublayers grows linearly, by default 1 - nonlinear_activation : str, optional - The activation after each convolutional sublayer, by default "leakyrelu" - nonlinear_activation_params : Dict[str, Any], optional - The parameters passed to the activation's initializer, by default - {"negative_slope": 0.2} - bias : bool, optional - Whether to use bias in convolutional sublayers, by default True - use_weight_norm : bool, optional - Whether to use weight normalization at all convolutional sublayers, - by default True + Args: + in_channels (int, optional): Number of channels of the input audio, by default 1 + out_channels (int, optional): Output feature size, by default 1 + kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3 + layers (int, optional): Number of layers, by default 10 + conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64 + dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows + exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, + by default 1 + nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default + {"negative_slope": 0.2} + bias (bool, optional): Whether to use bias in convolutional sublayers, by default True + use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, + by default True """ def __init__( @@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, in_channels, num_samples), the input audio. - - Returns - ------- - Tensor - Shape (N, out_channels, num_samples), the predicted logits. + + Args: + x (Tensor): Shape (N, in_channels, num_samples), the input audio. + + Returns: + Tensor: Shape (N, out_channels, num_samples), the predicted logits. """ return self.conv_layers(x) @@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer): class ResidualPWGDiscriminator(nn.Layer): """A wavenet-style discriminator for audio. - Parameters - ---------- - in_channels : int, optional - Number of channels of the input audio, by default 1 - out_channels : int, optional - Output feature size, by default 1 - kernel_size : int, optional - Kernel size of residual blocks, by default 3 - layers : int, optional - Number of residual blocks, by default 30 - stacks : int, optional - Number of groups of residual blocks, within which the dilation - of each residual blocks grows exponentially, by default 3 - residual_channels : int, optional - Residual channels of residual blocks, by default 64 - gate_channels : int, optional - Gate channels of residual blocks, by default 128 - skip_channels : int, optional - Skip channels of residual blocks, by default 64 - dropout : float, optional - Dropout probability of residual blocks, by default 0. - bias : bool, optional - Whether to use bias in residual blocks, by default True - use_weight_norm : bool, optional - Whether to use weight normalization in all convolutional layers, - by default True - use_causal_conv : bool, optional - Whether to use causal convolution in residual blocks, by default False - nonlinear_activation : str, optional - Activation after convolutions other than those in residual blocks, - by default "leakyrelu" - nonlinear_activation_params : Dict[str, Any], optional - Parameters to pass to the activation, by default {"negative_slope": 0.2} + Args: + in_channels (int, optional): Number of channels of the input audio, by default 1 + out_channels (int, optional): Output feature size, by default 1 + kernel_size (int, optional): Kernel size of residual blocks, by default 3 + layers (int, optional): Number of residual blocks, by default 30 + stacks (int, optional): Number of groups of residual blocks, within which the dilation + of each residual blocks grows exponentially, by default 3 + residual_channels (int, optional): Residual channels of residual blocks, by default 64 + gate_channels (int, optional): Gate channels of residual blocks, by default 128 + skip_channels (int, optional): Skip channels of residual blocks, by default 64 + dropout (float, optional): Dropout probability of residual blocks, by default 0. + bias (bool, optional): Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, + by default True + use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False + nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, + by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, + by default {"negative_slope": 0.2} """ def __init__( @@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, in_channels, num_samples), the input audio. - - Returns - ------- - Tensor - Shape (N, out_channels, num_samples), the predicted logits. + Args: + x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩ + + Returns: + Tensor: Shape (N, out_channels, num_samples), the predicted logits. """ x = self.first_conv(x) skip = 0 diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py index da71077f..abb691b4 100644 --- a/paddlespeech/t2s/models/tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py @@ -81,69 +81,39 @@ class Tacotron2(nn.Layer): # training related init_type: str="xavier_uniform", ): """Initialize Tacotron2 module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - embed_dim : int - Dimension of the token embedding. - elayers : int - Number of encoder blstm layers. - eunits : int - Number of encoder blstm units. - econv_layers : int - Number of encoder conv layers. - econv_filts : int - Number of encoder conv filter size. - econv_chans : int - Number of encoder conv filter channels. - dlayers : int - Number of decoder lstm layers. - dunits : int - Number of decoder lstm units. - prenet_layers : int - Number of prenet layers. - prenet_units : int - Number of prenet units. - postnet_layers : int - Number of postnet layers. - postnet_filts : int - Number of postnet filter size. - postnet_chans : int - Number of postnet filter channels. - output_activation : str - Name of activation function for outputs. - adim : int - Number of dimension of mlp in attention. - aconv_chans : int - Number of attention conv filter channels. - aconv_filts : int - Number of attention conv filter size. - cumulate_att_w : bool - Whether to cumulate previous attention weight. - use_batch_norm : bool - Whether to use batch normalization. - use_concate : bool - Whether to concat enc outputs w/ dec lstm outputs. - reduction_factor : int - Reduction factor. - spk_num : Optional[int] - Number of speakers. If set to > 1, assume that the - sids will be provided as the input and use sid embedding layer. - lang_num : Optional[int] - Number of languages. If set to > 1, assume that the - lids will be provided as the input and use sid embedding layer. - spk_embed_dim : Optional[int] - Speaker embedding dimension. If set to > 0, - assume that spk_emb will be provided as the input. - spk_embed_integration_type : str - How to integrate speaker embedding. - dropout_rate : float - Dropout rate. - zoneout_rate : float - Zoneout rate. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + embed_dim (int): Dimension of the token embedding. + elayers (int): Number of encoder blstm layers. + eunits (int): Number of encoder blstm units. + econv_layers (int): Number of encoder conv layers. + econv_filts (int): Number of encoder conv filter size. + econv_chans (int): Number of encoder conv filter channels. + dlayers (int): Number of decoder lstm layers. + dunits (int): Number of decoder lstm units. + prenet_layers (int): Number of prenet layers. + prenet_units (int): Number of prenet units. + postnet_layers (int): Number of postnet layers. + postnet_filts (int): Number of postnet filter size. + postnet_chans (int): Number of postnet filter channels. + output_activation (str): Name of activation function for outputs. + adim (int): Number of dimension of mlp in attention. + aconv_chans (int): Number of attention conv filter channels. + aconv_filts (int): Number of attention conv filter size. + cumulate_att_w (bool): Whether to cumulate previous attention weight. + use_batch_norm (bool): Whether to use batch normalization. + use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor (int): Reduction factor. + spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the + sids will be provided as the input and use sid embedding layer. + lang_num (Optional[int]): Number of languages. If set to > 1, assume that the + lids will be provided as the input and use sid embedding layer. + spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, + assume that spk_emb will be provided as the input. + spk_embed_integration_type (str): How to integrate speaker embedding. + dropout_rate (float): Dropout rate. + zoneout_rate (float): Zoneout rate. """ assert check_argument_types() super().__init__() @@ -258,31 +228,19 @@ class Tacotron2(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded character ids (B, T_text). - text_lengths : Tensor(int64) - Batch of lengths of each input batch (B,). - speech : Tensor - Batch of padded target features (B, T_feats, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - spk_emb : Optional[Tensor] - Batch of speaker embeddings (B, spk_embed_dim). - spk_id : Optional[Tensor] - Batch of speaker IDs (B, 1). - lang_id : Optional[Tensor] - Batch of language IDs (B, 1). - - Returns - ---------- - Tensor - Loss scalar value. - Dict - Statistics to be monitored. - Tensor - Weight value if not joint training else model outputs. + Args: + text (Tensor(int64)): Batch of padded character ids (B, T_text). + text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,). + speech (Tensor): Batch of padded target features (B, T_feats, odim). + speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,). + spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim). + spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1). + lang_id (Optional[Tensor]): Batch of language IDs (B, 1). + + Returns: + Tensor: Loss scalar value. + Dict: Statistics to be monitored. + Tensor: Weight value if not joint training else model outputs. """ text = text[:, :text_lengths.max()] @@ -369,40 +327,26 @@ class Tacotron2(nn.Layer): use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text Tensor(int64) - Input sequence of characters (T_text,). - speech : Optional[Tensor] - Feature sequence to extract style (N, idim). - spk_emb : ptional[Tensor] - Speaker embedding (spk_embed_dim,). - spk_id : Optional[Tensor] - Speaker ID (1,). - lang_id : Optional[Tensor] - Language ID (1,). - threshold : float - Threshold in inference. - minlenratio : float - Minimum length ratio in inference. - maxlenratio : float - Maximum length ratio in inference. - use_att_constraint : bool - Whether to apply attention constraint. - backward_window : int - Backward window in attention constraint. - forward_window : int - Forward window in attention constraint. - use_teacher_forcing : bool - Whether to use teacher forcing. - - Return - ---------- - Dict[str, Tensor] - Output dict including the following items: - * feat_gen (Tensor): Output sequence of features (T_feats, odim). - * prob (Tensor): Output sequence of stop probabilities (T_feats,). - * att_w (Tensor): Attention weights (T_feats, T). + Args: + text (Tensor(int64)): Input sequence of characters (T_text,). + speech (Optional[Tensor]): Feature sequence to extract style (N, idim). + spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,). + spk_id (Optional[Tensor]): Speaker ID (1,). + lang_id (Optional[Tensor]): Language ID (1,). + threshold (float): Threshold in inference. + minlenratio (float): Minimum length ratio in inference. + maxlenratio (float): Maximum length ratio in inference. + use_att_constraint (bool): Whether to apply attention constraint. + backward_window (int): Backward window in attention constraint. + forward_window (int): Forward window in attention constraint. + use_teacher_forcing (bool): Whether to use teacher forcing. + + Returns: + Dict[str, Tensor] + Output dict including the following items: + * feat_gen (Tensor): Output sequence of features (T_feats, odim). + * prob (Tensor): Output sequence of stop probabilities (T_feats,). + * att_w (Tensor): Attention weights (T_feats, T). """ x = text @@ -458,18 +402,13 @@ class Tacotron2(nn.Layer): spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, eunits). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, eunits) if - integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). + Args: + hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits). + spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if + integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). """ if self.spk_embed_integration_type == "add": diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 4babe283..92754c30 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer): .. _`Neural Speech Synthesis with Transformer Network`: https://arxiv.org/pdf/1809.08895.pdf - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - embed_dim : int, optional - Dimension of character embedding. - eprenet_conv_layers : int, optional - Number of encoder prenet convolution layers. - eprenet_conv_chans : int, optional - Number of encoder prenet convolution channels. - eprenet_conv_filts : int, optional - Filter size of encoder prenet convolution. - dprenet_layers : int, optional - Number of decoder prenet layers. - dprenet_units : int, optional - Number of decoder prenet hidden units. - elayers : int, optional - Number of encoder layers. - eunits : int, optional - Number of encoder hidden units. - adim : int, optional - Number of attention transformation dimensions. - aheads : int, optional - Number of heads for multi head attention. - dlayers : int, optional - Number of decoder layers. - dunits : int, optional - Number of decoder hidden units. - postnet_layers : int, optional - Number of postnet layers. - postnet_chans : int, optional - Number of postnet channels. - postnet_filts : int, optional - Filter size of postnet. - use_scaled_pos_enc : pool, optional - Whether to use trainable scaled positional encoding. - use_batch_norm : bool, optional - Whether to use batch normalization in encoder prenet. - encoder_normalize_before : bool, optional - Whether to perform layer normalization before encoder block. - decoder_normalize_before : bool, optional - Whether to perform layer normalization before decoder block. - encoder_concat_after : bool, optional - Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after : bool, optional - Whether to concatenate attention layer's input and output in decoder. - positionwise_layer_type : str, optional - Position-wise operation type. - positionwise_conv_kernel_size : int, optional - Kernel size in position wise conv 1d. - reduction_factor : int, optional - Reduction factor. - spk_embed_dim : int, optional - Number of speaker embedding dimenstions. - spk_embed_integration_type : str, optional - How to integrate speaker embedding. - use_gst : str, optional - Whether to use global style token. - gst_tokens : int, optional - The number of GST embeddings. - gst_heads : int, optional - The number of heads in GST multihead attention. - gst_conv_layers : int, optional - The number of conv layers in GST. - gst_conv_chans_list : Sequence[int], optional - List of the number of channels of conv layers in GST. - gst_conv_kernel_size : int, optional - Kernal size of conv layers in GST. - gst_conv_stride : int, optional - Stride size of conv layers in GST. - gst_gru_layers : int, optional - The number of GRU layers in GST. - gst_gru_units : int, optional - The number of GRU units in GST. - transformer_lr : float, optional - Initial value of learning rate. - transformer_warmup_steps : int, optional - Optimizer warmup steps. - transformer_enc_dropout_rate : float, optional - Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate : float, optional - Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate : float, optional - Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate : float, optional - Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate : float, optional - Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate : float, optional - Dropout rate in deocoder self-attention module. - transformer_enc_dec_attn_dropout_rate : float, optional - Dropout rate in encoder-deocoder attention module. - init_type : str, optional - How to initialize transformer parameters. - init_enc_alpha : float, optional - Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha : float, optional - Initial value of alpha in scaled pos encoding of the decoder. - eprenet_dropout_rate : float, optional - Dropout rate in encoder prenet. - dprenet_dropout_rate : float, optional - Dropout rate in decoder prenet. - postnet_dropout_rate : float, optional - Dropout rate in postnet. - use_masking : bool, optional - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool, optional - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float, optional - Positive sample weight in bce calculation (only for use_masking=true). - loss_type : str, optional - How to calculate loss. - use_guided_attn_loss : bool, optional - Whether to use guided attention loss. - num_heads_applied_guided_attn : int, optional - Number of heads in each layer to apply guided attention loss. - num_layers_applied_guided_attn : int, optional - Number of layers to apply guided attention loss. - List of module names to apply guided attention loss. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + embed_dim (int, optional): Dimension of character embedding. + eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers. + eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels. + eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution. + dprenet_layers (int, optional): Number of decoder prenet layers. + dprenet_units (int, optional): Number of decoder prenet hidden units. + elayers (int, optional): Number of encoder layers. + eunits (int, optional): Number of encoder hidden units. + adim (int, optional): Number of attention transformation dimensions. + aheads (int, optional): Number of heads for multi head attention. + dlayers (int, optional): Number of decoder layers. + dunits (int, optional): Number of decoder hidden units. + postnet_layers (int, optional): Number of postnet layers. + postnet_chans (int, optional): Number of postnet channels. + postnet_filts (int, optional): Filter size of postnet. + use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding. + use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block. + decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block. + encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder. + positionwise_layer_type (str, optional): Position-wise operation type. + positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d. + reduction_factor (int, optional): Reduction factor. + spk_embed_dim (int, optional): Number of speaker embedding dimenstions. + spk_embed_integration_type (str, optional): How to integrate speaker embedding. + use_gst (str, optional): Whether to use global style token. + gst_tokens (int, optional): The number of GST embeddings. + gst_heads (int, optional): The number of heads in GST multihead attention. + gst_conv_layers (int, optional): The number of conv layers in GST. + gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST. + gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST. + gst_conv_stride (int, optional): Stride size of conv layers in GST. + gst_gru_layers (int, optional): The number of GRU layers in GST. + gst_gru_units (int, optional): The number of GRU units in GST. + transformer_lr (float, optional): Initial value of learning rate. + transformer_warmup_steps (int, optional): Optimizer warmup steps. + transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float, optional): Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float, optional): Dropout rate in deocoder self-attention module. + transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module. + init_type (str, optional): How to initialize transformer parameters. + init_enc_alpha (float, optional): Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder. + eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet. + dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet. + postnet_dropout_rate (float, optional): Dropout rate in postnet. + use_masking (bool, optional): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation. + bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true). + loss_type (str, optional): How to calculate loss. + use_guided_attn_loss (bool, optional): Whether to use guided attention loss. + num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss. + num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss. + List of module names to apply guided attention loss. """ def __init__( @@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded character ids (B, Tmax). - text_lengths : Tensor(int64) - Batch of lengths of each input batch (B,). - speech : Tensor - Batch of padded target features (B, Lmax, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - spk_emb : Tensor, optional - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Loss scalar value. - Dict - Statistics to be monitored. + Args: + text(Tensor(int64)): Batch of padded character ids (B, Tmax). + text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,). + speech(Tensor): Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). + spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Loss scalar value. + Dict: Statistics to be monitored. """ # input of embedding must be int64 @@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - spk_emb : Tensor, optional - Speaker embedding vector (spk_embed_dim,). - threshold : float, optional - Threshold in inference. - minlenratio : float, optional - Minimum length ratio in inference. - maxlenratio : float, optional - Maximum length ratio in inference. - use_teacher_forcing : bool, optional - Whether to use teacher forcing. - - Returns - ---------- - Tensor - Output sequence of features (L, odim). - Tensor - Output sequence of stop probabilities (L,). - Tensor - Encoder-decoder (source) attention weights (#layers, #heads, L, T). + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,). + threshold(float, optional): Threshold in inference. + minlenratio(float, optional): Minimum length ratio in inference. + maxlenratio(float, optional): Maximum length ratio in inference. + use_teacher_forcing(bool, optional): Whether to use teacher forcing. + + Returns: + Tensor: Output sequence of features (L, odim). + Tensor: Output sequence of stop probabilities (L,). + Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T). """ # input of embedding must be int64 @@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer): def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: """Make masks for self-attention. - Parameters - ---------- - ilens : Tensor - Batch of lengths (B,). + Args: + ilens(Tensor): Batch of lengths (B,). - Returns - ------- - Tensor - Mask tensor for self-attention. - dtype=paddle.bool + Returns: + Tensor: Mask tensor for self-attention. dtype=paddle.bool - Examples - ------- - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool + Examples: + >>> ilens = [5, 3] + >>> self._source_mask(ilens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 0, 0]]]) bool """ x_masks = make_non_pad_mask(ilens) @@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer): def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor: """Make masks for masked self-attention. - Parameters - ---------- - olens : LongTensor - Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor for masked self-attention. - - Examples - ---------- - >>> olens = [5, 3] - >>> self._target_mask(olens) - tensor([[[1, 0, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 1, 0], - [1, 1, 1, 1, 1]], - [[1, 0, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 0, 0]]], dtype=paddle.uint8) + Args: + olens (Tensor(int64)): Batch of lengths (B,). + + Returns: + Tensor: Mask tensor for masked self-attention. + + Examples: + >>> olens = [5, 3] + >>> self._target_mask(olens) + tensor([[[1, 0, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 1, 0], + [1, 1, 1, 1, 1]], + [[1, 0, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 0, 0]]], dtype=paddle.uint8) """ y_masks = make_non_pad_mask(olens) @@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer): spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim). + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Batch of integrated hidden state sequences (B, Tmax, adim). """ if self.spk_embed_integration_type == "add": diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py index e519e0c5..2c2f7ebb 100644 --- a/paddlespeech/t2s/models/waveflow.py +++ b/paddlespeech/t2s/models/waveflow.py @@ -30,20 +30,14 @@ __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] def fold(x, n_group): - r"""Fold audio or spectrogram's temporal dimension in to groups. + """Fold audio or spectrogram's temporal dimension in to groups. - Parameters - ---------- - x : Tensor [shape=(\*, time_steps) - The input tensor. + Args: + x(Tensor): The input tensor. shape=(\*, time_steps) + n_group(int): The size of a group. - n_group : int - The size of a group. - - Returns - --------- - Tensor : [shape=(\*, time_steps // n_group, group)] - Folded tensor. + Returns: + Tensor: Folded tensor. shape=(\*, time_steps // n_group, group) """ spatial_shape = list(x.shape[:-1]) time_steps = paddle.shape(x)[-1] @@ -58,27 +52,23 @@ class UpsampleNet(nn.LayerList): It consists of several conv2dtranspose layers which perform deconvolution on mel and time dimension. - Parameters - ---------- - upscale_factors : List[int], optional - Time upsampling factors for each Conv2DTranspose Layer. - - The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose - Layers. Each upscale_factor is used as the ``stride`` for the - corresponding Conv2DTranspose. Defaults to [16, 16], this the default - upsampling factor is 256. + Args: + upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer. + The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose + Layers. Each upscale_factor is used as the ``stride`` for the + corresponding Conv2DTranspose. Defaults to [16, 16], this the default + upsampling factor is 256. - Notes - ------ - ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft - transformation used to extract spectrogram features from audio. + Notes: + ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft + transformation used to extract spectrogram features from audio. - For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft - transformation whose ``hop_length`` equals 256 is suitable. + For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft + transformation whose ``hop_length`` equals 256 is suitable. - See Also - --------- - ``librosa.core.stft`` + See Also + + ``librosa.core.stft`` """ def __init__(self, upsample_factors): @@ -101,25 +91,18 @@ class UpsampleNet(nn.LayerList): self.upsample_factors = upsample_factors def forward(self, x, trim_conv_artifact=False): - r"""Forward pass of the ``UpsampleNet``. + """Forward pass of the ``UpsampleNet`` - Parameters - ----------- - x : Tensor [shape=(batch_size, input_channels, time_steps)] - The input spectrogram. + Args: + x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps) + trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False. - trim_conv_artifact : bool, optional - Trim deconvolution artifact at each layer. Defaults to False. + Returns: + Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps \* upsample_factor) - Returns - -------- - Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)] - The upsampled spectrogram. - - Notes - -------- - If trim_conv_artifact is ``True``, the output time steps is less - than ``time_steps \* upsample_factors``. + Notes: + If trim_conv_artifact is ``True``, the output time steps is less + than ``time_steps \* upsample_factors``. """ x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T) for layer in self: @@ -139,19 +122,11 @@ class ResidualBlock(nn.Layer): same paddign in width dimension. It also has projection for the condition and output. - Parameters - ---------- - channels : int - Feature size of the input. - - cond_channels : int - Featuer size of the condition. - - kernel_size : Tuple[int] - Kernel size of the Convolution2d applied to the input. - - dilations : int - Dilations of the Convolution2d applied to the input. + Args: + channels (int): Feature size of the input. + cond_channels (int): Featuer size of the condition. + kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input. + dilations (int): Dilations of the Convolution2d applied to the input. """ def __init__(self, channels, cond_channels, kernel_size, dilations): @@ -197,21 +172,13 @@ class ResidualBlock(nn.Layer): def forward(self, x, condition): """Compute output for a whole folded sequence. - Parameters - ---------- - x : Tensor [shape=(batch_size, channel, height, width)] - The input. - - condition : Tensor [shape=(batch_size, condition_channel, height, width)] - The local condition. + Args: + x (Tensor): The input. [shape=(batch_size, channel, height, width)] + condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition. - Returns - ------- - res : Tensor [shape=(batch_size, channel, height, width)] - The residual output. - - skip : Tensor [shape=(batch_size, channel, height, width)] - The skip output. + Returns: + res (Tensor): The residual output. [shape=(batch_size, channel, height, width)] + skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)] """ x_in = x x = self.conv(x) @@ -248,21 +215,14 @@ class ResidualBlock(nn.Layer): def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffer. - Parameters - ---------- - x_row : Tensor [shape=(batch_size, channel, 1, width)] - A row of the input. - - condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)] - A row of the condition. + Args: + x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) - Returns - ------- - res : Tensor [shape=(batch_size, channel, 1, width)] - A row of the the residual output. + Returns: + res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) - skip : Tensor [shape=(batch_size, channel, 1, width)] - A row of the skip output. """ x_row_in = x_row if len(paddle.shape(self._conv_buffer)) == 1: @@ -297,27 +257,15 @@ class ResidualBlock(nn.Layer): class ResidualNet(nn.LayerList): """A stack of several ResidualBlocks. It merges condition at each layer. - Parameters - ---------- - n_layer : int - Number of ResidualBlocks in the ResidualNet. - - residual_channels : int - Feature size of each ResidualBlocks. - - condition_channels : int - Feature size of the condition. + Args: + n_layer (int): Number of ResidualBlocks in the ResidualNet. + residual_channels (int): Feature size of each ResidualBlocks. + condition_channels (int): Feature size of the condition. + kernel_size (Tuple[int]): Kernel size of each ResidualBlock. + dilations_h (List[int]): Dilation in height dimension of every ResidualBlock. - kernel_size : Tuple[int] - Kernel size of each ResidualBlock. - - dilations_h : List[int] - Dilation in height dimension of every ResidualBlock. - - Raises - ------ - ValueError - If the length of dilations_h does not equals n_layers. + Raises: + ValueError: If the length of dilations_h does not equals n_layers. """ def __init__(self, @@ -339,18 +287,13 @@ class ResidualNet(nn.LayerList): def forward(self, x, condition): """Comput the output of given the input and the condition. - Parameters - ----------- - x : Tensor [shape=(batch_size, channel, height, width)] - The input. - - condition : Tensor [shape=(batch_size, condition_channel, height, width)] - The local condition. - - Returns - -------- - Tensor : [shape=(batch_size, channel, height, width)] - The output, which is an aggregation of all the skip outputs. + Args: + x (Tensor): The input. shape=(batch_size, channel, height, width) + condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width) + + Returns: + Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width) + """ skip_connections = [] for layer in self: @@ -368,21 +311,14 @@ class ResidualNet(nn.LayerList): def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffers. - Parameters - ---------- - x_row : Tensor [shape=(batch_size, channel, 1, width)] - A row of the input. - - condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)] - A row of the condition. - - Returns - ------- - res : Tensor [shape=(batch_size, channel, 1, width)] - A row of the the residual output. - - skip : Tensor [shape=(batch_size, channel, 1, width)] - A row of the skip output. + Args: + x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) + + Returns: + res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) + """ skip_connections = [] for layer in self: @@ -400,22 +336,12 @@ class Flow(nn.Layer): probability density estimation. The ``inverse`` method implements the sampling. - Parameters - ---------- - n_layers : int - Number of ResidualBlocks in the Flow. - - channels : int - Feature size of the ResidualBlocks. - - mel_bands : int - Feature size of the mel spectrogram (mel bands). - - kernel_size : Tuple[int] - Kernel size of each ResisualBlocks in the Flow. - - n_group : int - Number of timesteps to the folded into a group. + Args: + n_layers (int): Number of ResidualBlocks in the Flow. + channels (int): Feature size of the ResidualBlocks. + mel_bands (int): Feature size of the mel spectrogram (mel bands). + kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow. + n_group (int): Number of timesteps to the folded into a group. """ dilations_dict = { 8: [1, 1, 1, 1, 1, 1, 1, 1], @@ -466,26 +392,16 @@ class Flow(nn.Layer): """Probability density estimation. It is done by inversely transform a sample from p(X) into a sample from p(Z). - Parameters - ----------- - x : Tensor [shape=(batch, 1, height, width)] - A input sample of the distribution p(X). - - condition : Tensor [shape=(batch, condition_channel, height, width)] - The local condition. - - Returns - -------- - z (Tensor): shape(batch, 1, height, width), the transformed sample. - - Tuple[Tensor, Tensor] - The parameter of the transformation. - - logs (Tensor): shape(batch, 1, height - 1, width), the log scale - of the transformation from x to z. - - b (Tensor): shape(batch, 1, height - 1, width), the shift of the - transformation from x to z. + Args: + x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width) + condition (Tensor): The local condition. shape=(batch, condition_channel, height, width) + + Returns: + z (Tensor): shape(batch, 1, height, width), the transformed sample. + Tuple[Tensor, Tensor]: + The parameter of the transformation. + logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z. + b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z. """ # (B, C, H-1, W) logs, b = self._predict_parameters(x[:, :, :-1, :], @@ -516,27 +432,12 @@ class Flow(nn.Layer): """Sampling from the the distrition p(X). It is done by sample form p(Z) and transform the sample. It is a auto regressive transformation. - Parameters - ----------- - z : Tensor [shape=(batch, 1, height, width)] - A sample of the distribution p(Z). - - condition : Tensor [shape=(batch, condition_channel, height, width)] - The local condition. - - Returns - --------- - x : Tensor [shape=(batch, 1, height, width)] - The transformed sample. - - Tuple[Tensor, Tensor] - The parameter of the transformation. - - logs (Tensor): shape(batch, 1, height - 1, width), the log scale - of the transformation from x to z. - - b (Tensor): shape(batch, 1, height - 1, width), the shift of the - transformation from x to z. + Args: + z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps) + Returns: + Tensor: + The transformed sample. shape=(batch, 1, height, width) """ z_0 = z[:, :, :1, :] x = paddle.zeros_like(z) @@ -560,25 +461,13 @@ class WaveFlow(nn.LayerList): """An Deep Reversible layer that is composed of severel auto regressive flows. - Parameters - ----------- - n_flows : int - Number of flows in the WaveFlow model. - - n_layers : int - Number of ResidualBlocks in each Flow. - - n_group : int - Number of timesteps to fold as a group. - - channels : int - Feature size of each ResidualBlock. - - mel_bands : int - Feature size of mel spectrogram (mel bands). - - kernel_size : Union[int, List[int]] - Kernel size of the convolution layer in each ResidualBlock. + Args: + n_flows (int): Number of flows in the WaveFlow model. + n_layers (int): Number of ResidualBlocks in each Flow. + n_group (int): Number of timesteps to fold as a group. + channels (int): Feature size of each ResidualBlock. + mel_bands (int): Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. """ def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, @@ -628,22 +517,13 @@ class WaveFlow(nn.LayerList): """Probability density estimation of random variable x given the condition. - Parameters - ----------- - x : Tensor [shape=(batch_size, time_steps)] - The audio. - - condition : Tensor [shape=(batch_size, condition channel, time_steps)] - The local condition (mel spectrogram here). - - Returns - -------- - z : Tensor [shape=(batch_size, time_steps)] - The transformed random variable. - - log_det_jacobian: Tensor [shape=(1,)] - The log determinant of the jacobian of the transformation from x - to z. + Args: + x (Tensor): The audio. shape=(batch_size, time_steps) + condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps) + + Returns: + Tensor: The transformed random variable. shape=(batch_size, time_steps) + Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,) """ # x: (B, T) # condition: (B, C, T) upsampled condition @@ -678,18 +558,13 @@ class WaveFlow(nn.LayerList): Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an autoregressive manner. - Parameters - ---------- - z : Tensor [shape=(batch, 1, time_steps] - A sample of the distribution p(Z). - - condition : Tensor [shape=(batch, condition_channel, time_steps)] - The local condition. + Args: + z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps) - Returns - -------- - x : Tensor [shape=(batch_size, time_steps)] - The transformed sample (audio here). + Returns: + Tensor: The transformed sample (audio here). shape=(batch_size, time_steps) + """ z, condition = self._trim(z, condition) @@ -714,29 +589,15 @@ class WaveFlow(nn.LayerList): class ConditionalWaveFlow(nn.LayerList): """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model. - Parameters - ---------- - upsample_factors : List[int] - Upsample factors for the upsample net. - - n_flows : int - Number of flows in the WaveFlow model. - - n_layers : int - Number of ResidualBlocks in each Flow. - - n_group : int - Number of timesteps to fold as a group. - - channels : int - Feature size of each ResidualBlock. - - n_mels : int - Feature size of mel spectrogram (mel bands). - - kernel_size : Union[int, List[int]] - Kernel size of the convolution layer in each ResidualBlock. - """ + Args: + upsample_factors (List[int]): Upsample factors for the upsample net. + n_flows (int): Number of flows in the WaveFlow model. + n_layers (int): Number of ResidualBlocks in each Flow. + n_group (int): Number of timesteps to fold as a group. + channels (int): Feature size of each ResidualBlock. + n_mels (int): Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. + """ def __init__(self, upsample_factors: List[int], @@ -760,22 +621,13 @@ class ConditionalWaveFlow(nn.LayerList): """Compute the transformed random variable z (x to z) and the log of the determinant of the jacobian of the transformation from x to z. - Parameters - ---------- - audio : Tensor [shape=(B, T)] - The audio. + Args: + audio(Tensor): The audio. shape=(B, T) + mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel) - mel : Tensor [shape=(B, C_mel, T_mel)] - The mel spectrogram. - - Returns - ------- - z : Tensor [shape=(B, T)] - The inversely transformed random variable z (x to z) - - log_det_jacobian: Tensor [shape=(1,)] - the log of the determinant of the jacobian of the transformation - from x to z. + Returns: + Tensor: The inversely transformed random variable z (x to z). shape=(B, T) + Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,) """ condition = self.encoder(mel) z, log_det_jacobian = self.decoder(audio, condition) @@ -783,17 +635,13 @@ class ConditionalWaveFlow(nn.LayerList): @paddle.no_grad() def infer(self, mel): - r"""Generate raw audio given mel spectrogram. + """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : Tensor [shape=(B, C_mel, T_mel)] - Mel spectrogram (in log-magnitude). + Args: + mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) - Returns - ------- - Tensor : [shape=(B, T)] - The synthesized audio, where``T <= T_mel \* upsample_factors``. + Returns: + Tensor: The synthesized audio, where``T <= T_mel \* upsample_factors``. shape=(B, T) """ start = time.time() condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T) @@ -808,15 +656,11 @@ class ConditionalWaveFlow(nn.LayerList): def predict(self, mel): """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : np.ndarray [shape=(C_mel, T_mel)] - Mel spectrogram of an utterance(in log-magnitude). + Args: + mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) - Returns - ------- - np.ndarray [shape=(T,)] - The synthesized audio. + Returns: + np.ndarray: The synthesized audio. shape=(T,) """ mel = paddle.to_tensor(mel) mel = paddle.unsqueeze(mel, 0) @@ -828,18 +672,12 @@ class ConditionalWaveFlow(nn.LayerList): def from_pretrained(cls, config, checkpoint_path): """Build a ConditionalWaveFlow model from a pretrained model. - Parameters - ---------- - config: yacs.config.CfgNode - model configs + Args: + config(yacs.config.CfgNode): model configs + checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name - checkpoint_path: Path or str - the path of pretrained model checkpoint, without extension name - - Returns - ------- - ConditionalWaveFlow - The model built from pretrained result. + Returns: + ConditionalWaveFlow The model built from pretrained result. """ model = cls(upsample_factors=config.model.upsample_factors, n_flows=config.model.n_flows, @@ -855,11 +693,9 @@ class ConditionalWaveFlow(nn.LayerList): class WaveFlowLoss(nn.Layer): """Criterion of a WaveFlow model. - Parameters - ---------- - sigma : float - The standard deviation of the gaussian noise used in WaveFlow, by - default 1.0. + Args: + sigma (float): The standard deviation of the gaussian noise used in WaveFlow, + by default 1.0. """ def __init__(self, sigma=1.0): @@ -871,19 +707,13 @@ class WaveFlowLoss(nn.Layer): """Compute the loss given the transformed random variable z and the log_det_jacobian of transformation from x to z. - Parameters - ---------- - z : Tensor [shape=(B, T)] - The transformed random variable (x to z). - - log_det_jacobian : Tensor [shape=(1,)] - The log of the determinant of the jacobian matrix of the - transformation from x to z. + Args: + z(Tensor): The transformed random variable (x to z). shape=(B, T) + log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the + transformation from x to z. shape=(1,) - Returns - ------- - Tensor [shape=(1,)] - The loss. + Returns: + Tensor: The loss. shape=(1,) """ loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma ) - log_det_jacobian @@ -895,15 +725,12 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow): def forward(self, mel): """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : np.ndarray [shape=(C_mel, T_mel)] - Mel spectrogram of an utterance(in log-magnitude). - - Returns - ------- - np.ndarray [shape=(T,)] - The synthesized audio. + Args: + mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + + Returns: + np.ndarray: The synthesized audio. shape=(T,) + """ audio = self.predict(mel) return audio diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index fcf39a48..1320ffa3 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -67,14 +67,10 @@ class MelResNet(nn.Layer): def forward(self, x): ''' - Parameters - ---------- - x : Tensor - Input tensor (B, in_dims, T). - Returns - ---------- - Tensor - Output tensor (B, res_out_dims, T). + Args: + x (Tensor): Input tensor (B, in_dims, T). + Returns: + Tensor: Output tensor (B, res_out_dims, T). ''' x = self.conv_in(x) @@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer): def forward(self, m): ''' - Parameters - ---------- - c : Tensor - Input tensor (B, C_aux, T). - Returns - ---------- - Tensor - Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). - Tensor - Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). + Args: + c (Tensor): Input tensor (B, C_aux, T). + Returns: + Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). + Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). ''' # aux: [B, C_aux, T] # -> [B, res_out_dims, T - 2 * aux_context_window] @@ -172,32 +163,20 @@ class WaveRNN(nn.Layer): mode='RAW', init_type: str="xavier_uniform", ): ''' - Parameters - ---------- - rnn_dims : int, optional - Hidden dims of RNN Layers. - fc_dims : int, optional - Dims of FC Layers. - bits : int, optional - bit depth of signal. - aux_context_window : int, optional - The context window size of the first convolution applied to the - auxiliary input, by default 2 - upsample_scales : List[int], optional - Upsample scales of the upsample network. - aux_channels : int, optional - Auxiliary channel of the residual blocks. - compute_dims : int, optional - Dims of Conv1D in MelResNet. - res_out_dims : int, optional - Dims of output in MelResNet. - res_blocks : int, optional - Number of residual blocks. - mode : str, optional - Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution, - and `RAW` for quantized bits as the model's output. - init_type : str - How to initialize parameters. + Args: + rnn_dims (int, optional): Hidden dims of RNN Layers. + fc_dims (int, optional): Dims of FC Layers. + bits (int, optional): bit depth of signal. + aux_context_window (int, optional): The context window size of the first convolution applied to the + auxiliary input, by default 2 + upsample_scales (List[int], optional): Upsample scales of the upsample network. + aux_channels (int, optional): Auxiliary channel of the residual blocks. + compute_dims (int, optional): Dims of Conv1D in MelResNet. + res_out_dims (int, optional): Dims of output in MelResNet. + res_blocks (int, optional): Number of residual blocks. + mode (str, optional): Output mode of the WaveRNN vocoder. + `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output. + init_type (str): How to initialize parameters. ''' super().__init__() self.mode = mode @@ -245,18 +224,13 @@ class WaveRNN(nn.Layer): def forward(self, x, c): ''' - Parameters - ---------- - x : Tensor - wav sequence, [B, T] - c : Tensor - mel spectrogram [B, C_aux, T'] - - T = (T' - 2 * aux_context_window ) * hop_length - Returns - ---------- - Tensor - [B, T, n_classes] + Args: + x (Tensor): wav sequence, [B, T] + c (Tensor): mel spectrogram [B, C_aux, T'] + + T = (T' - 2 * aux_context_window ) * hop_length + Returns: + Tensor: [B, T, n_classes] ''' # Although we `_flatten_parameters()` on init, when using DataParallel # the model gets replicated, making it no longer guaranteed that the @@ -304,22 +278,14 @@ class WaveRNN(nn.Layer): mu_law: bool=True, gen_display: bool=False): """ - Parameters - ---------- - c : Tensor - input mels, (T', C_aux) - batched : bool - generate in batch or not - target : int - target number of samples to be generated in each batch entry - overlap : int - number of samples for crossfading between batches - mu_law : bool - use mu law or not - Returns - ---------- - wav sequence - Output (T' * prod(upsample_scales), out_channels, C_out). + Args: + c(Tensor): input mels, (T', C_aux) + batched(bool): generate in batch or not + target(int): target number of samples to be generated in each batch entry + overlap(int): number of samples for crossfading between batches + mu_law(bool) + Returns: + wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out). """ self.eval() @@ -434,16 +400,13 @@ class WaveRNN(nn.Layer): def pad_tensor(self, x, pad, side='both'): ''' - Parameters - ---------- - x : Tensor - mel, [1, n_frames, 80] - pad : int - side : str - 'both', 'before' or 'after' - Returns - ---------- - Tensor + Args: + x(Tensor): mel, [1, n_frames, 80] + pad(int): + side(str, optional): (Default value = 'both') + + Returns: + Tensor ''' b, t, _ = paddle.shape(x) # for dygraph to static graph @@ -461,38 +424,29 @@ class WaveRNN(nn.Layer): Fold the tensor with overlap for quick batched inference. Overlap will be used for crossfading in xfade_and_unfold() - Parameters - ---------- - x : Tensor - Upsampled conditioning features. mels or aux - shape=(1, T, features) - mels: [1, T, 80] - aux: [1, T, 128] - target : int - Target timesteps for each index of batch - overlap : int - Timesteps for both xfade and rnn warmup - overlap = hop_length * 2 - - Returns - ---------- - Tensor - shape=(num_folds, target + 2 * overlap, features) - num_flods = (time_seq - overlap) // (target + overlap) - mel: [num_folds, target + 2 * overlap, 80] - aux: [num_folds, target + 2 * overlap, 128] - - Details - ---------- - x = [[h1, h2, ... hn]] - - Where each h is a vector of conditioning features - - Eg: target=2, overlap=1 with x.size(1)=10 - - folded = [[h1, h2, h3, h4], - [h4, h5, h6, h7], - [h7, h8, h9, h10]] + Args: + x(Tensor): Upsampled conditioning features. mels or aux + shape=(1, T, features) + mels: [1, T, 80] + aux: [1, T, 128] + target(int): Target timesteps for each index of batch + overlap(int): Timesteps for both xfade and rnn warmup + + Returns: + Tensor: + shape=(num_folds, target + 2 * overlap, features) + num_flods = (time_seq - overlap) // (target + overlap) + mel: [num_folds, target + 2 * overlap, 80] + aux: [num_folds, target + 2 * overlap, 128] + + Details: + x = [[h1, h2, ... hn]] + Where each h is a vector of conditioning features + Eg: target=2, overlap=1 with x.size(1)=10 + + folded = [[h1, h2, h3, h4], + [h4, h5, h6, h7], + [h7, h8, h9, h10]] ''' _, total_len, features = paddle.shape(x) @@ -520,37 +474,33 @@ class WaveRNN(nn.Layer): def xfade_and_unfold(self, y, target: int=12000, overlap: int=600): ''' Applies a crossfade and unfolds into a 1d array. - Parameters - ---------- - y : Tensor - Batched sequences of audio samples - shape=(num_folds, target + 2 * overlap) - dtype=paddle.float32 - overlap : int - Timesteps for both xfade and rnn warmup - - Returns - ---------- - Tensor - audio samples in a 1d array - shape=(total_len) - dtype=paddle.float32 - - Details - ---------- - y = [[seq1], - [seq2], - [seq3]] - - Apply a gain envelope at both ends of the sequences - - y = [[seq1_in, seq1_target, seq1_out], - [seq2_in, seq2_target, seq2_out], - [seq3_in, seq3_target, seq3_out]] - - Stagger and add up the groups of samples: - - [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] + Args: + y (Tensor): + Batched sequences of audio samples + shape=(num_folds, target + 2 * overlap) + dtype=paddle.float32 + overlap (int): Timesteps for both xfade and rnn warmup + + Returns: + Tensor + audio samples in a 1d array + shape=(total_len) + dtype=paddle.float32 + + Details: + y = [[seq1], + [seq2], + [seq3]] + + Apply a gain envelope at both ends of the sequences + + y = [[seq1_in, seq1_target, seq1_out], + [seq2_in, seq2_target, seq2_out], + [seq3_in, seq3_target, seq3_out]] + + Stagger and add up the groups of samples: + + [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] ''' # num_folds = (total_len - overlap) // (target + overlap) diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py index c0d4f955..3abccc15 100644 --- a/paddlespeech/t2s/modules/causal_conv.py +++ b/paddlespeech/t2s/modules/causal_conv.py @@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T). + Args: + x (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T). """ return self.conv(self.pad(x))[:, :, :x.shape[2]] @@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T_in). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T_out). + Args: + x (Tensor): Input tensor (B, in_channels, T_in). + Returns: + Tensor: Output tensor (B, out_channels, T_out). """ return self.deconv(x)[:, :, :-self.stride] diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py index e4a6c8c6..185c62fb 100644 --- a/paddlespeech/t2s/modules/conformer/convolution.py +++ b/paddlespeech/t2s/modules/conformer/convolution.py @@ -18,12 +18,10 @@ from paddle import nn class ConvolutionModule(nn.Layer): """ConvolutionModule in Conformer model. - Parameters - ---------- - channels : int - The number of channels of conv layers. - kernel_size : int - Kernerl size of conv layers. + + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernerl size of conv layers. """ def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): @@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer): def forward(self, x): """Compute convolution module. - Parameters - ---------- - x : paddle.Tensor - Input tensor (#batch, time, channels). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, channels). + + Args: + x (Tensor): Input tensor (#batch, time, channels). + Returns: + Tensor: Output tensor (#batch, time, channels). """ # exchange the temporal dimension and the feature dimension x = x.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py index 2949dc37..61c32612 100644 --- a/paddlespeech/t2s/modules/conformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm class EncoderLayer(nn.Layer): """Encoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance - can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - feed_forward_macaron : nn.Layer - Additional feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - conv_module : nn.Layer - Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - stochastic_depth_rate : float - Proability to skip this layer. - During training, the layer may skip residual computation and return input - as-is with given probability. + + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance + can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + feed_forward_macaron (nn.Layer): Additional feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + conv_module (nn.Layer): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + stochastic_depth_rate (float): Proability to skip this layer. + During training, the layer may skip residual computation and return input + as-is with given probability. """ def __init__( @@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer): def forward(self, x_input, mask, cache=None): """Compute encoded features. - Parameters - ---------- - x_input : Union[Tuple, paddle.Tensor] - Input tensor w/ or w/o pos emb. - - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. - - w/o pos emb: Tensor (#batch, time, size). - mask : paddle.Tensor - Mask tensor for the input (#batch, time). - cache paddle.Tensor - Cache tensor of the input (#batch, time - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, size). - paddle.Tensor - Mask tensor (#batch, time). + + Args: + x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb. + - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. + - w/o pos emb: Tensor (#batch, time, size). + mask(Tensor): Mask tensor for the input (#batch, time). + cache (Tensor): + + Returns: + Tensor: Output tensor (#batch, time, size). + Tensor: Mask tensor (#batch, time). """ if isinstance(x_input, tuple): x, pos_emb = x_input[0], x_input[1] diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py index 68766d5e..aa875bd5 100644 --- a/paddlespeech/t2s/modules/conv.py +++ b/paddlespeech/t2s/modules/conv.py @@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D): 2. padding must be a causal padding (recpetive_field - 1, 0). Thus, these arguments are removed from the ``__init__`` method of this class. - - Parameters - ---------- - in_channels: int - The feature size of the input. - out_channels: int - The feature size of the output. - kernel_size: int or Tuple[int] - The size of the kernel. - dilation: int or Tuple[int] - The dilation of the convolution, by default 1 - weight_attr: ParamAttr, Initializer, str or bool, optional - The parameter attribute of the convolution kernel, by default None. - bias_attr: ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias. If ``False``, this layer does not - have a bias, by default None. - - Examples - -------- - >>> cell = Conv1dCell(3, 4, kernel_size=5) - >>> inputs = [paddle.randn([4, 3]) for _ in range(16)] - >>> outputs = [] - >>> cell.eval() - >>> cell.start_sequence() - >>> for xt in inputs: - >>> outputs.append(cell.add_input(xt)) - >>> len(outputs)) - 16 - >>> outputs[0].shape - [4, 4] + + Args: + in_channels (int): The feature size of the input. + out_channels (int): The feature size of the output. + kernel_size (int or Tuple[int]): The size of the kernel. + dilation (int or Tuple[int]): The dilation of the convolution, by default 1 + weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, + by default None. + bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. + If ``False``, this layer does not have a bias, by default None. + + Examples: + >>> cell = Conv1dCell(3, 4, kernel_size=5) + >>> inputs = [paddle.randn([4, 3]) for _ in range(16)] + >>> outputs = [] + >>> cell.eval() + >>> cell.start_sequence() + >>> for xt in inputs: + >>> outputs.append(cell.add_input(xt)) + >>> len(outputs)) + 16 + >>> outputs[0].shape + [4, 4] """ def __init__(self, @@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D): def start_sequence(self): """Prepare the layer for a series of incremental forward. - Warnings - --------- - This method should be called before a sequence of calls to - ``add_input``. + Warnings: + This method should be called before a sequence of calls to + ``add_input``. - Raises - ------ - Exception - If this method is called when the layer is in training mode. + Raises: + Exception + If this method is called when the layer is in training mode. """ if self.training: raise Exception("only use start_sequence in evaluation") @@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D): def initialize_buffer(self, x_t): """Initialize the buffer for the step input. - Parameters - ---------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + """ batch_size, _ = x_t.shape self._buffer = paddle.zeros( @@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D): def update_buffer(self, x_t): """Shift the buffer by one step. - Parameters - ---------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + """ self._buffer = paddle.concat( [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1) def add_input(self, x_t): """Add step input and compute step output. - - Parameters - ----------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. - - Returns - ------- - y_t :Tensor [shape=(batch_size, out_channels)] - The step output. + + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + + Returns: + y_t (Tensor): The step output. shape=(batch_size, out_channels) + """ batch_size = x_t.shape[0] if self.receptive_field > 1: @@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D): class Conv1dBatchNorm(nn.Layer): """A Conv1D Layer followed by a BatchNorm1D. - Parameters - ---------- - in_channels : int - The feature size of the input. - out_channels : int - The feature size of the output. - kernel_size : int - The size of the convolution kernel. - stride : int, optional - The stride of the convolution, by default 1. - padding : int, str or Tuple[int], optional - The padding of the convolution. - If int, a symmetrical padding is applied before convolution; - If str, it should be "same" or "valid"; - If Tuple[int], its length should be 2, meaning - ``(pad_before, pad_after)``, by default 0. - weight_attr : ParamAttr, Initializer, str or bool, optional - The parameter attribute of the convolution kernel, by default None. - bias_attr : ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias of the convolution, by default - None. - data_format : str ["NCL" or "NLC"], optional - The data layout of the input, by default "NCL" - momentum : float, optional - The momentum of the BatchNorm1D layer, by default 0.9 - epsilon : [type], optional - The epsilon of the BatchNorm1D layer, by default 1e-05 + Args: + in_channels (int): The feature size of the input. + out_channels (int): The feature size of the output. + kernel_size (int): The size of the convolution kernel. + stride (int, optional): The stride of the convolution, by default 1. + padding (int, str or Tuple[int], optional): + The padding of the convolution. + If int, a symmetrical padding is applied before convolution; + If str, it should be "same" or "valid"; + If Tuple[int], its length should be 2, meaning + ``(pad_before, pad_after)``, by default 0. + weight_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the convolution kernel, + by default None. + bias_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the bias of the convolution, + by defaultNone. + data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL" + momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9 + epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05 """ def __init__(self, @@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer): def forward(self, x): """Forward pass of the Conv1dBatchNorm layer. - - Parameters - ---------- - x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)] - The input tensor. Its data layout depends on ``data_format``. - - Returns - ------- - Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)] - The output tensor. + + Args: + x (Tensor): The input tensor. Its data layout depends on ``data_format``. + shape=(B, C_in, T_in) or (B, T_in, C_in) + + Returns: + Tensor: The output tensor. + shape=(B, C_out, T_out) or (B, T_out, C_out) + """ x = self.conv(x) x = self.bn(x) diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py index a3d56f7d..01eb5ad0 100644 --- a/paddlespeech/t2s/modules/geometry.py +++ b/paddlespeech/t2s/modules/geometry.py @@ -17,24 +17,18 @@ import paddle def shuffle_dim(x, axis, perm=None): """Permute input tensor along aixs given the permutation or randomly. + + Args: + x (Tensor): The input tensor. + axis (int): The axis to shuffle. + perm (List[int], ndarray, optional): + The order to reorder the tensor along the ``axis``-th dimension. + It is a permutation of ``[0, d)``, where d is the size of the + ``axis``-th dimension of the input tensor. If not provided, + a random permutation is used. Defaults to None. - Parameters - ---------- - x : Tensor - The input tensor. - axis : int - The axis to shuffle. - perm : List[int], ndarray, optional - The order to reorder the tensor along the ``axis``-th dimension. - - It is a permutation of ``[0, d)``, where d is the size of the - ``axis``-th dimension of the input tensor. If not provided, - a random permutation is used. Defaults to None. - - Returns - --------- - Tensor - The shuffled tensor, which has the same shape as x does. + Returns: + Tensor: The shuffled tensor, which has the same shape as x does. """ size = x.shape[axis] if perm is not None and len(perm) != size: diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py index 4edd22c9..088b98e0 100644 --- a/paddlespeech/t2s/modules/layer_norm.py +++ b/paddlespeech/t2s/modules/layer_norm.py @@ -18,13 +18,9 @@ from paddle import nn class LayerNorm(nn.LayerNorm): """Layer normalization module. - - Parameters - ---------- - nout : int - Output dim size. - dim : int - Dimension to be normalized. + Args: + nout (int): Output dim size. + dim (int): Dimension to be normalized. """ def __init__(self, nout, dim=-1): @@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm): def forward(self, x): """Apply layer normalization. - Parameters - ---------- - x : paddle.Tensor - Input tensor. + Args: + x (Tensor):Input tensor. - Returns - ---------- - paddle.Tensor - Normalized tensor. + Returns: + Tensor: Normalized tensor. """ if self.dim == -1: diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 618f444a..93644e24 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat, def sample_from_discretized_mix_logistic(y, log_scale_min=None): """ Sample from discretized mixture of logistic distributions - Parameters - ---------- - y : Tensor - (B, C, T) - log_scale_min : float - Log scale minimum value - Returns - ---------- - Tensor - sample in range of [-1, 1]. + + Args: + y(Tensor): (B, C, T) + log_scale_min(float, optional): (Default value = None) + + Returns: + Tensor: sample in range of [-1, 1]. """ if log_scale_min is None: log_scale_min = float(np.log(1e-14)) @@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer): def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): """Initialize guided attention loss module. - Parameters - ---------- - sigma : float, optional - Standard deviation to control how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. + Args: + sigma (float, optional): Standard deviation to control how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. """ super().__init__() @@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer): def forward(self, att_ws, ilens, olens): """Calculate forward propagation. - Parameters - ---------- - att_ws : Tensor - Batch of attention weights (B, T_max_out, T_max_in). - ilens : Tensor(int64) - Batch of input lenghts (B,). - olens : Tensor(int64) - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. + Args: + att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in). + ilens(Tensor(int64)): Batch of input lenghts (B,). + olens(Tensor(int64)): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. """ if self.guided_attn_masks is None: @@ -282,39 +269,33 @@ class GuidedAttentionLoss(nn.Layer): def _make_masks(ilens, olens): """Make masks indicating non-padded part. - Parameters - ---------- - ilens : Tensor(int64) or List - Batch of lengths (B,). - olens : Tensor(int64) or List - Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor indicating non-padded part. - - Examples - ---------- - >>> ilens, olens = [5, 2], [8, 5] - >>> _make_mask(ilens, olens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], - - [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]], dtype=paddle.uint8) + Args: + ilens(Tensor(int64) or List): Batch of lengths (B,). + olens(Tensor(int64) or List): Batch of lengths (B,). + + Returns: + Tensor: Mask tensor indicating non-padded part. + + Examples: + >>> ilens, olens = [5, 2], [8, 5] + >>> _make_mask(ilens, olens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + + [[1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], dtype=paddle.uint8) """ # (B, T_in) @@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer): class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): """Guided attention loss function module for multi head attention. - Parameters - ---------- - sigma : float, optional - Standard deviation to controlGuidedAttentionLoss - how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. + Args: + sigma (float, optional): Standard deviation to controlGuidedAttentionLoss + how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. """ def forward(self, att_ws, ilens, olens): """Calculate forward propagation. - Parameters - ---------- - att_ws : Tensor - Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens : Tensor - Batch of input lenghts (B,). - olens : Tensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. + Args: + att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens(Tensor): Batch of input lenghts (B,). + olens(Tensor): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. """ if self.guided_attn_masks is None: @@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer): use_weighted_masking=False, bce_pos_weight=20.0): """Initialize Tactoron2 loss module. - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float - Weight of positive sample of stop token. + + Args: + use_masking (bool): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): Whether to apply weighted masking in loss calculation. + bce_pos_weight (float): Weight of positive sample of stop token. """ super().__init__() assert (use_masking != use_weighted_masking) or not use_masking @@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer): def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens): """Calculate forward propagation. - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - logits : Tensor - Batch of stop logits (B, Lmax). - ys : Tensor - Batch of padded target features (B, Lmax, odim). - stop_labels : Tensor(int64) - Batch of the sequences of stop token labels (B, Lmax). - olens : Tensor(int64) - Batch of the lengths of each target (B,). - Returns - ---------- - Tensor - L1 loss value. - Tensor - Mean square error loss value. - Tensor - Binary cross entropy loss value. + + Args: + after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). + logits(Tensor): Batch of stop logits (B, Lmax). + ys(Tensor): Batch of padded target features (B, Lmax, odim). + stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax). + olens(Tensor(int64)): + + Returns: + Tensor: L1 loss value. + Tensor: Mean square error loss value. + Tensor: Binary cross entropy loss value. """ # make mask and apply it if self.use_masking: @@ -513,28 +472,20 @@ def stft(x, center=True, pad_mode='reflect'): """Perform STFT and convert to magnitude spectrogram. - Parameters - ---------- - x : Tensor - Input signal tensor (B, T). - fft_size : int - FFT size. - hop_size : int - Hop size. - win_length : int - window : str, optional - window : str - Name of window function, see `scipy.signal.get_window` for more - details. Defaults to "hann". - center : bool, optional - center (bool, optional): Whether to pad `x` to make that the - :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. - pad_mode : str, optional - Choose padding pattern when `center` is `True`. - Returns - ---------- - Tensor: - Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + Args: + x(Tensor): Input signal tensor (B, T). + fft_size(int): FFT size. + hop_size(int): Hop size. + win_length(int, optional): window : str, optional (Default value = None) + window(str, optional): Name of window function, see `scipy.signal.get_window` for more + details. Defaults to "hann". + center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the + :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. + pad_mode(str, optional, optional): (Default value = 'reflect') + hop_length: (Default value = None) + + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). """ # calculate window window = signal.get_window(window, win_length, fftbins=True) @@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Parameters - ---------- - x_mag : Tensor - Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag : Tensor) - Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns - ---------- - Tensor - Spectral convergence loss value. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Spectral convergence loss value. """ return paddle.norm( y_mag - x_mag, p="fro") / paddle.clip( @@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Parameters - ---------- - x_mag : Tensor - Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag : Tensor - Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns - ---------- - Tensor - Log STFT magnitude loss value. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Log STFT magnitude loss value. """ return F.l1_loss( paddle.log(paddle.clip(y_mag, min=self.epsilon)), @@ -625,18 +566,12 @@ class STFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Predicted signal (B, T). - y : Tensor - Groundtruth signal (B, T). - Returns - ---------- - Tensor - Spectral convergence loss value. - Tensor - Log STFT magnitude loss value. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. """ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) @@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer): win_lengths=[600, 1200, 240], window="hann", ): """Initialize Multi resolution STFT loss module. - Parameters - ---------- - fft_sizes : list - List of FFT sizes. - hop_sizes : list - List of hop sizes. - win_lengths : list - List of window lengths. - window : str - Window function type. + Args: + fft_sizes (list): List of FFT sizes. + hop_sizes (list): List of hop sizes. + win_lengths (list): List of window lengths. + window (str): Window function type. """ super().__init__() assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) @@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Predicted signal (B, T) or (B, #subband, T). - y : Tensor - Groundtruth signal (B, T) or (B, #subband, T). - Returns - ---------- - Tensor - Multi resolution spectral convergence loss value. - Tensor - Multi resolution log STFT magnitude loss value. + + Args: + x (Tensor): Predicted signal (B, T) or (B, #subband, T). + y (Tensor): Groundtruth signal (B, T) or (B, #subband, T). + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. """ if len(x.shape) == 3: # (B, C, T) -> (B x C, T) @@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer): def forward(self, outputs): """Calcualate generator adversarial loss. - Parameters - ---------- - outputs: Tensor or List - Discriminator outputs or list of discriminator outputs. - Returns - ---------- - Tensor - Generator adversarial loss value. + Args: + outputs (Tensor or List): Discriminator outputs or list of discriminator outputs. + Returns: + Tensor: Generator adversarial loss value. """ if isinstance(outputs, (tuple, list)): adv_loss = 0.0 @@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer): def forward(self, outputs_hat, outputs): """Calcualate discriminator adversarial loss. - Parameters - ---------- - outputs_hat : Tensor or list - Discriminator outputs or list of - discriminator outputs calculated from generator outputs. - outputs : Tensor or list - Discriminator outputs or list of - discriminator outputs calculated from groundtruth. - Returns - ---------- - Tensor - Discriminator real loss value. - Tensor - Discriminator fake loss value. + + Args: + outputs_hat (Tensor or list): Discriminator outputs or list of + discriminator outputs calculated from generator outputs. + outputs (Tensor or list): Discriminator outputs or list of + discriminator outputs calculated from groundtruth. + Returns: + Tensor: Discriminator real loss value. + Tensor: Discriminator fake loss value. """ if isinstance(outputs, (tuple, list)): real_loss = 0.0 @@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True): def weighted_mean(input, weight): """Weighted mean. It can also be used as masked mean. - Parameters - ----------- - input : Tensor - The input tensor. - weight : Tensor - The weight tensor with broadcastable shape with the input. - - Returns - ---------- - Tensor [shape=(1,)] - Weighted mean tensor with the same dtype as input. + Args: + input(Tensor): The input tensor. + weight(Tensor): The weight tensor with broadcastable shape with the input. + + Returns: + Tensor: Weighted mean tensor with the same dtype as input. shape=(1,) + """ weight = paddle.cast(weight, input.dtype) # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__ @@ -889,20 +801,15 @@ def weighted_mean(input, weight): def masked_l1_loss(prediction, target, mask): """Compute maksed L1 loss. - Parameters - ---------- - prediction : Tensor - The prediction. - target : Tensor - The target. The shape should be broadcastable to ``prediction``. - mask : Tensor - The mask. The shape should be broadcatable to the broadcasted shape of - ``prediction`` and ``target``. - - Returns - ------- - Tensor [shape=(1,)] - The masked L1 loss. + Args: + prediction(Tensor): The prediction. + target(Tensor): The target. The shape should be broadcastable to ``prediction``. + mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of + ``prediction`` and ``target``. + + Returns: + Tensor: The masked L1 loss. shape=(1,) + """ abs_error = F.l1_loss(prediction, target, reduction='none') loss = weighted_mean(abs_error, mask) @@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer): def forward(self, x): """Calculate Mel-spectrogram. - Parameters - ---------- - x : Tensor - Input waveform tensor (B, T) or (B, 1, T). - Returns - ---------- - Tensor - Mel-spectrogram (B, #mels, #frames). + Args: + + x (Tensor): Input waveform tensor (B, T) or (B, 1, T). + Returns: + Tensor: Mel-spectrogram (B, #mels, #frames). """ if len(x.shape) == 3: # (B, C, T) -> (B*C, T) @@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer): def forward(self, y_hat, y): """Calculate Mel-spectrogram loss. - Parameters - ---------- - y_hat : Tensor - Generated single tensor (B, 1, T). - y : Tensor - Groundtruth single tensor (B, 1, T). - Returns - ---------- - Tensor - Mel-spectrogram loss value. + Args: + y_hat(Tensor): Generated single tensor (B, 1, T). + y(Tensor): Groundtruth single tensor (B, 1, T). + + Returns: + Tensor: Mel-spectrogram loss value. """ mel_hat = self.mel_spectrogram(y_hat) mel = self.mel_spectrogram(y) @@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer): def forward(self, feats_hat, feats): """Calcualate feature matching loss. - Parameters - ---------- - feats_hat : list - List of list of discriminator outputs - calcuated from generater outputs. - feats : list - List of list of discriminator outputs - calcuated from groundtruth. - Returns - ---------- - Tensor - Feature matching loss value. + + Args: + feats_hat(list): List of list of discriminator outputs + calcuated from generater outputs. + feats(list): List of list of discriminator outputs + + Returns: + Tensor: Feature matching loss value. """ feat_match_loss = 0.0 diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 3822b33d..4207d316 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -20,27 +20,21 @@ from typeguard import check_argument_types def pad_list(xs, pad_value): """Perform padding for the list of tensors. - Parameters - ---------- - xs : List[Tensor] - List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value : float) - Value for padding. - - Returns - ---------- - Tensor - Padded tensor (B, Tmax, `*`). - - Examples - ---------- - >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) + Args: + xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value (float): Value for padding. + + Returns: + Tensor: Padded tensor (B, Tmax, `*`). + + Examples: + >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] + >>> x + [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] + >>> pad_list(x, 0) + tensor([[1., 1., 1., 1.], + [1., 1., 0., 0.], + [1., 0., 0., 0.]]) """ n_batch = len(xs) max_len = max(x.shape[0] for x in xs) @@ -55,25 +49,20 @@ def pad_list(xs, pad_value): def make_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of padded part. - Parameters - ---------- - lengths : LongTensor - Batch of lengths (B,). - - Returns - ---------- - Tensor(bool) - Mask tensor containing indices of padded part bool. - - Examples - ---------- - With only lengths. - - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] + Args: + lengths (Tensor(int64)): Batch of lengths (B,). + + Returns: + Tensor(bool): Mask tensor containing indices of padded part bool. + + Examples: + With only lengths. + + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] """ if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) @@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1): def make_non_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of non-padded part. - Parameters - ---------- - lengths : LongTensor or List - Batch of lengths (B,). - xs : Tensor, optional - The reference tensor. - If set, masks will be the same shape as this tensor. - length_dim : int, optional - Dimension indicator of the above tensor. - See the example. - - Returns - ---------- - Tensor(bool) - mask tensor containing indices of padded part bool. - - Examples - ---------- - With only lengths. - - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] + Args: + lengths (Tensor(int64) or List): Batch of lengths (B,). + xs (Tensor, optional): The reference tensor. + If set, masks will be the same shape as this tensor. + length_dim (int, optional): Dimension indicator of the above tensor. + See the example. + + Returns: + Tensor(bool): mask tensor containing indices of padded part bool. + + Examples: + With only lengths. + + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[1, 1, 1, 1 ,1], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 0]] """ return paddle.logical_not(make_pad_mask(lengths, length_dim)) @@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str): Custom initialization routines can be implemented into submodules - Parameters - ---------- - model : nn.Layer - Target. - init : str - Method of initialization. + Args: + model (nn.Layer): Target. + init (str): Method of initialization. """ assert check_argument_types() diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py index fb850a4d..9860da90 100644 --- a/paddlespeech/t2s/modules/pqmf.py +++ b/paddlespeech/t2s/modules/pqmf.py @@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): """Design prototype filter for PQMF. This method is based on `A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`_. - Parameters - ---------- - taps : int - The number of filter taps. - cutoff_ratio : float - Cut-off frequency ratio. - beta : float - Beta coefficient for kaiser window. - Returns - ---------- - ndarray - Impluse response of prototype filter (taps + 1,). - .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: - https://ieeexplore.ieee.org/abstract/document/681427 + + Args: + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. + Returns: + ndarray: + Impluse response of prototype filter (taps + 1,). + .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: + https://ieeexplore.ieee.org/abstract/document/681427 """ # check the arguments are valid assert taps % 2 == 0, "The number of taps mush be even number." @@ -68,16 +64,12 @@ class PQMF(nn.Layer): """Initilize PQMF module. The cutoff_ratio and beta parameters are optimized for #subbands = 4. See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. - Parameters - ---------- - subbands : int - The number of subbands. - taps : int - The number of filter taps. - cutoff_ratio : float - Cut-off frequency ratio. - beta : float - Beta coefficient for kaiser window. + + Args: + subbands (int): The number of subbands. + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. """ super().__init__() @@ -110,28 +102,20 @@ class PQMF(nn.Layer): def analysis(self, x): """Analysis with PQMF. - Parameters - ---------- - x : Tensor - Input tensor (B, 1, T). - Returns - ---------- - Tensor - Output tensor (B, subbands, T // subbands). + Args: + x (Tensor): Input tensor (B, 1, T). + Returns: + Tensor: Output tensor (B, subbands, T // subbands). """ x = F.conv1d(self.pad_fn(x), self.analysis_filter) return F.conv1d(x, self.updown_filter, stride=self.subbands) def synthesis(self, x): """Synthesis with PQMF. - Parameters - ---------- - x : Tensor - Input tensor (B, subbands, T // subbands). - Returns - ---------- - Tensor - Output tensor (B, 1, T). + Args: + x (Tensor): Input tensor (B, subbands, T // subbands). + Returns: + Tensor: Output tensor (B, 1, T). """ x = F.conv1d_transpose( x, self.updown_filter * self.subbands, stride=self.subbands) diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py index 6b7c6a6b..33ed575b 100644 --- a/paddlespeech/t2s/modules/predictor/duration_predictor.py +++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py @@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer): offset=1.0): """Initilize duration predictor module. - Parameters - ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. - offset : float, optional - Offset value to avoid nan in log domain. + Args: + idim (int):Input dimension. + n_layers (int, optional): Number of convolutional layers. + n_chans (int, optional): Number of channels of convolutional layers. + kernel_size (int, optional): Kernel size of convolutional layers. + dropout_rate (float, optional): Dropout rate. + offset (float, optional): Offset value to avoid nan in log domain. """ super().__init__() @@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer): def forward(self, xs, x_masks=None): """Calculate forward propagation. + Args: + xs(Tensor): Batch of input sequences (B, Tmax, idim). + x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : ByteTensor, optional - Batch of masks indicating padded part (B, Tmax). - - Returns - ---------- - Tensor - Batch of predicted durations in log domain (B, Tmax). + Returns: + Tensor: Batch of predicted durations in log domain (B, Tmax). """ return self._forward(xs, x_masks, False) def inference(self, xs, x_masks=None): """Inference duration. + Args: + xs(Tensor): Batch of input sequences (B, Tmax, idim). + x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : Tensor(bool), optional - Batch of masks indicating padded part (B, Tmax). - - Returns - ---------- - Tensor - Batch of predicted durations in linear domain int64 (B, Tmax). + Returns: + Tensor: Batch of predicted durations in linear domain int64 (B, Tmax). """ return self._forward(xs, x_masks, True) @@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer): def __init__(self, offset=1.0, reduction="mean"): """Initilize duration predictor loss module. - - Parameters - ---------- - offset : float, optional - Offset value to avoid nan in log domain. - reduction : str - Reduction type in loss calculation. + Args: + offset (float, optional): Offset value to avoid nan in log domain. + reduction (str): Reduction type in loss calculation. """ super().__init__() self.criterion = nn.MSELoss(reduction=reduction) @@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer): def forward(self, outputs, targets): """Calculate forward propagation. - Parameters - ---------- - outputs : Tensor - Batch of prediction durations in log domain (B, T) - targets : Tensor - Batch of groundtruth durations in linear domain (B, T) - - Returns - ---------- - Tensor - Mean squared error loss value. - - Note - ---------- - `outputs` is in log domain but `targets` is in linear domain. + Args: + outputs(Tensor): Batch of prediction durations in log domain (B, T) + targets(Tensor): Batch of groundtruth durations in linear domain (B, T) + + Returns: + Tensor: Mean squared error loss value. + + Note: + `outputs` is in log domain but `targets` is in linear domain. """ # NOTE: outputs is in log domain while targets in linear targets = paddle.log(targets.cast(dtype='float32') + self.offset) diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index 9510dd88..62d707d2 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer): def __init__(self, pad_value=0.0): """Initilize length regulator module. - Parameters - ---------- - pad_value : float, optional - Value used for padding. + Args: + pad_value (float, optional): Value used for padding. """ super().__init__() @@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer): def forward(self, xs, ds, alpha=1.0, is_inference=False): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds : Tensor(int64) - Batch of durations of each frame (B, T). - alpha : float, optional - Alpha value to control speed of speech. + Args: + xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds (Tensor(int64)): Batch of durations of each frame (B, T). + alpha (float, optional): Alpha value to control speed of speech. - Returns - ---------- - Tensor - replicated input tensor based on durations (B, T*, D). + Returns: + Tensor: replicated input tensor based on durations (B, T*, D). """ if alpha != 1.0: diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py index 417fca82..8afbf257 100644 --- a/paddlespeech/t2s/modules/predictor/variance_predictor.py +++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py @@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer): dropout_rate: float=0.5, ): """Initilize duration predictor module. - Parameters - ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. + Args: + idim (int): Input dimension. + n_layers (int, optional): Number of convolutional layers. + n_chans (int, optional): Number of channels of convolutional layers. + kernel_size (int, optional): Kernel size of convolutional layers. + dropout_rate (float, optional): Dropout rate. """ assert check_argument_types() super().__init__() @@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer): x_masks: paddle.Tensor=None) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : Tensor(bool), optional - Batch of masks indicating padded part (B, Tmax, 1). + Args: + xs (Tensor): Batch of input sequences (B, Tmax, idim). + x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1). - Returns - ---------- - Tensor - Batch of predicted sequences (B, Tmax, 1). + Returns: + Tensor: Batch of predicted sequences (B, Tmax, 1). """ # (B, idim, Tmax) xs = xs.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py index a96a8946..efbfce27 100644 --- a/paddlespeech/t2s/modules/residual_block.py +++ b/paddlespeech/t2s/modules/residual_block.py @@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer): unit and parametric redidual and skip connections. For more details, refer to `WaveNet: A Generative Model for Raw Audio `_. - Parameters - ---------- - kernel_size : int, optional - Kernel size of the 1D convolution, by default 3 - residual_channels : int, optional - Feature size of the resiaudl output(and also the input), by default 64 - gate_channels : int, optional - Output feature size of the 1D convolution, by default 128 - skip_channels : int, optional - Feature size of the skip output, by default 64 - aux_channels : int, optional - Feature size of the auxiliary input (e.g. spectrogram), by default 80 - dropout : float, optional - Probability of the dropout before the 1D convolution, by default 0. - dilation : int, optional - Dilation of the 1D convolution, by default 1 - bias : bool, optional - Whether to use bias in the 1D convolution, by default True - use_causal_conv : bool, optional - Whether to use causal padding for the 1D convolution, by default False + Args: + kernel_size (int, optional): Kernel size of the 1D convolution, by default 3 + residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64 + gate_channels (int, optional): Output feature size of the 1D convolution, by default 128 + skip_channels (int, optional): Feature size of the skip output, by default 64 + aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80 + dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0. + dilation (int, optional): Dilation of the 1D convolution, by default 1 + bias (bool, optional): Whether to use bias in the 1D convolution, by default True + use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False """ def __init__(self, @@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer): def forward(self, x, c): """ - Parameters - ---------- - x : Tensor - Shape (N, C_res, T), the input features. - c : Tensor - Shape (N, C_aux, T), the auxiliary input. - - Returns - ------- - res : Tensor - Shape (N, C_res, T), the residual output, which is used as the - input of the next ResidualBlock in a stack of ResidualBlocks. - skip : Tensor - Shape (N, C_skip, T), the skip output, which is collected among - each layer in a stack of ResidualBlocks. + Args: + x (Tensor): the input features. Shape (N, C_res, T) + c (Tensor): the auxiliary input. Shape (N, C_aux, T) + + Returns: + res (Tensor): Shape (N, C_res, T), the residual output, which is used as the + input of the next ResidualBlock in a stack of ResidualBlocks. + skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among + each layer in a stack of ResidualBlocks. """ x_input = x x = F.dropout(x, self.dropout, training=self.training) @@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer): nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1}, ): """Initialize HiFiGANResidualBlock module. - Parameters - ---------- - kernel_size : int - Kernel size of dilation convolution layer. - channels : int - Number of channels for convolution layer. - dilations : List[int] - List of dilation factors. - use_additional_convs : bool - Whether to use additional convolution layers. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. + Args: + kernel_size (int): Kernel size of dilation convolution layer. + channels (int): Number of channels for convolution layer. + dilations (List[int]): List of dilation factors. + use_additional_convs (bool): Whether to use additional convolution layers. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. """ super().__init__() @@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, channels, T). - Returns - ---------- - Tensor - Output tensor (B, channels, T). + Args: + x (Tensor): Input tensor (B, channels, T). + Returns: + Tensor: Output tensor (B, channels, T). """ for idx in range(len(self.convs1)): xt = self.convs1[idx](x) diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py index c885dfe9..0d949b56 100644 --- a/paddlespeech/t2s/modules/residual_stack.py +++ b/paddlespeech/t2s/modules/residual_stack.py @@ -37,26 +37,17 @@ class ResidualStack(nn.Layer): pad_params: Dict[str, Any]={"mode": "reflect"}, use_causal_conv: bool=False, ): """Initialize ResidualStack module. - Parameters - ---------- - kernel_size : int - Kernel size of dilation convolution layer. - channels : int - Number of channels of convolution layers. - dilation : int - Dilation factor. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : Dict[str,Any] - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : Dict[str, Any] - Hyperparameters for padding function. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + kernel_size (int): Kernel size of dilation convolution layer. + channels (int): Number of channels of convolution layers. + dilation (int): Dilation factor. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (Dict[str, Any]): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() # for compatibility @@ -102,13 +93,10 @@ class ResidualStack(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, channels, T). - Returns - ---------- - Tensor - Output tensor (B, chennels, T). + + Args: + c (Tensor): Input tensor (B, channels, T). + Returns: + Tensor: Output tensor (B, chennels, T). """ return self.stack(c) + self.skip_layer(c) diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py index 9d4b83a2..49091eac 100644 --- a/paddlespeech/t2s/modules/style_encoder.py +++ b/paddlespeech/t2s/modules/style_encoder.py @@ -30,33 +30,21 @@ class StyleEncoder(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - idim : int, optional - Dimension of the input mel-spectrogram. - gst_tokens : int, optional - The number of GST embeddings. - gst_token_dim : int, optional - Dimension of each GST embedding. - gst_heads : int, optional - The number of heads in GST multihead attention. - conv_layers : int, optional - The number of conv layers in the reference encoder. - conv_chans_list : Sequence[int], optional - List of the number of channels of conv layers in the referece encoder. - conv_kernel_size : int, optional - Kernal size of conv layers in the reference encoder. - conv_stride : int, optional - Stride size of conv layers in the reference encoder. - gru_layers : int, optional - The number of GRU layers in the reference encoder. - gru_units : int, optional - The number of GRU units in the reference encoder. - - Todo - ---------- - * Support manual weight specification in inference. + + Args: + idim (int, optional): Dimension of the input mel-spectrogram. + gst_tokens (int, optional): The number of GST embeddings. + gst_token_dim (int, optional): Dimension of each GST embedding. + gst_heads (int, optional): The number of heads in GST multihead attention. + conv_layers (int, optional): The number of conv layers in the reference encoder. + conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): Stride size of conv layers in the reference encoder. + gru_layers (int, optional): The number of GRU layers in the reference encoder. + gru_units (int, optional):The number of GRU units in the reference encoder. + + Todo: + * Support manual weight specification in inference. """ @@ -93,15 +81,11 @@ class StyleEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - speech : Tensor - Batch of padded target features (B, Lmax, odim). + Args: + speech (Tensor): Batch of padded target features (B, Lmax, odim). - Returns - ---------- - Tensor: - Style token embeddings (B, token_dim). + Returns: + Tensor: Style token embeddings (B, token_dim). """ ref_embs = self.ref_enc(speech) @@ -118,23 +102,15 @@ class ReferenceEncoder(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - idim : int, optional - Dimension of the input mel-spectrogram. - conv_layers : int, optional - The number of conv layers in the reference encoder. - conv_chans_list: : Sequence[int], optional - List of the number of channels of conv layers in the referece encoder. - conv_kernel_size : int, optional - Kernal size of conv layers in the reference encoder. - conv_stride : int, optional - Stride size of conv layers in the reference encoder. - gru_layers : int, optional - The number of GRU layers in the reference encoder. - gru_units : int, optional - The number of GRU units in the reference encoder. + + Args: + idim (int, optional): Dimension of the input mel-spectrogram. + conv_layers (int, optional): The number of conv layers in the reference encoder. + conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): Stride size of conv layers in the reference encoder. + gru_layers (int, optional): The number of GRU layers in the reference encoder. + gru_units (int, optional): The number of GRU units in the reference encoder. """ @@ -191,16 +167,11 @@ class ReferenceEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. + Args: + speech (Tensor): Batch of padded target features (B, Lmax, idim). - Parameters - ---------- - speech : Tensor - Batch of padded target features (B, Lmax, idim). - - Return - ---------- - Tensor - Reference embedding (B, gru_units) + Returns: + Tensor: Reference embedding (B, gru_units) """ batch_size = speech.shape[0] @@ -228,19 +199,12 @@ class StyleTokenLayer(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - ref_embed_dim : int, optional - Dimension of the input reference embedding. - gst_tokens : int, optional - The number of GST embeddings. - gst_token_dim : int, optional - Dimension of each GST embedding. - gst_heads : int, optional - The number of heads in GST multihead attention. - dropout_rate : float, optional - Dropout rate in multi-head attention. + Args: + ref_embed_dim (int, optional): Dimension of the input reference embedding. + gst_tokens (int, optional): The number of GST embeddings. + gst_token_dim (int, optional): Dimension of each GST embedding. + gst_heads (int, optional): The number of heads in GST multihead attention. + dropout_rate (float, optional): Dropout rate in multi-head attention. """ @@ -271,15 +235,11 @@ class StyleTokenLayer(nn.Layer): def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - ref_embs : Tensor - Reference embeddings (B, ref_embed_dim). + Args: + ref_embs (Tensor): Reference embeddings (B, ref_embed_dim). - Returns - ---------- - Tensor - Style token embeddings (B, gst_token_dim). + Returns: + Tensor: Style token embeddings (B, gst_token_dim). """ batch_size = ref_embs.shape[0] diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index af7a94f3..a6fde742 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -30,21 +30,14 @@ def _apply_attention_constraint(e, introduced in `Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`_. - Parameters - ---------- - e : Tensor - Attention energy before applying softmax (1, T). - last_attended_idx : int - The index of the inputs of the last attended [0, T]. - backward_window : int, optional - Backward window size in attention constraint. - forward_window : int, optional - Forward window size in attetion constraint. - - Returns - ---------- - Tensor - Monotonic constrained attention energy (1, T). + Args: + e(Tensor): Attention energy before applying softmax (1, T). + last_attended_idx(int): The index of the inputs of the last attended [0, T]. + backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1) + forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3) + + Returns: + Tensor: Monotonic constrained attention energy (1, T). .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`: https://arxiv.org/abs/1710.07654 @@ -67,20 +60,14 @@ class AttLoc(nn.Layer): Reference: Attention-Based Models for Speech Recognition (https://arxiv.org/pdf/1506.07503.pdf) - Parameters - ---------- - eprojs : int - projection-units of encoder - dunits : int - units of decoder - att_dim : int - att_dim: attention dimension - aconv_chans : int - channels of attention convolution - aconv_filts : int - filter size of attention convolution - han_mode : bool - flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + + Args: + eprojs (int): projection-units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution + han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h """ def __init__(self, @@ -129,33 +116,19 @@ class AttLoc(nn.Layer): backward_window=1, forward_window=3, ): """Calculate AttLoc forward propagation. - Parameters - ---------- - enc_hs_pad : paddle.Tensor - padded encoder hidden state (B, T_max, D_enc) - enc_hs_len : paddle.Tensor - padded encoder hidden state length (B) - dec_z : paddle.Tensor dec_z - decoder hidden state (B, D_dec) - att_prev : paddle.Tensor - previous attention weight (B, T_max) - scaling : float - scaling parameter before applying softmax - forward_window : paddle.Tensor - forward window size when constraining attention - last_attended_idx : int - index of the inputs of the last attended - backward_window : int - backward window size in attention constraint - forward_window : int - forward window size in attetion constraint - - Returns - ---------- - paddle.Tensor - attention weighted encoder state (B, D_enc) - paddle.Tensor - previous attention weights (B, T_max) + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(Tensor): padded encoder hidden state length (B) + dec_z(Tensor dec_z): decoder hidden state (B, D_dec) + att_prev(Tensor): previous attention weight (B, T_max) + scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0) + forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): forward window size in attetion constraint (Default value = 3) + Returns: + Tensor: attention weighted encoder state (B, D_enc) + Tensor: previous attention weights (B, T_max) """ batch = paddle.shape(enc_hs_pad)[0] # pre-compute all h outside the decoder loop @@ -217,19 +190,13 @@ class AttForward(nn.Layer): ---------- Forward attention in sequence-to-sequence acoustic modeling for speech synthesis (https://arxiv.org/pdf/1807.06736.pdf) - - Parameters - ---------- - eprojs : int - projection-units of encoder - dunits : int - units of decoder - att_dim : int - attention dimension - aconv_chans : int - channels of attention convolution - aconv_filts : int - filter size of attention convolution + + Args: + eprojs (int): projection-units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution """ def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): @@ -270,30 +237,20 @@ class AttForward(nn.Layer): backward_window=1, forward_window=3, ): """Calculate AttForward forward propagation. - Parameters - ---------- - enc_hs_pad : paddle.Tensor - padded encoder hidden state (B, T_max, D_enc) - enc_hs_len : list - padded encoder hidden state length (B,) - dec_z : paddle.Tensor - decoder hidden state (B, D_dec) - att_prev : paddle.Tensor - attention weights of previous step (B, T_max) - scaling : float - scaling parameter before applying softmax - last_attended_idx : int - index of the inputs of the last attended - backward_window : int - backward window size in attention constraint - forward_window : int - forward window size in attetion constraint - Returns - ---------- - paddle.Tensor - attention weighted encoder state (B, D_enc) - paddle.Tensor - previous attention weights (B, T_max) + + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(list): padded encoder hidden state length (B,) + dec_z(Tensor): decoder hidden state (B, D_dec) + att_prev(Tensor): attention weights of previous step (B, T_max) + scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): (Default value = 3) + + Returns: + Tensor: attention weighted encoder state (B, D_enc) + Tensor: previous attention weights (B, T_max) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop @@ -359,24 +316,17 @@ class AttForward(nn.Layer): class AttForwardTA(nn.Layer): """Forward attention with transition agent module. - Reference - ---------- - Forward attention in sequence-to-sequence acoustic modeling for speech synthesis - (https://arxiv.org/pdf/1807.06736.pdf) - Parameters - ---------- - eunits : int - units of encoder - dunits : int - units of decoder - att_dim : int - attention dimension - aconv_chans : int - channels of attention convolution - aconv_filts : int - filter size of attention convolution - odim : int - output dimension + Reference: + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + + Args: + eunits (int): units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution + odim (int): output dimension """ def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): @@ -420,32 +370,21 @@ class AttForwardTA(nn.Layer): backward_window=1, forward_window=3, ): """Calculate AttForwardTA forward propagation. - Parameters - ---------- - enc_hs_pad : paddle.Tensor - padded encoder hidden state (B, Tmax, eunits) - enc_hs_len : list paddle.Tensor - padded encoder hidden state length (B,) - dec_z : paddle.Tensor - decoder hidden state (B, dunits) - att_prev : paddle.Tensor - attention weights of previous step (B, T_max) - out_prev : paddle.Tensor - decoder outputs of previous step (B, odim) - scaling : float - scaling parameter before applying softmax - last_attended_idx : int - index of the inputs of the last attended - backward_window : int - backward window size in attention constraint - forward_window : int - forward window size in attetion constraint - Returns - ---------- - paddle.Tensor - attention weighted encoder state (B, dunits) - paddle.Tensor - previous attention weights (B, Tmax) + + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits) + enc_hs_len(list Tensor): padded encoder hidden state length (B,) + dec_z(Tensor): decoder hidden state (B, dunits) + att_prev(Tensor): attention weights of previous step (B, T_max) + out_prev(Tensor): decoder outputs of previous step (B, odim) + scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): (Default value = 3) + + Returns: + Tensor: attention weighted encoder state (B, dunits) + Tensor: previous attention weights (B, Tmax) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index 0cfe0b84..ebdfa387 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -44,16 +44,11 @@ class Prenet(nn.Layer): def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5): """Initialize prenet module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - n_layers : int, optional - The number of prenet layers. - n_units : int, optional - The number of prenet units. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + n_layers (int, optional): The number of prenet layers. + n_units (int, optional): The number of prenet units. """ super().__init__() self.dropout_rate = dropout_rate @@ -66,15 +61,11 @@ class Prenet(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Batch of input tensors (B, ..., idim). + Args: + x (Tensor): Batch of input tensors (B, ..., idim). - Returns - ---------- - Tensor - Batch of output tensors (B, ..., odim). + Returns: + Tensor: Batch of output tensors (B, ..., odim). """ for i in range(len(self.prenet)): @@ -109,22 +100,14 @@ class Postnet(nn.Layer): use_batch_norm=True, ): """Initialize postnet module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - n_layers : int, optional - The number of layers. - n_filts : int, optional - The number of filter size. - n_units : int, optional - The number of filter channels. - use_batch_norm : bool, optional - Whether to use batch normalization.. - dropout_rate : float, optional - Dropout rate.. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + n_layers (int, optional): The number of layers. + n_filts (int, optional): The number of filter size. + n_units (int, optional): The number of filter channels. + use_batch_norm (bool, optional): Whether to use batch normalization.. + dropout_rate (float, optional): Dropout rate.. """ super().__init__() self.postnet = nn.LayerList() @@ -184,16 +167,10 @@ class Postnet(nn.Layer): def forward(self, xs): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of the sequences of padded input tensors (B, idim, Tmax). - - Returns - ---------- - Tensor - Batch of padded output tensor. (B, odim, Tmax). - + Args: + xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax). + Returns: + Tensor: Batch of padded output tensor. (B, odim, Tmax). """ for i in range(len(self.postnet)): xs = self.postnet[i](xs) @@ -217,13 +194,11 @@ class ZoneOutCell(nn.Layer): def __init__(self, cell, zoneout_rate=0.1): """Initialize zone out cell module. - Parameters - ---------- - cell : nn.Layer: - Paddle recurrent cell module - e.g. `paddle.nn.LSTMCell`. - zoneout_rate : float, optional - Probability of zoneout from 0.0 to 1.0. + + Args: + cell (nn.Layer): Paddle recurrent cell module + e.g. `paddle.nn.LSTMCell`. + zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0. """ super().__init__() self.cell = cell @@ -235,20 +210,18 @@ class ZoneOutCell(nn.Layer): def forward(self, inputs, hidden): """Calculate forward propagation. - Parameters - ---------- - inputs : Tensor - Batch of input tensor (B, input_size). - hidden : tuple - - Tensor: Batch of initial hidden states (B, hidden_size). - - Tensor: Batch of initial cell states (B, hidden_size). - Returns - ---------- - Tensor - Batch of next hidden states (B, hidden_size). - tuple: - - Tensor: Batch of next hidden states (B, hidden_size). - - Tensor: Batch of next cell states (B, hidden_size). + + Args: + inputs (Tensor): Batch of input tensor (B, input_size). + hidden (tuple): + - Tensor: Batch of initial hidden states (B, hidden_size). + - Tensor: Batch of initial cell states (B, hidden_size). + Returns: + Tensor: + Batch of next hidden states (B, hidden_size). + tuple: + - Tensor: Batch of next hidden states (B, hidden_size). + - Tensor: Batch of next cell states (B, hidden_size). """ # we only use the second output of LSTMCell in paddle _, next_hidden = self.cell(inputs, hidden) @@ -302,42 +275,29 @@ class Decoder(nn.Layer): zoneout_rate=0.1, reduction_factor=1, ): """Initialize Tacotron2 decoder module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - att nn.Layer - Instance of attention class. - dlayers int, optional - The number of decoder lstm layers. - dunits : int, optional - The number of decoder lstm units. - prenet_layers : int, optional - The number of prenet layers. - prenet_units : int, optional - The number of prenet units. - postnet_layers : int, optional - The number of postnet layers. - postnet_filts : int, optional - The number of postnet filter size. - postnet_chans : int, optional - The number of postnet filter channels. - output_activation_fn : nn.Layer, optional - Activation function for outputs. - cumulate_att_w : bool, optional - Whether to cumulate previous attention weight. - use_batch_norm : bool, optional - Whether to use batch normalization. - use_concate : bool, optional - Whether to concatenate encoder embedding with decoder lstm outputs. - dropout_rate : float, optional - Dropout rate. - zoneout_rate : float, optional - Zoneout rate. - reduction_factor : int, optional - Reduction factor. + + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + att (nn.Layer): Instance of attention class. + dlayers (int, optional): The number of decoder lstm layers. + dunits (int, optional): The number of decoder lstm units. + prenet_layers (int, optional): The number of prenet layers. + prenet_units (int, optional): The number of prenet units. + postnet_layers (int, optional): The number of postnet layers. + postnet_filts (int, optional): The number of postnet filter size. + postnet_chans (int, optional): The number of postnet filter channels. + output_activation_fn (nn.Layer, optional): Activation function for outputs. + cumulate_att_w (bool, optional): Whether to cumulate previous attention weight. + use_batch_norm (bool, optional): Whether to use batch normalization. + use_concate : bool, optional + Whether to concatenate encoder embedding with decoder lstm outputs. + dropout_rate : float, optional + Dropout rate. + zoneout_rate : float, optional + Zoneout rate. + reduction_factor : int, optional + Reduction factor. """ super().__init__() @@ -401,26 +361,19 @@ class Decoder(nn.Layer): def forward(self, hs, hlens, ys): """Calculate forward propagation. - Parameters - ---------- - hs : Tensor - Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens : Tensor(int64) padded - Batch of lengths of each input batch (B,). - ys : Tensor - Batch of the sequences of padded target features (B, Lmax, odim). - Returns - ---------- - Tensor - Batch of output tensors after postnet (B, Lmax, odim). - Tensor - Batch of output tensors before postnet (B, Lmax, odim). - Tensor - Batch of logits of stop prediction (B, Lmax). - Tensor - Batch of attention weights (B, Lmax, Tmax). - Note - ---------- + + Args: + hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,). + ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + + Returns: + Tensor: Batch of output tensors after postnet (B, Lmax, odim). + Tensor: Batch of output tensors before postnet (B, Lmax, odim). + Tensor: Batch of logits of stop prediction (B, Lmax). + Tensor: Batch of attention weights (B, Lmax, Tmax). + + Note: This computation is performed in teacher-forcing manner. """ # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) @@ -517,37 +470,24 @@ class Decoder(nn.Layer): backward_window=None, forward_window=None, ): """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - h : Tensor - Input sequence of encoder hidden states (T, C). - threshold : float, optional - Threshold to stop generation. - minlenratio : float, optional - Minimum length ratio. - If set to 1.0 and the length of input is 10, - the minimum length of outputs will be 10 * 1 = 10. - minlenratio : float, optional - Minimum length ratio. - If set to 10 and the length of input is 10, - the maximum length of outputs will be 10 * 10 = 100. - use_att_constraint : bool - Whether to apply attention constraint introduced in `Deep Voice 3`_. - backward_window : int - Backward window size in attention constraint. - forward_window : int - Forward window size in attention constraint. - Returns - ---------- - Tensor - Output sequence of features (L, odim). - Tensor - Output sequence of stop probabilities (L,). - Tensor - Attention weights (L, T). - Note - ---------- - This computation is performed in auto-regressive manner. + Args: + h(Tensor): Input sequence of encoder hidden states (T, C). + threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5) + minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10, + the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0) + maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10, + the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0) + use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False) + backward_window(int, optional): Backward window size in attention constraint. (Default value = None) + forward_window(int, optional): (Default value = None) + + Returns: + Tensor: Output sequence of features (L, odim). + Tensor: Output sequence of stop probabilities (L,). + Tensor: Attention weights (L, T). + + Note: + This computation is performed in auto-regressive manner. .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654 """ # setup @@ -683,21 +623,18 @@ class Decoder(nn.Layer): def calculate_all_attentions(self, hs, hlens, ys): """Calculate all of the attention weights. - Parameters - ---------- - hs : Tensor - Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens : Tensor(int64) - Batch of lengths of each input batch (B,). - ys : Tensor - Batch of the sequences of padded target features (B, Lmax, odim). - Returns - ---------- - numpy.ndarray - Batch of attention weights (B, Lmax, Tmax). - Note - ---------- - This computation is performed in teacher-forcing manner. + + Args: + hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64)): Batch of lengths of each input batch (B,). + ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + + Returns: + numpy.ndarray: + Batch of attention weights (B, Lmax, Tmax). + + Note: + This computation is performed in teacher-forcing manner. """ # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) if self.reduction_factor > 1: diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index 80c213a1..db102a11 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -45,31 +45,18 @@ class Encoder(nn.Layer): dropout_rate=0.5, padding_idx=0, ): """Initialize Tacotron2 encoder module. - - Parameters - ---------- - idim : int - Dimension of the inputs. - input_layer : str - Input layer type. - embed_dim : int, optional - Dimension of character embedding. - elayers : int, optional - The number of encoder blstm layers. - eunits : int, optional - The number of encoder blstm units. - econv_layers : int, optional - The number of encoder conv layers. - econv_filts : int, optional - The number of encoder conv filter size. - econv_chans : int, optional - The number of encoder conv filter channels. - use_batch_norm : bool, optional - Whether to use batch normalization. - use_residual : bool, optional - Whether to use residual connection. - dropout_rate : float, optional - Dropout rate. + Args: + idim (int): Dimension of the inputs. + input_layer (str): Input layer type. + embed_dim (int, optional): Dimension of character embedding. + elayers (int, optional): The number of encoder blstm layers. + eunits (int, optional): The number of encoder blstm units. + econv_layers (int, optional): The number of encoder conv layers. + econv_filts (int, optional): The number of encoder conv filter size. + econv_chans (int, optional): The number of encoder conv filter channels. + use_batch_norm (bool, optional): Whether to use batch normalization. + use_residual (bool, optional): Whether to use residual connection. + dropout_rate (float, optional): Dropout rate. """ super().__init__() @@ -139,21 +126,15 @@ class Encoder(nn.Layer): def forward(self, xs, ilens=None): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of the padded sequence. Either character ids (B, Tmax) - or acoustic feature (B, Tmax, idim * encoder_reduction_factor). - Padded value should be 0. - ilens : Tensor(int64) - Batch of lengths of each input batch (B,). - - Returns - ---------- - Tensor - Batch of the sequences of encoder states(B, Tmax, eunits). - Tensor(int64) - Batch of lengths of each sequence (B,) + Args: + xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax) + or acoustic feature (B, Tmax, idim * encoder_reduction_factor). + Padded value should be 0. + ilens (Tensor(int64)): Batch of lengths of each input batch (B,). + + Returns: + Tensor: Batch of the sequences of encoder states(B, Tmax, eunits). + Tensor(int64): Batch of lengths of each sequence (B,) """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: @@ -179,16 +160,12 @@ class Encoder(nn.Layer): def inference(self, x): """Inference. - Parameters - ---------- - x : Tensor - The sequeunce of character ids (T,) - or acoustic feature (T, idim * encoder_reduction_factor). + Args: + x (Tensor): The sequeunce of character ids (T,) + or acoustic feature (T, idim * encoder_reduction_factor). - Returns - ---------- - Tensor - The sequences of encoder states(T, eunits). + Returns: + Tensor: The sequences of encoder states(T, eunits). """ xs = x.unsqueeze(0) diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index 1ca4e6d8..b2275e23 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -59,18 +59,12 @@ class TADELayer(nn.Layer): def forward(self, x, c): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - c : Tensor - Auxiliary input tensor (B, aux_channels, T). - Returns - ---------- - Tensor - Output tensor (B, in_channels, T * upsample_factor). - Tensor - Upsampled aux tensor (B, in_channels, T * upsample_factor). + Args: + x (Tensor): Input tensor (B, in_channels, T). + c (Tensor): Auxiliary input tensor (B, aux_channels, T). + Returns: + Tensor: Output tensor (B, in_channels, T * upsample_factor). + Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor). """ x = self.norm(x) @@ -142,18 +136,13 @@ class TADEResBlock(nn.Layer): def forward(self, x, c): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - c : Tensor - Auxiliary input tensor (B, aux_channels, T). - Returns - ---------- - Tensor - Output tensor (B, in_channels, T * upsample_factor). - Tensor - Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). + Args: + + x (Tensor): Input tensor (B, in_channels, T). + c (Tensor): Auxiliary input tensor (B, aux_channels, T). + Returns: + Tensor: Output tensor (B, in_channels, T * upsample_factor). + Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). """ residual = x x, c = self.tade1(x, c) diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index 34386f2a..cdb95b21 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -24,15 +24,10 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill class MultiHeadedAttention(nn.Layer): """Multi-Head Attention layer. - - Parameters - ---------- - n_head : int - The number of heads. - n_feat : int - The number of features. - dropout_rate : float - Dropout rate. + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. """ def __init__(self, n_head, n_feat, dropout_rate): @@ -52,23 +47,15 @@ class MultiHeadedAttention(nn.Layer): def forward_qkv(self, query, key, value): """Transform query, key and value. - Parameters - ---------- - query : paddle.Tensor - query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - - Returns - ---------- - paddle.Tensor - Transformed query tensor (#batch, n_head, time1, d_k). - paddle.Tensor - Transformed key tensor (#batch, n_head, time2, d_k). - paddle.Tensor - Transformed value tensor (#batch, n_head, time2, d_k). + Args: + query(Tensor): query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + + Returns: + Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + Tensor: Transformed value tensor (#batch, n_head, time2, d_k). """ n_batch = paddle.shape(query)[0] @@ -89,20 +76,13 @@ class MultiHeadedAttention(nn.Layer): def forward_attention(self, value, scores, mask=None): """Compute attention context vector. - Parameters - ---------- - value : paddle.Tensor - Transformed value (#batch, n_head, time2, d_k). - scores : paddle.Tensor - Attention score (#batch, n_head, time1, time2). - mask : paddle.Tensor - Mask (#batch, 1, time2) or (#batch, time1, time2). - - Returns - ---------- - paddle.Tensor: - Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). + Args: + value(Tensor): Transformed value (#batch, n_head, time2, d_k). + scores(Tensor): Attention score (#batch, n_head, time1, time2). + mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + + Returns: + Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). """ n_batch = paddle.shape(value)[0] softmax = paddle.nn.Softmax(axis=-1) @@ -132,21 +112,14 @@ class MultiHeadedAttention(nn.Layer): def forward(self, query, key, value, mask=None): """Compute scaled dot product attention. - Parameters - ---------- - query : paddle.Tensor - Query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - mask : paddle.Tensor - Mask tensor (#batch, 1, time2) or (#batch, time1, time2). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time1, d_model). + Args: + query(Tensor): Query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + + Returns: + Tensor: Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) scores = paddle.matmul(q, k.transpose( @@ -159,16 +132,12 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): """Multi-Head Attention layer with relative position encoding (new implementation). Details can be found in https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860 - Parameters - ---------- - n_head : int - The number of heads. - n_feat : int - The number of features. - dropout_rate : float - Dropout rate. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): @@ -191,15 +160,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def rel_shift(self, x): """Compute relative positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, head, time1, 2*time1-1). - time1 means the length of query vector. - Returns - ---------- - paddle.Tensor - Output tensor. + Args: + x(Tensor): Input tensor (batch, head, time1, 2*time1-1). + + Returns: + Tensor:Output tensor. """ b, h, t1, t2 = paddle.shape(x) zero_pad = paddle.zeros((b, h, t1, 1)) @@ -216,24 +181,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def forward(self, query, key, value, pos_emb, mask): """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Parameters - ---------- - query : paddle.Tensor - Query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - pos_emb : paddle.Tensor - Positional embedding tensor - (#batch, 2*time1-1, size). - mask : paddle.Tensor - Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time1, d_model). + + Args: + query(Tensor): Query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size). + mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + Tensor: Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) # (batch, time1, head, d_k) diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py index fe2949f4..a8db7345 100644 --- a/paddlespeech/t2s/modules/transformer/decoder.py +++ b/paddlespeech/t2s/modules/transformer/decoder.py @@ -36,51 +36,32 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat class Decoder(nn.Layer): """Transfomer decoder module. - Parameters - ---------- - odim : int - Output diminsion. - self_attention_layer_type : str - Self-attention layer type. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - conv_wshare : int - The number of kernel of convolution. Only used in - self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_kernel_length : Union[int, str]) - Kernel size str of convolution - (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_usebias : bool - Whether to use bias in convolution. Only used in - self_attention_layer_type == "lightconv*" or "dynamiconv*". - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - self_attention_dropout_rate : float - Dropout rate in self-attention. - src_attention_dropout_rate : float - Dropout rate in source-attention. - input_layer : (Union[str, nn.Layer]) - Input layer type. - use_output_layer : bool - Whether to use output layer. - pos_enc_class : nn.Layer - Positional encoding module class. - `PositionalEncoding `or `ScaledPositionalEncoding` - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + Args: + odim (int): Output diminsion. + self_attention_layer_type (str): Self-attention layer type. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + conv_wshare (int): The number of kernel of convolution. Only used in + self_attention_layer_type == "lightconv*" or "dynamiconv*". + conv_kernel_length (Union[int, str]):Kernel size str of convolution + (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". + conv_usebias (bool): Whether to use bias in convolution. Only used in + self_attention_layer_type == "lightconv*" or "dynamiconv*". + linear_units(int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + self_attention_dropout_rate (float): Dropout rate in self-attention. + src_attention_dropout_rate (float): Dropout rate in source-attention. + input_layer (Union[str, nn.Layer]): Input layer type. + use_output_layer (bool): Whether to use output layer. + pos_enc_class (nn.Layer): Positional encoding module class. + `PositionalEncoding `or `ScaledPositionalEncoding` + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ @@ -161,27 +142,18 @@ class Decoder(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask): """Forward decoder. - - Parameters - ---------- - tgt : paddle.Tensor - Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". - In the other case, input tensor (#batch, maxlen_out, odim). - tgt_mask : paddle.Tensor - Input token mask (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, feat). - memory_mask : paddle.Tensor - Encoded memory mask (#batch, maxlen_in). - - Returns - ---------- - paddle.Tensor - Decoded token score before softmax (#batch, maxlen_out, odim) - if use_output_layer is True. In the other case,final block outputs - (#batch, maxlen_out, attention_dim). - paddle.Tensor - Score mask before softmax (#batch, maxlen_out). + Args: + tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". + In the other case, input tensor (#batch, maxlen_out, odim). + tgt_mask(Tensor): Input token mask (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). + memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + + Returns: + Tensor: + Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. + In the other case,final block outputs (#batch, maxlen_out, attention_dim). + Tensor: Score mask before softmax (#batch, maxlen_out). """ x = self.embed(tgt) @@ -196,23 +168,15 @@ class Decoder(nn.Layer): def forward_one_step(self, tgt, tgt_mask, memory, cache=None): """Forward one step. - Parameters - ---------- - tgt : paddle.Tensor - Input token ids, int64 (#batch, maxlen_out). - tgt_mask : paddle.Tensor - Input token mask (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, feat). - cache : (List[paddle.Tensor]) - List of cached tensors. - Each tensor shape should be (#batch, maxlen_out - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (batch, maxlen_out, odim). - List[paddle.Tensor] - List of cache tensors of each decoder layer. + Args: + tgt(Tensor): Input token ids, int64 (#batch, maxlen_out). + tgt_mask(Tensor): Input token mask (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). + cache((List[Tensor]), optional): List of cached tensors. (Default value = None) + + Returns: + Tensor: Output tensor (batch, maxlen_out, odim). + List[Tensor]: List of cache tensors of each decoder layer. """ x = self.embed(tgt) @@ -254,20 +218,14 @@ class Decoder(nn.Layer): xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]: """Score new token batch (required). - Parameters - ---------- - ys : paddle.Tensor - paddle.int64 prefix tokens (n_batch, ylen). - states : List[Any] - Scorer states for prefix tokens. - xs : paddle.Tensor - The encoder feature that generates ys (n_batch, xlen, n_feat). + Args: + ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen). + states(List[Any]): Scorer states for prefix tokens. + xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat). - Returns - ---------- - tuple[paddle.Tensor, List[Any]] - Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` - and next state list for ys. + Returns: + tuple[Tensor, List[Any]]: + Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys. """ # merge states diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py index 44978f1e..9a13cd79 100644 --- a/paddlespeech/t2s/modules/transformer/decoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py @@ -22,28 +22,21 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm class DecoderLayer(nn.Layer): """Single decoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ @@ -75,30 +68,22 @@ class DecoderLayer(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None): """Compute decoded features. - Parameters - ---------- - tgt : paddle.Tensor - Input tensor (#batch, maxlen_out, size). - tgt_mask : paddle.Tensor - Mask for input tensor (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, size). - memory_mask : paddle.Tensor - Encoded memory mask (#batch, maxlen_in). - cache : List[paddle.Tensor] - List of cached tensors. - Each tensor shape should be (#batch, maxlen_out - 1, size). - - Returns - ---------- - paddle.Tensor - Output tensor(#batch, maxlen_out, size). - paddle.Tensor - Mask for output tensor (#batch, maxlen_out). - paddle.Tensor - Encoded memory (#batch, maxlen_in, size). - paddle.Tensor - Encoded memory mask (#batch, maxlen_in). + Args: + tgt(Tensor): Input tensor (#batch, maxlen_out, size). + tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size). + memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + cache(List[Tensor], optional): List of cached tensors. + Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None) + Returns: + Tensor + Output tensor(#batch, maxlen_out, size). + Tensor + Mask for output tensor (#batch, maxlen_out). + Tensor + Encoded memory (#batch, maxlen_in, size). + Tensor + Encoded memory mask (#batch, maxlen_in). """ residual = tgt diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index 40ab03ee..d9339d20 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -22,18 +22,12 @@ from paddle import nn class PositionalEncoding(nn.Layer): """Positional encoding. - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - reverse : bool - Whether to reverse the input position. - type : str - dtype of param + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + reverse (bool): Whether to reverse the input position. + type (str): dtype of param """ def __init__(self, @@ -73,15 +67,11 @@ class PositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). + Args: + x (Tensor): Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) T = paddle.shape(x)[1] @@ -91,19 +81,13 @@ class PositionalEncoding(nn.Layer): class ScaledPositionalEncoding(PositionalEncoding): """Scaled positional encoding module. - See Sec. 3.2 https://arxiv.org/abs/1809.08895 - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - dtype : str - dtype of param + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + dtype (str): dtype of param """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -126,14 +110,10 @@ class ScaledPositionalEncoding(PositionalEncoding): def forward(self, x): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Args: + x (Tensor): Input tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) T = paddle.shape(x)[1] @@ -145,14 +125,11 @@ class RelPositionalEncoding(nn.Layer): """Relative positional encoding module (new implementation). Details can be found in https://github.com/espnet/espnet/pull/2816. See : Appendix B in https://arxiv.org/abs/1901.02860 - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -197,14 +174,10 @@ class RelPositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Args: + x (Tensor):Input tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) x = x * self.xscale diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py index 8bf71b41..2b3ee788 100644 --- a/paddlespeech/t2s/modules/transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -37,62 +37,37 @@ from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling class BaseEncoder(nn.Layer): """Base Encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, nn.Layer] - Input layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - macaron_style : bool - Whether to use macaron style for positionwise layer. - pos_enc_layer_type : str - Encoder positional encoding layer type. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - use_cnn_module : bool - Whether to use convolution module. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel : int - Kernerl size of convolution module. - padding_idx : int - Padding idx for input_layer=embed. - stochastic_depth_rate : float - Maximum probability to skip the encoder layer. - intermediate_layers : Union[List[int], None] - indices of intermediate CTC layer. - indices start from 1. - if not None, intermediate outputs are returned (which changes return type - signature.) - encoder_type: str - "transformer", or "conformer". + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, nn.Layer]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Encoder positional encoding layer type. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): Kernerl size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer. + indices start from 1. + if not None, intermediate outputs are returned (which changes return type + signature.) + encoder_type (str): "transformer", or "conformer". """ def __init__(self, @@ -290,19 +265,13 @@ class BaseEncoder(nn.Layer): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + Args: + xs (Tensor): Input tensor (#batch, time, idim). + masks (Tensor): Mask tensor (#batch, 1, time). + + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor: Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -313,45 +282,28 @@ class BaseEncoder(nn.Layer): class TransformerEncoder(BaseEncoder): """Transformer encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, paddle.nn.Layer] - Input layer type. - pos_enc_layer_type : str - Encoder positional encoding layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - padding_idx : int - Padding idx for input_layer=embed. + + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): Input layer type. + pos_enc_layer_type (str): Encoder positional encoding layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + padding_idx (int): Padding idx for input_layer=embed. """ def __init__( @@ -397,19 +349,13 @@ class TransformerEncoder(BaseEncoder): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + Args: + xs(Tensor): Input tensor (#batch, time, idim). + masks(Tensor): Mask tensor (#batch, 1, time). + + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor:Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -420,23 +366,15 @@ class TransformerEncoder(BaseEncoder): def forward_one_step(self, xs, masks, cache=None): """Encode input frame. - Parameters - ---------- - xs : paddle.Tensor - Input tensor. - masks : paddle.Tensor - Mask tensor. - cache : List[paddle.Tensor] - List of cache tensors. - - Returns - ---------- - paddle.Tensor - Output tensor. - paddle.Tensor - Mask tensor. - List[paddle.Tensor] - List of new cache tensors. + Args: + xs (Tensor): Input tensor. + masks (Tensor): Mask tensor. + cache (List[Tensor]): List of cache tensors. + + Returns: + Tensor: Output tensor. + Tensor: Mask tensor. + List[Tensor]: List of new cache tensors. """ xs = self.embed(xs) @@ -453,60 +391,35 @@ class TransformerEncoder(BaseEncoder): class ConformerEncoder(BaseEncoder): """Conformer encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, nn.Layer] - Input layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - macaron_style : bool - Whether to use macaron style for positionwise layer. - pos_enc_layer_type : str - Encoder positional encoding layer type. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - use_cnn_module : bool - Whether to use convolution module. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel : int - Kernerl size of convolution module. - padding_idx : int - Padding idx for input_layer=embed. - stochastic_depth_rate : float - Maximum probability to skip the encoder layer. - intermediate_layers : Union[List[int], None] - indices of intermediate CTC layer. - indices start from 1. - if not None, intermediate outputs are returned (which changes return type - signature.) + + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, nn.Layer]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool):Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Encoder positional encoding layer type. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): Kernerl size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1. + if not None, intermediate outputs are returned (which changes return type signature.) """ def __init__( @@ -563,18 +476,13 @@ class ConformerEncoder(BaseEncoder): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + + Args: + xs (Tensor): Input tensor (#batch, time, idim). + masks (Tensor): Mask tensor (#batch, 1, time). + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor: Mask tensor (#batch, 1, time). """ if isinstance(self.embed, (Conv2dSubsampling)): xs, masks = self.embed(xs, masks) diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py index f55ded3d..72372b69 100644 --- a/paddlespeech/t2s/modules/transformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py @@ -20,25 +20,18 @@ from paddle import nn class EncoderLayer(nn.Layer): """Encoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ def __init__( @@ -65,21 +58,14 @@ class EncoderLayer(nn.Layer): def forward(self, x, mask, cache=None): """Compute encoded features. - Parameters - ---------- - x_input : paddle.Tensor - Input tensor (#batch, time, size). - mask : paddle.Tensor - Mask tensor for the input (#batch, time). - cache : paddle.Tensor - Cache tensor of the input (#batch, time - 1, size). + Args: + x(Tensor): Input tensor (#batch, time, size). + mask(Tensor): Mask tensor for the input (#batch, time). + cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, size). - paddle.Tensor - Mask tensor (#batch, time). + Returns: + Tensor: Output tensor (#batch, time, size). + Tensor: Mask tensor (#batch, time). """ residual = x if self.normalize_before: diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py index ccf84c8a..9bcc1acf 100644 --- a/paddlespeech/t2s/modules/transformer/lightconv.py +++ b/paddlespeech/t2s/modules/transformer/lightconv.py @@ -30,20 +30,13 @@ class LightweightConvolution(nn.Layer): This implementation is based on https://github.com/pytorch/fairseq/tree/master/fairseq - Parameters - ---------- - wshare : int - the number of kernel of convolution - n_feat : int - the number of features - dropout_rate : float - dropout_rate - kernel_size : int - kernel size (length) - use_kernel_mask : bool - Use causal mask or not for convolution kernel - use_bias : bool - Use bias term or not. + Args: + wshare (int): the number of kernel of convolution + n_feat (int): the number of features + dropout_rate (float): dropout_rate + kernel_size (int): kernel size (length) + use_kernel_mask (bool): Use causal mask or not for convolution kernel + use_bias (bool): Use bias term or not. """ @@ -100,21 +93,14 @@ class LightweightConvolution(nn.Layer): This function takes query, key and value but uses only query. This is just for compatibility with self-attention layer (attention.py) - Parameters - ---------- - query : paddle.Tensor - (batch, time1, d_model) input tensor - key : paddle.Tensor - (batch, time2, d_model) NOT USED - value : paddle.Tensor - (batch, time2, d_model) NOT USED - mask : paddle.Tensor - (batch, time1, time2) mask - - Return - ---------- - x : paddle.Tensor - (batch, time1, d_model) ouput + Args: + query (Tensor): input tensor. (batch, time1, d_model) + key (Tensor): NOT USED. (batch, time2, d_model) + value (Tensor): NOT USED. (batch, time2, d_model) + mask : (Tensor): (batch, time1, time2) mask + + Return: + Tensor: ouput. (batch, time1, d_model) """ # linear -> GLU -> lightconv -> linear diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py index fd97b004..c10e6add 100644 --- a/paddlespeech/t2s/modules/transformer/mask.py +++ b/paddlespeech/t2s/modules/transformer/mask.py @@ -17,19 +17,16 @@ import paddle def subsequent_mask(size, dtype=paddle.bool): """Create mask for subsequent steps (size, size). - Parameters - ---------- - size : int - size of mask - dtype : paddle.dtype - result dtype - Return - ---------- - paddle.Tensor - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] + + Args: + size (int): size of mask + dtype (paddle.dtype): result dtype + Return: + Tensor: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=dtype) return paddle.tril(ret) @@ -37,19 +34,13 @@ def subsequent_mask(size, dtype=paddle.bool): def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool): """Create mask for decoder self-attention. - Parameters - ---------- - ys_pad : paddle.Tensor - batch of padded target sequences (B, Lmax) - ignore_id : int - index of padding - dtype : torch.dtype - result dtype - Return - ---------- - paddle.Tensor - (B, Lmax, Lmax) + Args: + ys_pad (Tensor): batch of padded target sequences (B, Lmax) + ignore_id (int): index of padding + dtype (paddle.dtype): result dtype + Return: + Tensor: (B, Lmax, Lmax) """ ys_mask = ys_in_pad != ignore_id m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0) diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py index df8929e3..d3285b65 100644 --- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py +++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py @@ -31,16 +31,11 @@ class MultiLayeredConv1d(nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize MultiLayeredConv1d module. - Parameters - ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. """ super().__init__() @@ -62,15 +57,11 @@ class MultiLayeredConv1d(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + Args: + x (Tensor): Batch of input tensors (B, T, in_chans). - Returns - ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + Returns: + Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( @@ -87,16 +78,11 @@ class Conv1dLinear(nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize Conv1dLinear module. - Parameters - ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. """ super().__init__() self.w_1 = nn.Conv1D( @@ -112,15 +98,11 @@ class Conv1dLinear(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + Args: + x (Tensor): Batch of input tensors (B, T, in_chans). - Returns - ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + Returns: + Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py index 28ed1c31..92af6851 100644 --- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py +++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py @@ -20,14 +20,10 @@ from paddle import nn class PositionwiseFeedForward(nn.Layer): """Positionwise feed forward layer. - Parameters - ---------- - idim : int - Input dimenstion. - hidden_units : int - The number of hidden units. - dropout_rate : float - Dropout rate. + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. """ def __init__(self, diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index f738b556..2073a78b 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -29,16 +29,11 @@ class MultiSequential(paddle.nn.Sequential): def repeat(N, fn): """Repeat module N times. - Parameters - ---------- - N : int - Number of repeat time. - fn : Callable - Function to generate module. + Args: + N (int): Number of repeat time. + fn (Callable): Function to generate module. - Returns - ---------- - MultiSequential - Repeated model instance. + Returns: + MultiSequential: Repeated model instance. """ return MultiSequential(*[fn(n) for n in range(N)]) diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py index cf0fca8a..07439705 100644 --- a/paddlespeech/t2s/modules/transformer/subsampling.py +++ b/paddlespeech/t2s/modules/transformer/subsampling.py @@ -21,16 +21,12 @@ from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding class Conv2dSubsampling(nn.Layer): """Convolutional 2D subsampling (to 1/4 length). - Parameters - ---------- - idim : int - Input dimension. - odim : int - Output dimension. - dropout_rate : float - Dropout rate. - pos_enc : nn.Layer - Custom position encoding layer. + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + pos_enc (nn.Layer): Custom position encoding layer. """ def __init__(self, idim, odim, dropout_rate, pos_enc=None): @@ -48,20 +44,12 @@ class Conv2dSubsampling(nn.Layer): def forward(self, x, x_mask): """Subsample x. - Parameters - ---------- - x : paddle.Tensor - Input tensor (#batch, time, idim). - x_mask : paddle.Tensor - Input mask (#batch, 1, time). - Returns - ---------- - paddle.Tensor - Subsampled tensor (#batch, time', odim), - where time' = time // 4. - paddle.Tensor - Subsampled mask (#batch, 1, time'), - where time' = time // 4. + Args: + x (Tensor): Input tensor (#batch, time, idim). + x_mask (Tensor): Input mask (#batch, 1, time). + Returns: + Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4. + Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4. """ # (b, c, t, f) x = x.unsqueeze(1) diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py index 82e30414..65e78a89 100644 --- a/paddlespeech/t2s/modules/upsample.py +++ b/paddlespeech/t2s/modules/upsample.py @@ -27,17 +27,12 @@ class Stretch2D(nn.Layer): def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"): """Strech an image (or image-like object) with some interpolation. - Parameters - ---------- - w_scale : int - Scalar of width. - h_scale : int - Scalar of the height. - mode : str, optional - Interpolation mode, modes suppored are "nearest", "bilinear", - "trilinear", "bicubic", "linear" and "area",by default "nearest" - - For more details about interpolation, see + Args: + w_scale (int): Scalar of width. + h_scale (int): Scalar of the height. + mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", + "trilinear", "bicubic", "linear" and "area",by default "nearest" + For more details about interpolation, see `paddle.nn.functional.interpolate `_. """ super().__init__() @@ -47,16 +42,14 @@ class Stretch2D(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, C, H, W) - - Returns - ------- - Tensor - Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. - The stretched image. + + Args: + x (Tensor): Shape (N, C, H, W) + + Returns: + Tensor: The stretched image. + Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. + """ out = F.interpolate( x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode) @@ -67,26 +60,16 @@ class UpsampleNet(nn.Layer): """A Layer to upsample spectrogram by applying consecutive stretch and convolutions. - Parameters - ---------- - upsample_scales : List[int] - Upsampling factors for each strech. - nonlinear_activation : Optional[str], optional - Activation after each convolution, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to construct the activation, by default {} - interpolate_mode : str, optional - Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size : int, optional - Convolution kernel size along the frequency axis, by default 1 - use_causal_conv : bool, optional - Whether to use causal padding before convolution, by default False - - If True, Causal padding is used along the time axis, i.e. padding - amount is ``receptive field - 1`` and 0 for before and after, - respectively. - - If False, "same" padding is used along the time axis. + Args: + upsample_scales (List[int]): Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 + use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + If True, Causal padding is used along the time axis, + i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. + If False, "same" padding is used along the time axis. """ def __init__(self, @@ -122,16 +105,12 @@ class UpsampleNet(nn.Layer): def forward(self, c): """ - Parameters - ---------- - c : Tensor - Shape (N, F, T), spectrogram - - Returns - ------- - Tensor - Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled - spectrogram + Args: + c (Tensor): spectrogram. Shape (N, F, T) + + Returns: + Tensor: upsampled spectrogram. + Shape (N, F, T'), where ``T' = upsample_factor * T``, """ c = c.unsqueeze(1) for f in self.up_layers: @@ -145,35 +124,22 @@ class UpsampleNet(nn.Layer): class ConvInUpsampleNet(nn.Layer): """A Layer to upsample spectrogram composed of a convolution and an UpsampleNet. - - Parameters - ---------- - upsample_scales : List[int] - Upsampling factors for each strech. - nonlinear_activation : Optional[str], optional - Activation after each convolution, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to construct the activation, by default {} - interpolate_mode : str, optional - Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size : int, optional - Convolution kernel size along the frequency axis, by default 1 - aux_channels : int, optional - Feature size of the input, by default 80 - aux_context_window : int, optional - Context window of the first 1D convolution applied to the input. It - related to the kernel size of the convolution, by default 0 - - If use causal convolution, the kernel size is ``window + 1``, else - the kernel size is ``2 * window + 1``. - use_causal_conv : bool, optional - Whether to use causal padding before convolution, by default False - - If True, Causal padding is used along the time axis, i.e. padding - amount is ``receptive field - 1`` and 0 for before and after, - respectively. - - If False, "same" padding is used along the time axis. + + Args: + upsample_scales (List[int]): Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 + aux_channels (int, optional): Feature size of the input, by default 80 + aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It + related to the kernel size of the convolution, by default 0 + If use causal convolution, the kernel size is ``window + 1``, + else the kernel size is ``2 * window + 1``. + use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + If True, Causal padding is used along the time axis, i.e. padding + amount is ``receptive field - 1`` and 0 for before and after, respectively. + If False, "same" padding is used along the time axis. """ def __init__(self, @@ -204,16 +170,11 @@ class ConvInUpsampleNet(nn.Layer): def forward(self, c): """ - Parameters - ---------- - c : Tensor - Shape (N, F, T), spectrogram - - Returns - ------- - Tensors - Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled - spectrogram + Args: + c (Tensor): spectrogram. Shape (N, F, T) + + Returns: + Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, """ c_ = self.conv_in(c) c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py index de36db24..05a363ff 100644 --- a/paddlespeech/t2s/training/experiment.py +++ b/paddlespeech/t2s/training/experiment.py @@ -57,35 +57,30 @@ class ExperimentBase(object): Feel free to add/overwrite other methods and standalone functions if you need. - Parameters - ---------- - config: yacs.config.CfgNode - The configuration used for the experiment. - - args: argparse.Namespace - The parsed command line arguments. - - Examples - -------- - >>> def main_sp(config, args): - >>> exp = Experiment(config, args) - >>> exp.setup() - >>> exe.resume_or_load() - >>> exp.run() - >>> - >>> config = get_cfg_defaults() - >>> parser = default_argument_parser() - >>> args = parser.parse_args() - >>> if args.config: - >>> config.merge_from_file(args.config) - >>> if args.opts: - >>> config.merge_from_list(args.opts) - >>> config.freeze() - >>> - >>> if args.ngpu > 1: - >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - >>> else: - >>> main_sp(config, args) + Args: + config (yacs.config.CfgNode): The configuration used for the experiment. + args (argparse.Namespace): The parsed command line arguments. + + Examples: + >>> def main_sp(config, args): + >>> exp = Experiment(config, args) + >>> exp.setup() + >>> exe.resume_or_load() + >>> exp.run() + >>> + >>> config = get_cfg_defaults() + >>> parser = default_argument_parser() + >>> args = parser.parse_args() + >>> if args.config: + >>> config.merge_from_file(args.config) + >>> if args.opts: + >>> config.merge_from_list(args.opts) + >>> config.freeze() + >>> + >>> if args.ngpu > 1: + >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) + >>> else: + >>> main_sp(config, args) """ def __init__(self, config, args): diff --git a/paddlespeech/t2s/training/extensions/snapshot.py b/paddlespeech/t2s/training/extensions/snapshot.py index 3a86556b..5f8d3c45 100644 --- a/paddlespeech/t2s/training/extensions/snapshot.py +++ b/paddlespeech/t2s/training/extensions/snapshot.py @@ -43,10 +43,8 @@ class Snapshot(extension.Extension): parameters and optimizer states. If the updater inside the trainer subclasses StandardUpdater, everything is good to go. - Parameters - ---------- - checkpoint_dir : Union[str, Path] - The directory to save checkpoints into. + Arsg: + checkpoint_dir (Union[str, Path]): The directory to save checkpoints into. """ trigger = (1, 'epoch') diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py index 7a9fe5ad..41b13b75 100644 --- a/paddlespeech/t2s/utils/error_rate.py +++ b/paddlespeech/t2s/utils/error_rate.py @@ -70,21 +70,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): """Compute the levenshtein distance between reference sequence and hypothesis sequence in word-level. - Parameters - ---------- - reference : str - The reference sentence. - hypothesis : str - The hypothesis sentence. - ignore_case : bool - Whether case-sensitive or not. - delimiter : char(str) - Delimiter of input sentences. - - Returns - ---------- - list - Levenshtein distance and word number of reference sentence. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + delimiter (char(str)): Delimiter of input sentences. + + Returns: + list: Levenshtein distance and word number of reference sentence. """ if ignore_case: reference = reference.lower() @@ -101,21 +94,14 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False): """Compute the levenshtein distance between reference sequence and hypothesis sequence in char-level. - Parameters - ---------- - reference: str - The reference sentence. - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - remove_space: bool - Whether remove internal space characters - - Returns - ---------- - list - Levenshtein distance and length of reference sentence. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + remove_space (bool): Whether remove internal space characters + + Returns: + list: Levenshtein distance and length of reference sentence. """ if ignore_case: reference = reference.lower() @@ -146,27 +132,17 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): We can use levenshtein distance to calculate WER. Please draw an attention that empty items will be removed when splitting sentences by delimiter. - Parameters - ---------- - reference: str - The reference sentence. - - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - delimiter: char - Delimiter of input sentences. - - Returns - ---------- - float - Word error rate. - - Raises - ---------- - ValueError - If word number of reference is zero. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + delimiter (char): Delimiter of input sentences. + + Returns: + float: Word error rate. + + Raises: + ValueError: If word number of reference is zero. """ edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case, delimiter) @@ -194,26 +170,17 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False): space characters will be truncated and multiple consecutive space characters in a sentence will be replaced by one space character. - Parameters - ---------- - reference: str - The reference sentence. - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - remove_space: bool - Whether remove internal space characters - - Returns - ---------- - float - Character error rate. - - Raises - ---------- - ValueError - If the reference length is zero. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + remove_space (bool): Whether remove internal space characters + + Returns: + float: Character error rate. + + Raises: + ValueError: If the reference length is zero. """ edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case, remove_space) diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py index d0e277db..75c2e448 100644 --- a/paddlespeech/t2s/utils/h5_utils.py +++ b/paddlespeech/t2s/utils/h5_utils.py @@ -23,18 +23,12 @@ import numpy as np def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: """Read a dataset from a HDF5 file. + Args: + filename (Union[Path, str]): Path of the HDF5 file. + dataset_name (str): Name of the dataset to read. - Parameters - ---------- - filename : Union[Path, str] - Path of the HDF5 file. - dataset_name : str - Name of the dataset to read. - - Returns - ------- - Any - The retrieved dataset. + Returns: + Any: The retrieved dataset. """ filename = Path(filename) @@ -60,17 +54,11 @@ def write_hdf5(filename: Union[Path, str], write_data: np.ndarray, is_overwrite: bool=True) -> None: """Write dataset to HDF5 file. - - Parameters - ---------- - filename : Union[Path, str] - Path of the HDF5 file. - dataset_name : str - Name of the dataset to write to. - write_data : np.ndarrays - The data to write. - is_overwrite : bool, optional - Whether to overwrite, by default True + Args: + filename (Union[Path, str]): Path of the HDF5 file. + dataset_name (str): Name of the dataset to write to. + write_data (np.ndarrays): The data to write. + is_overwrite (bool, optional): Whether to overwrite, by default True """ # convert to numpy array filename = Path(filename)