[doc]update api docs (#2406)

* update apt docs, test=doc
pull/2422/head
TianYuan 2 years ago committed by GitHub
parent e6cbcca3e2
commit 5e714ecb4a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -20,4 +20,7 @@ Subpackages
paddlespeech.audio.io
paddlespeech.audio.metric
paddlespeech.audio.sox_effects
paddlespeech.audio.streamdata
paddlespeech.audio.text
paddlespeech.audio.transform
paddlespeech.audio.utils

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.autodecode module
===============================================
.. automodule:: paddlespeech.audio.streamdata.autodecode
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.cache module
==========================================
.. automodule:: paddlespeech.audio.streamdata.cache
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.compat module
===========================================
.. automodule:: paddlespeech.audio.streamdata.compat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.extradatasets module
==================================================
.. automodule:: paddlespeech.audio.streamdata.extradatasets
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.filters module
============================================
.. automodule:: paddlespeech.audio.streamdata.filters
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.gopen module
==========================================
.. automodule:: paddlespeech.audio.streamdata.gopen
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.handlers module
=============================================
.. automodule:: paddlespeech.audio.streamdata.handlers
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.mix module
========================================
.. automodule:: paddlespeech.audio.streamdata.mix
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.paddle\_utils module
==================================================
.. automodule:: paddlespeech.audio.streamdata.paddle_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.pipeline module
=============================================
.. automodule:: paddlespeech.audio.streamdata.pipeline
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,28 @@
paddlespeech.audio.streamdata package
=====================================
.. automodule:: paddlespeech.audio.streamdata
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.streamdata.autodecode
paddlespeech.audio.streamdata.cache
paddlespeech.audio.streamdata.compat
paddlespeech.audio.streamdata.extradatasets
paddlespeech.audio.streamdata.filters
paddlespeech.audio.streamdata.gopen
paddlespeech.audio.streamdata.handlers
paddlespeech.audio.streamdata.mix
paddlespeech.audio.streamdata.paddle_utils
paddlespeech.audio.streamdata.pipeline
paddlespeech.audio.streamdata.shardlists
paddlespeech.audio.streamdata.tariterators
paddlespeech.audio.streamdata.utils
paddlespeech.audio.streamdata.writer

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.shardlists module
===============================================
.. automodule:: paddlespeech.audio.streamdata.shardlists
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.tariterators module
=================================================
.. automodule:: paddlespeech.audio.streamdata.tariterators
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.utils module
==========================================
.. automodule:: paddlespeech.audio.streamdata.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.writer module
===========================================
.. automodule:: paddlespeech.audio.streamdata.writer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.audio.text package
===============================
.. automodule:: paddlespeech.audio.text
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.text.text_featurizer
paddlespeech.audio.text.utility

@ -0,0 +1,7 @@
paddlespeech.audio.text.text\_featurizer module
===============================================
.. automodule:: paddlespeech.audio.text.text_featurizer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.text.utility module
======================================
.. automodule:: paddlespeech.audio.text.utility
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.add\_deltas module
===============================================
.. automodule:: paddlespeech.audio.transform.add_deltas
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.channel\_selector module
=====================================================
.. automodule:: paddlespeech.audio.transform.channel_selector
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.cmvn module
========================================
.. automodule:: paddlespeech.audio.transform.cmvn
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.functional module
==============================================
.. automodule:: paddlespeech.audio.transform.functional
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.perturb module
===========================================
.. automodule:: paddlespeech.audio.transform.perturb
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,24 @@
paddlespeech.audio.transform package
====================================
.. automodule:: paddlespeech.audio.transform
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.transform.add_deltas
paddlespeech.audio.transform.channel_selector
paddlespeech.audio.transform.cmvn
paddlespeech.audio.transform.functional
paddlespeech.audio.transform.perturb
paddlespeech.audio.transform.spec_augment
paddlespeech.audio.transform.spectrogram
paddlespeech.audio.transform.transform_interface
paddlespeech.audio.transform.transformation
paddlespeech.audio.transform.wpe

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spec\_augment module
=================================================
.. automodule:: paddlespeech.audio.transform.spec_augment
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spectrogram module
===============================================
.. automodule:: paddlespeech.audio.transform.spectrogram
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transform\_interface module
========================================================
.. automodule:: paddlespeech.audio.transform.transform_interface
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transformation module
==================================================
.. automodule:: paddlespeech.audio.transform.transformation
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.wpe module
=======================================
.. automodule:: paddlespeech.audio.transform.wpe
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.check\_kwargs module
=============================================
.. automodule:: paddlespeech.audio.utils.check_kwargs
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.dynamic\_import module
===============================================
.. automodule:: paddlespeech.audio.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -12,8 +12,11 @@ Submodules
.. toctree::
:maxdepth: 4
paddlespeech.audio.utils.check_kwargs
paddlespeech.audio.utils.download
paddlespeech.audio.utils.dynamic_import
paddlespeech.audio.utils.error
paddlespeech.audio.utils.log
paddlespeech.audio.utils.numeric
paddlespeech.audio.utils.tensor_utils
paddlespeech.audio.utils.time

@ -0,0 +1,7 @@
paddlespeech.audio.utils.tensor\_utils module
=============================================
.. automodule:: paddlespeech.audio.utils.tensor_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.collate module
=========================================
.. automodule:: paddlespeech.kws.exps.mdtc.collate
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.compute\_det module
==============================================
.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.plot\_det\_curve module
==================================================
.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,19 @@
paddlespeech.kws.exps.mdtc package
==================================
.. automodule:: paddlespeech.kws.exps.mdtc
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc.collate
paddlespeech.kws.exps.mdtc.compute_det
paddlespeech.kws.exps.mdtc.plot_det_curve
paddlespeech.kws.exps.mdtc.score
paddlespeech.kws.exps.mdtc.train

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.score module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.score
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.train module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,15 @@
paddlespeech.kws.exps package
=============================
.. automodule:: paddlespeech.kws.exps
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc

@ -12,4 +12,5 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps
paddlespeech.kws.models

@ -0,0 +1,7 @@
paddlespeech.resource.model\_alias module
=========================================
.. automodule:: paddlespeech.resource.model_alias
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.pretrained\_models module
===============================================
.. automodule:: paddlespeech.resource.pretrained_models
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.resource module
=====================================
.. automodule:: paddlespeech.resource.resource
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.resource package
=============================
.. automodule:: paddlespeech.resource
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.resource.model_alias
paddlespeech.resource.pretrained_models
paddlespeech.resource.resource

@ -16,8 +16,10 @@ Subpackages
paddlespeech.cli
paddlespeech.cls
paddlespeech.kws
paddlespeech.resource
paddlespeech.s2t
paddlespeech.server
paddlespeech.t2s
paddlespeech.text
paddlespeech.utils
paddlespeech.vector

@ -19,5 +19,4 @@ Subpackages
paddlespeech.s2t.models
paddlespeech.s2t.modules
paddlespeech.s2t.training
paddlespeech.s2t.transform
paddlespeech.s2t.utils

@ -18,7 +18,6 @@ Submodules
paddlespeech.server.utils.config
paddlespeech.server.utils.errors
paddlespeech.server.utils.exception
paddlespeech.server.utils.log
paddlespeech.server.utils.onnx_infer
paddlespeech.server.utils.paddle_predictor
paddlespeech.server.utils.util

@ -19,4 +19,5 @@ Submodules
paddlespeech.t2s.datasets.get_feats
paddlespeech.t2s.datasets.ljspeech
paddlespeech.t2s.datasets.preprocess_utils
paddlespeech.t2s.datasets.sampler
paddlespeech.t2s.datasets.vocoder_batch_fn

@ -0,0 +1,7 @@
paddlespeech.t2s.datasets.sampler module
========================================
.. automodule:: paddlespeech.t2s.datasets.sampler
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.align module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.normalize module
=================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.preprocess module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,21 @@
paddlespeech.t2s.exps.ernie\_sat package
========================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat.align
paddlespeech.t2s.exps.ernie_sat.normalize
paddlespeech.t2s.exps.ernie_sat.preprocess
paddlespeech.t2s.exps.ernie_sat.synthesize
paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
paddlespeech.t2s.exps.ernie_sat.train
paddlespeech.t2s.exps.ernie_sat.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
=======================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.train module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.utils module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
:members:
:undoc-members:
:show-inheritance:

@ -16,3 +16,4 @@ Submodules
paddlespeech.t2s.exps.fastspeech2.normalize
paddlespeech.t2s.exps.fastspeech2.preprocess
paddlespeech.t2s.exps.fastspeech2.train
paddlespeech.t2s.exps.fastspeech2.vc2_infer

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
===================================================
.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
:members:
:undoc-members:
:show-inheritance:

@ -12,11 +12,13 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat
paddlespeech.t2s.exps.fastspeech2
paddlespeech.t2s.exps.gan_vocoder
paddlespeech.t2s.exps.speedyspeech
paddlespeech.t2s.exps.tacotron2
paddlespeech.t2s.exps.transformer_tts
paddlespeech.t2s.exps.vits
paddlespeech.t2s.exps.waveflow
paddlespeech.t2s.exps.wavernn
@ -31,6 +33,7 @@ Submodules
paddlespeech.t2s.exps.ort_predict
paddlespeech.t2s.exps.ort_predict_e2e
paddlespeech.t2s.exps.ort_predict_streaming
paddlespeech.t2s.exps.stream_play_tts
paddlespeech.t2s.exps.syn_utils
paddlespeech.t2s.exps.synthesize
paddlespeech.t2s.exps.synthesize_e2e

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.stream\_play\_tts module
==============================================
.. automodule:: paddlespeech.t2s.exps.stream_play_tts
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.normalize module
===========================================
.. automodule:: paddlespeech.t2s.exps.vits.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.preprocess module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,20 @@
paddlespeech.t2s.exps.vits package
==================================
.. automodule:: paddlespeech.t2s.exps.vits
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.vits.normalize
paddlespeech.t2s.exps.vits.preprocess
paddlespeech.t2s.exps.vits.synthesize
paddlespeech.t2s.exps.vits.synthesize_e2e
paddlespeech.t2s.exps.vits.train
paddlespeech.t2s.exps.vits.voice_cloning

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize\_e2e module
=================================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.train module
=======================================
.. automodule:: paddlespeech.t2s.exps.vits.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.voice\_cloning module
================================================
.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.dataset module
=============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.onnx\_api module
===============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.t2s.frontend.g2pw package
======================================
.. automodule:: paddlespeech.t2s.frontend.g2pw
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw.dataset
paddlespeech.t2s.frontend.g2pw.onnx_api
paddlespeech.t2s.frontend.g2pw.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.utils module
===========================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.mix\_frontend module
==============================================
.. automodule:: paddlespeech.t2s.frontend.mix_frontend
:members:
:undoc-members:
:show-inheritance:

@ -12,6 +12,7 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw
paddlespeech.t2s.frontend.normalizer
paddlespeech.t2s.frontend.zh_normalization
@ -23,6 +24,7 @@ Submodules
paddlespeech.t2s.frontend.arpabet
paddlespeech.t2s.frontend.generate_lexicon
paddlespeech.t2s.frontend.mix_frontend
paddlespeech.t2s.frontend.phonectic
paddlespeech.t2s.frontend.punctuation
paddlespeech.t2s.frontend.tone_sandhi

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
====================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
=============================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
:members:
:undoc-members:
:show-inheritance:

@ -12,4 +12,5 @@ Submodules
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.ernie_sat.mlm
paddlespeech.t2s.models.ernie_sat.ernie_sat
paddlespeech.t2s.models.ernie_sat.ernie_sat_updater

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.core module
=========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.t2s.models.vits.monotonic\_align package
=====================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.vits.monotonic_align.core
paddlespeech.t2s.models.vits.monotonic_align.setup

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.setup module
==========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.dynamic\_import module
=========================================
.. automodule:: paddlespeech.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.env module
=============================
.. automodule:: paddlespeech.utils.env
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.utils package
==========================
.. automodule:: paddlespeech.utils
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.utils.dynamic_import
paddlespeech.utils.env

@ -74,8 +74,10 @@ Contents
paddlespeech.cli <api/paddlespeech.cli>
paddlespeech.cls <api/paddlespeech.cls>
paddlespeech.kws <api/paddlespeech.kws>
paddlespeech.resource <api/paddlespeech.resource>
paddlespeech.s2t <api/paddlespeech.s2t>
paddlespeech.server <api/paddlespeech.server>
paddlespeech.t2s <api/paddlespeech.t2s>
paddlespeech.text <api/paddlespeech.text>
paddlespeech.utils <api/ppaddlespeech.utils>
paddlespeech.vector <api/paddlespeech.vector>

@ -71,31 +71,53 @@ class MLMEncoder(nn.Layer):
"""Conformer encoder module.
Args:
idim (int): Input dimension.
attention_dim (int): Dimension of attention.
attention_heads (int): The number of heads of multi head attention.
linear_units (int): The number of units of position-wise feed forward.
num_blocks (int): The number of decoder blocks.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate after adding positional encoding.
attention_dropout_rate (float): Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]): Input layer type.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
idim (int):
Input dimension.
attention_dim (int):
Dimension of attention.
attention_heads (int):
The number of heads of multi head attention.
linear_units (int):
The number of units of position-wise feed forward.
num_blocks (int):
The number of decoder blocks.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
attention_dropout_rate (float):
Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]):
Input layer type.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
macaron_style (bool): Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str): Encoder positional encoding layer type.
selfattention_layer_type (str): Encoder attention layer type.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int): Kernerl size of convolution module.
padding_idx (int): Padding idx for input_layer=embed.
stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
positionwise_layer_type (str):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int):
Kernel size of positionwise conv1d layer.
macaron_style (bool):
Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str):
Encoder positional encoding layer type.
selfattention_layer_type (str):
Encoder attention layer type.
activation_type (str):
Encoder activation function type.
use_cnn_module (bool):
Whether to use convolution module.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int):
Kernerl size of convolution module.
padding_idx (int):
Padding idx for input_layer=embed.
stochastic_depth_rate (float):
Maximum probability to skip the encoder layer.
"""
@ -320,12 +342,16 @@ class MLMDecoder(MLMEncoder):
"""Encode input sequence.
Args:
xs (paddle.Tensor): Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, time).
xs (paddle.Tensor):
Input tensor (#batch, time, idim).
masks (paddle.Tensor):
Mask tensor (#batch, time).
Returns:
paddle.Tensor: Output tensor (#batch, time, attention_dim).
paddle.Tensor: Mask tensor (#batch, time).
paddle.Tensor:
Output tensor (#batch, time, attention_dim).
paddle.Tensor:
Mask tensor (#batch, time).
"""
xs = self.embed(xs)
@ -392,19 +418,27 @@ class MLM(nn.Layer):
use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
'''
Args:
speech (paddle.Tensor): input speech (1, Tmax, D).
text (paddle.Tensor): input text (1, Tmax2).
masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
span_bdy (List[int]): masked mel boundary of input speech (2,)
use_teacher_forcing (bool): whether to use teacher forcing
speech (paddle.Tensor):
input speech (1, Tmax, D).
text (paddle.Tensor):
input text (1, Tmax2).
masked_pos (paddle.Tensor):
masked position of input speech (1, Tmax)
speech_mask (paddle.Tensor):
mask of speech (1, 1, Tmax).
text_mask (paddle.Tensor):
mask of text (1, 1, Tmax2).
speech_seg_pos (paddle.Tensor):
n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
text_seg_pos (paddle.Tensor):
n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
span_bdy (List[int]):
masked mel boundary of input speech (2,)
use_teacher_forcing (bool):
whether to use teacher forcing
Returns:
List[Tensor]:
eg:
[Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
'''
z_cache = None

@ -48,12 +48,18 @@ class StochasticDurationPredictor(nn.Layer):
global_channels: int=-1, ):
"""Initialize StochasticDurationPredictor module.
Args:
channels (int): Number of channels.
kernel_size (int): Kernel size.
dropout_rate (float): Dropout rate.
flows (int): Number of flows.
dds_conv_layers (int): Number of conv layers in DDS conv.
global_channels (int): Number of global conditioning channels.
channels (int):
Number of channels.
kernel_size (int):
Kernel size.
dropout_rate (float):
Dropout rate.
flows (int):
Number of flows.
dds_conv_layers (int):
Number of conv layers in DDS conv.
global_channels (int):
Number of global conditioning channels.
"""
super().__init__()
@ -108,14 +114,21 @@ class StochasticDurationPredictor(nn.Layer):
noise_scale: float=1.0, ) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T_text).
x_mask (Tensor): Mask tensor (B, 1, T_text).
w (Optional[Tensor]): Duration tensor (B, 1, T_text).
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1)
inverse (bool): Whether to inverse the flow.
noise_scale (float): Noise scale value.
x (Tensor):
Input tensor (B, channels, T_text).
x_mask (Tensor):
Mask tensor (B, 1, T_text).
w (Optional[Tensor]):
Duration tensor (B, 1, T_text).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1)
inverse (bool):
Whether to inverse the flow.
noise_scale (float):
Noise scale value.
Returns:
Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,).
Tensor:
If not inverse, negative log-likelihood (NLL) tensor (B,).
If inverse, log-duration tensor (B, 1, T_text).
"""
# stop gradient

@ -34,11 +34,15 @@ class FlipFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Flipped tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Flipped tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
x = paddle.flip(x, [1])
if not inverse:
@ -60,13 +64,19 @@ class LogFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
inverse (bool): Whether to inverse the flow.
eps (float): Epsilon for log.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
eps (float):
Epsilon for log.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
if not inverse:
y = paddle.log(paddle.clip(x, min=eps)) * x_mask
@ -83,7 +93,8 @@ class ElementwiseAffineFlow(nn.Layer):
def __init__(self, channels: int):
"""Initialize ElementwiseAffineFlow module.
Args:
channels (int): Number of channels.
channels (int):
Number of channels.
"""
super().__init__()
self.channels = channels
@ -107,12 +118,17 @@ class ElementwiseAffineFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
if not inverse:
y = self.m + paddle.exp(self.logs) * x
@ -157,11 +173,16 @@ class DilatedDepthSeparableConv(nn.Layer):
eps: float=1e-5, ):
"""Initialize DilatedDepthSeparableConv module.
Args:
channels (int): Number of channels.
kernel_size (int): Kernel size.
layers (int): Number of layers.
dropout_rate (float): Dropout rate.
eps (float): Epsilon for layer norm.
channels (int):
Number of channels.
kernel_size (int):
Kernel size.
layers (int):
Number of layers.
dropout_rate (float):
Dropout rate.
eps (float):
Epsilon for layer norm.
"""
super().__init__()
@ -198,11 +219,15 @@ class DilatedDepthSeparableConv(nn.Layer):
g: Optional[paddle.Tensor]=None) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
x (Tensor):
Input tensor (B, in_channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns:
Tensor: Output tensor (B, channels, T).
Tensor:
Output tensor (B, channels, T).
"""
if g is not None:
x = x + g
@ -225,12 +250,18 @@ class ConvFlow(nn.Layer):
tail_bound: float=5.0, ):
"""Initialize ConvFlow module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size.
layers (int): Number of layers.
bins (int): Number of bins.
tail_bound (float): Tail bound value.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size.
layers (int):
Number of layers.
bins (int):
Number of bins.
tail_bound (float):
Tail bound value.
"""
super().__init__()
self.half_channels = in_channels // 2
@ -275,13 +306,19 @@ class ConvFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
xa, xb = x.split(2, 1)
h = self.input_conv(xa)

@ -97,81 +97,104 @@ class VITSGenerator(nn.Layer):
stochastic_duration_predictor_dds_conv_layers: int=3, ):
"""Initialize VITS generator module.
Args:
vocabs (int): Input vocabulary size.
aux_channels (int): Number of acoustic feature channels.
hidden_channels (int): Number of hidden channels.
spks (Optional[int]): Number of speakers. If set to > 1, assume that the
vocabs (int):
Input vocabulary size.
aux_channels (int):
Number of acoustic feature channels.
hidden_channels (int):
Number of hidden channels.
spks (Optional[int]):
Number of speakers. If set to > 1, assume that the
sids will be provided as the input and use sid embedding layer.
langs (Optional[int]): Number of languages. If set to > 1, assume that the
langs (Optional[int]):
Number of languages. If set to > 1, assume that the
lids will be provided as the input and use sid embedding layer.
spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
spk_embed_dim (Optional[int]):
Speaker embedding dimension. If set to > 0,
assume that spembs will be provided as the input.
global_channels (int): Number of global conditioning channels.
segment_size (int): Segment size for decoder.
text_encoder_attention_heads (int): Number of heads in conformer block
of text encoder.
text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
of text encoder.
text_encoder_blocks (int): Number of conformer blocks in text encoder.
text_encoder_positionwise_layer_type (str): Position-wise layer type in
conformer block of text encoder.
text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
kernel size in conformer block of text encoder. Only used when the
above layer type is conv1d or conv1d-linear.
text_encoder_positional_encoding_layer_type (str): Positional encoding layer
type in conformer block of text encoder.
text_encoder_self_attention_layer_type (str): Self-attention layer type in
conformer block of text encoder.
text_encoder_activation_type (str): Activation function type in conformer
block of text encoder.
text_encoder_normalize_before (bool): Whether to apply layer norm before
self-attention in conformer block of text encoder.
text_encoder_dropout_rate (float): Dropout rate in conformer block of
text encoder.
text_encoder_positional_dropout_rate (float): Dropout rate for positional
encoding in conformer block of text encoder.
text_encoder_attention_dropout_rate (float): Dropout rate for attention in
conformer block of text encoder.
text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
will be used when only use_conformer_conv_in_text_encoder = True.
use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
in conformer block of text encoder.
use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
conformer block of text encoder.
decoder_kernel_size (int): Decoder kernel size.
decoder_channels (int): Number of decoder initial channels.
decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
decoder_upsample_kernel_sizes (List[int]): List of kernel size for
upsampling layers in decoder.
decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
in decoder.
decoder_resblock_dilations (List[List[int]]): List of list of dilations for
resblocks in decoder.
use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
decoder.
posterior_encoder_kernel_size (int): Posterior encoder kernel size.
posterior_encoder_layers (int): Number of layers of posterior encoder.
posterior_encoder_stacks (int): Number of stacks of posterior encoder.
posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
normalization in posterior encoder.
flow_flows (int): Number of flows in flow.
flow_kernel_size (int): Kernel size in flow.
flow_base_dilation (int): Base dilation in flow.
flow_layers (int): Number of layers in flow.
flow_dropout_rate (float): Dropout rate in flow
use_weight_norm_in_flow (bool): Whether to apply weight normalization in
flow.
use_only_mean_in_flow (bool): Whether to use only mean in flow.
stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
duration predictor.
stochastic_duration_predictor_dropout_rate (float): Dropout rate in
stochastic duration predictor.
stochastic_duration_predictor_flows (int): Number of flows in stochastic
duration predictor.
stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
layers in stochastic duration predictor.
global_channels (int):
Number of global conditioning channels.
segment_size (int):
Segment size for decoder.
text_encoder_attention_heads (int):
Number of heads in conformer block of text encoder.
text_encoder_ffn_expand (int):
Expansion ratio of FFN in conformer block of text encoder.
text_encoder_blocks (int):
Number of conformer blocks in text encoder.
text_encoder_positionwise_layer_type (str):
Position-wise layer type in conformer block of text encoder.
text_encoder_positionwise_conv_kernel_size (int):
Position-wise convolution kernel size in conformer block of text encoder.
Only used when the above layer type is conv1d or conv1d-linear.
text_encoder_positional_encoding_layer_type (str):
Positional encoding layer type in conformer block of text encoder.
text_encoder_self_attention_layer_type (str):
Self-attention layer type in conformer block of text encoder.
text_encoder_activation_type (str):
Activation function type in conformer block of text encoder.
text_encoder_normalize_before (bool):
Whether to apply layer norm before self-attention in conformer block of text encoder.
text_encoder_dropout_rate (float):
Dropout rate in conformer block of text encoder.
text_encoder_positional_dropout_rate (float):
Dropout rate for positional encoding in conformer block of text encoder.
text_encoder_attention_dropout_rate (float):
Dropout rate for attention in conformer block of text encoder.
text_encoder_conformer_kernel_size (int):
Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True.
use_macaron_style_in_text_encoder (bool):
Whether to use macaron style FFN in conformer block of text encoder.
use_conformer_conv_in_text_encoder (bool):
Whether to use covolution in conformer block of text encoder.
decoder_kernel_size (int):
Decoder kernel size.
decoder_channels (int):
Number of decoder initial channels.
decoder_upsample_scales (List[int]):
List of upsampling scales in decoder.
decoder_upsample_kernel_sizes (List[int]):
List of kernel size for upsampling layers in decoder.
decoder_resblock_kernel_sizes (List[int]):
List of kernel size for resblocks in decoder.
decoder_resblock_dilations (List[List[int]]):
List of list of dilations for resblocks in decoder.
use_weight_norm_in_decoder (bool):
Whether to apply weight normalization in decoder.
posterior_encoder_kernel_size (int):
Posterior encoder kernel size.
posterior_encoder_layers (int):
Number of layers of posterior encoder.
posterior_encoder_stacks (int):
Number of stacks of posterior encoder.
posterior_encoder_base_dilation (int):
Base dilation of posterior encoder.
posterior_encoder_dropout_rate (float):
Dropout rate for posterior encoder.
use_weight_norm_in_posterior_encoder (bool):
Whether to apply weight normalization in posterior encoder.
flow_flows (int):
Number of flows in flow.
flow_kernel_size (int):
Kernel size in flow.
flow_base_dilation (int):
Base dilation in flow.
flow_layers (int):
Number of layers in flow.
flow_dropout_rate (float):
Dropout rate in flow
use_weight_norm_in_flow (bool):
Whether to apply weight normalization in flow.
use_only_mean_in_flow (bool):
Whether to use only mean in flow.
stochastic_duration_predictor_kernel_size (int):
Kernel size in stochastic duration predictor.
stochastic_duration_predictor_dropout_rate (float):
Dropout rate in stochastic duration predictor.
stochastic_duration_predictor_flows (int):
Number of flows in stochastic duration predictor.
stochastic_duration_predictor_dds_conv_layers (int):
Number of DDS conv layers in stochastic duration predictor.
"""
super().__init__()
self.segment_size = segment_size
@ -272,20 +295,33 @@ class VITSGenerator(nn.Layer):
paddle.Tensor, paddle.Tensor, ], ]:
"""Calculate forward propagation.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, aux_channels, T_feats).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
Tensor: Duration negative log-likelihood (NLL) tensor (B,).
Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
Tensor: Segments start index tensor (B,).
Tensor: Text mask tensor (B, 1, T_text).
Tensor: Feature mask tensor (B, 1, T_feats).
Tensor:
Waveform tensor (B, 1, segment_size * upsample_factor).
Tensor:
Duration negative log-likelihood (NLL) tensor (B,).
Tensor:
Monotonic attention weight tensor (B, 1, T_feats, T_text).
Tensor:
Segments start index tensor (B,).
Tensor:
Text mask tensor (B, 1, T_text).
Tensor:
Feature mask tensor (B, 1, T_feats).
tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
- Tensor: Posterior encoder hidden representation (B, H, T_feats).
- Tensor: Flow hidden representation (B, H, T_feats).
@ -402,24 +438,40 @@ class VITSGenerator(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Run inference.
Args:
text (Tensor): Input text index tensor (B, T_text,).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
text (Tensor):
Input text index tensor (B, T_text,).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
dur (Optional[Tensor]):
Ground-truth duration (B, T_text,). If provided,
skip the prediction of durations (i.e., teacher forcing).
noise_scale (float): Noise scale parameter for flow.
noise_scale_dur (float): Noise scale parameter for duration predictor.
alpha (float): Alpha parameter to control the speed of generated speech.
max_len (Optional[int]): Maximum length of acoustic feature sequence.
use_teacher_forcing (bool): Whether to use teacher forcing.
noise_scale (float):
Noise scale parameter for flow.
noise_scale_dur (float):
Noise scale parameter for duration predictor.
alpha (float):
Alpha parameter to control the speed of generated speech.
max_len (Optional[int]):
Maximum length of acoustic feature sequence.
use_teacher_forcing (bool):
Whether to use teacher forcing.
Returns:
Tensor: Generated waveform tensor (B, T_wav).
Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
Tensor: Duration tensor (B, T_text).
Tensor:
Generated waveform tensor (B, T_wav).
Tensor:
Monotonic attention weight tensor (B, T_feats, T_text).
Tensor:
Duration tensor (B, T_text).
"""
# encoder
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
@ -533,15 +585,23 @@ class VITSGenerator(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
"""Run voice conversion.
Args:
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor): Feature length tensor (B,).
sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor):
Feature length tensor (B,).
sids_src (Optional[Tensor]):
Speaker index tensor of source feature (B,) or (B, 1).
sids_tgt (Optional[Tensor]):
Speaker index tensor of target feature (B,) or (B, 1).
spembs_src (Optional[Tensor]):
Speaker embedding tensor of source feature (B, spk_embed_dim).
spembs_tgt (Optional[Tensor]):
Speaker embedding tensor of target feature (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
Tensor: Generated waveform tensor (B, T_wav).
Tensor:
Generated waveform tensor (B, T_wav).
"""
# encoder
g_src = None
@ -602,10 +662,13 @@ class VITSGenerator(nn.Layer):
mask: paddle.Tensor) -> paddle.Tensor:
"""Generate path a.k.a. monotonic attention.
Args:
dur (Tensor): Duration tensor (B, 1, T_text).
mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).
dur (Tensor):
Duration tensor (B, 1, T_text).
mask (Tensor):
Attention mask tensor (B, 1, T_feats, T_text).
Returns:
Tensor: Path tensor (B, 1, T_feats, T_text).
Tensor:
Path tensor (B, 1, T_feats, T_text).
"""
b, _, t_y, t_x = paddle.shape(mask)
cum_dur = paddle.cumsum(dur, -1)

@ -52,17 +52,28 @@ class PosteriorEncoder(nn.Layer):
"""Initilialize PosteriorEncoder module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size in WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of repeat stacking of WaveNet.
base_dilation (int): Base dilation factor.
global_channels (int): Number of global conditioning channels.
dropout_rate (float): Dropout rate.
bias (bool): Whether to use bias parameters in conv.
use_weight_norm (bool): Whether to apply weight norm.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size in WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of repeat stacking of WaveNet.
base_dilation (int):
Base dilation factor.
global_channels (int):
Number of global conditioning channels.
dropout_rate (float):
Dropout rate.
bias (bool):
Whether to use bias parameters in conv.
use_weight_norm (bool):
Whether to apply weight norm.
"""
super().__init__()
@ -99,15 +110,22 @@ class PosteriorEncoder(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T_feats).
x_lengths (Tensor): Length tensor (B,).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
x (Tensor):
Input tensor (B, in_channels, T_feats).
x_lengths (Tensor):
Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns:
Tensor: Encoded hidden representation tensor (B, out_channels, T_feats).
Tensor: Projected mean tensor (B, out_channels, T_feats).
Tensor: Projected scale tensor (B, out_channels, T_feats).
Tensor: Mask tensor for input tensor (B, 1, T_feats).
Tensor:
Encoded hidden representation tensor (B, out_channels, T_feats).
Tensor:
Projected mean tensor (B, out_channels, T_feats).
Tensor:
Projected scale tensor (B, out_channels, T_feats).
Tensor:
Mask tensor for input tensor (B, 1, T_feats).
"""
x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)

@ -55,18 +55,30 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Initilize ResidualAffineCouplingBlock module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
flows (int): Number of flows.
kernel_size (int): Kernel size for WaveNet.
base_dilation (int): Base dilation factor for WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of stacks of WaveNet.
global_channels (int): Number of global channels.
dropout_rate (float): Dropout rate.
use_weight_norm (bool): Whether to use weight normalization in WaveNet.
bias (bool): Whether to use bias paramters in WaveNet.
use_only_mean (bool): Whether to estimate only mean.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
flows (int):
Number of flows.
kernel_size (int):
Kernel size for WaveNet.
base_dilation (int):
Base dilation factor for WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
"""
super().__init__()
@ -97,10 +109,14 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_mask (Tensor): Length tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, in_channels, T).
x_mask (Tensor):
Length tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, in_channels, T).
@ -134,17 +150,28 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Initialzie ResidualAffineCouplingLayer module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size for WaveNet.
base_dilation (int): Base dilation factor for WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of stacks of WaveNet.
global_channels (int): Number of global channels.
dropout_rate (float): Dropout rate.
use_weight_norm (bool): Whether to use weight normalization in WaveNet.
bias (bool): Whether to use bias paramters in WaveNet.
use_only_mean (bool): Whether to estimate only mean.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size for WaveNet.
base_dilation (int):
Base dilation factor for WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
"""
assert in_channels % 2 == 0, "in_channels should be divisible by 2"
@ -211,14 +238,20 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_lengths (Tensor): Length tensor (B,).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, in_channels, T).
x_lengths (Tensor):
Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, in_channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, in_channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
xa, xb = paddle.split(x, 2, axis=1)

@ -62,23 +62,40 @@ class TextEncoder(nn.Layer):
"""Initialize TextEncoder module.
Args:
vocabs (int): Vocabulary size.
attention_dim (int): Attention dimension.
attention_heads (int): Number of attention heads.
linear_units (int): Number of linear units of positionwise layers.
blocks (int): Number of encoder blocks.
positionwise_layer_type (str): Positionwise layer type.
positionwise_conv_kernel_size (int): Positionwise layer's kernel size.
positional_encoding_layer_type (str): Positional encoding layer type.
self_attention_layer_type (str): Self-attention layer type.
activation_type (str): Activation function type.
normalize_before (bool): Whether to apply LayerNorm before attention.
use_macaron_style (bool): Whether to use macaron style components.
use_conformer_conv (bool): Whether to use conformer conv layers.
conformer_kernel_size (int): Conformer's conv kernel size.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate for positional encoding.
attention_dropout_rate (float): Dropout rate for attention.
vocabs (int):
Vocabulary size.
attention_dim (int):
Attention dimension.
attention_heads (int):
Number of attention heads.
linear_units (int):
Number of linear units of positionwise layers.
blocks (int):
Number of encoder blocks.
positionwise_layer_type (str):
Positionwise layer type.
positionwise_conv_kernel_size (int):
Positionwise layer's kernel size.
positional_encoding_layer_type (str):
Positional encoding layer type.
self_attention_layer_type (str):
Self-attention layer type.
activation_type (str):
Activation function type.
normalize_before (bool):
Whether to apply LayerNorm before attention.
use_macaron_style (bool):
Whether to use macaron style components.
use_conformer_conv (bool):
Whether to use conformer conv layers.
conformer_kernel_size (int):
Conformer's conv kernel size.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate for positional encoding.
attention_dropout_rate (float):
Dropout rate for attention.
"""
super().__init__()
@ -121,14 +138,20 @@ class TextEncoder(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input index tensor (B, T_text).
x_lengths (Tensor): Length tensor (B,).
x (Tensor):
Input index tensor (B, T_text).
x_lengths (Tensor):
Length tensor (B,).
Returns:
Tensor: Encoded hidden representation (B, attention_dim, T_text).
Tensor: Projected mean tensor (B, attention_dim, T_text).
Tensor: Projected scale tensor (B, attention_dim, T_text).
Tensor: Mask tensor for input tensor (B, 1, T_text).
Tensor:
Encoded hidden representation (B, attention_dim, T_text).
Tensor:
Projected mean tensor (B, attention_dim, T_text).
Tensor:
Projected scale tensor (B, attention_dim, T_text).
Tensor:
Mask tensor for input tensor (B, 1, T_text).
"""
x = self.emb(x) * math.sqrt(self.attention_dim)

@ -156,17 +156,25 @@ class VITS(nn.Layer):
init_type: str="xavier_uniform", ):
"""Initialize VITS module.
Args:
idim (int): Input vocabrary size.
odim (int): Acoustic feature dimension. The actual output channels will
idim (int):
Input vocabrary size.
odim (int):
Acoustic feature dimension. The actual output channels will
be 1 since VITS is the end-to-end text-to-wave model but for the
compatibility odim is used to indicate the acoustic feature dimension.
sampling_rate (int): Sampling rate, not used for the training but it will
sampling_rate (int):
Sampling rate, not used for the training but it will
be referred in saving waveform during the inference.
generator_type (str): Generator type.
generator_params (Dict[str, Any]): Parameter dict for generator.
discriminator_type (str): Discriminator type.
discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
cache_generator_outputs (bool): Whether to cache generator outputs.
generator_type (str):
Generator type.
generator_params (Dict[str, Any]):
Parameter dict for generator.
discriminator_type (str):
Discriminator type.
discriminator_params (Dict[str, Any]):
Parameter dict for discriminator.
cache_generator_outputs (bool):
Whether to cache generator outputs.
"""
assert check_argument_types()
super().__init__()
@ -218,14 +226,22 @@ class VITS(nn.Layer):
forward_generator: bool=True, ) -> Dict[str, Any]:
"""Perform generator forward.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
forward_generator (bool): Whether to forward generator.
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
forward_generator (bool):
Whether to forward generator.
Returns:
"""
@ -259,13 +275,20 @@ class VITS(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]:
"""Perform generator forward.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
"""
@ -304,13 +327,20 @@ class VITS(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]:
"""Perform discriminator forward.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
"""
@ -353,22 +383,36 @@ class VITS(nn.Layer):
use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
"""Run inference.
Args:
text (Tensor): Input text index tensor (T_text,).
feats (Tensor): Feature tensor (T_feats, aux_channels).
sids (Tensor): Speaker index tensor (1,).
spembs (Optional[Tensor]): Speaker embedding tensor (spk_embed_dim,).
lids (Tensor): Language index tensor (1,).
durations (Tensor): Ground-truth duration tensor (T_text,).
noise_scale (float): Noise scale value for flow.
noise_scale_dur (float): Noise scale value for duration predictor.
alpha (float): Alpha parameter to control the speed of generated speech.
max_len (Optional[int]): Maximum length.
use_teacher_forcing (bool): Whether to use teacher forcing.
text (Tensor):
Input text index tensor (T_text,).
feats (Tensor):
Feature tensor (T_feats, aux_channels).
sids (Tensor):
Speaker index tensor (1,).
spembs (Optional[Tensor]):
Speaker embedding tensor (spk_embed_dim,).
lids (Tensor):
Language index tensor (1,).
durations (Tensor):
Ground-truth duration tensor (T_text,).
noise_scale (float):
Noise scale value for flow.
noise_scale_dur (float):
Noise scale value for duration predictor.
alpha (float):
Alpha parameter to control the speed of generated speech.
max_len (Optional[int]):
Maximum length.
use_teacher_forcing (bool):
Whether to use teacher forcing.
Returns:
Dict[str, Tensor]:
* wav (Tensor): Generated waveform tensor (T_wav,).
* att_w (Tensor): Monotonic attention weight tensor (T_feats, T_text).
* duration (Tensor): Predicted duration tensor (T_text,).
* wav (Tensor):
Generated waveform tensor (T_wav,).
* att_w (Tensor):
Monotonic attention weight tensor (T_feats, T_text).
* duration (Tensor):
Predicted duration tensor (T_text,).
"""
# setup
text = text[None]
@ -417,15 +461,22 @@ class VITS(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
"""Run voice conversion.
Args:
feats (Tensor): Feature tensor (T_feats, aux_channels).
sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,).
sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,).
spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,).
spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,).
lids (Optional[Tensor]): Language index tensor (1,).
feats (Tensor):
Feature tensor (T_feats, aux_channels).
sids_src (Optional[Tensor]):
Speaker index tensor of source feature (1,).
sids_tgt (Optional[Tensor]):
Speaker index tensor of target feature (1,).
spembs_src (Optional[Tensor]):
Speaker embedding tensor of source feature (spk_embed_dim,).
spembs_tgt (Optional[Tensor]):
Speaker embedding tensor of target feature (spk_embed_dim,).
lids (Optional[Tensor]):
Language index tensor (1,).
Returns:
Dict[str, Tensor]:
* wav (Tensor): Generated waveform tensor (T_wav,).
* wav (Tensor):
Generated waveform tensor (T_wav,).
"""
assert feats is not None
feats = feats[None].transpose([0, 2, 1])

@ -39,14 +39,22 @@ class ResidualBlock(nn.Layer):
"""Initialize ResidualBlock module.
Args:
kernel_size (int): Kernel size of dilation convolution layer.
residual_channels (int): Number of channels for residual connection.
skip_channels (int): Number of channels for skip connection.
aux_channels (int): Number of local conditioning channels.
dropout (float): Dropout probability.
dilation (int): Dilation factor.
bias (bool): Whether to add bias parameter in convolution layers.
scale_residual (bool): Whether to scale the residual outputs.
kernel_size (int):
Kernel size of dilation convolution layer.
residual_channels (int):
Number of channels for residual connection.
skip_channels (int):
Number of channels for skip connection.
aux_channels (int):
Number of local conditioning channels.
dropout (float):
Dropout probability.
dilation (int):
Dilation factor.
bias (bool):
Whether to add bias parameter in convolution layers.
scale_residual (bool):
Whether to scale the residual outputs.
"""
super().__init__()

@ -47,25 +47,42 @@ class WaveNet(nn.Layer):
"""Initialize WaveNet module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Kernel size of dilated convolution.
layers (int): Number of residual block layers.
stacks (int): Number of stacks i.e., dilation cycles.
base_dilation (int): Base dilation factor.
residual_channels (int): Number of channels in residual conv.
gate_channels (int): Number of channels in gated conv.
skip_channels (int): Number of channels in skip conv.
aux_channels (int): Number of channels for local conditioning feature.
global_channels (int): Number of channels for global conditioning feature.
dropout_rate (float): Dropout rate. 0.0 means no dropout applied.
bias (bool): Whether to use bias parameter in conv layer.
use_weight_norm (bool): Whether to use weight norm. If set to true, it will
be applied to all of the conv layers.
use_first_conv (bool): Whether to use the first conv layers.
use_last_conv (bool): Whether to use the last conv layers.
scale_residual (bool): Whether to scale the residual outputs.
scale_skip_connect (bool): Whether to scale the skip connection outputs.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
kernel_size (int):
Kernel size of dilated convolution.
layers (int):
Number of residual block layers.
stacks (int):
Number of stacks i.e., dilation cycles.
base_dilation (int):
Base dilation factor.
residual_channels (int):
Number of channels in residual conv.
gate_channels (int):
Number of channels in gated conv.
skip_channels (int):
Number of channels in skip conv.
aux_channels (int):
Number of channels for local conditioning feature.
global_channels (int):
Number of channels for global conditioning feature.
dropout_rate (float):
Dropout rate. 0.0 means no dropout applied.
bias (bool):
Whether to use bias parameter in conv layer.
use_weight_norm (bool):
Whether to use weight norm. If set to true, it will be applied to all of the conv layers.
use_first_conv (bool):
Whether to use the first conv layers.
use_last_conv (bool):
Whether to use the last conv layers.
scale_residual (bool):
Whether to scale the residual outputs.
scale_skip_connect (bool):
Whether to scale the skip connection outputs.
"""
super().__init__()
@ -128,15 +145,18 @@ class WaveNet(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T) if use_first_conv else
(B, residual_channels, T).
x_mask (Optional[Tensor]): Mask tensor (B, 1, T).
c (Optional[Tensor]): Local conditioning features (B, aux_channels, T).
g (Optional[Tensor]): Global conditioning features (B, global_channels, 1).
x (Tensor):
Input noise signal (B, 1, T) if use_first_conv else (B, residual_channels, T).
x_mask (Optional[Tensor]):
Mask tensor (B, 1, T).
c (Optional[Tensor]):
Local conditioning features (B, aux_channels, T).
g (Optional[Tensor]):
Global conditioning features (B, global_channels, 1).
Returns:
Tensor: Output tensor (B, out_channels, T) if use_last_conv else
(B, residual_channels, T).
Tensor:
Output tensor (B, out_channels, T) if use_last_conv else(B, residual_channels, T).
"""
# encode to hidden representation

@ -69,9 +69,11 @@ class MelResNet(nn.Layer):
def forward(self, x):
'''
Args:
x (Tensor): Input tensor (B, in_dims, T).
x (Tensor):
Input tensor (B, in_dims, T).
Returns:
Tensor: Output tensor (B, res_out_dims, T).
Tensor:
Output tensor (B, res_out_dims, T).
'''
x = self.conv_in(x)
@ -119,10 +121,13 @@ class UpsampleNetwork(nn.Layer):
def forward(self, m):
'''
Args:
c (Tensor): Input tensor (B, C_aux, T).
c (Tensor):
Input tensor (B, C_aux, T).
Returns:
Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
Tensor:
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
Tensor:
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
'''
# aux: [B, C_aux, T]
# -> [B, res_out_dims, T - 2 * aux_context_window]
@ -302,7 +307,8 @@ class WaveRNN(nn.Layer):
number of samples for crossfading between batches
mu_law(bool)
Returns:
wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
wav sequence:
Output (T' * prod(upsample_scales), out_channels, C_out).
"""
self.eval()

Loading…
Cancel
Save