Merge branch 'PaddlePaddle:develop' into develop

pull/2418/head
liangym 3 years ago committed by GitHub
commit 5c197e7016
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -20,4 +20,7 @@ Subpackages
paddlespeech.audio.io paddlespeech.audio.io
paddlespeech.audio.metric paddlespeech.audio.metric
paddlespeech.audio.sox_effects paddlespeech.audio.sox_effects
paddlespeech.audio.streamdata
paddlespeech.audio.text
paddlespeech.audio.transform
paddlespeech.audio.utils paddlespeech.audio.utils

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.autodecode module
===============================================
.. automodule:: paddlespeech.audio.streamdata.autodecode
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.cache module
==========================================
.. automodule:: paddlespeech.audio.streamdata.cache
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.compat module
===========================================
.. automodule:: paddlespeech.audio.streamdata.compat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.extradatasets module
==================================================
.. automodule:: paddlespeech.audio.streamdata.extradatasets
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.filters module
============================================
.. automodule:: paddlespeech.audio.streamdata.filters
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.gopen module
==========================================
.. automodule:: paddlespeech.audio.streamdata.gopen
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.handlers module
=============================================
.. automodule:: paddlespeech.audio.streamdata.handlers
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.mix module
========================================
.. automodule:: paddlespeech.audio.streamdata.mix
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.paddle\_utils module
==================================================
.. automodule:: paddlespeech.audio.streamdata.paddle_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.pipeline module
=============================================
.. automodule:: paddlespeech.audio.streamdata.pipeline
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,28 @@
paddlespeech.audio.streamdata package
=====================================
.. automodule:: paddlespeech.audio.streamdata
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.streamdata.autodecode
paddlespeech.audio.streamdata.cache
paddlespeech.audio.streamdata.compat
paddlespeech.audio.streamdata.extradatasets
paddlespeech.audio.streamdata.filters
paddlespeech.audio.streamdata.gopen
paddlespeech.audio.streamdata.handlers
paddlespeech.audio.streamdata.mix
paddlespeech.audio.streamdata.paddle_utils
paddlespeech.audio.streamdata.pipeline
paddlespeech.audio.streamdata.shardlists
paddlespeech.audio.streamdata.tariterators
paddlespeech.audio.streamdata.utils
paddlespeech.audio.streamdata.writer

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.shardlists module
===============================================
.. automodule:: paddlespeech.audio.streamdata.shardlists
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.tariterators module
=================================================
.. automodule:: paddlespeech.audio.streamdata.tariterators
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.utils module
==========================================
.. automodule:: paddlespeech.audio.streamdata.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.writer module
===========================================
.. automodule:: paddlespeech.audio.streamdata.writer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.audio.text package
===============================
.. automodule:: paddlespeech.audio.text
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.text.text_featurizer
paddlespeech.audio.text.utility

@ -0,0 +1,7 @@
paddlespeech.audio.text.text\_featurizer module
===============================================
.. automodule:: paddlespeech.audio.text.text_featurizer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.text.utility module
======================================
.. automodule:: paddlespeech.audio.text.utility
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.add\_deltas module
===============================================
.. automodule:: paddlespeech.audio.transform.add_deltas
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.channel\_selector module
=====================================================
.. automodule:: paddlespeech.audio.transform.channel_selector
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.cmvn module
========================================
.. automodule:: paddlespeech.audio.transform.cmvn
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.functional module
==============================================
.. automodule:: paddlespeech.audio.transform.functional
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.perturb module
===========================================
.. automodule:: paddlespeech.audio.transform.perturb
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,24 @@
paddlespeech.audio.transform package
====================================
.. automodule:: paddlespeech.audio.transform
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.transform.add_deltas
paddlespeech.audio.transform.channel_selector
paddlespeech.audio.transform.cmvn
paddlespeech.audio.transform.functional
paddlespeech.audio.transform.perturb
paddlespeech.audio.transform.spec_augment
paddlespeech.audio.transform.spectrogram
paddlespeech.audio.transform.transform_interface
paddlespeech.audio.transform.transformation
paddlespeech.audio.transform.wpe

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spec\_augment module
=================================================
.. automodule:: paddlespeech.audio.transform.spec_augment
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spectrogram module
===============================================
.. automodule:: paddlespeech.audio.transform.spectrogram
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transform\_interface module
========================================================
.. automodule:: paddlespeech.audio.transform.transform_interface
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transformation module
==================================================
.. automodule:: paddlespeech.audio.transform.transformation
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.wpe module
=======================================
.. automodule:: paddlespeech.audio.transform.wpe
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.check\_kwargs module
=============================================
.. automodule:: paddlespeech.audio.utils.check_kwargs
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.dynamic\_import module
===============================================
.. automodule:: paddlespeech.audio.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -12,8 +12,11 @@ Submodules
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.audio.utils.check_kwargs
paddlespeech.audio.utils.download paddlespeech.audio.utils.download
paddlespeech.audio.utils.dynamic_import
paddlespeech.audio.utils.error paddlespeech.audio.utils.error
paddlespeech.audio.utils.log paddlespeech.audio.utils.log
paddlespeech.audio.utils.numeric paddlespeech.audio.utils.numeric
paddlespeech.audio.utils.tensor_utils
paddlespeech.audio.utils.time paddlespeech.audio.utils.time

@ -0,0 +1,7 @@
paddlespeech.audio.utils.tensor\_utils module
=============================================
.. automodule:: paddlespeech.audio.utils.tensor_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.collate module
=========================================
.. automodule:: paddlespeech.kws.exps.mdtc.collate
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.compute\_det module
==============================================
.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.plot\_det\_curve module
==================================================
.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,19 @@
paddlespeech.kws.exps.mdtc package
==================================
.. automodule:: paddlespeech.kws.exps.mdtc
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc.collate
paddlespeech.kws.exps.mdtc.compute_det
paddlespeech.kws.exps.mdtc.plot_det_curve
paddlespeech.kws.exps.mdtc.score
paddlespeech.kws.exps.mdtc.train

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.score module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.score
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.train module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,15 @@
paddlespeech.kws.exps package
=============================
.. automodule:: paddlespeech.kws.exps
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc

@ -12,4 +12,5 @@ Subpackages
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.kws.exps
paddlespeech.kws.models paddlespeech.kws.models

@ -0,0 +1,7 @@
paddlespeech.resource.model\_alias module
=========================================
.. automodule:: paddlespeech.resource.model_alias
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.pretrained\_models module
===============================================
.. automodule:: paddlespeech.resource.pretrained_models
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.resource module
=====================================
.. automodule:: paddlespeech.resource.resource
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.resource package
=============================
.. automodule:: paddlespeech.resource
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.resource.model_alias
paddlespeech.resource.pretrained_models
paddlespeech.resource.resource

@ -16,8 +16,10 @@ Subpackages
paddlespeech.cli paddlespeech.cli
paddlespeech.cls paddlespeech.cls
paddlespeech.kws paddlespeech.kws
paddlespeech.resource
paddlespeech.s2t paddlespeech.s2t
paddlespeech.server paddlespeech.server
paddlespeech.t2s paddlespeech.t2s
paddlespeech.text paddlespeech.text
paddlespeech.utils
paddlespeech.vector paddlespeech.vector

@ -19,5 +19,4 @@ Subpackages
paddlespeech.s2t.models paddlespeech.s2t.models
paddlespeech.s2t.modules paddlespeech.s2t.modules
paddlespeech.s2t.training paddlespeech.s2t.training
paddlespeech.s2t.transform
paddlespeech.s2t.utils paddlespeech.s2t.utils

@ -18,7 +18,6 @@ Submodules
paddlespeech.server.utils.config paddlespeech.server.utils.config
paddlespeech.server.utils.errors paddlespeech.server.utils.errors
paddlespeech.server.utils.exception paddlespeech.server.utils.exception
paddlespeech.server.utils.log
paddlespeech.server.utils.onnx_infer paddlespeech.server.utils.onnx_infer
paddlespeech.server.utils.paddle_predictor paddlespeech.server.utils.paddle_predictor
paddlespeech.server.utils.util paddlespeech.server.utils.util

@ -19,4 +19,5 @@ Submodules
paddlespeech.t2s.datasets.get_feats paddlespeech.t2s.datasets.get_feats
paddlespeech.t2s.datasets.ljspeech paddlespeech.t2s.datasets.ljspeech
paddlespeech.t2s.datasets.preprocess_utils paddlespeech.t2s.datasets.preprocess_utils
paddlespeech.t2s.datasets.sampler
paddlespeech.t2s.datasets.vocoder_batch_fn paddlespeech.t2s.datasets.vocoder_batch_fn

@ -0,0 +1,7 @@
paddlespeech.t2s.datasets.sampler module
========================================
.. automodule:: paddlespeech.t2s.datasets.sampler
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.align module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.normalize module
=================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.preprocess module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,21 @@
paddlespeech.t2s.exps.ernie\_sat package
========================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat.align
paddlespeech.t2s.exps.ernie_sat.normalize
paddlespeech.t2s.exps.ernie_sat.preprocess
paddlespeech.t2s.exps.ernie_sat.synthesize
paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
paddlespeech.t2s.exps.ernie_sat.train
paddlespeech.t2s.exps.ernie_sat.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
=======================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.train module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.utils module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
:members:
:undoc-members:
:show-inheritance:

@ -16,3 +16,4 @@ Submodules
paddlespeech.t2s.exps.fastspeech2.normalize paddlespeech.t2s.exps.fastspeech2.normalize
paddlespeech.t2s.exps.fastspeech2.preprocess paddlespeech.t2s.exps.fastspeech2.preprocess
paddlespeech.t2s.exps.fastspeech2.train paddlespeech.t2s.exps.fastspeech2.train
paddlespeech.t2s.exps.fastspeech2.vc2_infer

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
===================================================
.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
:members:
:undoc-members:
:show-inheritance:

@ -12,11 +12,13 @@ Subpackages
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.t2s.exps.ernie_sat
paddlespeech.t2s.exps.fastspeech2 paddlespeech.t2s.exps.fastspeech2
paddlespeech.t2s.exps.gan_vocoder paddlespeech.t2s.exps.gan_vocoder
paddlespeech.t2s.exps.speedyspeech paddlespeech.t2s.exps.speedyspeech
paddlespeech.t2s.exps.tacotron2 paddlespeech.t2s.exps.tacotron2
paddlespeech.t2s.exps.transformer_tts paddlespeech.t2s.exps.transformer_tts
paddlespeech.t2s.exps.vits
paddlespeech.t2s.exps.waveflow paddlespeech.t2s.exps.waveflow
paddlespeech.t2s.exps.wavernn paddlespeech.t2s.exps.wavernn
@ -31,6 +33,7 @@ Submodules
paddlespeech.t2s.exps.ort_predict paddlespeech.t2s.exps.ort_predict
paddlespeech.t2s.exps.ort_predict_e2e paddlespeech.t2s.exps.ort_predict_e2e
paddlespeech.t2s.exps.ort_predict_streaming paddlespeech.t2s.exps.ort_predict_streaming
paddlespeech.t2s.exps.stream_play_tts
paddlespeech.t2s.exps.syn_utils paddlespeech.t2s.exps.syn_utils
paddlespeech.t2s.exps.synthesize paddlespeech.t2s.exps.synthesize
paddlespeech.t2s.exps.synthesize_e2e paddlespeech.t2s.exps.synthesize_e2e

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.stream\_play\_tts module
==============================================
.. automodule:: paddlespeech.t2s.exps.stream_play_tts
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.normalize module
===========================================
.. automodule:: paddlespeech.t2s.exps.vits.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.preprocess module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,20 @@
paddlespeech.t2s.exps.vits package
==================================
.. automodule:: paddlespeech.t2s.exps.vits
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.vits.normalize
paddlespeech.t2s.exps.vits.preprocess
paddlespeech.t2s.exps.vits.synthesize
paddlespeech.t2s.exps.vits.synthesize_e2e
paddlespeech.t2s.exps.vits.train
paddlespeech.t2s.exps.vits.voice_cloning

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize\_e2e module
=================================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.train module
=======================================
.. automodule:: paddlespeech.t2s.exps.vits.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.voice\_cloning module
================================================
.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.dataset module
=============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.onnx\_api module
===============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.t2s.frontend.g2pw package
======================================
.. automodule:: paddlespeech.t2s.frontend.g2pw
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw.dataset
paddlespeech.t2s.frontend.g2pw.onnx_api
paddlespeech.t2s.frontend.g2pw.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.utils module
===========================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.mix\_frontend module
==============================================
.. automodule:: paddlespeech.t2s.frontend.mix_frontend
:members:
:undoc-members:
:show-inheritance:

@ -12,6 +12,7 @@ Subpackages
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.t2s.frontend.g2pw
paddlespeech.t2s.frontend.normalizer paddlespeech.t2s.frontend.normalizer
paddlespeech.t2s.frontend.zh_normalization paddlespeech.t2s.frontend.zh_normalization
@ -23,6 +24,7 @@ Submodules
paddlespeech.t2s.frontend.arpabet paddlespeech.t2s.frontend.arpabet
paddlespeech.t2s.frontend.generate_lexicon paddlespeech.t2s.frontend.generate_lexicon
paddlespeech.t2s.frontend.mix_frontend
paddlespeech.t2s.frontend.phonectic paddlespeech.t2s.frontend.phonectic
paddlespeech.t2s.frontend.punctuation paddlespeech.t2s.frontend.punctuation
paddlespeech.t2s.frontend.tone_sandhi paddlespeech.t2s.frontend.tone_sandhi

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
====================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
=============================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
:members:
:undoc-members:
:show-inheritance:

@ -12,4 +12,5 @@ Submodules
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.t2s.models.ernie_sat.mlm paddlespeech.t2s.models.ernie_sat.ernie_sat
paddlespeech.t2s.models.ernie_sat.ernie_sat_updater

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.core module
=========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.t2s.models.vits.monotonic\_align package
=====================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.vits.monotonic_align.core
paddlespeech.t2s.models.vits.monotonic_align.setup

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.setup module
==========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.dynamic\_import module
=========================================
.. automodule:: paddlespeech.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.env module
=============================
.. automodule:: paddlespeech.utils.env
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.utils package
==========================
.. automodule:: paddlespeech.utils
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.utils.dynamic_import
paddlespeech.utils.env

@ -74,8 +74,10 @@ Contents
paddlespeech.cli <api/paddlespeech.cli> paddlespeech.cli <api/paddlespeech.cli>
paddlespeech.cls <api/paddlespeech.cls> paddlespeech.cls <api/paddlespeech.cls>
paddlespeech.kws <api/paddlespeech.kws> paddlespeech.kws <api/paddlespeech.kws>
paddlespeech.resource <api/paddlespeech.resource>
paddlespeech.s2t <api/paddlespeech.s2t> paddlespeech.s2t <api/paddlespeech.s2t>
paddlespeech.server <api/paddlespeech.server> paddlespeech.server <api/paddlespeech.server>
paddlespeech.t2s <api/paddlespeech.t2s> paddlespeech.t2s <api/paddlespeech.t2s>
paddlespeech.text <api/paddlespeech.text> paddlespeech.text <api/paddlespeech.text>
paddlespeech.utils <api/ppaddlespeech.utils>
paddlespeech.vector <api/paddlespeech.vector> paddlespeech.vector <api/paddlespeech.vector>

@ -1,11 +1,10 @@
# ERNIE-SAT with AISHELL3 dataset # ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 ## Model Framework
In ERNIE-SAT, we propose two innovations:
## 模型框架 - In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
ERNIE-SAT 中我们提出了两项创新: - The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
<p align="center"> <p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" /> <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -1,11 +1,10 @@
# ERNIE-SAT with AISHELL3 and VCTK dataset # ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 ## Model Framework
In ERNIE-SAT, we propose two innovations:
## 模型框架 - In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
ERNIE-SAT 中我们提出了两项创新: - The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
<p align="center"> <p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" /> <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -1,11 +1,10 @@
# ERNIE-SAT with VCTK dataset # ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 ## Model Framework
In ERNIE-SAT, we propose two innovations:
## 模型框架 - In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
ERNIE-SAT 中我们提出了两项创新: - The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
<p align="center"> <p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" /> <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 |
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.050767 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 |

@ -1 +1 @@
from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter from .onnx_api import G2PWOnnxConverter

@ -15,6 +15,10 @@
Credits Credits
This code is modified from https://github.com/GitYCC/g2pW This code is modified from https://github.com/GitYCC/g2pW
""" """
from typing import Dict
from typing import List
from typing import Tuple
import numpy as np import numpy as np
from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map
@ -23,22 +27,17 @@ ANCHOR_CHAR = '▁'
def prepare_onnx_input(tokenizer, def prepare_onnx_input(tokenizer,
labels, labels: List[str],
char2phonemes, char2phonemes: Dict[str, List[int]],
chars, chars: List[str],
texts, texts: List[str],
query_ids, query_ids: List[int],
phonemes=None, use_mask: bool=False,
pos_tags=None, window_size: int=None,
use_mask=False, max_len: int=512) -> Dict[str, np.array]:
use_char_phoneme=False,
use_pos=False,
window_size=None,
max_len=512):
if window_size is not None: if window_size is not None:
truncated_texts, truncated_query_ids = _truncate_texts(window_size, truncated_texts, truncated_query_ids = _truncate_texts(
texts, query_ids) window_size=window_size, texts=texts, query_ids=query_ids)
input_ids = [] input_ids = []
token_type_ids = [] token_type_ids = []
attention_masks = [] attention_masks = []
@ -51,13 +50,19 @@ def prepare_onnx_input(tokenizer,
query_id = (truncated_query_ids if window_size else query_ids)[idx] query_id = (truncated_query_ids if window_size else query_ids)[idx]
try: try:
tokens, text2token, token2text = tokenize_and_map(tokenizer, text) tokens, text2token, token2text = tokenize_and_map(
tokenizer=tokenizer, text=text)
except Exception: except Exception:
print(f'warning: text "{text}" is invalid') print(f'warning: text "{text}" is invalid')
return {} return {}
text, query_id, tokens, text2token, token2text = _truncate( text, query_id, tokens, text2token, token2text = _truncate(
max_len, text, query_id, tokens, text2token, token2text) max_len=max_len,
text=text,
query_id=query_id,
tokens=tokens,
text2token=text2token,
token2text=token2text)
processed_tokens = ['[CLS]'] + tokens + ['[SEP]'] processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
@ -91,7 +96,8 @@ def prepare_onnx_input(tokenizer,
return outputs return outputs
def _truncate_texts(window_size, texts, query_ids): def _truncate_texts(window_size: int, texts: List[str],
query_ids: List[int]) -> Tuple[List[str], List[int]]:
truncated_texts = [] truncated_texts = []
truncated_query_ids = [] truncated_query_ids = []
for text, query_id in zip(texts, query_ids): for text, query_id in zip(texts, query_ids):
@ -105,7 +111,12 @@ def _truncate_texts(window_size, texts, query_ids):
return truncated_texts, truncated_query_ids return truncated_texts, truncated_query_ids
def _truncate(max_len, text, query_id, tokens, text2token, token2text): def _truncate(max_len: int,
text: str,
query_id: int,
tokens: List[str],
text2token: List[int],
token2text: List[Tuple[int]]):
truncate_len = max_len - 2 truncate_len = max_len - 2
if len(tokens) <= truncate_len: if len(tokens) <= truncate_len:
return (text, query_id, tokens, text2token, token2text) return (text, query_id, tokens, text2token, token2text)
@ -132,18 +143,8 @@ def _truncate(max_len, text, query_id, tokens, text2token, token2text):
], [(s - start, e - start) for s, e in token2text[token_start:token_end]]) ], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
def prepare_data(sent_path, lb_path=None): def get_phoneme_labels(polyphonic_chars: List[List[str]]
raw_texts = open(sent_path).read().rstrip().split('\n') ) -> Tuple[List[str], Dict[str, List[int]]]:
query_ids = [raw.index(ANCHOR_CHAR) for raw in raw_texts]
texts = [raw.replace(ANCHOR_CHAR, '') for raw in raw_texts]
if lb_path is None:
return texts, query_ids
else:
phonemes = open(lb_path).read().rstrip().split('\n')
return texts, query_ids, phonemes
def get_phoneme_labels(polyphonic_chars):
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars]))) labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
char2phonemes = {} char2phonemes = {}
for char, phoneme in polyphonic_chars: for char, phoneme in polyphonic_chars:
@ -153,7 +154,8 @@ def get_phoneme_labels(polyphonic_chars):
return labels, char2phonemes return labels, char2phonemes
def get_char_phoneme_labels(polyphonic_chars): def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted( labels = sorted(
list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars]))) list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
char2phonemes = {} char2phonemes = {}

@ -17,6 +17,10 @@ Credits
""" """
import json import json
import os import os
from typing import Any
from typing import Dict
from typing import List
from typing import Tuple
import numpy as np import numpy as np
import onnxruntime import onnxruntime
@ -37,7 +41,8 @@ from paddlespeech.utils.env import MODEL_HOME
model_version = '1.1' model_version = '1.1'
def predict(session, onnx_input, labels): def predict(session, onnx_input: Dict[str, Any],
labels: List[str]) -> Tuple[List[str], List[float]]:
all_preds = [] all_preds = []
all_confidences = [] all_confidences = []
probs = session.run([], { probs = session.run([], {
@ -61,10 +66,10 @@ def predict(session, onnx_input, labels):
class G2PWOnnxConverter: class G2PWOnnxConverter:
def __init__(self, def __init__(self,
model_dir=MODEL_HOME, model_dir: os.PathLike=MODEL_HOME,
style='bopomofo', style: str='bopomofo',
model_source=None, model_source: str=None,
enable_non_tradional_chinese=False): enable_non_tradional_chinese: bool=False):
uncompress_path = download_and_decompress( uncompress_path = download_and_decompress(
g2pw_onnx_models['G2PWModel'][model_version], model_dir) g2pw_onnx_models['G2PWModel'][model_version], model_dir)
@ -76,7 +81,8 @@ class G2PWOnnxConverter:
os.path.join(uncompress_path, 'g2pW.onnx'), os.path.join(uncompress_path, 'g2pW.onnx'),
sess_options=sess_options) sess_options=sess_options)
self.config = load_config( self.config = load_config(
os.path.join(uncompress_path, 'config.py'), use_default=True) config_path=os.path.join(uncompress_path, 'config.py'),
use_default=True)
self.model_source = model_source if model_source else self.config.model_source self.model_source = model_source if model_source else self.config.model_source
self.enable_opencc = enable_non_tradional_chinese self.enable_opencc = enable_non_tradional_chinese
@ -103,9 +109,9 @@ class G2PWOnnxConverter:
.strip().split('\n') .strip().split('\n')
] ]
self.labels, self.char2phonemes = get_char_phoneme_labels( self.labels, self.char2phonemes = get_char_phoneme_labels(
self.polyphonic_chars polyphonic_chars=self.polyphonic_chars
) if self.config.use_char_phoneme else get_phoneme_labels( ) if self.config.use_char_phoneme else get_phoneme_labels(
self.polyphonic_chars) polyphonic_chars=self.polyphonic_chars)
self.chars = sorted(list(self.char2phonemes.keys())) self.chars = sorted(list(self.char2phonemes.keys()))
@ -146,7 +152,7 @@ class G2PWOnnxConverter:
if self.enable_opencc: if self.enable_opencc:
self.cc = OpenCC('s2tw') self.cc = OpenCC('s2tw')
def _convert_bopomofo_to_pinyin(self, bopomofo): def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
tone = bopomofo[-1] tone = bopomofo[-1]
assert tone in '12345' assert tone in '12345'
component = self.bopomofo_convert_dict.get(bopomofo[:-1]) component = self.bopomofo_convert_dict.get(bopomofo[:-1])
@ -156,7 +162,7 @@ class G2PWOnnxConverter:
print(f'Warning: "{bopomofo}" cannot convert to pinyin') print(f'Warning: "{bopomofo}" cannot convert to pinyin')
return None return None
def __call__(self, sentences): def __call__(self, sentences: List[str]) -> List[List[str]]:
if isinstance(sentences, str): if isinstance(sentences, str):
sentences = [sentences] sentences = [sentences]
@ -169,23 +175,25 @@ class G2PWOnnxConverter:
sentences = translated_sentences sentences = translated_sentences
texts, query_ids, sent_ids, partial_results = self._prepare_data( texts, query_ids, sent_ids, partial_results = self._prepare_data(
sentences) sentences=sentences)
if len(texts) == 0: if len(texts) == 0:
# sentences no polyphonic words # sentences no polyphonic words
return partial_results return partial_results
onnx_input = prepare_onnx_input( onnx_input = prepare_onnx_input(
self.tokenizer, tokenizer=self.tokenizer,
self.labels, labels=self.labels,
self.char2phonemes, char2phonemes=self.char2phonemes,
self.chars, chars=self.chars,
texts, texts=texts,
query_ids, query_ids=query_ids,
use_mask=self.config.use_mask, use_mask=self.config.use_mask,
use_char_phoneme=self.config.use_char_phoneme,
window_size=None) window_size=None)
preds, confidences = predict(self.session_g2pW, onnx_input, self.labels) preds, confidences = predict(
session=self.session_g2pW,
onnx_input=onnx_input,
labels=self.labels)
if self.config.use_char_phoneme: if self.config.use_char_phoneme:
preds = [pred.split(' ')[1] for pred in preds] preds = [pred.split(' ')[1] for pred in preds]
@ -195,7 +203,9 @@ class G2PWOnnxConverter:
return results return results
def _prepare_data(self, sentences): def _prepare_data(
self, sentences: List[str]
) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
texts, query_ids, sent_ids, partial_results = [], [], [], [] texts, query_ids, sent_ids, partial_results = [], [], [], []
for sent_id, sent in enumerate(sentences): for sent_id, sent in enumerate(sentences):
# pypinyin works well for Simplified Chinese than Traditional Chinese # pypinyin works well for Simplified Chinese than Traditional Chinese

@ -15,10 +15,11 @@
Credits Credits
This code is modified from https://github.com/GitYCC/g2pW This code is modified from https://github.com/GitYCC/g2pW
""" """
import os
import re import re
def wordize_and_map(text): def wordize_and_map(text: str):
words = [] words = []
index_map_from_text_to_word = [] index_map_from_text_to_word = []
index_map_from_word_to_text = [] index_map_from_word_to_text = []
@ -54,8 +55,8 @@ def wordize_and_map(text):
return words, index_map_from_text_to_word, index_map_from_word_to_text return words, index_map_from_text_to_word, index_map_from_word_to_text
def tokenize_and_map(tokenizer, text): def tokenize_and_map(tokenizer, text: str):
words, text2word, word2text = wordize_and_map(text) words, text2word, word2text = wordize_and_map(text=text)
tokens = [] tokens = []
index_map_from_token_to_text = [] index_map_from_token_to_text = []
@ -82,7 +83,7 @@ def tokenize_and_map(tokenizer, text):
return tokens, index_map_from_text_to_token, index_map_from_token_to_text return tokens, index_map_from_text_to_token, index_map_from_token_to_text
def _load_config(config_path): def _load_config(config_path: os.PathLike):
import importlib.util import importlib.util
spec = importlib.util.spec_from_file_location('__init__', config_path) spec = importlib.util.spec_from_file_location('__init__', config_path)
config = importlib.util.module_from_spec(spec) config = importlib.util.module_from_spec(spec)
@ -130,7 +131,7 @@ default_config_dict = {
} }
def load_config(config_path, use_default=False): def load_config(config_path: os.PathLike, use_default: bool=False):
config = _load_config(config_path) config = _load_config(config_path)
if use_default: if use_default:
for attr, val in default_config_dict.items(): for attr, val in default_config_dict.items():

@ -71,31 +71,53 @@ class MLMEncoder(nn.Layer):
"""Conformer encoder module. """Conformer encoder module.
Args: Args:
idim (int): Input dimension. idim (int):
attention_dim (int): Dimension of attention. Input dimension.
attention_heads (int): The number of heads of multi head attention. attention_dim (int):
linear_units (int): The number of units of position-wise feed forward. Dimension of attention.
num_blocks (int): The number of decoder blocks. attention_heads (int):
dropout_rate (float): Dropout rate. The number of heads of multi head attention.
positional_dropout_rate (float): Dropout rate after adding positional encoding. linear_units (int):
attention_dropout_rate (float): Dropout rate in attention. The number of units of position-wise feed forward.
input_layer (Union[str, paddle.nn.Layer]): Input layer type. num_blocks (int):
normalize_before (bool): Whether to use layer_norm before the first block. The number of decoder blocks.
concat_after (bool): Whether to concat attention layer's input and output. dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
attention_dropout_rate (float):
Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]):
Input layer type.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied. if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x))) i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x) if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". positionwise_layer_type (str):
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. "linear", "conv1d", or "conv1d-linear".
macaron_style (bool): Whether to use macaron style for positionwise layer. positionwise_conv_kernel_size (int):
pos_enc_layer_type (str): Encoder positional encoding layer type. Kernel size of positionwise conv1d layer.
selfattention_layer_type (str): Encoder attention layer type. macaron_style (bool):
activation_type (str): Encoder activation function type. Whether to use macaron style for positionwise layer.
use_cnn_module (bool): Whether to use convolution module. pos_enc_layer_type (str):
zero_triu (bool): Whether to zero the upper triangular part of attention matrix. Encoder positional encoding layer type.
cnn_module_kernel (int): Kernerl size of convolution module. selfattention_layer_type (str):
padding_idx (int): Padding idx for input_layer=embed. Encoder attention layer type.
stochastic_depth_rate (float): Maximum probability to skip the encoder layer. activation_type (str):
Encoder activation function type.
use_cnn_module (bool):
Whether to use convolution module.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int):
Kernerl size of convolution module.
padding_idx (int):
Padding idx for input_layer=embed.
stochastic_depth_rate (float):
Maximum probability to skip the encoder layer.
""" """
@ -320,12 +342,16 @@ class MLMDecoder(MLMEncoder):
"""Encode input sequence. """Encode input sequence.
Args: Args:
xs (paddle.Tensor): Input tensor (#batch, time, idim). xs (paddle.Tensor):
masks (paddle.Tensor): Mask tensor (#batch, time). Input tensor (#batch, time, idim).
masks (paddle.Tensor):
Mask tensor (#batch, time).
Returns: Returns:
paddle.Tensor: Output tensor (#batch, time, attention_dim). paddle.Tensor:
paddle.Tensor: Mask tensor (#batch, time). Output tensor (#batch, time, attention_dim).
paddle.Tensor:
Mask tensor (#batch, time).
""" """
xs = self.embed(xs) xs = self.embed(xs)
@ -392,19 +418,27 @@ class MLM(nn.Layer):
use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]: use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
''' '''
Args: Args:
speech (paddle.Tensor): input speech (1, Tmax, D). speech (paddle.Tensor):
text (paddle.Tensor): input text (1, Tmax2). input speech (1, Tmax, D).
masked_pos (paddle.Tensor): masked position of input speech (1, Tmax) text (paddle.Tensor):
speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax). input text (1, Tmax2).
text_mask (paddle.Tensor): mask of text (1, 1, Tmax2). masked_pos (paddle.Tensor):
speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax). masked position of input speech (1, Tmax)
text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2). speech_mask (paddle.Tensor):
span_bdy (List[int]): masked mel boundary of input speech (2,) mask of speech (1, 1, Tmax).
use_teacher_forcing (bool): whether to use teacher forcing text_mask (paddle.Tensor):
mask of text (1, 1, Tmax2).
speech_seg_pos (paddle.Tensor):
n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
text_seg_pos (paddle.Tensor):
n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
span_bdy (List[int]):
masked mel boundary of input speech (2,)
use_teacher_forcing (bool):
whether to use teacher forcing
Returns: Returns:
List[Tensor]: List[Tensor]:
eg: eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
[Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
''' '''
z_cache = None z_cache = None

@ -48,12 +48,18 @@ class StochasticDurationPredictor(nn.Layer):
global_channels: int=-1, ): global_channels: int=-1, ):
"""Initialize StochasticDurationPredictor module. """Initialize StochasticDurationPredictor module.
Args: Args:
channels (int): Number of channels. channels (int):
kernel_size (int): Kernel size. Number of channels.
dropout_rate (float): Dropout rate. kernel_size (int):
flows (int): Number of flows. Kernel size.
dds_conv_layers (int): Number of conv layers in DDS conv. dropout_rate (float):
global_channels (int): Number of global conditioning channels. Dropout rate.
flows (int):
Number of flows.
dds_conv_layers (int):
Number of conv layers in DDS conv.
global_channels (int):
Number of global conditioning channels.
""" """
super().__init__() super().__init__()
@ -108,14 +114,21 @@ class StochasticDurationPredictor(nn.Layer):
noise_scale: float=1.0, ) -> paddle.Tensor: noise_scale: float=1.0, ) -> paddle.Tensor:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, channels, T_text). x (Tensor):
x_mask (Tensor): Mask tensor (B, 1, T_text). Input tensor (B, channels, T_text).
w (Optional[Tensor]): Duration tensor (B, 1, T_text). x_mask (Tensor):
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1) Mask tensor (B, 1, T_text).
inverse (bool): Whether to inverse the flow. w (Optional[Tensor]):
noise_scale (float): Noise scale value. Duration tensor (B, 1, T_text).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1)
inverse (bool):
Whether to inverse the flow.
noise_scale (float):
Noise scale value.
Returns: Returns:
Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,). Tensor:
If not inverse, negative log-likelihood (NLL) tensor (B,).
If inverse, log-duration tensor (B, 1, T_text). If inverse, log-duration tensor (B, 1, T_text).
""" """
# stop gradient # stop gradient

@ -34,11 +34,15 @@ class FlipFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, channels, T). x (Tensor):
inverse (bool): Whether to inverse the flow. Input tensor (B, channels, T).
inverse (bool):
Whether to inverse the flow.
Returns: Returns:
Tensor: Flipped tensor (B, channels, T). Tensor:
Tensor: Log-determinant tensor for NLL (B,) if not inverse. Flipped tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
""" """
x = paddle.flip(x, [1]) x = paddle.flip(x, [1])
if not inverse: if not inverse:
@ -60,13 +64,19 @@ class LogFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, channels, T). x (Tensor):
x_mask (Tensor): Mask tensor (B, 1, T). Input tensor (B, channels, T).
inverse (bool): Whether to inverse the flow. x_mask (Tensor):
eps (float): Epsilon for log. Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
eps (float):
Epsilon for log.
Returns: Returns:
Tensor: Output tensor (B, channels, T). Tensor:
Tensor: Log-determinant tensor for NLL (B,) if not inverse. Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
""" """
if not inverse: if not inverse:
y = paddle.log(paddle.clip(x, min=eps)) * x_mask y = paddle.log(paddle.clip(x, min=eps)) * x_mask
@ -83,7 +93,8 @@ class ElementwiseAffineFlow(nn.Layer):
def __init__(self, channels: int): def __init__(self, channels: int):
"""Initialize ElementwiseAffineFlow module. """Initialize ElementwiseAffineFlow module.
Args: Args:
channels (int): Number of channels. channels (int):
Number of channels.
""" """
super().__init__() super().__init__()
self.channels = channels self.channels = channels
@ -107,12 +118,17 @@ class ElementwiseAffineFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, channels, T). x (Tensor):
x_mask (Tensor): Mask tensor (B, 1, T). Input tensor (B, channels, T).
inverse (bool): Whether to inverse the flow. x_mask (Tensor):
Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
Returns: Returns:
Tensor: Output tensor (B, channels, T). Tensor:
Tensor: Log-determinant tensor for NLL (B,) if not inverse. Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
""" """
if not inverse: if not inverse:
y = self.m + paddle.exp(self.logs) * x y = self.m + paddle.exp(self.logs) * x
@ -157,11 +173,16 @@ class DilatedDepthSeparableConv(nn.Layer):
eps: float=1e-5, ): eps: float=1e-5, ):
"""Initialize DilatedDepthSeparableConv module. """Initialize DilatedDepthSeparableConv module.
Args: Args:
channels (int): Number of channels. channels (int):
kernel_size (int): Kernel size. Number of channels.
layers (int): Number of layers. kernel_size (int):
dropout_rate (float): Dropout rate. Kernel size.
eps (float): Epsilon for layer norm. layers (int):
Number of layers.
dropout_rate (float):
Dropout rate.
eps (float):
Epsilon for layer norm.
""" """
super().__init__() super().__init__()
@ -198,11 +219,15 @@ class DilatedDepthSeparableConv(nn.Layer):
g: Optional[paddle.Tensor]=None) -> paddle.Tensor: g: Optional[paddle.Tensor]=None) -> paddle.Tensor:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, in_channels, T). x (Tensor):
x_mask (Tensor): Mask tensor (B, 1, T). Input tensor (B, in_channels, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). x_mask (Tensor):
Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns: Returns:
Tensor: Output tensor (B, channels, T). Tensor:
Output tensor (B, channels, T).
""" """
if g is not None: if g is not None:
x = x + g x = x + g
@ -225,12 +250,18 @@ class ConvFlow(nn.Layer):
tail_bound: float=5.0, ): tail_bound: float=5.0, ):
"""Initialize ConvFlow module. """Initialize ConvFlow module.
Args: Args:
in_channels (int): Number of input channels. in_channels (int):
hidden_channels (int): Number of hidden channels. Number of input channels.
kernel_size (int): Kernel size. hidden_channels (int):
layers (int): Number of layers. Number of hidden channels.
bins (int): Number of bins. kernel_size (int):
tail_bound (float): Tail bound value. Kernel size.
layers (int):
Number of layers.
bins (int):
Number of bins.
tail_bound (float):
Tail bound value.
""" """
super().__init__() super().__init__()
self.half_channels = in_channels // 2 self.half_channels = in_channels // 2
@ -275,13 +306,19 @@ class ConvFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, channels, T). x (Tensor):
x_mask (Tensor): Mask tensor (B, 1, T). Input tensor (B, channels, T).
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1). x_mask (Tensor):
inverse (bool): Whether to inverse the flow. Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns: Returns:
Tensor: Output tensor (B, channels, T). Tensor:
Tensor: Log-determinant tensor for NLL (B,) if not inverse. Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
""" """
xa, xb = x.split(2, 1) xa, xb = x.split(2, 1)
h = self.input_conv(xa) h = self.input_conv(xa)

@ -97,81 +97,104 @@ class VITSGenerator(nn.Layer):
stochastic_duration_predictor_dds_conv_layers: int=3, ): stochastic_duration_predictor_dds_conv_layers: int=3, ):
"""Initialize VITS generator module. """Initialize VITS generator module.
Args: Args:
vocabs (int): Input vocabulary size. vocabs (int):
aux_channels (int): Number of acoustic feature channels. Input vocabulary size.
hidden_channels (int): Number of hidden channels. aux_channels (int):
spks (Optional[int]): Number of speakers. If set to > 1, assume that the Number of acoustic feature channels.
hidden_channels (int):
Number of hidden channels.
spks (Optional[int]):
Number of speakers. If set to > 1, assume that the
sids will be provided as the input and use sid embedding layer. sids will be provided as the input and use sid embedding layer.
langs (Optional[int]): Number of languages. If set to > 1, assume that the langs (Optional[int]):
Number of languages. If set to > 1, assume that the
lids will be provided as the input and use sid embedding layer. lids will be provided as the input and use sid embedding layer.
spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, spk_embed_dim (Optional[int]):
Speaker embedding dimension. If set to > 0,
assume that spembs will be provided as the input. assume that spembs will be provided as the input.
global_channels (int): Number of global conditioning channels. global_channels (int):
segment_size (int): Segment size for decoder. Number of global conditioning channels.
text_encoder_attention_heads (int): Number of heads in conformer block segment_size (int):
of text encoder. Segment size for decoder.
text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block text_encoder_attention_heads (int):
of text encoder. Number of heads in conformer block of text encoder.
text_encoder_blocks (int): Number of conformer blocks in text encoder. text_encoder_ffn_expand (int):
text_encoder_positionwise_layer_type (str): Position-wise layer type in Expansion ratio of FFN in conformer block of text encoder.
conformer block of text encoder. text_encoder_blocks (int):
text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution Number of conformer blocks in text encoder.
kernel size in conformer block of text encoder. Only used when the text_encoder_positionwise_layer_type (str):
above layer type is conv1d or conv1d-linear. Position-wise layer type in conformer block of text encoder.
text_encoder_positional_encoding_layer_type (str): Positional encoding layer text_encoder_positionwise_conv_kernel_size (int):
type in conformer block of text encoder. Position-wise convolution kernel size in conformer block of text encoder.
text_encoder_self_attention_layer_type (str): Self-attention layer type in Only used when the above layer type is conv1d or conv1d-linear.
conformer block of text encoder. text_encoder_positional_encoding_layer_type (str):
text_encoder_activation_type (str): Activation function type in conformer Positional encoding layer type in conformer block of text encoder.
block of text encoder. text_encoder_self_attention_layer_type (str):
text_encoder_normalize_before (bool): Whether to apply layer norm before Self-attention layer type in conformer block of text encoder.
self-attention in conformer block of text encoder. text_encoder_activation_type (str):
text_encoder_dropout_rate (float): Dropout rate in conformer block of Activation function type in conformer block of text encoder.
text encoder. text_encoder_normalize_before (bool):
text_encoder_positional_dropout_rate (float): Dropout rate for positional Whether to apply layer norm before self-attention in conformer block of text encoder.
encoding in conformer block of text encoder. text_encoder_dropout_rate (float):
text_encoder_attention_dropout_rate (float): Dropout rate for attention in Dropout rate in conformer block of text encoder.
conformer block of text encoder. text_encoder_positional_dropout_rate (float):
text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It Dropout rate for positional encoding in conformer block of text encoder.
will be used when only use_conformer_conv_in_text_encoder = True. text_encoder_attention_dropout_rate (float):
use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN Dropout rate for attention in conformer block of text encoder.
in conformer block of text encoder. text_encoder_conformer_kernel_size (int):
use_conformer_conv_in_text_encoder (bool): Whether to use covolution in Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True.
conformer block of text encoder. use_macaron_style_in_text_encoder (bool):
decoder_kernel_size (int): Decoder kernel size. Whether to use macaron style FFN in conformer block of text encoder.
decoder_channels (int): Number of decoder initial channels. use_conformer_conv_in_text_encoder (bool):
decoder_upsample_scales (List[int]): List of upsampling scales in decoder. Whether to use covolution in conformer block of text encoder.
decoder_upsample_kernel_sizes (List[int]): List of kernel size for decoder_kernel_size (int):
upsampling layers in decoder. Decoder kernel size.
decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks decoder_channels (int):
in decoder. Number of decoder initial channels.
decoder_resblock_dilations (List[List[int]]): List of list of dilations for decoder_upsample_scales (List[int]):
resblocks in decoder. List of upsampling scales in decoder.
use_weight_norm_in_decoder (bool): Whether to apply weight normalization in decoder_upsample_kernel_sizes (List[int]):
decoder. List of kernel size for upsampling layers in decoder.
posterior_encoder_kernel_size (int): Posterior encoder kernel size. decoder_resblock_kernel_sizes (List[int]):
posterior_encoder_layers (int): Number of layers of posterior encoder. List of kernel size for resblocks in decoder.
posterior_encoder_stacks (int): Number of stacks of posterior encoder. decoder_resblock_dilations (List[List[int]]):
posterior_encoder_base_dilation (int): Base dilation of posterior encoder. List of list of dilations for resblocks in decoder.
posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder. use_weight_norm_in_decoder (bool):
use_weight_norm_in_posterior_encoder (bool): Whether to apply weight Whether to apply weight normalization in decoder.
normalization in posterior encoder. posterior_encoder_kernel_size (int):
flow_flows (int): Number of flows in flow. Posterior encoder kernel size.
flow_kernel_size (int): Kernel size in flow. posterior_encoder_layers (int):
flow_base_dilation (int): Base dilation in flow. Number of layers of posterior encoder.
flow_layers (int): Number of layers in flow. posterior_encoder_stacks (int):
flow_dropout_rate (float): Dropout rate in flow Number of stacks of posterior encoder.
use_weight_norm_in_flow (bool): Whether to apply weight normalization in posterior_encoder_base_dilation (int):
flow. Base dilation of posterior encoder.
use_only_mean_in_flow (bool): Whether to use only mean in flow. posterior_encoder_dropout_rate (float):
stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic Dropout rate for posterior encoder.
duration predictor. use_weight_norm_in_posterior_encoder (bool):
stochastic_duration_predictor_dropout_rate (float): Dropout rate in Whether to apply weight normalization in posterior encoder.
stochastic duration predictor. flow_flows (int):
stochastic_duration_predictor_flows (int): Number of flows in stochastic Number of flows in flow.
duration predictor. flow_kernel_size (int):
stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv Kernel size in flow.
layers in stochastic duration predictor. flow_base_dilation (int):
Base dilation in flow.
flow_layers (int):
Number of layers in flow.
flow_dropout_rate (float):
Dropout rate in flow
use_weight_norm_in_flow (bool):
Whether to apply weight normalization in flow.
use_only_mean_in_flow (bool):
Whether to use only mean in flow.
stochastic_duration_predictor_kernel_size (int):
Kernel size in stochastic duration predictor.
stochastic_duration_predictor_dropout_rate (float):
Dropout rate in stochastic duration predictor.
stochastic_duration_predictor_flows (int):
Number of flows in stochastic duration predictor.
stochastic_duration_predictor_dds_conv_layers (int):
Number of DDS conv layers in stochastic duration predictor.
""" """
super().__init__() super().__init__()
self.segment_size = segment_size self.segment_size = segment_size
@ -272,20 +295,33 @@ class VITSGenerator(nn.Layer):
paddle.Tensor, paddle.Tensor, ], ]: paddle.Tensor, paddle.Tensor, ], ]:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
text (Tensor): Text index tensor (B, T_text). text (Tensor):
text_lengths (Tensor): Text length tensor (B,). Text index tensor (B, T_text).
feats (Tensor): Feature tensor (B, aux_channels, T_feats). text_lengths (Tensor):
feats_lengths (Tensor): Feature length tensor (B,). Text length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). feats (Tensor):
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). Feature tensor (B, aux_channels, T_feats).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns: Returns:
Tensor: Waveform tensor (B, 1, segment_size * upsample_factor). Tensor:
Tensor: Duration negative log-likelihood (NLL) tensor (B,). Waveform tensor (B, 1, segment_size * upsample_factor).
Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text). Tensor:
Tensor: Segments start index tensor (B,). Duration negative log-likelihood (NLL) tensor (B,).
Tensor: Text mask tensor (B, 1, T_text). Tensor:
Tensor: Feature mask tensor (B, 1, T_feats). Monotonic attention weight tensor (B, 1, T_feats, T_text).
Tensor:
Segments start index tensor (B,).
Tensor:
Text mask tensor (B, 1, T_text).
Tensor:
Feature mask tensor (B, 1, T_feats).
tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
- Tensor: Posterior encoder hidden representation (B, H, T_feats). - Tensor: Posterior encoder hidden representation (B, H, T_feats).
- Tensor: Flow hidden representation (B, H, T_feats). - Tensor: Flow hidden representation (B, H, T_feats).
@ -402,24 +438,40 @@ class VITSGenerator(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Run inference. """Run inference.
Args: Args:
text (Tensor): Input text index tensor (B, T_text,). text (Tensor):
text_lengths (Tensor): Text length tensor (B,). Input text index tensor (B, T_text,).
feats (Tensor): Feature tensor (B, aux_channels, T_feats,). text_lengths (Tensor):
feats_lengths (Tensor): Feature length tensor (B,). Text length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). feats (Tensor):
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). Feature tensor (B, aux_channels, T_feats,).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). feats_lengths (Tensor):
dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided, Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
dur (Optional[Tensor]):
Ground-truth duration (B, T_text,). If provided,
skip the prediction of durations (i.e., teacher forcing). skip the prediction of durations (i.e., teacher forcing).
noise_scale (float): Noise scale parameter for flow. noise_scale (float):
noise_scale_dur (float): Noise scale parameter for duration predictor. Noise scale parameter for flow.
alpha (float): Alpha parameter to control the speed of generated speech. noise_scale_dur (float):
max_len (Optional[int]): Maximum length of acoustic feature sequence. Noise scale parameter for duration predictor.
use_teacher_forcing (bool): Whether to use teacher forcing. alpha (float):
Alpha parameter to control the speed of generated speech.
max_len (Optional[int]):
Maximum length of acoustic feature sequence.
use_teacher_forcing (bool):
Whether to use teacher forcing.
Returns: Returns:
Tensor: Generated waveform tensor (B, T_wav). Tensor:
Tensor: Monotonic attention weight tensor (B, T_feats, T_text). Generated waveform tensor (B, T_wav).
Tensor: Duration tensor (B, T_text). Tensor:
Monotonic attention weight tensor (B, T_feats, T_text).
Tensor:
Duration tensor (B, T_text).
""" """
# encoder # encoder
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths) x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
@ -533,15 +585,23 @@ class VITSGenerator(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
"""Run voice conversion. """Run voice conversion.
Args: Args:
feats (Tensor): Feature tensor (B, aux_channels, T_feats,). feats (Tensor):
feats_lengths (Tensor): Feature length tensor (B,). Feature tensor (B, aux_channels, T_feats,).
sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1). feats_lengths (Tensor):
sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1). Feature length tensor (B,).
spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim). sids_src (Optional[Tensor]):
spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim). Speaker index tensor of source feature (B,) or (B, 1).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). sids_tgt (Optional[Tensor]):
Speaker index tensor of target feature (B,) or (B, 1).
spembs_src (Optional[Tensor]):
Speaker embedding tensor of source feature (B, spk_embed_dim).
spembs_tgt (Optional[Tensor]):
Speaker embedding tensor of target feature (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns: Returns:
Tensor: Generated waveform tensor (B, T_wav). Tensor:
Generated waveform tensor (B, T_wav).
""" """
# encoder # encoder
g_src = None g_src = None
@ -602,10 +662,13 @@ class VITSGenerator(nn.Layer):
mask: paddle.Tensor) -> paddle.Tensor: mask: paddle.Tensor) -> paddle.Tensor:
"""Generate path a.k.a. monotonic attention. """Generate path a.k.a. monotonic attention.
Args: Args:
dur (Tensor): Duration tensor (B, 1, T_text). dur (Tensor):
mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text). Duration tensor (B, 1, T_text).
mask (Tensor):
Attention mask tensor (B, 1, T_feats, T_text).
Returns: Returns:
Tensor: Path tensor (B, 1, T_feats, T_text). Tensor:
Path tensor (B, 1, T_feats, T_text).
""" """
b, _, t_y, t_x = paddle.shape(mask) b, _, t_y, t_x = paddle.shape(mask)
cum_dur = paddle.cumsum(dur, -1) cum_dur = paddle.cumsum(dur, -1)

@ -52,17 +52,28 @@ class PosteriorEncoder(nn.Layer):
"""Initilialize PosteriorEncoder module. """Initilialize PosteriorEncoder module.
Args: Args:
in_channels (int): Number of input channels. in_channels (int):
out_channels (int): Number of output channels. Number of input channels.
hidden_channels (int): Number of hidden channels. out_channels (int):
kernel_size (int): Kernel size in WaveNet. Number of output channels.
layers (int): Number of layers of WaveNet. hidden_channels (int):
stacks (int): Number of repeat stacking of WaveNet. Number of hidden channels.
base_dilation (int): Base dilation factor. kernel_size (int):
global_channels (int): Number of global conditioning channels. Kernel size in WaveNet.
dropout_rate (float): Dropout rate. layers (int):
bias (bool): Whether to use bias parameters in conv. Number of layers of WaveNet.
use_weight_norm (bool): Whether to apply weight norm. stacks (int):
Number of repeat stacking of WaveNet.
base_dilation (int):
Base dilation factor.
global_channels (int):
Number of global conditioning channels.
dropout_rate (float):
Dropout rate.
bias (bool):
Whether to use bias parameters in conv.
use_weight_norm (bool):
Whether to apply weight norm.
""" """
super().__init__() super().__init__()
@ -99,15 +110,22 @@ class PosteriorEncoder(nn.Layer):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, in_channels, T_feats). x (Tensor):
x_lengths (Tensor): Length tensor (B,). Input tensor (B, in_channels, T_feats).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). x_lengths (Tensor):
Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns: Returns:
Tensor: Encoded hidden representation tensor (B, out_channels, T_feats). Tensor:
Tensor: Projected mean tensor (B, out_channels, T_feats). Encoded hidden representation tensor (B, out_channels, T_feats).
Tensor: Projected scale tensor (B, out_channels, T_feats). Tensor:
Tensor: Mask tensor for input tensor (B, 1, T_feats). Projected mean tensor (B, out_channels, T_feats).
Tensor:
Projected scale tensor (B, out_channels, T_feats).
Tensor:
Mask tensor for input tensor (B, 1, T_feats).
""" """
x_mask = make_non_pad_mask(x_lengths).unsqueeze(1) x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)

@ -55,18 +55,30 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Initilize ResidualAffineCouplingBlock module. """Initilize ResidualAffineCouplingBlock module.
Args: Args:
in_channels (int): Number of input channels. in_channels (int):
hidden_channels (int): Number of hidden channels. Number of input channels.
flows (int): Number of flows. hidden_channels (int):
kernel_size (int): Kernel size for WaveNet. Number of hidden channels.
base_dilation (int): Base dilation factor for WaveNet. flows (int):
layers (int): Number of layers of WaveNet. Number of flows.
stacks (int): Number of stacks of WaveNet. kernel_size (int):
global_channels (int): Number of global channels. Kernel size for WaveNet.
dropout_rate (float): Dropout rate. base_dilation (int):
use_weight_norm (bool): Whether to use weight normalization in WaveNet. Base dilation factor for WaveNet.
bias (bool): Whether to use bias paramters in WaveNet. layers (int):
use_only_mean (bool): Whether to estimate only mean. Number of layers of WaveNet.
stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
""" """
super().__init__() super().__init__()
@ -97,10 +109,14 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, in_channels, T). x (Tensor):
x_mask (Tensor): Length tensor (B, 1, T). Input tensor (B, in_channels, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). x_mask (Tensor):
inverse (bool): Whether to inverse the flow. Length tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns: Returns:
Tensor: Output tensor (B, in_channels, T). Tensor: Output tensor (B, in_channels, T).
@ -134,17 +150,28 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Initialzie ResidualAffineCouplingLayer module. """Initialzie ResidualAffineCouplingLayer module.
Args: Args:
in_channels (int): Number of input channels. in_channels (int):
hidden_channels (int): Number of hidden channels. Number of input channels.
kernel_size (int): Kernel size for WaveNet. hidden_channels (int):
base_dilation (int): Base dilation factor for WaveNet. Number of hidden channels.
layers (int): Number of layers of WaveNet. kernel_size (int):
stacks (int): Number of stacks of WaveNet. Kernel size for WaveNet.
global_channels (int): Number of global channels. base_dilation (int):
dropout_rate (float): Dropout rate. Base dilation factor for WaveNet.
use_weight_norm (bool): Whether to use weight normalization in WaveNet. layers (int):
bias (bool): Whether to use bias paramters in WaveNet. Number of layers of WaveNet.
use_only_mean (bool): Whether to estimate only mean. stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
""" """
assert in_channels % 2 == 0, "in_channels should be divisible by 2" assert in_channels % 2 == 0, "in_channels should be divisible by 2"
@ -211,14 +238,20 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Args:
x (Tensor): Input tensor (B, in_channels, T). x (Tensor):
x_lengths (Tensor): Length tensor (B,). Input tensor (B, in_channels, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). x_lengths (Tensor):
inverse (bool): Whether to inverse the flow. Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns: Returns:
Tensor: Output tensor (B, in_channels, T). Tensor:
Tensor: Log-determinant tensor for NLL (B,) if not inverse. Output tensor (B, in_channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
""" """
xa, xb = paddle.split(x, 2, axis=1) xa, xb = paddle.split(x, 2, axis=1)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save