Merge branch 'PaddlePaddle:develop' into develop

pull/2418/head
liangym 3 years ago committed by GitHub
commit 5c197e7016
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -20,4 +20,7 @@ Subpackages
paddlespeech.audio.io
paddlespeech.audio.metric
paddlespeech.audio.sox_effects
paddlespeech.audio.streamdata
paddlespeech.audio.text
paddlespeech.audio.transform
paddlespeech.audio.utils

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.autodecode module
===============================================
.. automodule:: paddlespeech.audio.streamdata.autodecode
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.cache module
==========================================
.. automodule:: paddlespeech.audio.streamdata.cache
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.compat module
===========================================
.. automodule:: paddlespeech.audio.streamdata.compat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.extradatasets module
==================================================
.. automodule:: paddlespeech.audio.streamdata.extradatasets
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.filters module
============================================
.. automodule:: paddlespeech.audio.streamdata.filters
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.gopen module
==========================================
.. automodule:: paddlespeech.audio.streamdata.gopen
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.handlers module
=============================================
.. automodule:: paddlespeech.audio.streamdata.handlers
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.mix module
========================================
.. automodule:: paddlespeech.audio.streamdata.mix
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.paddle\_utils module
==================================================
.. automodule:: paddlespeech.audio.streamdata.paddle_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.pipeline module
=============================================
.. automodule:: paddlespeech.audio.streamdata.pipeline
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,28 @@
paddlespeech.audio.streamdata package
=====================================
.. automodule:: paddlespeech.audio.streamdata
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.streamdata.autodecode
paddlespeech.audio.streamdata.cache
paddlespeech.audio.streamdata.compat
paddlespeech.audio.streamdata.extradatasets
paddlespeech.audio.streamdata.filters
paddlespeech.audio.streamdata.gopen
paddlespeech.audio.streamdata.handlers
paddlespeech.audio.streamdata.mix
paddlespeech.audio.streamdata.paddle_utils
paddlespeech.audio.streamdata.pipeline
paddlespeech.audio.streamdata.shardlists
paddlespeech.audio.streamdata.tariterators
paddlespeech.audio.streamdata.utils
paddlespeech.audio.streamdata.writer

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.shardlists module
===============================================
.. automodule:: paddlespeech.audio.streamdata.shardlists
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.tariterators module
=================================================
.. automodule:: paddlespeech.audio.streamdata.tariterators
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.utils module
==========================================
.. automodule:: paddlespeech.audio.streamdata.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.writer module
===========================================
.. automodule:: paddlespeech.audio.streamdata.writer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.audio.text package
===============================
.. automodule:: paddlespeech.audio.text
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.text.text_featurizer
paddlespeech.audio.text.utility

@ -0,0 +1,7 @@
paddlespeech.audio.text.text\_featurizer module
===============================================
.. automodule:: paddlespeech.audio.text.text_featurizer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.text.utility module
======================================
.. automodule:: paddlespeech.audio.text.utility
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.add\_deltas module
===============================================
.. automodule:: paddlespeech.audio.transform.add_deltas
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.channel\_selector module
=====================================================
.. automodule:: paddlespeech.audio.transform.channel_selector
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.cmvn module
========================================
.. automodule:: paddlespeech.audio.transform.cmvn
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.functional module
==============================================
.. automodule:: paddlespeech.audio.transform.functional
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.perturb module
===========================================
.. automodule:: paddlespeech.audio.transform.perturb
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,24 @@
paddlespeech.audio.transform package
====================================
.. automodule:: paddlespeech.audio.transform
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.transform.add_deltas
paddlespeech.audio.transform.channel_selector
paddlespeech.audio.transform.cmvn
paddlespeech.audio.transform.functional
paddlespeech.audio.transform.perturb
paddlespeech.audio.transform.spec_augment
paddlespeech.audio.transform.spectrogram
paddlespeech.audio.transform.transform_interface
paddlespeech.audio.transform.transformation
paddlespeech.audio.transform.wpe

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spec\_augment module
=================================================
.. automodule:: paddlespeech.audio.transform.spec_augment
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spectrogram module
===============================================
.. automodule:: paddlespeech.audio.transform.spectrogram
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transform\_interface module
========================================================
.. automodule:: paddlespeech.audio.transform.transform_interface
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transformation module
==================================================
.. automodule:: paddlespeech.audio.transform.transformation
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.wpe module
=======================================
.. automodule:: paddlespeech.audio.transform.wpe
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.check\_kwargs module
=============================================
.. automodule:: paddlespeech.audio.utils.check_kwargs
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.dynamic\_import module
===============================================
.. automodule:: paddlespeech.audio.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -12,8 +12,11 @@ Submodules
.. toctree::
:maxdepth: 4
paddlespeech.audio.utils.check_kwargs
paddlespeech.audio.utils.download
paddlespeech.audio.utils.dynamic_import
paddlespeech.audio.utils.error
paddlespeech.audio.utils.log
paddlespeech.audio.utils.numeric
paddlespeech.audio.utils.tensor_utils
paddlespeech.audio.utils.time

@ -0,0 +1,7 @@
paddlespeech.audio.utils.tensor\_utils module
=============================================
.. automodule:: paddlespeech.audio.utils.tensor_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.collate module
=========================================
.. automodule:: paddlespeech.kws.exps.mdtc.collate
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.compute\_det module
==============================================
.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.plot\_det\_curve module
==================================================
.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,19 @@
paddlespeech.kws.exps.mdtc package
==================================
.. automodule:: paddlespeech.kws.exps.mdtc
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc.collate
paddlespeech.kws.exps.mdtc.compute_det
paddlespeech.kws.exps.mdtc.plot_det_curve
paddlespeech.kws.exps.mdtc.score
paddlespeech.kws.exps.mdtc.train

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.score module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.score
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.train module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,15 @@
paddlespeech.kws.exps package
=============================
.. automodule:: paddlespeech.kws.exps
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc

@ -12,4 +12,5 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps
paddlespeech.kws.models

@ -0,0 +1,7 @@
paddlespeech.resource.model\_alias module
=========================================
.. automodule:: paddlespeech.resource.model_alias
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.pretrained\_models module
===============================================
.. automodule:: paddlespeech.resource.pretrained_models
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.resource module
=====================================
.. automodule:: paddlespeech.resource.resource
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.resource package
=============================
.. automodule:: paddlespeech.resource
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.resource.model_alias
paddlespeech.resource.pretrained_models
paddlespeech.resource.resource

@ -16,8 +16,10 @@ Subpackages
paddlespeech.cli
paddlespeech.cls
paddlespeech.kws
paddlespeech.resource
paddlespeech.s2t
paddlespeech.server
paddlespeech.t2s
paddlespeech.text
paddlespeech.utils
paddlespeech.vector

@ -19,5 +19,4 @@ Subpackages
paddlespeech.s2t.models
paddlespeech.s2t.modules
paddlespeech.s2t.training
paddlespeech.s2t.transform
paddlespeech.s2t.utils

@ -18,7 +18,6 @@ Submodules
paddlespeech.server.utils.config
paddlespeech.server.utils.errors
paddlespeech.server.utils.exception
paddlespeech.server.utils.log
paddlespeech.server.utils.onnx_infer
paddlespeech.server.utils.paddle_predictor
paddlespeech.server.utils.util

@ -19,4 +19,5 @@ Submodules
paddlespeech.t2s.datasets.get_feats
paddlespeech.t2s.datasets.ljspeech
paddlespeech.t2s.datasets.preprocess_utils
paddlespeech.t2s.datasets.sampler
paddlespeech.t2s.datasets.vocoder_batch_fn

@ -0,0 +1,7 @@
paddlespeech.t2s.datasets.sampler module
========================================
.. automodule:: paddlespeech.t2s.datasets.sampler
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.align module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.normalize module
=================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.preprocess module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,21 @@
paddlespeech.t2s.exps.ernie\_sat package
========================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat.align
paddlespeech.t2s.exps.ernie_sat.normalize
paddlespeech.t2s.exps.ernie_sat.preprocess
paddlespeech.t2s.exps.ernie_sat.synthesize
paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
paddlespeech.t2s.exps.ernie_sat.train
paddlespeech.t2s.exps.ernie_sat.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
=======================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.train module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.utils module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
:members:
:undoc-members:
:show-inheritance:

@ -16,3 +16,4 @@ Submodules
paddlespeech.t2s.exps.fastspeech2.normalize
paddlespeech.t2s.exps.fastspeech2.preprocess
paddlespeech.t2s.exps.fastspeech2.train
paddlespeech.t2s.exps.fastspeech2.vc2_infer

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
===================================================
.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
:members:
:undoc-members:
:show-inheritance:

@ -12,11 +12,13 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat
paddlespeech.t2s.exps.fastspeech2
paddlespeech.t2s.exps.gan_vocoder
paddlespeech.t2s.exps.speedyspeech
paddlespeech.t2s.exps.tacotron2
paddlespeech.t2s.exps.transformer_tts
paddlespeech.t2s.exps.vits
paddlespeech.t2s.exps.waveflow
paddlespeech.t2s.exps.wavernn
@ -31,6 +33,7 @@ Submodules
paddlespeech.t2s.exps.ort_predict
paddlespeech.t2s.exps.ort_predict_e2e
paddlespeech.t2s.exps.ort_predict_streaming
paddlespeech.t2s.exps.stream_play_tts
paddlespeech.t2s.exps.syn_utils
paddlespeech.t2s.exps.synthesize
paddlespeech.t2s.exps.synthesize_e2e

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.stream\_play\_tts module
==============================================
.. automodule:: paddlespeech.t2s.exps.stream_play_tts
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.normalize module
===========================================
.. automodule:: paddlespeech.t2s.exps.vits.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.preprocess module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,20 @@
paddlespeech.t2s.exps.vits package
==================================
.. automodule:: paddlespeech.t2s.exps.vits
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.vits.normalize
paddlespeech.t2s.exps.vits.preprocess
paddlespeech.t2s.exps.vits.synthesize
paddlespeech.t2s.exps.vits.synthesize_e2e
paddlespeech.t2s.exps.vits.train
paddlespeech.t2s.exps.vits.voice_cloning

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize\_e2e module
=================================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.train module
=======================================
.. automodule:: paddlespeech.t2s.exps.vits.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.voice\_cloning module
================================================
.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.dataset module
=============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.onnx\_api module
===============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.t2s.frontend.g2pw package
======================================
.. automodule:: paddlespeech.t2s.frontend.g2pw
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw.dataset
paddlespeech.t2s.frontend.g2pw.onnx_api
paddlespeech.t2s.frontend.g2pw.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.utils module
===========================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.mix\_frontend module
==============================================
.. automodule:: paddlespeech.t2s.frontend.mix_frontend
:members:
:undoc-members:
:show-inheritance:

@ -12,6 +12,7 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw
paddlespeech.t2s.frontend.normalizer
paddlespeech.t2s.frontend.zh_normalization
@ -23,6 +24,7 @@ Submodules
paddlespeech.t2s.frontend.arpabet
paddlespeech.t2s.frontend.generate_lexicon
paddlespeech.t2s.frontend.mix_frontend
paddlespeech.t2s.frontend.phonectic
paddlespeech.t2s.frontend.punctuation
paddlespeech.t2s.frontend.tone_sandhi

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
====================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
=============================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
:members:
:undoc-members:
:show-inheritance:

@ -12,4 +12,5 @@ Submodules
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.ernie_sat.mlm
paddlespeech.t2s.models.ernie_sat.ernie_sat
paddlespeech.t2s.models.ernie_sat.ernie_sat_updater

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.core module
=========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.t2s.models.vits.monotonic\_align package
=====================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.vits.monotonic_align.core
paddlespeech.t2s.models.vits.monotonic_align.setup

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.setup module
==========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.dynamic\_import module
=========================================
.. automodule:: paddlespeech.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.env module
=============================
.. automodule:: paddlespeech.utils.env
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.utils package
==========================
.. automodule:: paddlespeech.utils
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.utils.dynamic_import
paddlespeech.utils.env

@ -74,8 +74,10 @@ Contents
paddlespeech.cli <api/paddlespeech.cli>
paddlespeech.cls <api/paddlespeech.cls>
paddlespeech.kws <api/paddlespeech.kws>
paddlespeech.resource <api/paddlespeech.resource>
paddlespeech.s2t <api/paddlespeech.s2t>
paddlespeech.server <api/paddlespeech.server>
paddlespeech.t2s <api/paddlespeech.t2s>
paddlespeech.text <api/paddlespeech.text>
paddlespeech.utils <api/ppaddlespeech.utils>
paddlespeech.vector <api/paddlespeech.vector>

@ -1,11 +1,10 @@
# ERNIE-SAT with AISHELL3 dataset
# ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。
## 模型框架
ERNIE-SAT 中我们提出了两项创新:
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
## Model Framework
In ERNIE-SAT, we propose two innovations:
- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
- The joint mask learning of speech and text is used to realize the alignment of speech and text
<p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -1,11 +1,10 @@
# ERNIE-SAT with AISHELL3 and VCTK dataset
# ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。
## 模型框架
ERNIE-SAT 中我们提出了两项创新:
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
## Model Framework
In ERNIE-SAT, we propose two innovations:
- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
- The joint mask learning of speech and text is used to realize the alignment of speech and text
<p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -1,11 +1,10 @@
# ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。
## 模型框架
ERNIE-SAT 中我们提出了两项创新:
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
## Model Framework
In ERNIE-SAT, we propose two innovations:
- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
- The joint mask learning of speech and text is used to realize the alignment of speech and text
<p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 |
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.050767 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 |

@ -1 +1 @@
from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter
from .onnx_api import G2PWOnnxConverter

@ -15,6 +15,10 @@
Credits
This code is modified from https://github.com/GitYCC/g2pW
"""
from typing import Dict
from typing import List
from typing import Tuple
import numpy as np
from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map
@ -23,22 +27,17 @@ ANCHOR_CHAR = '▁'
def prepare_onnx_input(tokenizer,
labels,
char2phonemes,
chars,
texts,
query_ids,
phonemes=None,
pos_tags=None,
use_mask=False,
use_char_phoneme=False,
use_pos=False,
window_size=None,
max_len=512):
labels: List[str],
char2phonemes: Dict[str, List[int]],
chars: List[str],
texts: List[str],
query_ids: List[int],
use_mask: bool=False,
window_size: int=None,
max_len: int=512) -> Dict[str, np.array]:
if window_size is not None:
truncated_texts, truncated_query_ids = _truncate_texts(window_size,
texts, query_ids)
truncated_texts, truncated_query_ids = _truncate_texts(
window_size=window_size, texts=texts, query_ids=query_ids)
input_ids = []
token_type_ids = []
attention_masks = []
@ -51,13 +50,19 @@ def prepare_onnx_input(tokenizer,
query_id = (truncated_query_ids if window_size else query_ids)[idx]
try:
tokens, text2token, token2text = tokenize_and_map(tokenizer, text)
tokens, text2token, token2text = tokenize_and_map(
tokenizer=tokenizer, text=text)
except Exception:
print(f'warning: text "{text}" is invalid')
return {}
text, query_id, tokens, text2token, token2text = _truncate(
max_len, text, query_id, tokens, text2token, token2text)
max_len=max_len,
text=text,
query_id=query_id,
tokens=tokens,
text2token=text2token,
token2text=token2text)
processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
@ -91,7 +96,8 @@ def prepare_onnx_input(tokenizer,
return outputs
def _truncate_texts(window_size, texts, query_ids):
def _truncate_texts(window_size: int, texts: List[str],
query_ids: List[int]) -> Tuple[List[str], List[int]]:
truncated_texts = []
truncated_query_ids = []
for text, query_id in zip(texts, query_ids):
@ -105,7 +111,12 @@ def _truncate_texts(window_size, texts, query_ids):
return truncated_texts, truncated_query_ids
def _truncate(max_len, text, query_id, tokens, text2token, token2text):
def _truncate(max_len: int,
text: str,
query_id: int,
tokens: List[str],
text2token: List[int],
token2text: List[Tuple[int]]):
truncate_len = max_len - 2
if len(tokens) <= truncate_len:
return (text, query_id, tokens, text2token, token2text)
@ -132,18 +143,8 @@ def _truncate(max_len, text, query_id, tokens, text2token, token2text):
], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
def prepare_data(sent_path, lb_path=None):
raw_texts = open(sent_path).read().rstrip().split('\n')
query_ids = [raw.index(ANCHOR_CHAR) for raw in raw_texts]
texts = [raw.replace(ANCHOR_CHAR, '') for raw in raw_texts]
if lb_path is None:
return texts, query_ids
else:
phonemes = open(lb_path).read().rstrip().split('\n')
return texts, query_ids, phonemes
def get_phoneme_labels(polyphonic_chars):
def get_phoneme_labels(polyphonic_chars: List[List[str]]
) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
char2phonemes = {}
for char, phoneme in polyphonic_chars:
@ -153,7 +154,8 @@ def get_phoneme_labels(polyphonic_chars):
return labels, char2phonemes
def get_char_phoneme_labels(polyphonic_chars):
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted(
list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
char2phonemes = {}

@ -17,6 +17,10 @@ Credits
"""
import json
import os
from typing import Any
from typing import Dict
from typing import List
from typing import Tuple
import numpy as np
import onnxruntime
@ -37,7 +41,8 @@ from paddlespeech.utils.env import MODEL_HOME
model_version = '1.1'
def predict(session, onnx_input, labels):
def predict(session, onnx_input: Dict[str, Any],
labels: List[str]) -> Tuple[List[str], List[float]]:
all_preds = []
all_confidences = []
probs = session.run([], {
@ -61,10 +66,10 @@ def predict(session, onnx_input, labels):
class G2PWOnnxConverter:
def __init__(self,
model_dir=MODEL_HOME,
style='bopomofo',
model_source=None,
enable_non_tradional_chinese=False):
model_dir: os.PathLike=MODEL_HOME,
style: str='bopomofo',
model_source: str=None,
enable_non_tradional_chinese: bool=False):
uncompress_path = download_and_decompress(
g2pw_onnx_models['G2PWModel'][model_version], model_dir)
@ -76,7 +81,8 @@ class G2PWOnnxConverter:
os.path.join(uncompress_path, 'g2pW.onnx'),
sess_options=sess_options)
self.config = load_config(
os.path.join(uncompress_path, 'config.py'), use_default=True)
config_path=os.path.join(uncompress_path, 'config.py'),
use_default=True)
self.model_source = model_source if model_source else self.config.model_source
self.enable_opencc = enable_non_tradional_chinese
@ -103,9 +109,9 @@ class G2PWOnnxConverter:
.strip().split('\n')
]
self.labels, self.char2phonemes = get_char_phoneme_labels(
self.polyphonic_chars
polyphonic_chars=self.polyphonic_chars
) if self.config.use_char_phoneme else get_phoneme_labels(
self.polyphonic_chars)
polyphonic_chars=self.polyphonic_chars)
self.chars = sorted(list(self.char2phonemes.keys()))
@ -146,7 +152,7 @@ class G2PWOnnxConverter:
if self.enable_opencc:
self.cc = OpenCC('s2tw')
def _convert_bopomofo_to_pinyin(self, bopomofo):
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
tone = bopomofo[-1]
assert tone in '12345'
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
@ -156,7 +162,7 @@ class G2PWOnnxConverter:
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
return None
def __call__(self, sentences):
def __call__(self, sentences: List[str]) -> List[List[str]]:
if isinstance(sentences, str):
sentences = [sentences]
@ -169,23 +175,25 @@ class G2PWOnnxConverter:
sentences = translated_sentences
texts, query_ids, sent_ids, partial_results = self._prepare_data(
sentences)
sentences=sentences)
if len(texts) == 0:
# sentences no polyphonic words
return partial_results
onnx_input = prepare_onnx_input(
self.tokenizer,
self.labels,
self.char2phonemes,
self.chars,
texts,
query_ids,
tokenizer=self.tokenizer,
labels=self.labels,
char2phonemes=self.char2phonemes,
chars=self.chars,
texts=texts,
query_ids=query_ids,
use_mask=self.config.use_mask,
use_char_phoneme=self.config.use_char_phoneme,
window_size=None)
preds, confidences = predict(self.session_g2pW, onnx_input, self.labels)
preds, confidences = predict(
session=self.session_g2pW,
onnx_input=onnx_input,
labels=self.labels)
if self.config.use_char_phoneme:
preds = [pred.split(' ')[1] for pred in preds]
@ -195,7 +203,9 @@ class G2PWOnnxConverter:
return results
def _prepare_data(self, sentences):
def _prepare_data(
self, sentences: List[str]
) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
texts, query_ids, sent_ids, partial_results = [], [], [], []
for sent_id, sent in enumerate(sentences):
# pypinyin works well for Simplified Chinese than Traditional Chinese

@ -15,10 +15,11 @@
Credits
This code is modified from https://github.com/GitYCC/g2pW
"""
import os
import re
def wordize_and_map(text):
def wordize_and_map(text: str):
words = []
index_map_from_text_to_word = []
index_map_from_word_to_text = []
@ -54,8 +55,8 @@ def wordize_and_map(text):
return words, index_map_from_text_to_word, index_map_from_word_to_text
def tokenize_and_map(tokenizer, text):
words, text2word, word2text = wordize_and_map(text)
def tokenize_and_map(tokenizer, text: str):
words, text2word, word2text = wordize_and_map(text=text)
tokens = []
index_map_from_token_to_text = []
@ -82,7 +83,7 @@ def tokenize_and_map(tokenizer, text):
return tokens, index_map_from_text_to_token, index_map_from_token_to_text
def _load_config(config_path):
def _load_config(config_path: os.PathLike):
import importlib.util
spec = importlib.util.spec_from_file_location('__init__', config_path)
config = importlib.util.module_from_spec(spec)
@ -130,7 +131,7 @@ default_config_dict = {
}
def load_config(config_path, use_default=False):
def load_config(config_path: os.PathLike, use_default: bool=False):
config = _load_config(config_path)
if use_default:
for attr, val in default_config_dict.items():

@ -71,31 +71,53 @@ class MLMEncoder(nn.Layer):
"""Conformer encoder module.
Args:
idim (int): Input dimension.
attention_dim (int): Dimension of attention.
attention_heads (int): The number of heads of multi head attention.
linear_units (int): The number of units of position-wise feed forward.
num_blocks (int): The number of decoder blocks.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate after adding positional encoding.
attention_dropout_rate (float): Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]): Input layer type.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
idim (int):
Input dimension.
attention_dim (int):
Dimension of attention.
attention_heads (int):
The number of heads of multi head attention.
linear_units (int):
The number of units of position-wise feed forward.
num_blocks (int):
The number of decoder blocks.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
attention_dropout_rate (float):
Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]):
Input layer type.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
macaron_style (bool): Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str): Encoder positional encoding layer type.
selfattention_layer_type (str): Encoder attention layer type.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int): Kernerl size of convolution module.
padding_idx (int): Padding idx for input_layer=embed.
stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
positionwise_layer_type (str):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int):
Kernel size of positionwise conv1d layer.
macaron_style (bool):
Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str):
Encoder positional encoding layer type.
selfattention_layer_type (str):
Encoder attention layer type.
activation_type (str):
Encoder activation function type.
use_cnn_module (bool):
Whether to use convolution module.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int):
Kernerl size of convolution module.
padding_idx (int):
Padding idx for input_layer=embed.
stochastic_depth_rate (float):
Maximum probability to skip the encoder layer.
"""
@ -320,12 +342,16 @@ class MLMDecoder(MLMEncoder):
"""Encode input sequence.
Args:
xs (paddle.Tensor): Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, time).
xs (paddle.Tensor):
Input tensor (#batch, time, idim).
masks (paddle.Tensor):
Mask tensor (#batch, time).
Returns:
paddle.Tensor: Output tensor (#batch, time, attention_dim).
paddle.Tensor: Mask tensor (#batch, time).
paddle.Tensor:
Output tensor (#batch, time, attention_dim).
paddle.Tensor:
Mask tensor (#batch, time).
"""
xs = self.embed(xs)
@ -392,19 +418,27 @@ class MLM(nn.Layer):
use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
'''
Args:
speech (paddle.Tensor): input speech (1, Tmax, D).
text (paddle.Tensor): input text (1, Tmax2).
masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
span_bdy (List[int]): masked mel boundary of input speech (2,)
use_teacher_forcing (bool): whether to use teacher forcing
speech (paddle.Tensor):
input speech (1, Tmax, D).
text (paddle.Tensor):
input text (1, Tmax2).
masked_pos (paddle.Tensor):
masked position of input speech (1, Tmax)
speech_mask (paddle.Tensor):
mask of speech (1, 1, Tmax).
text_mask (paddle.Tensor):
mask of text (1, 1, Tmax2).
speech_seg_pos (paddle.Tensor):
n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
text_seg_pos (paddle.Tensor):
n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
span_bdy (List[int]):
masked mel boundary of input speech (2,)
use_teacher_forcing (bool):
whether to use teacher forcing
Returns:
List[Tensor]:
eg:
[Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
'''
z_cache = None

@ -48,12 +48,18 @@ class StochasticDurationPredictor(nn.Layer):
global_channels: int=-1, ):
"""Initialize StochasticDurationPredictor module.
Args:
channels (int): Number of channels.
kernel_size (int): Kernel size.
dropout_rate (float): Dropout rate.
flows (int): Number of flows.
dds_conv_layers (int): Number of conv layers in DDS conv.
global_channels (int): Number of global conditioning channels.
channels (int):
Number of channels.
kernel_size (int):
Kernel size.
dropout_rate (float):
Dropout rate.
flows (int):
Number of flows.
dds_conv_layers (int):
Number of conv layers in DDS conv.
global_channels (int):
Number of global conditioning channels.
"""
super().__init__()
@ -108,14 +114,21 @@ class StochasticDurationPredictor(nn.Layer):
noise_scale: float=1.0, ) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T_text).
x_mask (Tensor): Mask tensor (B, 1, T_text).
w (Optional[Tensor]): Duration tensor (B, 1, T_text).
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1)
inverse (bool): Whether to inverse the flow.
noise_scale (float): Noise scale value.
x (Tensor):
Input tensor (B, channels, T_text).
x_mask (Tensor):
Mask tensor (B, 1, T_text).
w (Optional[Tensor]):
Duration tensor (B, 1, T_text).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1)
inverse (bool):
Whether to inverse the flow.
noise_scale (float):
Noise scale value.
Returns:
Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,).
Tensor:
If not inverse, negative log-likelihood (NLL) tensor (B,).
If inverse, log-duration tensor (B, 1, T_text).
"""
# stop gradient

@ -34,11 +34,15 @@ class FlipFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Flipped tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Flipped tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
x = paddle.flip(x, [1])
if not inverse:
@ -60,13 +64,19 @@ class LogFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
inverse (bool): Whether to inverse the flow.
eps (float): Epsilon for log.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
eps (float):
Epsilon for log.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
if not inverse:
y = paddle.log(paddle.clip(x, min=eps)) * x_mask
@ -83,7 +93,8 @@ class ElementwiseAffineFlow(nn.Layer):
def __init__(self, channels: int):
"""Initialize ElementwiseAffineFlow module.
Args:
channels (int): Number of channels.
channels (int):
Number of channels.
"""
super().__init__()
self.channels = channels
@ -107,12 +118,17 @@ class ElementwiseAffineFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
if not inverse:
y = self.m + paddle.exp(self.logs) * x
@ -157,11 +173,16 @@ class DilatedDepthSeparableConv(nn.Layer):
eps: float=1e-5, ):
"""Initialize DilatedDepthSeparableConv module.
Args:
channels (int): Number of channels.
kernel_size (int): Kernel size.
layers (int): Number of layers.
dropout_rate (float): Dropout rate.
eps (float): Epsilon for layer norm.
channels (int):
Number of channels.
kernel_size (int):
Kernel size.
layers (int):
Number of layers.
dropout_rate (float):
Dropout rate.
eps (float):
Epsilon for layer norm.
"""
super().__init__()
@ -198,11 +219,15 @@ class DilatedDepthSeparableConv(nn.Layer):
g: Optional[paddle.Tensor]=None) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
x (Tensor):
Input tensor (B, in_channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns:
Tensor: Output tensor (B, channels, T).
Tensor:
Output tensor (B, channels, T).
"""
if g is not None:
x = x + g
@ -225,12 +250,18 @@ class ConvFlow(nn.Layer):
tail_bound: float=5.0, ):
"""Initialize ConvFlow module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size.
layers (int): Number of layers.
bins (int): Number of bins.
tail_bound (float): Tail bound value.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size.
layers (int):
Number of layers.
bins (int):
Number of bins.
tail_bound (float):
Tail bound value.
"""
super().__init__()
self.half_channels = in_channels // 2
@ -275,13 +306,19 @@ class ConvFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
xa, xb = x.split(2, 1)
h = self.input_conv(xa)

@ -97,81 +97,104 @@ class VITSGenerator(nn.Layer):
stochastic_duration_predictor_dds_conv_layers: int=3, ):
"""Initialize VITS generator module.
Args:
vocabs (int): Input vocabulary size.
aux_channels (int): Number of acoustic feature channels.
hidden_channels (int): Number of hidden channels.
spks (Optional[int]): Number of speakers. If set to > 1, assume that the
vocabs (int):
Input vocabulary size.
aux_channels (int):
Number of acoustic feature channels.
hidden_channels (int):
Number of hidden channels.
spks (Optional[int]):
Number of speakers. If set to > 1, assume that the
sids will be provided as the input and use sid embedding layer.
langs (Optional[int]): Number of languages. If set to > 1, assume that the
langs (Optional[int]):
Number of languages. If set to > 1, assume that the
lids will be provided as the input and use sid embedding layer.
spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
spk_embed_dim (Optional[int]):
Speaker embedding dimension. If set to > 0,
assume that spembs will be provided as the input.
global_channels (int): Number of global conditioning channels.
segment_size (int): Segment size for decoder.
text_encoder_attention_heads (int): Number of heads in conformer block
of text encoder.
text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
of text encoder.
text_encoder_blocks (int): Number of conformer blocks in text encoder.
text_encoder_positionwise_layer_type (str): Position-wise layer type in
conformer block of text encoder.
text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
kernel size in conformer block of text encoder. Only used when the
above layer type is conv1d or conv1d-linear.
text_encoder_positional_encoding_layer_type (str): Positional encoding layer
type in conformer block of text encoder.
text_encoder_self_attention_layer_type (str): Self-attention layer type in
conformer block of text encoder.
text_encoder_activation_type (str): Activation function type in conformer
block of text encoder.
text_encoder_normalize_before (bool): Whether to apply layer norm before
self-attention in conformer block of text encoder.
text_encoder_dropout_rate (float): Dropout rate in conformer block of
text encoder.
text_encoder_positional_dropout_rate (float): Dropout rate for positional
encoding in conformer block of text encoder.
text_encoder_attention_dropout_rate (float): Dropout rate for attention in
conformer block of text encoder.
text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
will be used when only use_conformer_conv_in_text_encoder = True.
use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
in conformer block of text encoder.
use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
conformer block of text encoder.
decoder_kernel_size (int): Decoder kernel size.
decoder_channels (int): Number of decoder initial channels.
decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
decoder_upsample_kernel_sizes (List[int]): List of kernel size for
upsampling layers in decoder.
decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
in decoder.
decoder_resblock_dilations (List[List[int]]): List of list of dilations for
resblocks in decoder.
use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
decoder.
posterior_encoder_kernel_size (int): Posterior encoder kernel size.
posterior_encoder_layers (int): Number of layers of posterior encoder.
posterior_encoder_stacks (int): Number of stacks of posterior encoder.
posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
normalization in posterior encoder.
flow_flows (int): Number of flows in flow.
flow_kernel_size (int): Kernel size in flow.
flow_base_dilation (int): Base dilation in flow.
flow_layers (int): Number of layers in flow.
flow_dropout_rate (float): Dropout rate in flow
use_weight_norm_in_flow (bool): Whether to apply weight normalization in
flow.
use_only_mean_in_flow (bool): Whether to use only mean in flow.
stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
duration predictor.
stochastic_duration_predictor_dropout_rate (float): Dropout rate in
stochastic duration predictor.
stochastic_duration_predictor_flows (int): Number of flows in stochastic
duration predictor.
stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
layers in stochastic duration predictor.
global_channels (int):
Number of global conditioning channels.
segment_size (int):
Segment size for decoder.
text_encoder_attention_heads (int):
Number of heads in conformer block of text encoder.
text_encoder_ffn_expand (int):
Expansion ratio of FFN in conformer block of text encoder.
text_encoder_blocks (int):
Number of conformer blocks in text encoder.
text_encoder_positionwise_layer_type (str):
Position-wise layer type in conformer block of text encoder.
text_encoder_positionwise_conv_kernel_size (int):
Position-wise convolution kernel size in conformer block of text encoder.
Only used when the above layer type is conv1d or conv1d-linear.
text_encoder_positional_encoding_layer_type (str):
Positional encoding layer type in conformer block of text encoder.
text_encoder_self_attention_layer_type (str):
Self-attention layer type in conformer block of text encoder.
text_encoder_activation_type (str):
Activation function type in conformer block of text encoder.
text_encoder_normalize_before (bool):
Whether to apply layer norm before self-attention in conformer block of text encoder.
text_encoder_dropout_rate (float):
Dropout rate in conformer block of text encoder.
text_encoder_positional_dropout_rate (float):
Dropout rate for positional encoding in conformer block of text encoder.
text_encoder_attention_dropout_rate (float):
Dropout rate for attention in conformer block of text encoder.
text_encoder_conformer_kernel_size (int):
Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True.
use_macaron_style_in_text_encoder (bool):
Whether to use macaron style FFN in conformer block of text encoder.
use_conformer_conv_in_text_encoder (bool):
Whether to use covolution in conformer block of text encoder.
decoder_kernel_size (int):
Decoder kernel size.
decoder_channels (int):
Number of decoder initial channels.
decoder_upsample_scales (List[int]):
List of upsampling scales in decoder.
decoder_upsample_kernel_sizes (List[int]):
List of kernel size for upsampling layers in decoder.
decoder_resblock_kernel_sizes (List[int]):
List of kernel size for resblocks in decoder.
decoder_resblock_dilations (List[List[int]]):
List of list of dilations for resblocks in decoder.
use_weight_norm_in_decoder (bool):
Whether to apply weight normalization in decoder.
posterior_encoder_kernel_size (int):
Posterior encoder kernel size.
posterior_encoder_layers (int):
Number of layers of posterior encoder.
posterior_encoder_stacks (int):
Number of stacks of posterior encoder.
posterior_encoder_base_dilation (int):
Base dilation of posterior encoder.
posterior_encoder_dropout_rate (float):
Dropout rate for posterior encoder.
use_weight_norm_in_posterior_encoder (bool):
Whether to apply weight normalization in posterior encoder.
flow_flows (int):
Number of flows in flow.
flow_kernel_size (int):
Kernel size in flow.
flow_base_dilation (int):
Base dilation in flow.
flow_layers (int):
Number of layers in flow.
flow_dropout_rate (float):
Dropout rate in flow
use_weight_norm_in_flow (bool):
Whether to apply weight normalization in flow.
use_only_mean_in_flow (bool):
Whether to use only mean in flow.
stochastic_duration_predictor_kernel_size (int):
Kernel size in stochastic duration predictor.
stochastic_duration_predictor_dropout_rate (float):
Dropout rate in stochastic duration predictor.
stochastic_duration_predictor_flows (int):
Number of flows in stochastic duration predictor.
stochastic_duration_predictor_dds_conv_layers (int):
Number of DDS conv layers in stochastic duration predictor.
"""
super().__init__()
self.segment_size = segment_size
@ -272,20 +295,33 @@ class VITSGenerator(nn.Layer):
paddle.Tensor, paddle.Tensor, ], ]:
"""Calculate forward propagation.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, aux_channels, T_feats).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
Tensor: Duration negative log-likelihood (NLL) tensor (B,).
Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
Tensor: Segments start index tensor (B,).
Tensor: Text mask tensor (B, 1, T_text).
Tensor: Feature mask tensor (B, 1, T_feats).
Tensor:
Waveform tensor (B, 1, segment_size * upsample_factor).
Tensor:
Duration negative log-likelihood (NLL) tensor (B,).
Tensor:
Monotonic attention weight tensor (B, 1, T_feats, T_text).
Tensor:
Segments start index tensor (B,).
Tensor:
Text mask tensor (B, 1, T_text).
Tensor:
Feature mask tensor (B, 1, T_feats).
tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
- Tensor: Posterior encoder hidden representation (B, H, T_feats).
- Tensor: Flow hidden representation (B, H, T_feats).
@ -402,24 +438,40 @@ class VITSGenerator(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Run inference.
Args:
text (Tensor): Input text index tensor (B, T_text,).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
text (Tensor):
Input text index tensor (B, T_text,).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
dur (Optional[Tensor]):
Ground-truth duration (B, T_text,). If provided,
skip the prediction of durations (i.e., teacher forcing).
noise_scale (float): Noise scale parameter for flow.
noise_scale_dur (float): Noise scale parameter for duration predictor.
alpha (float): Alpha parameter to control the speed of generated speech.
max_len (Optional[int]): Maximum length of acoustic feature sequence.
use_teacher_forcing (bool): Whether to use teacher forcing.
noise_scale (float):
Noise scale parameter for flow.
noise_scale_dur (float):
Noise scale parameter for duration predictor.
alpha (float):
Alpha parameter to control the speed of generated speech.
max_len (Optional[int]):
Maximum length of acoustic feature sequence.
use_teacher_forcing (bool):
Whether to use teacher forcing.
Returns:
Tensor: Generated waveform tensor (B, T_wav).
Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
Tensor: Duration tensor (B, T_text).
Tensor:
Generated waveform tensor (B, T_wav).
Tensor:
Monotonic attention weight tensor (B, T_feats, T_text).
Tensor:
Duration tensor (B, T_text).
"""
# encoder
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
@ -533,15 +585,23 @@ class VITSGenerator(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
"""Run voice conversion.
Args:
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor): Feature length tensor (B,).
sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor):
Feature length tensor (B,).
sids_src (Optional[Tensor]):
Speaker index tensor of source feature (B,) or (B, 1).
sids_tgt (Optional[Tensor]):
Speaker index tensor of target feature (B,) or (B, 1).
spembs_src (Optional[Tensor]):
Speaker embedding tensor of source feature (B, spk_embed_dim).
spembs_tgt (Optional[Tensor]):
Speaker embedding tensor of target feature (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
Tensor: Generated waveform tensor (B, T_wav).
Tensor:
Generated waveform tensor (B, T_wav).
"""
# encoder
g_src = None
@ -602,10 +662,13 @@ class VITSGenerator(nn.Layer):
mask: paddle.Tensor) -> paddle.Tensor:
"""Generate path a.k.a. monotonic attention.
Args:
dur (Tensor): Duration tensor (B, 1, T_text).
mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).
dur (Tensor):
Duration tensor (B, 1, T_text).
mask (Tensor):
Attention mask tensor (B, 1, T_feats, T_text).
Returns:
Tensor: Path tensor (B, 1, T_feats, T_text).
Tensor:
Path tensor (B, 1, T_feats, T_text).
"""
b, _, t_y, t_x = paddle.shape(mask)
cum_dur = paddle.cumsum(dur, -1)

@ -52,17 +52,28 @@ class PosteriorEncoder(nn.Layer):
"""Initilialize PosteriorEncoder module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size in WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of repeat stacking of WaveNet.
base_dilation (int): Base dilation factor.
global_channels (int): Number of global conditioning channels.
dropout_rate (float): Dropout rate.
bias (bool): Whether to use bias parameters in conv.
use_weight_norm (bool): Whether to apply weight norm.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size in WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of repeat stacking of WaveNet.
base_dilation (int):
Base dilation factor.
global_channels (int):
Number of global conditioning channels.
dropout_rate (float):
Dropout rate.
bias (bool):
Whether to use bias parameters in conv.
use_weight_norm (bool):
Whether to apply weight norm.
"""
super().__init__()
@ -99,15 +110,22 @@ class PosteriorEncoder(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T_feats).
x_lengths (Tensor): Length tensor (B,).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
x (Tensor):
Input tensor (B, in_channels, T_feats).
x_lengths (Tensor):
Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns:
Tensor: Encoded hidden representation tensor (B, out_channels, T_feats).
Tensor: Projected mean tensor (B, out_channels, T_feats).
Tensor: Projected scale tensor (B, out_channels, T_feats).
Tensor: Mask tensor for input tensor (B, 1, T_feats).
Tensor:
Encoded hidden representation tensor (B, out_channels, T_feats).
Tensor:
Projected mean tensor (B, out_channels, T_feats).
Tensor:
Projected scale tensor (B, out_channels, T_feats).
Tensor:
Mask tensor for input tensor (B, 1, T_feats).
"""
x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)

@ -55,18 +55,30 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Initilize ResidualAffineCouplingBlock module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
flows (int): Number of flows.
kernel_size (int): Kernel size for WaveNet.
base_dilation (int): Base dilation factor for WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of stacks of WaveNet.
global_channels (int): Number of global channels.
dropout_rate (float): Dropout rate.
use_weight_norm (bool): Whether to use weight normalization in WaveNet.
bias (bool): Whether to use bias paramters in WaveNet.
use_only_mean (bool): Whether to estimate only mean.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
flows (int):
Number of flows.
kernel_size (int):
Kernel size for WaveNet.
base_dilation (int):
Base dilation factor for WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
"""
super().__init__()
@ -97,10 +109,14 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_mask (Tensor): Length tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, in_channels, T).
x_mask (Tensor):
Length tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, in_channels, T).
@ -134,17 +150,28 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Initialzie ResidualAffineCouplingLayer module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size for WaveNet.
base_dilation (int): Base dilation factor for WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of stacks of WaveNet.
global_channels (int): Number of global channels.
dropout_rate (float): Dropout rate.
use_weight_norm (bool): Whether to use weight normalization in WaveNet.
bias (bool): Whether to use bias paramters in WaveNet.
use_only_mean (bool): Whether to estimate only mean.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size for WaveNet.
base_dilation (int):
Base dilation factor for WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
"""
assert in_channels % 2 == 0, "in_channels should be divisible by 2"
@ -211,14 +238,20 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_lengths (Tensor): Length tensor (B,).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, in_channels, T).
x_lengths (Tensor):
Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, in_channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, in_channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
xa, xb = paddle.split(x, 2, axis=1)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save