Merge branch 'PaddlePaddle:develop' into develop

pull/2416/head
WongLaw 3 years ago committed by GitHub
commit 68bd85a6b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -61,7 +61,7 @@ tts_python:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
# voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
# 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc', # 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
@ -87,7 +87,7 @@ tts_inference:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
am_predictor_conf: am_predictor_conf:
device: # set 'gpu:id' or 'cpu' device: # set 'gpu:id' or 'cpu'

@ -29,7 +29,7 @@ tts_online:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
am_sample_rate: 24000 am_sample_rate: 24000
am_sess_conf: am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu' device: "cpu" # set 'gpu:id' or 'cpu'

@ -29,7 +29,7 @@ tts_online:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
am_sample_rate: 24000 am_sample_rate: 24000
am_sess_conf: am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu' device: "cpu" # set 'gpu:id' or 'cpu'

@ -20,4 +20,7 @@ Subpackages
paddlespeech.audio.io paddlespeech.audio.io
paddlespeech.audio.metric paddlespeech.audio.metric
paddlespeech.audio.sox_effects paddlespeech.audio.sox_effects
paddlespeech.audio.streamdata
paddlespeech.audio.text
paddlespeech.audio.transform
paddlespeech.audio.utils paddlespeech.audio.utils

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.autodecode module
===============================================
.. automodule:: paddlespeech.audio.streamdata.autodecode
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.cache module
==========================================
.. automodule:: paddlespeech.audio.streamdata.cache
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.compat module
===========================================
.. automodule:: paddlespeech.audio.streamdata.compat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.extradatasets module
==================================================
.. automodule:: paddlespeech.audio.streamdata.extradatasets
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.filters module
============================================
.. automodule:: paddlespeech.audio.streamdata.filters
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.gopen module
==========================================
.. automodule:: paddlespeech.audio.streamdata.gopen
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.handlers module
=============================================
.. automodule:: paddlespeech.audio.streamdata.handlers
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.mix module
========================================
.. automodule:: paddlespeech.audio.streamdata.mix
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.paddle\_utils module
==================================================
.. automodule:: paddlespeech.audio.streamdata.paddle_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.pipeline module
=============================================
.. automodule:: paddlespeech.audio.streamdata.pipeline
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,28 @@
paddlespeech.audio.streamdata package
=====================================
.. automodule:: paddlespeech.audio.streamdata
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.streamdata.autodecode
paddlespeech.audio.streamdata.cache
paddlespeech.audio.streamdata.compat
paddlespeech.audio.streamdata.extradatasets
paddlespeech.audio.streamdata.filters
paddlespeech.audio.streamdata.gopen
paddlespeech.audio.streamdata.handlers
paddlespeech.audio.streamdata.mix
paddlespeech.audio.streamdata.paddle_utils
paddlespeech.audio.streamdata.pipeline
paddlespeech.audio.streamdata.shardlists
paddlespeech.audio.streamdata.tariterators
paddlespeech.audio.streamdata.utils
paddlespeech.audio.streamdata.writer

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.shardlists module
===============================================
.. automodule:: paddlespeech.audio.streamdata.shardlists
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.tariterators module
=================================================
.. automodule:: paddlespeech.audio.streamdata.tariterators
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.utils module
==========================================
.. automodule:: paddlespeech.audio.streamdata.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.streamdata.writer module
===========================================
.. automodule:: paddlespeech.audio.streamdata.writer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.audio.text package
===============================
.. automodule:: paddlespeech.audio.text
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.text.text_featurizer
paddlespeech.audio.text.utility

@ -0,0 +1,7 @@
paddlespeech.audio.text.text\_featurizer module
===============================================
.. automodule:: paddlespeech.audio.text.text_featurizer
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.text.utility module
======================================
.. automodule:: paddlespeech.audio.text.utility
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.add\_deltas module
===============================================
.. automodule:: paddlespeech.audio.transform.add_deltas
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.channel\_selector module
=====================================================
.. automodule:: paddlespeech.audio.transform.channel_selector
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.cmvn module
========================================
.. automodule:: paddlespeech.audio.transform.cmvn
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.functional module
==============================================
.. automodule:: paddlespeech.audio.transform.functional
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.perturb module
===========================================
.. automodule:: paddlespeech.audio.transform.perturb
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,24 @@
paddlespeech.audio.transform package
====================================
.. automodule:: paddlespeech.audio.transform
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.transform.add_deltas
paddlespeech.audio.transform.channel_selector
paddlespeech.audio.transform.cmvn
paddlespeech.audio.transform.functional
paddlespeech.audio.transform.perturb
paddlespeech.audio.transform.spec_augment
paddlespeech.audio.transform.spectrogram
paddlespeech.audio.transform.transform_interface
paddlespeech.audio.transform.transformation
paddlespeech.audio.transform.wpe

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spec\_augment module
=================================================
.. automodule:: paddlespeech.audio.transform.spec_augment
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.spectrogram module
===============================================
.. automodule:: paddlespeech.audio.transform.spectrogram
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transform\_interface module
========================================================
.. automodule:: paddlespeech.audio.transform.transform_interface
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.transformation module
==================================================
.. automodule:: paddlespeech.audio.transform.transformation
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.transform.wpe module
=======================================
.. automodule:: paddlespeech.audio.transform.wpe
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.check\_kwargs module
=============================================
.. automodule:: paddlespeech.audio.utils.check_kwargs
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.audio.utils.dynamic\_import module
===============================================
.. automodule:: paddlespeech.audio.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -12,8 +12,11 @@ Submodules
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.audio.utils.check_kwargs
paddlespeech.audio.utils.download paddlespeech.audio.utils.download
paddlespeech.audio.utils.dynamic_import
paddlespeech.audio.utils.error paddlespeech.audio.utils.error
paddlespeech.audio.utils.log paddlespeech.audio.utils.log
paddlespeech.audio.utils.numeric paddlespeech.audio.utils.numeric
paddlespeech.audio.utils.tensor_utils
paddlespeech.audio.utils.time paddlespeech.audio.utils.time

@ -0,0 +1,7 @@
paddlespeech.audio.utils.tensor\_utils module
=============================================
.. automodule:: paddlespeech.audio.utils.tensor_utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.collate module
=========================================
.. automodule:: paddlespeech.kws.exps.mdtc.collate
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.compute\_det module
==============================================
.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.plot\_det\_curve module
==================================================
.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,19 @@
paddlespeech.kws.exps.mdtc package
==================================
.. automodule:: paddlespeech.kws.exps.mdtc
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc.collate
paddlespeech.kws.exps.mdtc.compute_det
paddlespeech.kws.exps.mdtc.plot_det_curve
paddlespeech.kws.exps.mdtc.score
paddlespeech.kws.exps.mdtc.train

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.score module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.score
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.kws.exps.mdtc.train module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,15 @@
paddlespeech.kws.exps package
=============================
.. automodule:: paddlespeech.kws.exps
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc

@ -12,4 +12,5 @@ Subpackages
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.kws.exps
paddlespeech.kws.models paddlespeech.kws.models

@ -0,0 +1,7 @@
paddlespeech.resource.model\_alias module
=========================================
.. automodule:: paddlespeech.resource.model_alias
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.pretrained\_models module
===============================================
.. automodule:: paddlespeech.resource.pretrained_models
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.resource.resource module
=====================================
.. automodule:: paddlespeech.resource.resource
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.resource package
=============================
.. automodule:: paddlespeech.resource
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.resource.model_alias
paddlespeech.resource.pretrained_models
paddlespeech.resource.resource

@ -16,8 +16,10 @@ Subpackages
paddlespeech.cli paddlespeech.cli
paddlespeech.cls paddlespeech.cls
paddlespeech.kws paddlespeech.kws
paddlespeech.resource
paddlespeech.s2t paddlespeech.s2t
paddlespeech.server paddlespeech.server
paddlespeech.t2s paddlespeech.t2s
paddlespeech.text paddlespeech.text
paddlespeech.utils
paddlespeech.vector paddlespeech.vector

@ -19,5 +19,4 @@ Subpackages
paddlespeech.s2t.models paddlespeech.s2t.models
paddlespeech.s2t.modules paddlespeech.s2t.modules
paddlespeech.s2t.training paddlespeech.s2t.training
paddlespeech.s2t.transform
paddlespeech.s2t.utils paddlespeech.s2t.utils

@ -18,7 +18,6 @@ Submodules
paddlespeech.server.utils.config paddlespeech.server.utils.config
paddlespeech.server.utils.errors paddlespeech.server.utils.errors
paddlespeech.server.utils.exception paddlespeech.server.utils.exception
paddlespeech.server.utils.log
paddlespeech.server.utils.onnx_infer paddlespeech.server.utils.onnx_infer
paddlespeech.server.utils.paddle_predictor paddlespeech.server.utils.paddle_predictor
paddlespeech.server.utils.util paddlespeech.server.utils.util

@ -19,4 +19,5 @@ Submodules
paddlespeech.t2s.datasets.get_feats paddlespeech.t2s.datasets.get_feats
paddlespeech.t2s.datasets.ljspeech paddlespeech.t2s.datasets.ljspeech
paddlespeech.t2s.datasets.preprocess_utils paddlespeech.t2s.datasets.preprocess_utils
paddlespeech.t2s.datasets.sampler
paddlespeech.t2s.datasets.vocoder_batch_fn paddlespeech.t2s.datasets.vocoder_batch_fn

@ -0,0 +1,7 @@
paddlespeech.t2s.datasets.sampler module
========================================
.. automodule:: paddlespeech.t2s.datasets.sampler
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.align module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.normalize module
=================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.preprocess module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,21 @@
paddlespeech.t2s.exps.ernie\_sat package
========================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat.align
paddlespeech.t2s.exps.ernie_sat.normalize
paddlespeech.t2s.exps.ernie_sat.preprocess
paddlespeech.t2s.exps.ernie_sat.synthesize
paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
paddlespeech.t2s.exps.ernie_sat.train
paddlespeech.t2s.exps.ernie_sat.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
=======================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.train module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.ernie\_sat.utils module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
:members:
:undoc-members:
:show-inheritance:

@ -16,3 +16,4 @@ Submodules
paddlespeech.t2s.exps.fastspeech2.normalize paddlespeech.t2s.exps.fastspeech2.normalize
paddlespeech.t2s.exps.fastspeech2.preprocess paddlespeech.t2s.exps.fastspeech2.preprocess
paddlespeech.t2s.exps.fastspeech2.train paddlespeech.t2s.exps.fastspeech2.train
paddlespeech.t2s.exps.fastspeech2.vc2_infer

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
===================================================
.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
:members:
:undoc-members:
:show-inheritance:

@ -12,11 +12,13 @@ Subpackages
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.t2s.exps.ernie_sat
paddlespeech.t2s.exps.fastspeech2 paddlespeech.t2s.exps.fastspeech2
paddlespeech.t2s.exps.gan_vocoder paddlespeech.t2s.exps.gan_vocoder
paddlespeech.t2s.exps.speedyspeech paddlespeech.t2s.exps.speedyspeech
paddlespeech.t2s.exps.tacotron2 paddlespeech.t2s.exps.tacotron2
paddlespeech.t2s.exps.transformer_tts paddlespeech.t2s.exps.transformer_tts
paddlespeech.t2s.exps.vits
paddlespeech.t2s.exps.waveflow paddlespeech.t2s.exps.waveflow
paddlespeech.t2s.exps.wavernn paddlespeech.t2s.exps.wavernn
@ -31,6 +33,7 @@ Submodules
paddlespeech.t2s.exps.ort_predict paddlespeech.t2s.exps.ort_predict
paddlespeech.t2s.exps.ort_predict_e2e paddlespeech.t2s.exps.ort_predict_e2e
paddlespeech.t2s.exps.ort_predict_streaming paddlespeech.t2s.exps.ort_predict_streaming
paddlespeech.t2s.exps.stream_play_tts
paddlespeech.t2s.exps.syn_utils paddlespeech.t2s.exps.syn_utils
paddlespeech.t2s.exps.synthesize paddlespeech.t2s.exps.synthesize
paddlespeech.t2s.exps.synthesize_e2e paddlespeech.t2s.exps.synthesize_e2e

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.stream\_play\_tts module
==============================================
.. automodule:: paddlespeech.t2s.exps.stream_play_tts
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.normalize module
===========================================
.. automodule:: paddlespeech.t2s.exps.vits.normalize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.preprocess module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.preprocess
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,20 @@
paddlespeech.t2s.exps.vits package
==================================
.. automodule:: paddlespeech.t2s.exps.vits
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.vits.normalize
paddlespeech.t2s.exps.vits.preprocess
paddlespeech.t2s.exps.vits.synthesize
paddlespeech.t2s.exps.vits.synthesize_e2e
paddlespeech.t2s.exps.vits.train
paddlespeech.t2s.exps.vits.voice_cloning

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.synthesize\_e2e module
=================================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.train module
=======================================
.. automodule:: paddlespeech.t2s.exps.vits.train
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.exps.vits.voice\_cloning module
================================================
.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.dataset module
=============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.onnx\_api module
===============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,17 @@
paddlespeech.t2s.frontend.g2pw package
======================================
.. automodule:: paddlespeech.t2s.frontend.g2pw
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw.dataset
paddlespeech.t2s.frontend.g2pw.onnx_api
paddlespeech.t2s.frontend.g2pw.utils

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.g2pw.utils module
===========================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.frontend.mix\_frontend module
==============================================
.. automodule:: paddlespeech.t2s.frontend.mix_frontend
:members:
:undoc-members:
:show-inheritance:

@ -12,6 +12,7 @@ Subpackages
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.t2s.frontend.g2pw
paddlespeech.t2s.frontend.normalizer paddlespeech.t2s.frontend.normalizer
paddlespeech.t2s.frontend.zh_normalization paddlespeech.t2s.frontend.zh_normalization
@ -23,6 +24,7 @@ Submodules
paddlespeech.t2s.frontend.arpabet paddlespeech.t2s.frontend.arpabet
paddlespeech.t2s.frontend.generate_lexicon paddlespeech.t2s.frontend.generate_lexicon
paddlespeech.t2s.frontend.mix_frontend
paddlespeech.t2s.frontend.phonectic paddlespeech.t2s.frontend.phonectic
paddlespeech.t2s.frontend.punctuation paddlespeech.t2s.frontend.punctuation
paddlespeech.t2s.frontend.tone_sandhi paddlespeech.t2s.frontend.tone_sandhi

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
====================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
=============================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
:members:
:undoc-members:
:show-inheritance:

@ -12,4 +12,5 @@ Submodules
.. toctree:: .. toctree::
:maxdepth: 4 :maxdepth: 4
paddlespeech.t2s.models.ernie_sat.mlm paddlespeech.t2s.models.ernie_sat.ernie_sat
paddlespeech.t2s.models.ernie_sat.ernie_sat_updater

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.core module
=========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.t2s.models.vits.monotonic\_align package
=====================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.vits.monotonic_align.core
paddlespeech.t2s.models.vits.monotonic_align.setup

@ -0,0 +1,7 @@
paddlespeech.t2s.models.vits.monotonic\_align.setup module
==========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.dynamic\_import module
=========================================
.. automodule:: paddlespeech.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,7 @@
paddlespeech.utils.env module
=============================
.. automodule:: paddlespeech.utils.env
:members:
:undoc-members:
:show-inheritance:

@ -0,0 +1,16 @@
paddlespeech.utils package
==========================
.. automodule:: paddlespeech.utils
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.utils.dynamic_import
paddlespeech.utils.env

@ -74,8 +74,10 @@ Contents
paddlespeech.cli <api/paddlespeech.cli> paddlespeech.cli <api/paddlespeech.cli>
paddlespeech.cls <api/paddlespeech.cls> paddlespeech.cls <api/paddlespeech.cls>
paddlespeech.kws <api/paddlespeech.kws> paddlespeech.kws <api/paddlespeech.kws>
paddlespeech.resource <api/paddlespeech.resource>
paddlespeech.s2t <api/paddlespeech.s2t> paddlespeech.s2t <api/paddlespeech.s2t>
paddlespeech.server <api/paddlespeech.server> paddlespeech.server <api/paddlespeech.server>
paddlespeech.t2s <api/paddlespeech.t2s> paddlespeech.t2s <api/paddlespeech.t2s>
paddlespeech.text <api/paddlespeech.text> paddlespeech.text <api/paddlespeech.text>
paddlespeech.utils <api/ppaddlespeech.utils>
paddlespeech.vector <api/paddlespeech.vector> paddlespeech.vector <api/paddlespeech.vector>

@ -5,6 +5,7 @@
- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf) - [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
- [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf) - [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf)
* github: https://github.com/PaperMechanica/SemiPPL * github: https://github.com/PaperMechanica/SemiPPL
- [WikipediaHomographData](https://github.com/google-research-datasets/WikipediaHomographData)
### Text Normalization ### Text Normalization
#### English #### English
- [applenob/text_normalization](https://github.com/applenob/text_normalization) - [applenob/text_normalization](https://github.com/applenob/text_normalization)

@ -1,11 +1,10 @@
# ERNIE-SAT with AISHELL3 dataset # ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 ## Model Framework
In ERNIE-SAT, we propose two innovations:
## 模型框架 - In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
ERNIE-SAT 中我们提出了两项创新: - The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
<p align="center"> <p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" /> <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -1,11 +1,10 @@
# ERNIE-SAT with AISHELL3 and VCTK dataset # ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 ## Model Framework
In ERNIE-SAT, we propose two innovations:
## 模型框架 - In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
ERNIE-SAT 中我们提出了两项创新: - The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
<p align="center"> <p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" /> <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -75,6 +75,15 @@ When "Prepare" done. The structure of the current directory is listed below.
``` ```
### Set finetune.yaml
`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
Arguments:
- `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
- `learning_rate`: learning rate. Default: 0.0001
- `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
- `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set [].
## Get Started ## Get Started
Run the command below to Run the command below to

@ -14,6 +14,7 @@
import argparse import argparse
import os import os
from pathlib import Path from pathlib import Path
from typing import List
from typing import Union from typing import Union
import yaml import yaml
@ -21,10 +22,10 @@ from local.check_oov import get_check_result
from local.extract import extract_feature from local.extract import extract_feature
from local.label_process import get_single_label from local.label_process import get_single_label
from local.prepare_env import generate_finetune_env from local.prepare_env import generate_finetune_env
from local.train import train_sp
from paddle import distributed as dist from paddle import distributed as dist
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.t2s.exps.fastspeech2.train import train_sp
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
DICT_EN = 'tools/aligner/cmudict-0.7b' DICT_EN = 'tools/aligner/cmudict-0.7b'
@ -38,15 +39,24 @@ os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
class TrainArgs(): class TrainArgs():
def __init__(self, ngpu, config_file, dump_dir: Path, output_dir: Path): def __init__(self,
ngpu,
config_file,
dump_dir: Path,
output_dir: Path,
frozen_layers: List[str]):
# config: fastspeech2 config file.
self.config = str(config_file) self.config = str(config_file)
self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl") self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl") self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
# model output dir.
self.output_dir = str(output_dir) self.output_dir = str(output_dir)
self.ngpu = ngpu self.ngpu = ngpu
self.phones_dict = str(dump_dir / "phone_id_map.txt") self.phones_dict = str(dump_dir / "phone_id_map.txt")
self.speaker_dict = str(dump_dir / "speaker_id_map.txt") self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
self.voice_cloning = False self.voice_cloning = False
# frozen layers
self.frozen_layers = frozen_layers
def get_mfa_result( def get_mfa_result(
@ -122,12 +132,11 @@ if __name__ == '__main__':
"--ngpu", type=int, default=2, help="if ngpu=0, use cpu.") "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
parser.add_argument("--epoch", type=int, default=100, help="finetune epoch") parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
parser.add_argument( parser.add_argument(
"--batch_size", "--finetune_config",
type=int, type=str,
default=-1, default="./finetune.yaml",
help="batch size, default -1 means same as pretrained model") help="Path to finetune config file")
args = parser.parse_args() args = parser.parse_args()
@ -147,8 +156,14 @@ if __name__ == '__main__':
with open(config_file) as f: with open(config_file) as f:
config = CfgNode(yaml.safe_load(f)) config = CfgNode(yaml.safe_load(f))
config.max_epoch = config.max_epoch + args.epoch config.max_epoch = config.max_epoch + args.epoch
if args.batch_size > 0:
config.batch_size = args.batch_size with open(args.finetune_config) as f2:
finetune_config = CfgNode(yaml.safe_load(f2))
config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
frozen_layers = finetune_config.frozen_layers
assert type(frozen_layers) == list, "frozen_layers should be set a list."
if args.lang == 'en': if args.lang == 'en':
lexicon_file = DICT_EN lexicon_file = DICT_EN
@ -158,6 +173,13 @@ if __name__ == '__main__':
mfa_phone_file = MFA_PHONE_ZH mfa_phone_file = MFA_PHONE_ZH
else: else:
print('please input right lang!!') print('please input right lang!!')
print(f"finetune max_epoch: {config.max_epoch}")
print(f"finetune batch_size: {config.batch_size}")
print(f"finetune learning_rate: {config.optimizer.learning_rate}")
print(f"finetune num_snapshots: {config.num_snapshots}")
print(f"finetune frozen_layers: {frozen_layers}")
am_phone_file = pretrained_model_dir / "phone_id_map.txt" am_phone_file = pretrained_model_dir / "phone_id_map.txt"
label_file = input_dir / "labels.txt" label_file = input_dir / "labels.txt"
@ -181,7 +203,8 @@ if __name__ == '__main__':
generate_finetune_env(output_dir, pretrained_model_dir) generate_finetune_env(output_dir, pretrained_model_dir)
# create a new args for training # create a new args for training
train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir) train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
frozen_layers)
# finetune models # finetune models
# dispatch # dispatch

@ -0,0 +1,12 @@
###########################################################
# PARAS SETTING #
###########################################################
# Set to -1 to indicate that the parameter is the same as the pretrained model configuration
batch_size: -1
learning_rate: 0.0001 # learning rate
num_snapshots: -1
# frozen_layers should be a list
# if you don't need to freeze, set frozen_layers to []
frozen_layers: ["encoder", "duration_predictor"]

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import logging import logging
import math
import os import os
from operator import itemgetter from operator import itemgetter
from pathlib import Path from pathlib import Path
@ -211,9 +210,9 @@ def extract_feature(duration_file: str,
mel_extractor, pitch_extractor, energy_extractor = get_extractor(config) mel_extractor, pitch_extractor, energy_extractor = get_extractor(config)
wav_files = sorted(list((input_dir).rglob("*.wav"))) wav_files = sorted(list((input_dir).rglob("*.wav")))
# split data into 3 sections, train: 80%, dev: 10%, test: 10% # split data into 3 sections, train: len(wav_files) - 2, dev: 1, test: 1
num_train = math.ceil(len(wav_files) * 0.8) num_train = len(wav_files) - 2
num_dev = math.ceil(len(wav_files) * 0.1) num_dev = 1
print(num_train, num_dev) print(num_train, num_dev)
train_wav_files = wav_files[:num_train] train_wav_files = wav_files[:num_train]

@ -0,0 +1,178 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import shutil
from pathlib import Path
from typing import List
import jsonlines
import numpy as np
import paddle
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
def freeze_layer(model, layers: List[str]):
"""freeze layers
Args:
layers (List[str]): frozen layers
"""
for layer in layers:
for param in eval("model." + layer + ".parameters()"):
param.trainable = False
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("gpu")
world_size = paddle.distributed.get_world_size()
if world_size > 1:
paddle.distributed.init_parallel_env()
# set the random seed, it is a must for multiprocess training
seed_everything(config.seed)
print(
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
)
fields = [
"text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy"
]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn
print("spk_num:", spk_num)
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
# construct dataset for training and validation
with jsonlines.open(args.train_metadata, 'r') as reader:
train_metadata = list(reader)
train_dataset = DataTable(
data=train_metadata,
fields=fields,
converters=converters, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters=converters, )
# collate function and dataloader
train_sampler = DistributedBatchSampler(
train_dataset,
batch_size=config.batch_size,
shuffle=True,
drop_last=True)
print("samplers done!")
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
num_workers=config.num_workers)
dev_dataloader = DataLoader(
dev_dataset,
shuffle=False,
drop_last=False,
batch_size=config.batch_size,
collate_fn=collate_fn,
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
odim = config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
# freeze layer
if args.frozen_layers != []:
freeze_layer(model, args.frozen_layers)
if world_size > 1:
model = DataParallel(model)
print("model done!")
optimizer = build_optimizers(model, **config["optimizer"])
print("optimizer done!")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if dist.get_rank() == 0:
config_name = args.config.split("/")[-1]
# copy conf to output_dir
shutil.copyfile(args.config, output_dir / config_name)
updater = FastSpeech2Updater(
model=model,
optimizer=optimizer,
dataloader=train_dataloader,
output_dir=output_dir,
**config["updater"])
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
evaluator = FastSpeech2Evaluator(
model, dev_dataloader, output_dir=output_dir, **config["updater"])
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
trainer.extend(
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
trainer.run()

@ -10,11 +10,12 @@ mfa_dir=./mfa_result
dump_dir=./dump dump_dir=./dump
output_dir=./exp/default output_dir=./exp/default
lang=zh lang=zh
ngpu=2 ngpu=1
finetune_config=./finetune.yaml
ckpt=snapshot_iter_96600 ckpt=snapshot_iter_96699
gpus=0,1 gpus=1
CUDA_VISIBLE_DEVICES=${gpus} CUDA_VISIBLE_DEVICES=${gpus}
stage=0 stage=0
stop_stage=100 stop_stage=100
@ -35,7 +36,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--output_dir=${output_dir} \ --output_dir=${output_dir} \
--lang=${lang} \ --lang=${lang} \
--ngpu=${ngpu} \ --ngpu=${ngpu} \
--epoch=100 --epoch=100 \
--finetune_config=${finetune_config}
fi fi
@ -54,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=./test_e2e \ --output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \ --phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \ --speaker_dict=${dump_dir}/speaker_id_map.txt \
--spk_id=0 --spk_id=0

@ -1,11 +1,10 @@
# ERNIE-SAT with VCTK dataset # ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 ## Model Framework
In ERNIE-SAT, we propose two innovations:
## 模型框架 - In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
ERNIE-SAT 中我们提出了两项创新: - The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
<p align="center"> <p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" /> <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 |
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.050767 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 |

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save