Merge branch 'PaddlePaddle:develop' into develop

3 years ago · 68bd85a6b8
parent fab5b3a3a6 5e714ecb4a
commit 68bd85a6b8
116 changed files with 1693 additions and 479 deletions
--- a/demos/speech_server/conf/application.yaml
+++ b/demos/speech_server/conf/application.yaml
@ -61,7 +61,7 @@ tts_python:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
+
    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
    #                        'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
@ -87,7 +87,7 @@ tts_inference:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
+
    am_predictor_conf:
        device:  # set 'gpu:id' or 'cpu'
--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@ -29,7 +29,7 @@ tts_online:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
+    
    # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@ -70,7 +70,6 @@ tts_online-onnx:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
    spk_id: 0
    am_sample_rate: 24000
    am_sess_conf:
        device: "cpu" # set 'gpu:id' or 'cpu'
--- a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml
@ -29,7 +29,7 @@ tts_online:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
+        
    # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@ -70,7 +70,6 @@ tts_online-onnx:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
    spk_id: 0
    am_sample_rate: 24000
    am_sess_conf:
        device: "cpu" # set 'gpu:id' or 'cpu'
--- a/docs/source/api/paddlespeech.audio.rst
+++ b/docs/source/api/paddlespeech.audio.rst
@ -20,4 +20,7 @@ Subpackages
   paddlespeech.audio.io
   paddlespeech.audio.metric
   paddlespeech.audio.sox_effects
   paddlespeech.audio.streamdata
   paddlespeech.audio.text
   paddlespeech.audio.transform
   paddlespeech.audio.utils
--- a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.autodecode module
 ===============================================
 .. automodule:: paddlespeech.audio.streamdata.autodecode
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.cache.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.cache.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.cache module
 ==========================================
 .. automodule:: paddlespeech.audio.streamdata.cache
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.compat.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.compat.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.compat module
 ===========================================
 .. automodule:: paddlespeech.audio.streamdata.compat
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.extradatasets module
 ==================================================
 .. automodule:: paddlespeech.audio.streamdata.extradatasets
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.filters.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.filters.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.filters module
 ============================================
 .. automodule:: paddlespeech.audio.streamdata.filters
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.gopen module
 ==========================================
 .. automodule:: paddlespeech.audio.streamdata.gopen
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.handlers module
 =============================================
 .. automodule:: paddlespeech.audio.streamdata.handlers
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.mix.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.mix.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.mix module
 ========================================
 .. automodule:: paddlespeech.audio.streamdata.mix
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.paddle\_utils module
 ==================================================
 .. automodule:: paddlespeech.audio.streamdata.paddle_utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.pipeline module
 =============================================
 .. automodule:: paddlespeech.audio.streamdata.pipeline
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.rst
@ -0,0 +1,28 @@
 paddlespeech.audio.streamdata package
 =====================================
 .. automodule:: paddlespeech.audio.streamdata
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.audio.streamdata.autodecode
   paddlespeech.audio.streamdata.cache
   paddlespeech.audio.streamdata.compat
   paddlespeech.audio.streamdata.extradatasets
   paddlespeech.audio.streamdata.filters
   paddlespeech.audio.streamdata.gopen
   paddlespeech.audio.streamdata.handlers
   paddlespeech.audio.streamdata.mix
   paddlespeech.audio.streamdata.paddle_utils
   paddlespeech.audio.streamdata.pipeline
   paddlespeech.audio.streamdata.shardlists
   paddlespeech.audio.streamdata.tariterators
   paddlespeech.audio.streamdata.utils
   paddlespeech.audio.streamdata.writer
--- a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.shardlists module
 ===============================================
 .. automodule:: paddlespeech.audio.streamdata.shardlists
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.tariterators module
 =================================================
 .. automodule:: paddlespeech.audio.streamdata.tariterators
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.utils.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.utils.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.utils module
 ==========================================
 .. automodule:: paddlespeech.audio.streamdata.utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.writer.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.writer.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.streamdata.writer module
 ===========================================
 .. automodule:: paddlespeech.audio.streamdata.writer
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.text.rst
+++ b/docs/source/api/paddlespeech.audio.text.rst
@ -0,0 +1,16 @@
 paddlespeech.audio.text package
 ===============================
 .. automodule:: paddlespeech.audio.text
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.audio.text.text_featurizer
   paddlespeech.audio.text.utility
--- a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
+++ b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.text.text\_featurizer module
 ===============================================
 .. automodule:: paddlespeech.audio.text.text_featurizer
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.text.utility.rst
+++ b/docs/source/api/paddlespeech.audio.text.utility.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.text.utility module
 ======================================
 .. automodule:: paddlespeech.audio.text.utility
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
+++ b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.add\_deltas module
 ===============================================
 .. automodule:: paddlespeech.audio.transform.add_deltas
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
+++ b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.channel\_selector module
 =====================================================
 .. automodule:: paddlespeech.audio.transform.channel_selector
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.cmvn.rst
+++ b/docs/source/api/paddlespeech.audio.transform.cmvn.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.cmvn module
 ========================================
 .. automodule:: paddlespeech.audio.transform.cmvn
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.functional.rst
+++ b/docs/source/api/paddlespeech.audio.transform.functional.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.functional module
 ==============================================
 .. automodule:: paddlespeech.audio.transform.functional
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.perturb.rst
+++ b/docs/source/api/paddlespeech.audio.transform.perturb.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.perturb module
 ===========================================
 .. automodule:: paddlespeech.audio.transform.perturb
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.rst
+++ b/docs/source/api/paddlespeech.audio.transform.rst
@ -0,0 +1,24 @@
 paddlespeech.audio.transform package
 ====================================
 .. automodule:: paddlespeech.audio.transform
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.audio.transform.add_deltas
   paddlespeech.audio.transform.channel_selector
   paddlespeech.audio.transform.cmvn
   paddlespeech.audio.transform.functional
   paddlespeech.audio.transform.perturb
   paddlespeech.audio.transform.spec_augment
   paddlespeech.audio.transform.spectrogram
   paddlespeech.audio.transform.transform_interface
   paddlespeech.audio.transform.transformation
   paddlespeech.audio.transform.wpe
--- a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
+++ b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.spec\_augment module
 =================================================
 .. automodule:: paddlespeech.audio.transform.spec_augment
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
+++ b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.spectrogram module
 ===============================================
 .. automodule:: paddlespeech.audio.transform.spectrogram
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
+++ b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.transform\_interface module
 ========================================================
 .. automodule:: paddlespeech.audio.transform.transform_interface
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.transformation.rst
+++ b/docs/source/api/paddlespeech.audio.transform.transformation.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.transformation module
 ==================================================
 .. automodule:: paddlespeech.audio.transform.transformation
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.wpe.rst
+++ b/docs/source/api/paddlespeech.audio.transform.wpe.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.transform.wpe module
 =======================================
 .. automodule:: paddlespeech.audio.transform.wpe
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
+++ b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.utils.check\_kwargs module
 =============================================
 .. automodule:: paddlespeech.audio.utils.check_kwargs
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
+++ b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.utils.dynamic\_import module
 ===============================================
 .. automodule:: paddlespeech.audio.utils.dynamic_import
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.utils.rst
+++ b/docs/source/api/paddlespeech.audio.utils.rst
@ -12,8 +12,11 @@ Submodules
 .. toctree::
   :maxdepth: 4
   paddlespeech.audio.utils.check_kwargs
   paddlespeech.audio.utils.download
   paddlespeech.audio.utils.dynamic_import
   paddlespeech.audio.utils.error
   paddlespeech.audio.utils.log
   paddlespeech.audio.utils.numeric
   paddlespeech.audio.utils.tensor_utils
   paddlespeech.audio.utils.time
--- a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
+++ b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
@ -0,0 +1,7 @@
 paddlespeech.audio.utils.tensor\_utils module
 =============================================
 .. automodule:: paddlespeech.audio.utils.tensor_utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
@ -0,0 +1,7 @@
 paddlespeech.kws.exps.mdtc.collate module
 =========================================
 .. automodule:: paddlespeech.kws.exps.mdtc.collate
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
@ -0,0 +1,7 @@
 paddlespeech.kws.exps.mdtc.compute\_det module
 ==============================================
 .. automodule:: paddlespeech.kws.exps.mdtc.compute_det
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
@ -0,0 +1,7 @@
 paddlespeech.kws.exps.mdtc.plot\_det\_curve module
 ==================================================
 .. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
@ -0,0 +1,19 @@
 paddlespeech.kws.exps.mdtc package
 ==================================
 .. automodule:: paddlespeech.kws.exps.mdtc
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.kws.exps.mdtc.collate
   paddlespeech.kws.exps.mdtc.compute_det
   paddlespeech.kws.exps.mdtc.plot_det_curve
   paddlespeech.kws.exps.mdtc.score
   paddlespeech.kws.exps.mdtc.train
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
@ -0,0 +1,7 @@
 paddlespeech.kws.exps.mdtc.score module
 =======================================
 .. automodule:: paddlespeech.kws.exps.mdtc.score
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
@ -0,0 +1,7 @@
 paddlespeech.kws.exps.mdtc.train module
 =======================================
 .. automodule:: paddlespeech.kws.exps.mdtc.train
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.rst
+++ b/docs/source/api/paddlespeech.kws.exps.rst
@ -0,0 +1,15 @@
 paddlespeech.kws.exps package
 =============================
 .. automodule:: paddlespeech.kws.exps
   :members:
   :undoc-members:
   :show-inheritance:
 Subpackages
 -----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.kws.exps.mdtc
--- a/docs/source/api/paddlespeech.kws.rst
+++ b/docs/source/api/paddlespeech.kws.rst
@ -12,4 +12,5 @@ Subpackages
 .. toctree::
   :maxdepth: 4
   paddlespeech.kws.exps
   paddlespeech.kws.models
--- a/docs/source/api/paddlespeech.resource.model_alias.rst
+++ b/docs/source/api/paddlespeech.resource.model_alias.rst
@ -0,0 +1,7 @@
 paddlespeech.resource.model\_alias module
 =========================================
 .. automodule:: paddlespeech.resource.model_alias
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.resource.pretrained_models.rst
+++ b/docs/source/api/paddlespeech.resource.pretrained_models.rst
@ -0,0 +1,7 @@
 paddlespeech.resource.pretrained\_models module
 ===============================================
 .. automodule:: paddlespeech.resource.pretrained_models
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.resource.resource.rst
+++ b/docs/source/api/paddlespeech.resource.resource.rst
@ -0,0 +1,7 @@
 paddlespeech.resource.resource module
 =====================================
 .. automodule:: paddlespeech.resource.resource
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.resource.rst
+++ b/docs/source/api/paddlespeech.resource.rst
@ -0,0 +1,17 @@
 paddlespeech.resource package
 =============================
 .. automodule:: paddlespeech.resource
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.resource.model_alias
   paddlespeech.resource.pretrained_models
   paddlespeech.resource.resource
--- a/docs/source/api/paddlespeech.rst
+++ b/docs/source/api/paddlespeech.rst
@ -16,8 +16,10 @@ Subpackages
   paddlespeech.cli
   paddlespeech.cls
   paddlespeech.kws
   paddlespeech.resource
   paddlespeech.s2t
   paddlespeech.server
   paddlespeech.t2s
   paddlespeech.text
   paddlespeech.utils
   paddlespeech.vector
--- a/docs/source/api/paddlespeech.s2t.rst
+++ b/docs/source/api/paddlespeech.s2t.rst
@ -19,5 +19,4 @@ Subpackages
   paddlespeech.s2t.models
   paddlespeech.s2t.modules
   paddlespeech.s2t.training
   paddlespeech.s2t.transform
   paddlespeech.s2t.utils
--- a/docs/source/api/paddlespeech.server.utils.rst
+++ b/docs/source/api/paddlespeech.server.utils.rst
@ -18,7 +18,6 @@ Submodules
   paddlespeech.server.utils.config
   paddlespeech.server.utils.errors
   paddlespeech.server.utils.exception
   paddlespeech.server.utils.log
   paddlespeech.server.utils.onnx_infer
   paddlespeech.server.utils.paddle_predictor
   paddlespeech.server.utils.util
--- a/docs/source/api/paddlespeech.t2s.datasets.rst
+++ b/docs/source/api/paddlespeech.t2s.datasets.rst
@ -19,4 +19,5 @@ Submodules
   paddlespeech.t2s.datasets.get_feats
   paddlespeech.t2s.datasets.ljspeech
   paddlespeech.t2s.datasets.preprocess_utils
   paddlespeech.t2s.datasets.sampler
   paddlespeech.t2s.datasets.vocoder_batch_fn
--- a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
+++ b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.datasets.sampler module
 ========================================
 .. automodule:: paddlespeech.t2s.datasets.sampler
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.ernie\_sat.align module
 =============================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat.align
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.ernie\_sat.normalize module
 =================================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.ernie\_sat.preprocess module
 ==================================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
@ -0,0 +1,21 @@
 paddlespeech.t2s.exps.ernie\_sat package
 ========================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.t2s.exps.ernie_sat.align
   paddlespeech.t2s.exps.ernie_sat.normalize
   paddlespeech.t2s.exps.ernie_sat.preprocess
   paddlespeech.t2s.exps.ernie_sat.synthesize
   paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
   paddlespeech.t2s.exps.ernie_sat.train
   paddlespeech.t2s.exps.ernie_sat.utils
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.ernie\_sat.synthesize module
 ==================================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
 =======================================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.ernie\_sat.train module
 =============================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat.train
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.ernie\_sat.utils module
 =============================================
 .. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
@ -16,3 +16,4 @@ Submodules
   paddlespeech.t2s.exps.fastspeech2.normalize
   paddlespeech.t2s.exps.fastspeech2.preprocess
   paddlespeech.t2s.exps.fastspeech2.train
   paddlespeech.t2s.exps.fastspeech2.vc2_infer
--- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
 ===================================================
 .. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.rst
@ -12,11 +12,13 @@ Subpackages
 .. toctree::
   :maxdepth: 4
   paddlespeech.t2s.exps.ernie_sat
   paddlespeech.t2s.exps.fastspeech2
   paddlespeech.t2s.exps.gan_vocoder
   paddlespeech.t2s.exps.speedyspeech
   paddlespeech.t2s.exps.tacotron2
   paddlespeech.t2s.exps.transformer_tts
   paddlespeech.t2s.exps.vits
   paddlespeech.t2s.exps.waveflow
   paddlespeech.t2s.exps.wavernn
@ -31,6 +33,7 @@ Submodules
   paddlespeech.t2s.exps.ort_predict
   paddlespeech.t2s.exps.ort_predict_e2e
   paddlespeech.t2s.exps.ort_predict_streaming
   paddlespeech.t2s.exps.stream_play_tts
   paddlespeech.t2s.exps.syn_utils
   paddlespeech.t2s.exps.synthesize
   paddlespeech.t2s.exps.synthesize_e2e
--- a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.stream\_play\_tts module
 ==============================================
 .. automodule:: paddlespeech.t2s.exps.stream_play_tts
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.vits.normalize module
 ===========================================
 .. automodule:: paddlespeech.t2s.exps.vits.normalize
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.vits.preprocess module
 ============================================
 .. automodule:: paddlespeech.t2s.exps.vits.preprocess
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.rst
@ -0,0 +1,20 @@
 paddlespeech.t2s.exps.vits package
 ==================================
 .. automodule:: paddlespeech.t2s.exps.vits
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.t2s.exps.vits.normalize
   paddlespeech.t2s.exps.vits.preprocess
   paddlespeech.t2s.exps.vits.synthesize
   paddlespeech.t2s.exps.vits.synthesize_e2e
   paddlespeech.t2s.exps.vits.train
   paddlespeech.t2s.exps.vits.voice_cloning
--- a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.vits.synthesize module
 ============================================
 .. automodule:: paddlespeech.t2s.exps.vits.synthesize
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.vits.synthesize\_e2e module
 =================================================
 .. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.vits.train module
 =======================================
 .. automodule:: paddlespeech.t2s.exps.vits.train
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.exps.vits.voice\_cloning module
 ================================================
 .. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.frontend.g2pw.dataset module
 =============================================
 .. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.frontend.g2pw.onnx\_api module
 ===============================================
 .. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
@ -0,0 +1,17 @@
 paddlespeech.t2s.frontend.g2pw package
 ======================================
 .. automodule:: paddlespeech.t2s.frontend.g2pw
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.t2s.frontend.g2pw.dataset
   paddlespeech.t2s.frontend.g2pw.onnx_api
   paddlespeech.t2s.frontend.g2pw.utils
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.frontend.g2pw.utils module
 ===========================================
 .. automodule:: paddlespeech.t2s.frontend.g2pw.utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.frontend.mix\_frontend module
 ==============================================
 .. automodule:: paddlespeech.t2s.frontend.mix_frontend
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.rst
@ -12,6 +12,7 @@ Subpackages
 .. toctree::
   :maxdepth: 4
   paddlespeech.t2s.frontend.g2pw
   paddlespeech.t2s.frontend.normalizer
   paddlespeech.t2s.frontend.zh_normalization
@ -23,6 +24,7 @@ Submodules
   paddlespeech.t2s.frontend.arpabet
   paddlespeech.t2s.frontend.generate_lexicon
   paddlespeech.t2s.frontend.mix_frontend
   paddlespeech.t2s.frontend.phonectic
   paddlespeech.t2s.frontend.punctuation
   paddlespeech.t2s.frontend.tone_sandhi
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
 ====================================================
 .. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
 =============================================================
 .. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
@ -12,4 +12,5 @@ Submodules
 .. toctree::
   :maxdepth: 4
-   paddlespeech.t2s.models.ernie_sat.mlm
+   paddlespeech.t2s.models.ernie_sat.ernie_sat
   paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.models.vits.monotonic\_align.core module
 =========================================================
 .. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
@ -0,0 +1,16 @@
 paddlespeech.t2s.models.vits.monotonic\_align package
 =====================================================
 .. automodule:: paddlespeech.t2s.models.vits.monotonic_align
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.t2s.models.vits.monotonic_align.core
   paddlespeech.t2s.models.vits.monotonic_align.setup
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
@ -0,0 +1,7 @@
 paddlespeech.t2s.models.vits.monotonic\_align.setup module
 ==========================================================
 .. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.utils.dynamic_import.rst
+++ b/docs/source/api/paddlespeech.utils.dynamic_import.rst
@ -0,0 +1,7 @@
 paddlespeech.utils.dynamic\_import module
 =========================================
 .. automodule:: paddlespeech.utils.dynamic_import
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.utils.env.rst
+++ b/docs/source/api/paddlespeech.utils.env.rst
@ -0,0 +1,7 @@
 paddlespeech.utils.env module
 =============================
 .. automodule:: paddlespeech.utils.env
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/paddlespeech.utils.rst
+++ b/docs/source/api/paddlespeech.utils.rst
@ -0,0 +1,16 @@
 paddlespeech.utils package
 ==========================
 .. automodule:: paddlespeech.utils
   :members:
   :undoc-members:
   :show-inheritance:
 Submodules
 ----------
 .. toctree::
   :maxdepth: 4
   paddlespeech.utils.dynamic_import
   paddlespeech.utils.env
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -74,8 +74,10 @@ Contents
   paddlespeech.cli <api/paddlespeech.cli>
   paddlespeech.cls <api/paddlespeech.cls>
   paddlespeech.kws <api/paddlespeech.kws>
   paddlespeech.resource <api/paddlespeech.resource>
   paddlespeech.s2t <api/paddlespeech.s2t>
   paddlespeech.server <api/paddlespeech.server>
   paddlespeech.t2s <api/paddlespeech.t2s>
   paddlespeech.text <api/paddlespeech.text>
   paddlespeech.utils <api/ppaddlespeech.utils>
   paddlespeech.vector <api/paddlespeech.vector>
--- a/docs/source/tts/tts_papers.md
+++ b/docs/source/tts/tts_papers.md
@ -5,6 +5,7 @@
 - [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
 - [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf)
  * github: https://github.com/PaperMechanica/SemiPPL
 - [WikipediaHomographData](https://github.com/google-research-datasets/WikipediaHomographData)
 ### Text Normalization
 #### English
 - [applenob/text_normalization](https://github.com/applenob/text_normalization)
--- a/examples/aishell3/ernie_sat/README.md
+++ b/examples/aishell3/ernie_sat/README.md
@ -1,11 +1,10 @@
-# ERNIE-SAT with AISHELL3 dataset
+# ERNIE-SAT with VCTK dataset
 ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
+## Model Framework
-
+In ERNIE-SAT, we propose two innovations:
-## 模型框架
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
-ERNIE-SAT 中我们提出了两项创新：
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
 - 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
 - 采用语言和语音的联合掩码学习实现了语言和语音的对齐
 <p align="center">
    <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
--- a/examples/aishell3_vctk/ernie_sat/README.md
+++ b/examples/aishell3_vctk/ernie_sat/README.md
@ -1,11 +1,10 @@
-# ERNIE-SAT with AISHELL3 and VCTK dataset
+# ERNIE-SAT with VCTK dataset
 ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
+## Model Framework
-
+In ERNIE-SAT, we propose two innovations:
-## 模型框架
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
-ERNIE-SAT 中我们提出了两项创新：
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
 - 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
 - 采用语言和语音的联合掩码学习实现了语言和语音的对齐
 <p align="center">
    <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
--- a/examples/other/tts_finetune/tts3/README.md
+++ b/examples/other/tts_finetune/tts3/README.md
@ -75,6 +75,15 @@ When "Prepare" done. The structure of the current directory is listed below.
 ```
 ### Set finetune.yaml
 `finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
 Arguments:
  - `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
  - `learning_rate`: learning rate. Default: 0.0001
  - `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
  - `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set []. 
 ## Get Started
 Run the command below to
--- a/examples/other/tts_finetune/tts3/finetune.py
+++ b/examples/other/tts_finetune/tts3/finetune.py
@ -14,6 +14,7 @@
 import argparse
 import os
 from pathlib import Path
 from typing import List
 from typing import Union
 import yaml
@ -21,10 +22,10 @@ from local.check_oov import get_check_result
 from local.extract import extract_feature
 from local.label_process import get_single_label
 from local.prepare_env import generate_finetune_env
 from local.train import train_sp
 from paddle import distributed as dist
 from yacs.config import CfgNode
 from paddlespeech.t2s.exps.fastspeech2.train import train_sp
 from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
 DICT_EN = 'tools/aligner/cmudict-0.7b'
@ -38,15 +39,24 @@ os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
 class TrainArgs():
-    def __init__(self, ngpu, config_file, dump_dir: Path, output_dir: Path):
+    def __init__(self,
                 ngpu,
                 config_file,
                 dump_dir: Path,
                 output_dir: Path,
                 frozen_layers: List[str]):
        # config: fastspeech2 config file.
        self.config = str(config_file)
        self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
        self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
        # model output dir.
        self.output_dir = str(output_dir)
        self.ngpu = ngpu
        self.phones_dict = str(dump_dir / "phone_id_map.txt")
        self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
        self.voice_cloning = False
        # frozen layers
        self.frozen_layers = frozen_layers
 def get_mfa_result(
@ -122,12 +132,11 @@ if __name__ == '__main__':
        "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
    parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
    parser.add_argument(
-        "--batch_size",
+        "--finetune_config",
-        type=int,
+        type=str,
-        default=-1,
+        default="./finetune.yaml",
-        help="batch size, default -1 means same as pretrained model")
+        help="Path to finetune config file")
    args = parser.parse_args()
@ -147,8 +156,14 @@ if __name__ == '__main__':
    with open(config_file) as f:
        config = CfgNode(yaml.safe_load(f))
    config.max_epoch = config.max_epoch + args.epoch
-    if args.batch_size > 0:
+
-        config.batch_size = args.batch_size
+    with open(args.finetune_config) as f2:
        finetune_config = CfgNode(yaml.safe_load(f2))
    config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
    config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
    config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
    frozen_layers = finetune_config.frozen_layers
    assert type(frozen_layers) == list, "frozen_layers should be set a list."
    if args.lang == 'en':
        lexicon_file = DICT_EN
@ -158,6 +173,13 @@ if __name__ == '__main__':
        mfa_phone_file = MFA_PHONE_ZH
    else:
        print('please input right lang!!')
    print(f"finetune max_epoch: {config.max_epoch}")
    print(f"finetune batch_size: {config.batch_size}")
    print(f"finetune learning_rate: {config.optimizer.learning_rate}")
    print(f"finetune num_snapshots: {config.num_snapshots}")
    print(f"finetune frozen_layers: {frozen_layers}")
    am_phone_file = pretrained_model_dir / "phone_id_map.txt"
    label_file = input_dir / "labels.txt"
@ -181,7 +203,8 @@ if __name__ == '__main__':
    generate_finetune_env(output_dir, pretrained_model_dir)
    # create a new args for training
-    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir)
+    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
                           frozen_layers)
    # finetune models
    # dispatch
--- a/examples/other/tts_finetune/tts3/finetune.yaml
+++ b/examples/other/tts_finetune/tts3/finetune.yaml
@ -0,0 +1,12 @@
 ###########################################################
 #                 PARAS SETTING               #
 ###########################################################
 # Set to -1 to indicate that the parameter is the same as the pretrained model configuration
 batch_size: -1
 learning_rate: 0.0001     # learning rate
 num_snapshots: -1
 # frozen_layers should be a list
 # if you don't need to freeze, set frozen_layers to []
 frozen_layers: ["encoder", "duration_predictor"]
--- a/examples/other/tts_finetune/tts3/local/extract.py
+++ b/examples/other/tts_finetune/tts3/local/extract.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import math
 import os
 from operator import itemgetter
 from pathlib import Path
@ -211,9 +210,9 @@ def extract_feature(duration_file: str,
    mel_extractor, pitch_extractor, energy_extractor = get_extractor(config)
    wav_files = sorted(list((input_dir).rglob("*.wav")))
-    # split data into 3 sections, train: 80%, dev: 10%, test: 10%
+    # split data into 3 sections, train: len(wav_files) - 2, dev: 1, test: 1
-    num_train = math.ceil(len(wav_files) * 0.8)
+    num_train = len(wav_files) - 2
-    num_dev = math.ceil(len(wav_files) * 0.1)
+    num_dev = 1
    print(num_train, num_dev)
    train_wav_files = wav_files[:num_train]
--- a/examples/other/tts_finetune/tts3/local/train.py
+++ b/examples/other/tts_finetune/tts3/local/train.py
@ -0,0 +1,178 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import os
 import shutil
 from pathlib import Path
 from typing import List
 import jsonlines
 import numpy as np
 import paddle
 from paddle import DataParallel
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
 from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.optimizer import build_optimizers
 from paddlespeech.t2s.training.seeding import seed_everything
 from paddlespeech.t2s.training.trainer import Trainer
 def freeze_layer(model, layers: List[str]):
    """freeze layers
    Args:
        layers (List[str]): frozen layers
    """
    for layer in layers:
        for param in eval("model." + layer + ".parameters()"):
            param.trainable = False
 def train_sp(args, config):
    # decides device type and whether to run in parallel
    # setup running environment correctly
    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
        paddle.set_device("cpu")
    else:
        paddle.set_device("gpu")
    world_size = paddle.distributed.get_world_size()
    if world_size > 1:
        paddle.distributed.init_parallel_env()
    # set the random seed, it is a must for multiprocess training
    seed_everything(config.seed)
    print(
        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
    )
    fields = [
        "text", "text_lengths", "speech", "speech_lengths", "durations",
        "pitch", "energy"
    ]
    converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
    spk_num = None
    if args.speaker_dict is not None:
        print("multiple speaker fastspeech2!")
        collate_fn = fastspeech2_multi_spk_batch_fn
        with open(args.speaker_dict, 'rt') as f:
            spk_id = [line.strip().split() for line in f.readlines()]
        spk_num = len(spk_id)
        fields += ["spk_id"]
    elif args.voice_cloning:
        print("Training voice cloning!")
        collate_fn = fastspeech2_multi_spk_batch_fn
        fields += ["spk_emb"]
        converters["spk_emb"] = np.load
    else:
        print("single speaker fastspeech2!")
        collate_fn = fastspeech2_single_spk_batch_fn
    print("spk_num:", spk_num)
    # dataloader has been too verbose
    logging.getLogger("DataLoader").disabled = True
    # construct dataset for training and validation
    with jsonlines.open(args.train_metadata, 'r') as reader:
        train_metadata = list(reader)
    train_dataset = DataTable(
        data=train_metadata,
        fields=fields,
        converters=converters, )
    with jsonlines.open(args.dev_metadata, 'r') as reader:
        dev_metadata = list(reader)
    dev_dataset = DataTable(
        data=dev_metadata,
        fields=fields,
        converters=converters, )
    # collate function and dataloader
    train_sampler = DistributedBatchSampler(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        drop_last=True)
    print("samplers done!")
    train_dataloader = DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
        collate_fn=collate_fn,
        num_workers=config.num_workers)
    dev_dataloader = DataLoader(
        dev_dataset,
        shuffle=False,
        drop_last=False,
        batch_size=config.batch_size,
        collate_fn=collate_fn,
        num_workers=config.num_workers)
    print("dataloaders done!")
    with open(args.phones_dict, "r") as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
    odim = config.n_mels
    model = FastSpeech2(
        idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
    # freeze layer
    if args.frozen_layers != []:
        freeze_layer(model, args.frozen_layers)
    if world_size > 1:
        model = DataParallel(model)
    print("model done!")
    optimizer = build_optimizers(model, **config["optimizer"])
    print("optimizer done!")
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    if dist.get_rank() == 0:
        config_name = args.config.split("/")[-1]
        # copy conf to output_dir
        shutil.copyfile(args.config, output_dir / config_name)
    updater = FastSpeech2Updater(
        model=model,
        optimizer=optimizer,
        dataloader=train_dataloader,
        output_dir=output_dir,
        **config["updater"])
    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
    evaluator = FastSpeech2Evaluator(
        model, dev_dataloader, output_dir=output_dir, **config["updater"])
    if dist.get_rank() == 0:
        trainer.extend(evaluator, trigger=(1, "epoch"))
        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
    trainer.extend(
        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
    trainer.run()
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@ -10,11 +10,12 @@ mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
 lang=zh
-ngpu=2
+ngpu=1
 finetune_config=./finetune.yaml
-ckpt=snapshot_iter_96600
+ckpt=snapshot_iter_96699
-gpus=0,1
+gpus=1
 CUDA_VISIBLE_DEVICES=${gpus}
 stage=0
 stop_stage=100
@ -35,7 +36,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --output_dir=${output_dir} \
        --lang=${lang} \
        --ngpu=${ngpu} \
-        --epoch=100
+        --epoch=100 \
        --finetune_config=${finetune_config}
 fi
@ -54,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
        --text=${BIN_DIR}/../sentences.txt \
-        --output_dir=./test_e2e \
+        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \
        --spk_id=0 
--- a/examples/vctk/ernie_sat/README.md
+++ b/examples/vctk/ernie_sat/README.md
@ -1,11 +1,10 @@
 # ERNIE-SAT with VCTK dataset
 ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
+## Model Framework
-
+In ERNIE-SAT, we propose two innovations:
-## 模型框架
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
-ERNIE-SAT 中我们提出了两项创新：
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
 - 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
 - 采用语言和语音的联合掩码学习实现了语言和语音的对齐
 <p align="center">
    <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | 16 | 0.078918 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | 16 | 0.054401 |
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention | -1 | 0.050767 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | -1 | 0.061884 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | -1 |  0.052110 |
--- a/Show More
+++ b/Show More