[tts] Update finetune (#2430)

* update finetune, test=tts
3 years ago · 8636096484
parent ffdc17fd97
commit 8636096484
14 changed files with 980 additions and 372 deletions
--- a/demos/speech_web/speech_server/src/finetune.py
+++ b/demos/speech_web/speech_server/src/finetune.py
@ -29,6 +29,11 @@ class FineTune:
        self.finetune_config = os.path.join("conf/tts3_finetune.yaml")
    def finetune(self, input_dir, exp_dir='temp', epoch=100):
        """
        use cmd follow examples/other/tts_finetune/tts3/run.sh
        """
        newdir_name = "newdir"
        new_dir = os.path.join(input_dir, newdir_name)
        mfa_dir = os.path.join(exp_dir, 'mfa_result')
        dump_dir = os.path.join(exp_dir, 'dump')
        output_dir = os.path.join(exp_dir, 'exp')
@ -36,15 +41,42 @@ class FineTune:
        ngpu = 1
        cmd = f"""
-            python3 {self.PYTHONPATH}/finetune.py \
+            # check oov
            python3 {self.PYTHONPATH}/local/check_oov.py \
                --input_dir={input_dir} \
                --pretrained_model_dir={self.pretrained_model_dir} \
                --newdir_name={newdir_name} \
                --lang={lang}
            # get mfa result
            python3 {self.PYTHONPATH}/local/get_mfa_result.py \
                --input_dir={new_dir} \
                --mfa_dir={mfa_dir} \
                --lang={lang}
            # generate durations.txt
            python3 {self.PYTHONPATH}/local/generate_duration.py \
                --mfa_dir={mfa_dir} 
            # extract feature
            python3 {self.PYTHONPATH}/local/extract_feature.py \
                --duration_file="./durations.txt" \
                --input_dir={new_dir} \
                --dump_dir={dump_dir} \
                --pretrained_model_dir={self.pretrained_model_dir}
            # create finetune env
            python3 {self.PYTHONPATH}/local/prepare_env.py \
                --pretrained_model_dir={self.pretrained_model_dir} \
                --output_dir={output_dir}
            # finetune
            python3 {self.PYTHONPATH}/local/finetune.py \
                --pretrained_model_dir={self.pretrained_model_dir} \
                --dump_dir={dump_dir} \
                --output_dir={output_dir} \
            --lang={lang} \
                --ngpu={ngpu} \
-            --epoch={epoch} \
+                --epoch=100 \
                --finetune_config={self.finetune_config}
        """
--- a/examples/other/tts_finetune/tts3/README.md
+++ b/examples/other/tts_finetune/tts3/README.md
@ -1,20 +1,41 @@
-# Finetune your own AM based on FastSpeech2 with AISHELL-3.
+# Finetune your own AM based on FastSpeech2 with multi-speakers dataset.
-This example shows how to finetune your own AM based on FastSpeech2 with AISHELL-3. We use part of csmsc's data (top 200) as finetune data in this example. The example is implemented according to this [discussion](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1842). Thanks to the developer for the idea.
+This example shows how to finetune your own AM based on FastSpeech2 with multi-speakers dataset. For finetuning Chinese data, we use part of csmsc's data (top 200) and Fastspeech2 pretrained model with AISHELL-3. For finetuning English data, we use part of ljspeech's data (top 200) and Fastspeech2 pretrained model with VCTK. The example is implemented according to this [discussion](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1842). Thanks to the developer for the idea.
 For more information on training Fastspeech2 with AISHELL-3, You can refer [examples/aishell3/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3). For more information on training Fastspeech2 with VCTK, You can refer [examples/vctk/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3).
 We use AISHELL-3 to train a multi-speaker fastspeech2 model. You can refer [examples/aishell3/tts3](https://github.com/lym0302/PaddleSpeech/tree/develop/examples/aishell3/tts3) to train multi-speaker fastspeech2 from scratch.
 ## Prepare 
-### Download Pretrained Fastspeech2 model
+### Download Pretrained model
-Assume the path to the model is `./pretrained_models`. Download pretrained fastspeech2 model with aishell3: [fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip). 
+Assume the path to the model is `./pretrained_models`. </br>
 If you want to finetune Chinese data, you need to download Fastspeech2 pretrained model with AISHELL-3: [fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip) for finetuning. Download HiFiGAN pretrained model with aishell3: [hifigan_aishell3_ckpt_0.2.0](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip) for synthesis.
 ```bash
 mkdir -p pretrained_models && cd pretrained_models
 # pretrained fastspeech2 model
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip 
 unzip fastspeech2_aishell3_ckpt_1.1.0.zip
 # pretrained hifigan model
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip
 unzip hifigan_aishell3_ckpt_0.2.0.zip
 cd ../
 ```
 If you want to finetune English data, you need to download Fastspeech2 pretrained model with VCTK: [fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip) for finetuning. Download HiFiGAN pretrained model with VCTK: [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip) for synthesis.
 ```bash
 mkdir -p pretrained_models && cd pretrained_models
 # pretrained fastspeech2 model
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip 
 unzip fastspeech2_vctk_ckpt_1.2.0.zip
 # pretrained hifigan model
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip
 unzip hifigan_vctk_ckpt_0.2.0.zip
 cd ../
 ```
 ### Download MFA tools and pretrained model
-Assume the path to the MFA tool is `./tools`. Download [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz) and pretrained MFA models with aishell3: [aishell3_model.zip](https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip).
+Assume the path to the MFA tool is `./tools`. Download [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz).
 ```bash
 mkdir -p tools && cd tools
@ -22,16 +43,34 @@ mkdir -p tools && cd tools
 wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
 tar xvf montreal-forced-aligner_linux.tar.gz
 cp montreal-forced-aligner/lib/libpython3.6m.so.1.0 montreal-forced-aligner/lib/libpython3.6m.so
 # pretrained mfa model
 mkdir -p aligner && cd aligner
 ```
 If you want to finetune Chinese data, you need to download pretrained MFA models with aishell3: [aishell3_model.zip](https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip) and unzip it.
 ```bash
 # pretrained mfa model for Chinese data
 wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip
 unzip aishell3_model.zip
 wget https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/simple.lexicon
 cd ../../
 ```
 If you want to finetune English data, you need to download pretrained MFA models with vctk: [vctk_model.zip](https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/vctk_model.zip) and unzip it.
 ```bash
 # pretrained mfa model for Chinese data
 wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/vctk_model.zip
 unzip vctk_model.zip
 wget https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/cmudict-0.7b
 cd ../../
 ```
 ### Prepare your data
-Assume the path to the dataset is `./input`. This directory contains audio files (*.wav) and label file (labels.txt). The audio file is in wav format. The format of the label file is: utt_id|pinyin. Here is an example of the first 200 data of csmsc.
+Assume the path to the dataset is `./input` which contains a speaker folder. Speaker folder contains audio files (*.wav) and label file (labels.txt). The format of the audio file is wav. The format of the label file is: utt_id|pronunciation. </br>
 If you want to finetune Chinese data, Chinese label example: 000001|ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1</br>
 Here is an example of the first 200 data of csmsc.
 ```bash
 mkdir -p input && cd input
@ -60,7 +99,12 @@ When "Prepare" done. The structure of the current directory is listed below.
 │   │   ├── snapshot_iter_96400.pdz
 │   │   ├── speaker_id_map.txt
 │   │   └── speech_stats.npy
-│   └── fastspeech2_aishell3_ckpt_1.1.0.zip
+│   ├── fastspeech2_aishell3_ckpt_1.1.0.zip
 │   ├── hifigan_aishell3_ckpt_0.2.0    
 │   │   ├── default.yaml
 │   │   ├── feats_stats.npy
 │   │   └── snapshot_iter_2500000.pdz
 │   └── hifigan_aishell3_ckpt_0.2.0.zip
 └── tools
    ├── aligner
    │   ├── aishell3_model
@ -75,17 +119,68 @@ When "Prepare" done. The structure of the current directory is listed below.
 ```
 If you want to finetune English data, English label example: LJ001-0001|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition </br>
 Here is an example of the first 200 data of ljspeech.
 ```bash
 mkdir -p input && cd input
 wget https://paddlespeech.bj.bcebos.com/datasets/ljspeech_mini.zip
 unzip ljspeech_mini.zip
 cd ../
 ```
 When "Prepare" done. The structure of the current directory is listed below.
 ```text
 ├── input
 │   ├── ljspeech_mini
 │   │   ├── LJ001-0001.wav
 │   │   ├── LJ001-0002.wav
 │   │   ├── LJ001-0003.wav
 │   │   ├── ...
 │   │   ├── LJ002-0014.wav
 │   │   ├── labels.txt
 │   └── ljspeech_mini.zip
 ├── pretrained_models
 │   ├── fastspeech2_vctk_ckpt_1.2.0
 │   │   ├── default.yaml
 │   │   ├── energy_stats.npy
 │   │   ├── phone_id_map.txt
 │   │   ├── pitch_stats.npy
 │   │   ├── snapshot_iter_66200.pdz
 │   │   ├── speaker_id_map.txt
 │   │   └── speech_stats.npy
 │   ├── fastspeech2_vctk_ckpt_1.2.0.zip
 │   ├── hifigan_vctk_ckpt_0.2.0    
 │   │   ├── default.yaml
 │   │   ├── feats_stats.npy
 │   │   └── snapshot_iter_2500000.pdz
 │   └── hifigan_vctk_ckpt_0.2.0.zip
 └── tools
    ├── aligner
    │   ├── vctk_model
    │   ├── vctk_model.zip
    │   └── cmudict-0.7b
    ├── montreal-forced-aligner
    │   ├── bin
    │   ├── lib
    │   └── pretrained_models
    └── montreal-forced-aligner_linux.tar.gz
    ...
 ```
 ### Set finetune.yaml
-`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
+`conf/finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result. The value of frozen_layers can be change according `conf/fastspeech2_layers.txt` which is the model layer of fastspeech2.
 Arguments:
-  - `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
+  - `batch_size`: finetune batch size which should be less than or equal to the number of training samples. Default: -1, means 64 which same to pretrained model
  - `learning_rate`: learning rate. Default: 0.0001
  - `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
  - `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set []. 
 ## Get Started
 For Chinese data finetune, execute `./run.sh`. For English data finetune, execute `./run_en.sh`. </br>
 Run the command below to
 1. **source path**.
 2. finetune the model. 
@ -102,76 +197,59 @@ You can choose a range of stages you want to run, or set `stage` equal to `stop-
 Finetune a FastSpeech2 model. 
 ```bash
-./run.sh --stage 0 --stop-stage 0
+./run.sh --stage 0 --stop-stage 5
 ```
-`stage 0` of `run.sh` calls `finetune.py`, here's the complete help message.
+`stage 5` of `run.sh` calls `local/finetune.py`, here's the complete help message.
 ```text
-usage: finetune.py [-h] [--input_dir INPUT_DIR] [--pretrained_model_dir PRETRAINED_MODEL_DIR]
+usage: finetune.py [-h] [--pretrained_model_dir PRETRAINED_MODEL_DIR]
-                [--mfa_dir MFA_DIR] [--dump_dir DUMP_DIR]
+                [--dump_dir DUMP_DIR] [--output_dir OUTPUT_DIR] [--ngpu NGPU]
-                [--output_dir OUTPUT_DIR] [--lang LANG]
+                [--epoch EPOCH] [--finetune_config FINETUNE_CONFIG]
                [--ngpu NGPU]
 optional arguments:
-  -h, --help            show this help message and exit
+  -h, --help           Show this help message and exit
  --input_dir INPUT_DIR       
                        directory containing audio and label file
  --pretrained_model_dir PRETRAINED_MODEL_DIR
                       Path to pretrained model
  --mfa_dir MFA_DIR    directory to save aligned files
  --dump_dir DUMP_DIR
                       directory to save feature files and metadata
  --output_dir OUTPUT_DIR      
-                       directory to save finetune model 
+                       Directory to save finetune model 
-  --lang LANG          Choose input audio language, zh or en
+  --ngpu NGPU          The number of gpu, if ngpu=0, use cpu
-  --ngpu NGPU          if ngpu=0, use cpu
+  --epoch EPOCH        The epoch of finetune
-  --epoch EPOCH        the epoch of finetune
+  --finetune_config FINETUNE_CONFIG        
-  --batch_size BATCH_SIZE        
+                       Path to finetune config file
                       the batch size of finetune, default -1 means same as pretrained model
 ```
-1. `--input_dir` is the directory containing audio and label file. 
+
-2. `--pretrained_model_dir` is the directory incluing pretrained fastspeech2_aishell3 model.
+1. `--pretrained_model_dir` is the directory incluing pretrained fastspeech2_aishell3 model.
-3. `--mfa_dir` is the directory to save the results of aligning from pretrained MFA_aishell3 model.
+2. `--dump_dir` is the directory including audio feature and metadata.
-4. `--dump_dir` is the directory including audio feature and metadata.
+3. `--output_dir` is the directory to save finetune model.
-5. `--output_dir` is the directory to save finetune model.
+4. `--ngpu` is the number of gpu, if ngpu=0, use cpu
-6. `--lang` is the language of input audio, zh or en.
+5. `--epoch` is the epoch of finetune.
-7. `--ngpu` is the number of gpu.
+6. `--finetune_config` is the path to finetune config file
-8. `--epoch` is the epoch of finetune.
+ 
 9. `--batch_size` is the batch size of finetune.
 ### Synthesizing
-We use [HiFiGAN](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5) as the neural vocoder.
+To synthesize Chinese audio, We use [HiFiGAN with aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5) as the neural vocoder.
 Assume the path to the hifigan model is `./pretrained_models`. Download the pretrained HiFiGAN model from [hifigan_aishell3_ckpt_0.2.0](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip) and unzip it.
-```bash
+To synthesize English audio, We use [HiFiGAN with vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5) as the neural vocoder.
-cd pretrained_models
+Assume the path to the hifigan model is `./pretrained_models`. Download the pretrained HiFiGAN model from [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip) and unzip it.
-wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip
+
 unzip hifigan_aishell3_ckpt_0.2.0.zip
 cd ../
 ```
 HiFiGAN checkpoint contains files listed below.
 ```text
 hifigan_aishell3_ckpt_0.2.0
 ├── default.yaml                   # default config used to train HiFiGAN
 ├── feats_stats.npy                # statistics used to normalize spectrogram when training HiFiGAN
 └── snapshot_iter_2500000.pdz      # generator parameters of HiFiGAN
 ```
 Modify `ckpt` in `run.sh` to the final model in `exp/default/checkpoints`.
 ```bash
-./run.sh --stage 1 --stop-stage 1
+./run.sh --stage 6 --stop-stage 6
 ```
-`stage 1` of `run.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
+`stage 6` of `run.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am {fastspeech2_aishell3,fastspeech2_vctk}]
                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                         [--tones_dict TONES_DICT]
                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc {pwgan_aishell3, pwgan_vctk, hifigan_aishell3, hifigan_vctk}]
                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                         [--voc_stat VOC_STAT] [--lang LANG]
                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@ -181,7 +259,7 @@ Synthesize with acoustic model & vocoder
 optional arguments:
  -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+  --am {fastspeech2_aishell3, fastspeech2_vctk}
                        Choose acoustic model type of tts task.
  --am_config AM_CONFIG
                        Config of acoustic model.
@ -195,7 +273,7 @@ optional arguments:
  --speaker_dict SPEAKER_DICT
                        speaker id map file.
  --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+  --voc {pwgan_aishell3, pwgan_vctk, hifigan_aishell3, hifigan_vctk}
                        Choose vocoder type of tts task.
  --voc_config VOC_CONFIG
                        Config of voc.
@ -210,6 +288,7 @@ optional arguments:
  --output_dir OUTPUT_DIR
                        output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
 2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
@ -219,7 +298,8 @@ optional arguments:
 7.  `--output_dir` is the directory to save synthesized audio files.
 8. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 ### Tips
-If you want to get better audio quality, you can use more audios to finetune.
+If you want to get better audio quality, you can use more audios to finetune or change configuration parameters in `conf/finetune.yaml`.</br>
-More finetune results can be found on [finetune-fastspeech2-for-csmsc](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html#finetune-fastspeech2-for-csmsc).
+More finetune results can be found on [finetune-fastspeech2-for-csmsc](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html#finetune-fastspeech2-for-csmsc).</br>
 The results show the effect on csmsc_mini: Freeze encoder > Non Frozen > Freeze encoder && duration_predictor.
--- a/examples/other/tts_finetune/tts3/conf/fastspeech2_layers.txt
+++ b/examples/other/tts_finetune/tts3/conf/fastspeech2_layers.txt
@ -0,0 +1,216 @@
 epoch
 iteration
 main_params
 main_optimizer
 spk_embedding_table.weight
 encoder.embed.0.weight
 encoder.embed.1.alpha
 encoder.encoders.0.self_attn.linear_q.weight
 encoder.encoders.0.self_attn.linear_q.bias
 encoder.encoders.0.self_attn.linear_k.weight
 encoder.encoders.0.self_attn.linear_k.bias
 encoder.encoders.0.self_attn.linear_v.weight
 encoder.encoders.0.self_attn.linear_v.bias
 encoder.encoders.0.self_attn.linear_out.weight
 encoder.encoders.0.self_attn.linear_out.bias
 encoder.encoders.0.feed_forward.w_1.weight
 encoder.encoders.0.feed_forward.w_1.bias
 encoder.encoders.0.feed_forward.w_2.weight
 encoder.encoders.0.feed_forward.w_2.bias
 encoder.encoders.0.norm1.weight
 encoder.encoders.0.norm1.bias
 encoder.encoders.0.norm2.weight
 encoder.encoders.0.norm2.bias
 encoder.encoders.1.self_attn.linear_q.weight
 encoder.encoders.1.self_attn.linear_q.bias
 encoder.encoders.1.self_attn.linear_k.weight
 encoder.encoders.1.self_attn.linear_k.bias
 encoder.encoders.1.self_attn.linear_v.weight
 encoder.encoders.1.self_attn.linear_v.bias
 encoder.encoders.1.self_attn.linear_out.weight
 encoder.encoders.1.self_attn.linear_out.bias
 encoder.encoders.1.feed_forward.w_1.weight
 encoder.encoders.1.feed_forward.w_1.bias
 encoder.encoders.1.feed_forward.w_2.weight
 encoder.encoders.1.feed_forward.w_2.bias
 encoder.encoders.1.norm1.weight
 encoder.encoders.1.norm1.bias
 encoder.encoders.1.norm2.weight
 encoder.encoders.1.norm2.bias
 encoder.encoders.2.self_attn.linear_q.weight
 encoder.encoders.2.self_attn.linear_q.bias
 encoder.encoders.2.self_attn.linear_k.weight
 encoder.encoders.2.self_attn.linear_k.bias
 encoder.encoders.2.self_attn.linear_v.weight
 encoder.encoders.2.self_attn.linear_v.bias
 encoder.encoders.2.self_attn.linear_out.weight
 encoder.encoders.2.self_attn.linear_out.bias
 encoder.encoders.2.feed_forward.w_1.weight
 encoder.encoders.2.feed_forward.w_1.bias
 encoder.encoders.2.feed_forward.w_2.weight
 encoder.encoders.2.feed_forward.w_2.bias
 encoder.encoders.2.norm1.weight
 encoder.encoders.2.norm1.bias
 encoder.encoders.2.norm2.weight
 encoder.encoders.2.norm2.bias
 encoder.encoders.3.self_attn.linear_q.weight
 encoder.encoders.3.self_attn.linear_q.bias
 encoder.encoders.3.self_attn.linear_k.weight
 encoder.encoders.3.self_attn.linear_k.bias
 encoder.encoders.3.self_attn.linear_v.weight
 encoder.encoders.3.self_attn.linear_v.bias
 encoder.encoders.3.self_attn.linear_out.weight
 encoder.encoders.3.self_attn.linear_out.bias
 encoder.encoders.3.feed_forward.w_1.weight
 encoder.encoders.3.feed_forward.w_1.bias
 encoder.encoders.3.feed_forward.w_2.weight
 encoder.encoders.3.feed_forward.w_2.bias
 encoder.encoders.3.norm1.weight
 encoder.encoders.3.norm1.bias
 encoder.encoders.3.norm2.weight
 encoder.encoders.3.norm2.bias
 encoder.after_norm.weight
 encoder.after_norm.bias
 spk_projection.weight
 spk_projection.bias
 duration_predictor.conv.0.0.weight
 duration_predictor.conv.0.0.bias
 duration_predictor.conv.0.2.weight
 duration_predictor.conv.0.2.bias
 duration_predictor.conv.1.0.weight
 duration_predictor.conv.1.0.bias
 duration_predictor.conv.1.2.weight
 duration_predictor.conv.1.2.bias
 duration_predictor.linear.weight
 duration_predictor.linear.bias
 pitch_predictor.conv.0.0.weight
 pitch_predictor.conv.0.0.bias
 pitch_predictor.conv.0.2.weight
 pitch_predictor.conv.0.2.bias
 pitch_predictor.conv.1.0.weight
 pitch_predictor.conv.1.0.bias
 pitch_predictor.conv.1.2.weight
 pitch_predictor.conv.1.2.bias
 pitch_predictor.conv.2.0.weight
 pitch_predictor.conv.2.0.bias
 pitch_predictor.conv.2.2.weight
 pitch_predictor.conv.2.2.bias
 pitch_predictor.conv.3.0.weight
 pitch_predictor.conv.3.0.bias
 pitch_predictor.conv.3.2.weight
 pitch_predictor.conv.3.2.bias
 pitch_predictor.conv.4.0.weight
 pitch_predictor.conv.4.0.bias
 pitch_predictor.conv.4.2.weight
 pitch_predictor.conv.4.2.bias
 pitch_predictor.linear.weight
 pitch_predictor.linear.bias
 pitch_embed.0.weight
 pitch_embed.0.bias
 energy_predictor.conv.0.0.weight
 energy_predictor.conv.0.0.bias
 energy_predictor.conv.0.2.weight
 energy_predictor.conv.0.2.bias
 energy_predictor.conv.1.0.weight
 energy_predictor.conv.1.0.bias
 energy_predictor.conv.1.2.weight
 energy_predictor.conv.1.2.bias
 energy_predictor.linear.weight
 energy_predictor.linear.bias
 energy_embed.0.weight
 energy_embed.0.bias
 decoder.embed.0.alpha
 decoder.encoders.0.self_attn.linear_q.weight
 decoder.encoders.0.self_attn.linear_q.bias
 decoder.encoders.0.self_attn.linear_k.weight
 decoder.encoders.0.self_attn.linear_k.bias
 decoder.encoders.0.self_attn.linear_v.weight
 decoder.encoders.0.self_attn.linear_v.bias
 decoder.encoders.0.self_attn.linear_out.weight
 decoder.encoders.0.self_attn.linear_out.bias
 decoder.encoders.0.feed_forward.w_1.weight
 decoder.encoders.0.feed_forward.w_1.bias
 decoder.encoders.0.feed_forward.w_2.weight
 decoder.encoders.0.feed_forward.w_2.bias
 decoder.encoders.0.norm1.weight
 decoder.encoders.0.norm1.bias
 decoder.encoders.0.norm2.weight
 decoder.encoders.0.norm2.bias
 decoder.encoders.1.self_attn.linear_q.weight
 decoder.encoders.1.self_attn.linear_q.bias
 decoder.encoders.1.self_attn.linear_k.weight
 decoder.encoders.1.self_attn.linear_k.bias
 decoder.encoders.1.self_attn.linear_v.weight
 decoder.encoders.1.self_attn.linear_v.bias
 decoder.encoders.1.self_attn.linear_out.weight
 decoder.encoders.1.self_attn.linear_out.bias
 decoder.encoders.1.feed_forward.w_1.weight
 decoder.encoders.1.feed_forward.w_1.bias
 decoder.encoders.1.feed_forward.w_2.weight
 decoder.encoders.1.feed_forward.w_2.bias
 decoder.encoders.1.norm1.weight
 decoder.encoders.1.norm1.bias
 decoder.encoders.1.norm2.weight
 decoder.encoders.1.norm2.bias
 decoder.encoders.2.self_attn.linear_q.weight
 decoder.encoders.2.self_attn.linear_q.bias
 decoder.encoders.2.self_attn.linear_k.weight
 decoder.encoders.2.self_attn.linear_k.bias
 decoder.encoders.2.self_attn.linear_v.weight
 decoder.encoders.2.self_attn.linear_v.bias
 decoder.encoders.2.self_attn.linear_out.weight
 decoder.encoders.2.self_attn.linear_out.bias
 decoder.encoders.2.feed_forward.w_1.weight
 decoder.encoders.2.feed_forward.w_1.bias
 decoder.encoders.2.feed_forward.w_2.weight
 decoder.encoders.2.feed_forward.w_2.bias
 decoder.encoders.2.norm1.weight
 decoder.encoders.2.norm1.bias
 decoder.encoders.2.norm2.weight
 decoder.encoders.2.norm2.bias
 decoder.encoders.3.self_attn.linear_q.weight
 decoder.encoders.3.self_attn.linear_q.bias
 decoder.encoders.3.self_attn.linear_k.weight
 decoder.encoders.3.self_attn.linear_k.bias
 decoder.encoders.3.self_attn.linear_v.weight
 decoder.encoders.3.self_attn.linear_v.bias
 decoder.encoders.3.self_attn.linear_out.weight
 decoder.encoders.3.self_attn.linear_out.bias
 decoder.encoders.3.feed_forward.w_1.weight
 decoder.encoders.3.feed_forward.w_1.bias
 decoder.encoders.3.feed_forward.w_2.weight
 decoder.encoders.3.feed_forward.w_2.bias
 decoder.encoders.3.norm1.weight
 decoder.encoders.3.norm1.bias
 decoder.encoders.3.norm2.weight
 decoder.encoders.3.norm2.bias
 decoder.after_norm.weight
 decoder.after_norm.bias
 feat_out.weight
 feat_out.bias
 postnet.postnet.0.0.weight
 postnet.postnet.0.1.weight
 postnet.postnet.0.1.bias
 postnet.postnet.0.1._mean
 postnet.postnet.0.1._variance
 postnet.postnet.1.0.weight
 postnet.postnet.1.1.weight
 postnet.postnet.1.1.bias
 postnet.postnet.1.1._mean
 postnet.postnet.1.1._variance
 postnet.postnet.2.0.weight
 postnet.postnet.2.1.weight
 postnet.postnet.2.1.bias
 postnet.postnet.2.1._mean
 postnet.postnet.2.1._variance
 postnet.postnet.3.0.weight
 postnet.postnet.3.1.weight
 postnet.postnet.3.1.bias
 postnet.postnet.3.1._mean
 postnet.postnet.3.1._variance
 postnet.postnet.4.0.weight
 postnet.postnet.4.1.weight
 postnet.postnet.4.1.bias
 postnet.postnet.4.1._mean
 postnet.postnet.4.1._variance
--- a/examples/other/tts_finetune/tts3/conf/finetune.yaml
+++ b/examples/other/tts_finetune/tts3/conf/finetune.yaml
@ -9,4 +9,6 @@ num_snapshots: -1
 # frozen_layers should be a list
 # if you don't need to freeze, set frozen_layers to []
 # fastspeech2 layers can be found on conf/fastspeech2_layers.txt
 # example: frozen_layers: ["encoder", "duration_predictor"]
 frozen_layers: ["encoder"]
--- a/examples/other/tts_finetune/tts3/finetune.py
+++ b/examples/other/tts_finetune/tts3/finetune.py
@ -1,214 +0,0 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from pathlib import Path
 from typing import List
 from typing import Union
 import yaml
 from local.check_oov import get_check_result
 from local.extract import extract_feature
 from local.label_process import get_single_label
 from local.prepare_env import generate_finetune_env
 from local.train import train_sp
 from paddle import distributed as dist
 from yacs.config import CfgNode
 from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
 DICT_EN = 'tools/aligner/cmudict-0.7b'
 DICT_ZH = 'tools/aligner/simple.lexicon'
 MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
 MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
 MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
 MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
 MFA_PATH = 'tools/montreal-forced-aligner/bin'
 os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
 class TrainArgs():
    def __init__(self,
                 ngpu,
                 config_file,
                 dump_dir: Path,
                 output_dir: Path,
                 frozen_layers: List[str]):
        # config: fastspeech2 config file.
        self.config = str(config_file)
        self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
        self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
        # model output dir.
        self.output_dir = str(output_dir)
        self.ngpu = ngpu
        self.phones_dict = str(dump_dir / "phone_id_map.txt")
        self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
        self.voice_cloning = False
        # frozen layers
        self.frozen_layers = frozen_layers
 def get_mfa_result(
        input_dir: Union[str, Path],
        mfa_dir: Union[str, Path],
        lang: str='en', ):
    """get mfa result
    Args:
        input_dir (Union[str, Path]): input dir including wav file and label
        mfa_dir (Union[str, Path]): mfa result dir
        lang (str, optional): input audio language. Defaults to 'en'.
    """
    # MFA
    if lang == 'en':
        DICT = DICT_EN
        MODEL_DIR = MODEL_DIR_EN
    elif lang == 'zh':
        DICT = DICT_ZH
        MODEL_DIR = MODEL_DIR_ZH
    else:
        print('please input right lang!!')
    CMD = 'mfa_align' + ' ' + str(
        input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
    os.system(CMD)
 if __name__ == '__main__':
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--input_dir",
        type=str,
        default="./input/baker_mini",
        help="directory containing audio and label file")
    parser.add_argument(
        "--pretrained_model_dir",
        type=str,
        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
        help="Path to pretrained model")
    parser.add_argument(
        "--mfa_dir",
        type=str,
        default="./mfa_result",
        help="directory to save aligned files")
    parser.add_argument(
        "--dump_dir",
        type=str,
        default="./dump",
        help="directory to save feature files and metadata.")
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./exp/default/",
        help="directory to save finetune model.")
    parser.add_argument(
        '--lang',
        type=str,
        default='zh',
        choices=['zh', 'en'],
        help='Choose input audio language. zh or en')
    parser.add_argument(
        "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
    parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
    parser.add_argument(
        "--finetune_config",
        type=str,
        default="./finetune.yaml",
        help="Path to finetune config file")
    args = parser.parse_args()
    fs = 24000
    n_shift = 300
    input_dir = Path(args.input_dir).expanduser()
    mfa_dir = Path(args.mfa_dir).expanduser()
    mfa_dir.mkdir(parents=True, exist_ok=True)
    dump_dir = Path(args.dump_dir).expanduser()
    dump_dir.mkdir(parents=True, exist_ok=True)
    output_dir = Path(args.output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
    # read config
    config_file = pretrained_model_dir / "default.yaml"
    with open(config_file) as f:
        config = CfgNode(yaml.safe_load(f))
    config.max_epoch = config.max_epoch + args.epoch
    with open(args.finetune_config) as f2:
        finetune_config = CfgNode(yaml.safe_load(f2))
    config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
    config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
    config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
    frozen_layers = finetune_config.frozen_layers
    assert type(frozen_layers) == list, "frozen_layers should be set a list."
    if args.lang == 'en':
        lexicon_file = DICT_EN
        mfa_phone_file = MFA_PHONE_EN
    elif args.lang == 'zh':
        lexicon_file = DICT_ZH
        mfa_phone_file = MFA_PHONE_ZH
    else:
        print('please input right lang!!')
    print(f"finetune max_epoch: {config.max_epoch}")
    print(f"finetune batch_size: {config.batch_size}")
    print(f"finetune learning_rate: {config.optimizer.learning_rate}")
    print(f"finetune num_snapshots: {config.num_snapshots}")
    print(f"finetune frozen_layers: {frozen_layers}")
    am_phone_file = pretrained_model_dir / "phone_id_map.txt"
    label_file = input_dir / "labels.txt"
    #check phone for mfa and am finetune
    oov_words, oov_files, oov_file_words = get_check_result(
        label_file, lexicon_file, mfa_phone_file, am_phone_file)
    input_dir = get_single_label(label_file, oov_files, input_dir)
    # get mfa result
    get_mfa_result(input_dir, mfa_dir, args.lang)
    # # generate durations.txt
    duration_file = "./durations.txt"
    gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
    # generate phone and speaker map files
    extract_feature(duration_file, config, input_dir, dump_dir,
                    pretrained_model_dir)
    # create finetune env
    generate_finetune_env(output_dir, pretrained_model_dir)
    # create a new args for training
    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
                           frozen_layers)
    # finetune models
    # dispatch
    if args.ngpu > 1:
        dist.spawn(train_sp, (train_args, config), nprocs=args.ngpu)
    else:
        train_sp(train_args, config)
--- a/examples/other/tts_finetune/tts3/local/check_oov.py
+++ b/examples/other/tts_finetune/tts3/local/check_oov.py
@ -11,17 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 import re
 from pathlib import Path
 from typing import Dict
 from typing import List
 from typing import Union
 DICT_EN = 'tools/aligner/cmudict-0.7b'
 DICT_ZH = 'tools/aligner/simple.lexicon'
 MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
 MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
 MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
 MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
 MFA_PATH = 'tools/montreal-forced-aligner/bin'
 os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
 def check_phone(label_file: Union[str, Path],
-                pinyin_phones: Dict[str, str],
+                pronunciation_phones: Dict[str, str],
                mfa_phones: List[str],
                am_phones: List[str],
-                oov_record: str="./oov_info.txt"):
+                oov_record: str="./oov_info.txt",
                lang: str="zh"):
    """Check whether the phoneme corresponding to the audio text content 
    is in the phoneme list of the pretrained mfa model to ensure that the alignment is normal.
    Check whether the phoneme corresponding to the audio text content 
@ -29,7 +42,7 @@ def check_phone(label_file: Union[str, Path],
    Args:
        label_file (Union[str, Path]): label file, format: utt_id|phone seq
-        pinyin_phones (dict): pinyin to phones map dict
+        pronunciation_phones (dict): pronunciation to phones map dict
        mfa_phones (list): the phone list of pretrained mfa model
        am_phones (list): the phone list of pretrained mfa model
@ -46,16 +59,21 @@ def check_phone(label_file: Union[str, Path],
        for line in f.readlines():
            utt_id = line.split("|")[0]
            transcription = line.strip().split("|")[1]
            transcription = re.sub(
                r'[：、，；。？！,.:;"?!”’《》【】<=>{}()（）#&@“”^_|…\\]', '',
                transcription)
            if lang == "en":
                transcription = transcription.upper()
            flag = 0
            temp_oov_words = []
            for word in transcription.split(" "):
-                if word not in pinyin_phones.keys():
+                if word not in pronunciation_phones.keys():
                    temp_oov_words.append(word)
                    flag = 1
                    if word not in oov_words:
                        oov_words.append(word)
                else:
-                    for p in pinyin_phones[word]:
+                    for p in pronunciation_phones[word]:
                        if p not in mfa_phones or p not in am_phones:
                            temp_oov_words.append(word)
                            flag = 1
@ -74,20 +92,20 @@ def check_phone(label_file: Union[str, Path],
    return oov_words, oov_files, oov_file_words
-def get_pinyin_phones(lexicon_file: Union[str, Path]):
+def get_pronunciation_phones(lexicon_file: Union[str, Path]):
-    # pinyin to phones
+    # pronunciation to phones
-    pinyin_phones = {}
+    pronunciation_phones = {}
    with open(lexicon_file, "r") as f2:
        for line in f2.readlines():
            line_list = line.strip().split(" ")
-            pinyin = line_list[0]
+            pronunciation = line_list[0]
            if line_list[1] == '':
                phones = line_list[2:]
            else:
                phones = line_list[1:]
-            pinyin_phones[pinyin] = phones
+            pronunciation_phones[pronunciation] = phones
-    return pinyin_phones
+    return pronunciation_phones
 def get_mfa_phone(mfa_phone_file: Union[str, Path]):
@ -114,12 +132,109 @@ def get_am_phone(am_phone_file: Union[str, Path]):
 def get_check_result(label_file: Union[str, Path],
-                     lexicon_file: Union[str, Path],
+                     am_phone_file: Union[str, Path],
-                     mfa_phone_file: Union[str, Path],
+                     input_dir: Union[str, Path],
-                     am_phone_file: Union[str, Path]):
+                     newdir_name: str="newdir",
-    pinyin_phones = get_pinyin_phones(lexicon_file)
+                     lang: str="zh"):
    """Check if there is any audio in the input that contains the oov word according to label_file.
       Copy audio that does not contain oov word to input_dir / newdir_name.
       Generate label file and save to input_dir / newdir_name.
    Args:
        label_file (Union[str, Path]): input audio label file, format: utt|pronunciation 
        am_phone_file (Union[str, Path]): pretrained am model phone file
        input_dir (Union[str, Path]): input dir
        newdir_name (str): directory name saved after checking oov
        lang (str): input audio language
    """
    if lang == 'en':
        lexicon_file = DICT_EN
        mfa_phone_file = MFA_PHONE_EN
    elif lang == 'zh':
        lexicon_file = DICT_ZH
        mfa_phone_file = MFA_PHONE_ZH
    else:
        print('please input right lang!!')
    pronunciation_phones = get_pronunciation_phones(lexicon_file)
    mfa_phones = get_mfa_phone(mfa_phone_file)
    am_phones = get_am_phone(am_phone_file)
    oov_words, oov_files, oov_file_words = check_phone(
-        label_file, pinyin_phones, mfa_phones, am_phones)
+        label_file=label_file,
-    return oov_words, oov_files, oov_file_words
+        pronunciation_phones=pronunciation_phones,
        mfa_phones=mfa_phones,
        am_phones=am_phones,
        oov_record="./oov_info.txt",
        lang=lang)
    input_dir = Path(input_dir).expanduser()
    new_dir = input_dir / newdir_name
    new_dir.mkdir(parents=True, exist_ok=True)
    with open(label_file, "r") as f:
        for line in f.readlines():
            utt_id = line.split("|")[0]
            if utt_id not in oov_files:
                transcription = line.split("|")[1].strip()
                wav_file = str(input_dir) + "/" + utt_id + ".wav"
                new_wav_file = str(new_dir) + "/" + utt_id + ".wav"
                os.system("cp %s %s" % (wav_file, new_wav_file))
                single_file = str(new_dir) + "/" + utt_id + ".txt"
                with open(single_file, "w") as fw:
                    fw.write(transcription)
 if __name__ == '__main__':
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--input_dir",
        type=str,
        default="./input/csmsc_mini",
        help="directory containing audio and label file")
    parser.add_argument(
        "--pretrained_model_dir",
        type=str,
        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
        help="Path to pretrained model")
    parser.add_argument(
        "--newdir_name",
        type=str,
        default="newdir",
        help="directory name saved after checking oov")
    parser.add_argument(
        '--lang',
        type=str,
        default='zh',
        choices=['zh', 'en'],
        help='Choose input audio language. zh or en')
    args = parser.parse_args()
    # if args.lang == 'en':
    #     lexicon_file = DICT_EN
    #     mfa_phone_file = MFA_PHONE_EN
    # elif args.lang == 'zh':
    #     lexicon_file = DICT_ZH
    #     mfa_phone_file = MFA_PHONE_ZH
    # else:
    #     print('please input right lang!!')
    assert args.lang == "zh" or args.lang == "en", "please input right lang! zh or en"
    input_dir = Path(args.input_dir).expanduser()
    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
    am_phone_file = pretrained_model_dir / "phone_id_map.txt"
    label_file = input_dir / "labels.txt"
    get_check_result(
        label_file=label_file,
        am_phone_file=am_phone_file,
        input_dir=input_dir,
        newdir_name=args.newdir_name,
        lang=args.lang)
--- a/examples/other/tts_finetune/tts3/local/extract_feature.py
+++ b/examples/other/tts_finetune/tts3/local/extract_feature.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 import os
 from operator import itemgetter
@ -20,8 +21,10 @@ from typing import Union
 import jsonlines
 import numpy as np
 import yaml
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm
 from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.get_feats import Energy
@ -284,3 +287,49 @@ def extract_feature(duration_file: str,
        # norm
        normalize(speech_scaler, pitch_scaler, energy_scaler, vocab_phones,
                  vocab_speaker, dump_dir, "test")
 if __name__ == '__main__':
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--duration_file",
        type=str,
        default="./durations.txt",
        help="duration file")
    parser.add_argument(
        "--input_dir",
        type=str,
        default="./input/baker_mini/newdir",
        help="directory containing audio and label file")
    parser.add_argument(
        "--dump_dir", type=str, default="./dump", help="dump dir")
    parser.add_argument(
        "--pretrained_model_dir",
        type=str,
        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
        help="Path to pretrained model")
    args = parser.parse_args()
    input_dir = Path(args.input_dir).expanduser()
    dump_dir = Path(args.dump_dir).expanduser()
    dump_dir.mkdir(parents=True, exist_ok=True)
    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
    # read config
    config_file = pretrained_model_dir / "default.yaml"
    with open(config_file) as f:
        config = CfgNode(yaml.safe_load(f))
    extract_feature(
        duration_file=args.duration_file,
        config=config,
        input_dir=input_dir,
        dump_dir=dump_dir,
        pretrained_model_dir=pretrained_model_dir)
--- a/examples/other/tts_finetune/tts3/local/finetune.py
+++ b/examples/other/tts_finetune/tts3/local/finetune.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 import os
 import shutil
@ -20,10 +21,12 @@ from typing import List
 import jsonlines
 import numpy as np
 import paddle
 import yaml
 from paddle import DataParallel
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
 from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
@ -38,6 +41,27 @@ from paddlespeech.t2s.training.seeding import seed_everything
 from paddlespeech.t2s.training.trainer import Trainer
 class TrainArgs():
    def __init__(self,
                 ngpu,
                 config_file,
                 dump_dir: Path,
                 output_dir: Path,
                 frozen_layers: List[str]):
        # config: fastspeech2 config file.
        self.config = str(config_file)
        self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
        self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
        # model output dir.
        self.output_dir = str(output_dir)
        self.ngpu = ngpu
        self.phones_dict = str(dump_dir / "phone_id_map.txt")
        self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
        self.voice_cloning = False
        # frozen layers
        self.frozen_layers = frozen_layers
 def freeze_layer(model, layers: List[str]):
    """freeze layers
@ -176,3 +200,70 @@ def train_sp(args, config):
    trainer.extend(
        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
    trainer.run()
 if __name__ == '__main__':
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--pretrained_model_dir",
        type=str,
        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
        help="Path to pretrained model")
    parser.add_argument(
        "--dump_dir",
        type=str,
        default="./dump",
        help="directory to save feature files and metadata.")
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./exp/default/",
        help="directory to save finetune model.")
    parser.add_argument(
        "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
    parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
    parser.add_argument(
        "--finetune_config",
        type=str,
        default="./finetune.yaml",
        help="Path to finetune config file")
    args = parser.parse_args()
    dump_dir = Path(args.dump_dir).expanduser()
    dump_dir.mkdir(parents=True, exist_ok=True)
    output_dir = Path(args.output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
    # read config
    config_file = pretrained_model_dir / "default.yaml"
    with open(config_file) as f:
        config = CfgNode(yaml.safe_load(f))
    config.max_epoch = config.max_epoch + args.epoch
    with open(args.finetune_config) as f2:
        finetune_config = CfgNode(yaml.safe_load(f2))
    config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
    config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
    config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
    frozen_layers = finetune_config.frozen_layers
    assert type(frozen_layers) == list, "frozen_layers should be set a list."
    # create a new args for training
    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
                           frozen_layers)
    # finetune models
    # dispatch
    if args.ngpu > 1:
        dist.spawn(train_sp, (train_args, config), nprocs=args.ngpu)
    else:
        train_sp(train_args, config)
--- a/examples/other/tts_finetune/tts3/local/generate_duration.py
+++ b/examples/other/tts_finetune/tts3/local/generate_duration.py
@ -0,0 +1,38 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 from pathlib import Path
 from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
 if __name__ == '__main__':
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--mfa_dir",
        type=str,
        default="./mfa_result",
        help="directory to save aligned files")
    args = parser.parse_args()
    fs = 24000
    n_shift = 300
    duration_file = "./durations.txt"
    mfa_dir = Path(args.mfa_dir).expanduser()
    mfa_dir.mkdir(parents=True, exist_ok=True)
    gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
--- a/examples/other/tts_finetune/tts3/local/get_mfa_result.py
+++ b/examples/other/tts_finetune/tts3/local/get_mfa_result.py
@ -0,0 +1,83 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from pathlib import Path
 from typing import Union
 DICT_EN = 'tools/aligner/cmudict-0.7b'
 DICT_ZH = 'tools/aligner/simple.lexicon'
 MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
 MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
 MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
 MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
 MFA_PATH = 'tools/montreal-forced-aligner/bin'
 os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
 def get_mfa_result(
        input_dir: Union[str, Path],
        mfa_dir: Union[str, Path],
        lang: str='en', ):
    """get mfa result
    Args:
        input_dir (Union[str, Path]): input dir including wav file and label
        mfa_dir (Union[str, Path]): mfa result dir
        lang (str, optional): input audio language. Defaults to 'en'.
    """
    # MFA
    if lang == 'en':
        DICT = DICT_EN
        MODEL_DIR = MODEL_DIR_EN
    elif lang == 'zh':
        DICT = DICT_ZH
        MODEL_DIR = MODEL_DIR_ZH
    else:
        print('please input right lang!!')
    CMD = 'mfa_align' + ' ' + str(
        input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
    os.system(CMD)
 if __name__ == '__main__':
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--input_dir",
        type=str,
        default="./input/baker_mini/newdir",
        help="directory containing audio and label file")
    parser.add_argument(
        "--mfa_dir",
        type=str,
        default="./mfa_result",
        help="directory to save aligned files")
    parser.add_argument(
        '--lang',
        type=str,
        default='zh',
        choices=['zh', 'en'],
        help='Choose input audio language. zh or en')
    args = parser.parse_args()
    get_mfa_result(
        input_dir=args.input_dir, mfa_dir=args.mfa_dir, lang=args.lang)
--- a/examples/other/tts_finetune/tts3/local/label_process.py
+++ b/examples/other/tts_finetune/tts3/local/label_process.py
@ -1,63 +0,0 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from pathlib import Path
 from typing import List
 from typing import Union
 def change_baker_label(baker_label_file: Union[str, Path],
                       out_label_file: Union[str, Path]):
    """change baker label file to regular label file
    Args:
        baker_label_file (Union[str, Path]): Original baker label file
        out_label_file (Union[str, Path]): regular label file
    """
    with open(baker_label_file) as f:
        lines = f.readlines()
    with open(out_label_file, "w") as fw:
        for i in range(0, len(lines), 2):
            utt_id = lines[i].split()[0]
            transcription = lines[i + 1].strip()
            fw.write(utt_id + "|" + transcription + "\n")
 def get_single_label(label_file: Union[str, Path],
                     oov_files: List[Union[str, Path]],
                     input_dir: Union[str, Path]):
    """Divide the label file into individual files according to label_file
    Args:
        label_file (str or Path): label file, format: utt_id|phones id
        input_dir (Path): input dir including audios
    """
    input_dir = Path(input_dir).expanduser()
    new_dir = input_dir / "newdir"
    new_dir.mkdir(parents=True, exist_ok=True)
    with open(label_file, "r") as f:
        for line in f.readlines():
            utt_id = line.split("|")[0]
            if utt_id not in oov_files:
                transcription = line.split("|")[1].strip()
                wav_file = str(input_dir) + "/" + utt_id + ".wav"
                new_wav_file = str(new_dir) + "/" + utt_id + ".wav"
                os.system("cp %s %s" % (wav_file, new_wav_file))
                single_file = str(new_dir) + "/" + utt_id + ".txt"
                with open(single_file, "w") as fw:
                    fw.write(transcription)
    return new_dir
--- a/examples/other/tts_finetune/tts3/local/prepare_env.py
+++ b/examples/other/tts_finetune/tts3/local/prepare_env.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from pathlib import Path
@ -33,3 +34,29 @@ def generate_finetune_env(output_dir: Path, pretrained_model_dir: Path):
        line = "\"time\": \"2022-08-06 07:51:53.463650\", \"path\": \"%s\", \"iteration\": %d" % (
            str(output_dir / model_file), iter)
        f.write("{" + line + "}" + "\n")
 if __name__ == '__main__':
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--pretrained_model_dir",
        type=str,
        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
        help="Path to pretrained model")
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./exp/default/",
        help="directory to save finetune model.")
    args = parser.parse_args()
    output_dir = Path(args.output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
    generate_finetune_env(output_dir, pretrained_model_dir)
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@ -5,13 +5,16 @@ source path.sh
 input_dir=./input/csmsc_mini
 newdir_name="newdir"
 new_dir=${input_dir}/${newdir_name}
 pretrained_model_dir=./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0
 mfa_tools=./tools
 mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
 lang=zh
 ngpu=1
-finetune_config=./finetune.yaml
+finetune_config=./conf/finetune.yaml
 ckpt=snapshot_iter_96699
@ -26,22 +29,64 @@ stop_stage=100
 # this can not be mixed use with `$1`, `$2` ...
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 # check oov
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # finetune
+    echo "check oov"
-    python3 finetune.py \
+    python3 local/check_oov.py \
        --input_dir=${input_dir} \
        --pretrained_model_dir=${pretrained_model_dir} \
        --newdir_name=${newdir_name} \
        --lang=${lang}
 fi
 # get mfa result
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "get mfa result"
    python3 local/get_mfa_result.py \
        --input_dir=${new_dir} \
        --mfa_dir=${mfa_dir} \
        --lang=${lang}
 fi
 # generate durations.txt
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "generate durations.txt"
    python3 local/generate_duration.py \
        --mfa_dir=${mfa_dir} 
 fi
 # extract feature
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "extract feature"
    python3 local/extract_feature.py \
        --duration_file="./durations.txt" \
        --input_dir=${new_dir} \
        --dump_dir=${dump_dir} \
        --pretrained_model_dir=${pretrained_model_dir}
 fi
 # create finetune env
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "create finetune env"
    python3 local/prepare_env.py \
        --pretrained_model_dir=${pretrained_model_dir} \
        --output_dir=${output_dir}
 fi
 # finetune
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    echo "finetune..."
    python3 local/finetune.py \
        --pretrained_model_dir=${pretrained_model_dir} \
        --dump_dir=${dump_dir} \
        --output_dir=${output_dir} \
        --lang=${lang} \
        --ngpu=${ngpu} \
        --epoch=100 \
        --finetune_config=${finetune_config}
 fi
-
+# synthesize e2e
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    echo "in hifigan syn_e2e"
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
--- a/examples/other/tts_finetune/tts3/run_en.sh
+++ b/examples/other/tts_finetune/tts3/run_en.sh
@ -0,0 +1,107 @@
 #!/bin/bash
 set -e
 source path.sh
 input_dir=./input/ljspeech_mini
 newdir_name="newdir"
 new_dir=${input_dir}/${newdir_name}
 pretrained_model_dir=./pretrained_models/fastspeech2_vctk_ckpt_1.2.0
 mfa_tools=./tools
 mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
 lang=en
 ngpu=1
 finetune_config=./conf/finetune.yaml
 ckpt=snapshot_iter_66300
 gpus=1
 CUDA_VISIBLE_DEVICES=${gpus}
 stage=0
 stop_stage=100
 # with the following command, you can choose the stage range you want to run
 # such as `./run.sh --stage 0 --stop-stage 0`
 # this can not be mixed use with `$1`, `$2` ...
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 # check oov
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo "check oov"
    python3 local/check_oov.py \
        --input_dir=${input_dir} \
        --pretrained_model_dir=${pretrained_model_dir} \
        --newdir_name=${newdir_name} \
        --lang=${lang}
 fi
 # get mfa result
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "get mfa result"
    python3 local/get_mfa_result.py \
        --input_dir=${new_dir} \
        --mfa_dir=${mfa_dir} \
        --lang=${lang}
 fi
 # generate durations.txt
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "generate durations.txt"
    python3 local/generate_duration.py \
        --mfa_dir=${mfa_dir} 
 fi
 # extract feature
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "extract feature"
    python3 local/extract_feature.py \
        --duration_file="./durations.txt" \
        --input_dir=${new_dir} \
        --dump_dir=${dump_dir} \
        --pretrained_model_dir=${pretrained_model_dir}
 fi
 # create finetune env
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "create finetune env"
    python3 local/prepare_env.py \
        --pretrained_model_dir=${pretrained_model_dir} \
        --output_dir=${output_dir}
 fi
 # finetune
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    echo "finetune..."
    python3 local/finetune.py \
        --pretrained_model_dir=${pretrained_model_dir} \
        --dump_dir=${dump_dir} \
        --output_dir=${output_dir} \
        --ngpu=${ngpu} \
        --epoch=100 \
        --finetune_config=${finetune_config}
 fi
 # synthesize e2e
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    echo "in hifigan syn_e2e"
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
        --am=fastspeech2_vctk \
        --am_config=${pretrained_model_dir}/default.yaml \
        --am_ckpt=${output_dir}/checkpoints/${ckpt}.pdz \
        --am_stat=${pretrained_model_dir}/speech_stats.npy \
        --voc=hifigan_vctk \
        --voc_config=pretrained_models/hifigan_vctk_ckpt_0.2.0/default.yaml \
        --voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
        --lang=en \
        --text=${BIN_DIR}/../sentences_en.txt \
        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \
        --spk_id=0 
 fi