From c1d45510553105d3eef3652cf30d69385ecb9e8b Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 24 Aug 2022 18:34:11 +0800 Subject: [PATCH] add ernie sat synthesize_e2e, test=tts (#2287) --- examples/aishell3/README.md | 1 + examples/aishell3/ernie_sat/README.md | 152 ++++++- examples/aishell3/ernie_sat/conf/default.yaml | 9 +- .../aishell3/ernie_sat/local/synthesize.sh | 23 +- .../ernie_sat/local/synthesize_e2e.sh | 52 +++ examples/aishell3/ernie_sat/local/train.sh | 4 +- examples/aishell3/ernie_sat/run.sh | 6 +- examples/aishell3_vctk/README.md | 1 + examples/aishell3_vctk/ernie_sat/README.md | 162 +++++++- .../aishell3_vctk/ernie_sat/conf/default.yaml | 9 +- .../ernie_sat/local/synthesize.sh | 23 +- .../ernie_sat/local/synthesize_e2e.sh | 53 +++ .../aishell3_vctk/ernie_sat/local/train.sh | 2 +- examples/aishell3_vctk/ernie_sat/run.sh | 6 +- examples/vctk/README.md | 1 + examples/vctk/ernie_sat/README.md | 153 ++++++- examples/vctk/ernie_sat/conf/default.yaml | 7 +- examples/vctk/ernie_sat/local/synthesize.sh | 26 +- .../vctk/ernie_sat/local/synthesize_e2e.sh | 52 +++ examples/vctk/ernie_sat/local/train.sh | 2 +- examples/vctk/ernie_sat/run.sh | 6 +- paddlespeech/t2s/exps/ernie_sat/align.py | 15 +- .../t2s/exps/ernie_sat/synthesize_e2e.py | 379 ++++++++++++------ paddlespeech/t2s/exps/syn_utils.py | 4 + .../t2s/models/ernie_sat/ernie_sat.py | 4 +- 25 files changed, 937 insertions(+), 215 deletions(-) create mode 100755 examples/aishell3/ernie_sat/local/synthesize_e2e.sh create mode 100755 examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh create mode 100755 examples/vctk/ernie_sat/local/synthesize_e2e.sh diff --git a/examples/aishell3/README.md b/examples/aishell3/README.md index 191974dec..e022cef42 100644 --- a/examples/aishell3/README.md +++ b/examples/aishell3/README.md @@ -10,3 +10,4 @@ * voc3 - MultiBand MelGAN * vc0 - Tacotron2 Voice Cloning with GE2E * vc1 - FastSpeech2 Voice Cloning with GE2E +* ernie_sat - ERNIE-SAT diff --git a/examples/aishell3/ernie_sat/README.md b/examples/aishell3/ernie_sat/README.md index 8086d007c..707ee1381 100644 --- a/examples/aishell3/ernie_sat/README.md +++ b/examples/aishell3/ernie_sat/README.md @@ -1 +1,151 @@ -# ERNIE SAT with AISHELL3 dataset +# ERNIE-SAT with AISHELL3 dataset + +ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 + +## 模型框架 +ERNIE-SAT 中我们提出了两项创新: +- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 +- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 + +

+ +

+ +## Dataset +### Download and Extract +Download AISHELL-3 from it's [Official Website](http://www.aishelltech.com/aishell_3) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/data_aishell3`. + +### Get MFA Result and Extract +We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. +You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/data_aishell3`. +Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. + +### Synthesizing +We use [HiFiGAN](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5) as the neural vocoder. + +Download pretrained HiFiGAN model from [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip) and unzip it. +```bash +unzip hifigan_aishell3_ckpt_0.2.0.zip +``` +HiFiGAN checkpoint contains files listed below. +```text +hifigan_aishell3_ckpt_0.2.0 +├── default.yaml # default config used to train HiFiGAN +├── feats_stats.npy # statistics used to normalize spectrogram when training HiFiGAN +└── snapshot_iter_2500000.pdz # generator parameters of HiFiGAN +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +## Speech Synthesis and Speech Editing +### Prepare +**prepare aligner** +```bash +mkdir -p tools/aligner +cd tools +# download MFA +wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz +# extract MFA +tar xvf montreal-forced-aligner_linux.tar.gz +# fix .so of MFA +cd montreal-forced-aligner/lib +ln -snf libpython3.6m.so.1.0 libpython3.6m.so +cd - +# download align models and dicts +cd aligner +wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip +wget https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/simple.lexicon +wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/vctk_model.zip +wget https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/cmudict-0.7b +cd ../../ +``` +**prepare pretrained FastSpeech2 models** + +ERNIE-SAT use FastSpeech2 as phoneme duration predictor: +```bash +mkdir download +cd download +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip +unzip fastspeech2_conformer_baker_ckpt_0.5.zip +unzip fastspeech2_nosil_ljspeech_ckpt_0.5.zip +cd ../ +``` +**prepare source data** +```bash +mkdir source +cd source +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/SSB03540307.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/SSB03540428.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/LJ050-0278.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/p243_313.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/p299_096.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/this_was_not_the_show_for_me.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/README.md +cd ../ +``` + +You can check the text of downloaded wavs in `source/README.md`. +### Speech Synthesis and Speech Editing +```bash +./run.sh --stage 3 --stop-stage 3 --gpus 0 +``` +`stage 3` of `run.sh` calls `local/synthesize_e2e.sh`, `stage 0` of it is **Speech Synthesis** and `stage 1` of it is **Speech Editing**. + +You can modify `--wav_path`、`--old_str` and `--new_str` yourself, `--old_str` should be the text corresponding to the audio of `--wav_path`, `--new_str` should be designed according to `--task_name`, both `--source_lang` and `--target_lang` should be `zh` for model trained with AISHELL3 dataset. +## Pretrained Model +Pretrained ErnieSAT model: +- [erniesat_aishell3_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_ckpt_1.2.0.zip) + +Model | Step | eval/mlm_loss | eval/loss +:-------------:| :------------:| :-----: | :-----: +default| 8(gpu) x 289500|51.723782|51.723782 diff --git a/examples/aishell3/ernie_sat/conf/default.yaml b/examples/aishell3/ernie_sat/conf/default.yaml index fdc767fb0..dbd5c467e 100644 --- a/examples/aishell3/ernie_sat/conf/default.yaml +++ b/examples/aishell3/ernie_sat/conf/default.yaml @@ -1,3 +1,6 @@ +# This configuration tested on 8 GPUs (A100) with 80GB GPU memory. +# It takes around 3 days to finish the training,You can adjust +# batch_size、num_workers here and ngpu in local/train.sh for your machine ########################################################### # FEATURE EXTRACTION SETTING # ########################################################### @@ -21,8 +24,8 @@ mlm_prob: 0.8 ########################################################### # DATA SETTING # ########################################################### -batch_size: 20 -num_workers: 2 +batch_size: 40 +num_workers: 8 ########################################################### # MODEL SETTING # @@ -280,4 +283,4 @@ token_list: - o3 - iang5 - ei5 -- \ No newline at end of file +- diff --git a/examples/aishell3/ernie_sat/local/synthesize.sh b/examples/aishell3/ernie_sat/local/synthesize.sh index 3e907427c..8b4178f13 100755 --- a/examples/aishell3/ernie_sat/local/synthesize.sh +++ b/examples/aishell3/ernie_sat/local/synthesize.sh @@ -4,28 +4,11 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=1 -stop_stage=1 - -# pwgan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/synthesize.py \ - --erniesat_config=${config_path} \ - --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --erniesat_stat=dump/train/speech_stats.npy \ - --voc=pwgan_aishell3 \ - --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ - --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt -fi +stage=0 +stop_stage=0 # hifigan -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize.py \ diff --git a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh new file mode 100755 index 000000000..b33e8ca09 --- /dev/null +++ b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo 'speech synthesize !' + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --task_name=synthesize \ + --wav_path=source/SSB03540307.wav\ + --old_str='请播放歌曲小苹果。' \ + --new_str='歌曲真好听。' \ + --source_lang=zh \ + --target_lang=zh \ + --erniesat_config=${config_path} \ + --phones_dict=dump/phone_id_map.txt \ + --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --erniesat_stat=dump/train/speech_stats.npy \ + --voc=hifigan_aishell3 \ + --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ + --output_name=exp/pred_gen.wav +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo 'speech edit !' + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --task_name=edit \ + --wav_path=source/SSB03540428.wav \ + --old_str='今天天气很好' \ + --new_str='今天心情很好' \ + --source_lang=zh \ + --target_lang=zh \ + --erniesat_config=${config_path} \ + --phones_dict=dump/phone_id_map.txt \ + --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --erniesat_stat=dump/train/speech_stats.npy \ + --voc=hifigan_aishell3 \ + --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ + --output_name=exp/pred_edit.wav +fi diff --git a/examples/aishell3/ernie_sat/local/train.sh b/examples/aishell3/ernie_sat/local/train.sh index 30720e8f5..829310832 100755 --- a/examples/aishell3/ernie_sat/local/train.sh +++ b/examples/aishell3/ernie_sat/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file + --ngpu=8 \ + --phones-dict=dump/phone_id_map.txt diff --git a/examples/aishell3/ernie_sat/run.sh b/examples/aishell3/ernie_sat/run.sh index d75a19f23..cb354de41 100755 --- a/examples/aishell3/ernie_sat/run.sh +++ b/examples/aishell3/ernie_sat/run.sh @@ -9,7 +9,7 @@ stop_stage=100 conf_path=conf/default.yaml train_output_path=exp/default -ckpt_name=snapshot_iter_153.pdz +ckpt_name=snapshot_iter_289500.pdz # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -30,3 +30,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # synthesize, vocoder is pwgan CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/aishell3_vctk/README.md b/examples/aishell3_vctk/README.md index 330b25934..12213da2a 100644 --- a/examples/aishell3_vctk/README.md +++ b/examples/aishell3_vctk/README.md @@ -1 +1,2 @@ # Mixed Chinese and English TTS with AISHELL3 and VCTK datasets +* ernie_sat - ERNIE-SAT diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md index 1c6bbe230..777bea326 100644 --- a/examples/aishell3_vctk/ernie_sat/README.md +++ b/examples/aishell3_vctk/ernie_sat/README.md @@ -1 +1,161 @@ -# ERNIE SAT with AISHELL3 and VCTK dataset +# ERNIE-SAT with AISHELL3 and VCTK dataset + +ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 + +## 模型框架 +ERNIE-SAT 中我们提出了两项创新: +- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 +- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 + +

+ +

+ +## Dataset +### Download and Extract +Download all datasets and extract it to `~/datasets`: +- The aishell3 dataset is in the directory `~/datasets/data_aishell3` +- The vctk dataset is in the directory `~/datasets/VCTK-Corpus-0.92` + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for the fastspeech2 training. +You can download from here: +- [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz) +- [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz) + +Or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo. + +## Get Started +Assume the paths to the datasets are: +- `~/datasets/data_aishell3` +- `~/datasets/VCTK-Corpus-0.92` +Assume the path to the MFA results of the datasets are: +- `./aishell3_alignment_tone` +- `./vctk_alignment` +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. + +### Synthesizing +We use [HiFiGAN](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5) as the neural vocoder. + +Download pretrained HiFiGAN model from [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip) and unzip it. +```bash +unzip hifigan_aishell3_ckpt_0.2.0.zip +``` +HiFiGAN checkpoint contains files listed below. +```text +hifigan_aishell3_ckpt_0.2.0 +├── default.yaml # default config used to train HiFiGAN +├── feats_stats.npy # statistics used to normalize spectrogram when training HiFiGAN +└── snapshot_iter_2500000.pdz # generator parameters of HiFiGAN +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +## Speech Synthesis and Speech Editing +### Prepare + +**prepare aligner** +```bash +mkdir -p tools/aligner +cd tools +# download MFA +wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz +# extract MFA +tar xvf montreal-forced-aligner_linux.tar.gz +# fix .so of MFA +cd montreal-forced-aligner/lib +ln -snf libpython3.6m.so.1.0 libpython3.6m.so +cd - +# download align models and dicts +cd aligner +wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip +wget https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/simple.lexicon +wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/vctk_model.zip +wget https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/cmudict-0.7b +cd ../../ +``` +**prepare pretrained FastSpeech2 models** + +ERNIE-SAT use FastSpeech2 as phoneme duration predictor: +```bash +mkdir download +cd download +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip +unzip fastspeech2_conformer_baker_ckpt_0.5.zip +unzip fastspeech2_nosil_ljspeech_ckpt_0.5.zip +cd ../ +``` +**prepare source data** +```bash +mkdir source +cd source +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/SSB03540307.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/SSB03540428.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/LJ050-0278.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/p243_313.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/p299_096.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/this_was_not_the_show_for_me.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/README.md +cd ../ +``` +You can check the text of downloaded wavs in `source/README.md`. +### Cross Language Voice Cloning +```bash +./run.sh --stage 3 --stop-stage 3 --gpus 0 +``` +`stage 3` of `run.sh` calls `local/synthesize_e2e.sh`. + +You can modify `--wav_path`、`--old_str` and `--new_str` yourself, `--old_str` should be the text corresponding to the audio of `--wav_path`, `--new_str` should be designed according to `--task_name`, `--source_lang` and `--target_lang` should be different in this example. +## Pretrained Model +Pretrained ErnieSAT model: +- [erniesat_aishell3_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_vctk_ckpt_1.2.0.zip) + +Model | Step | eval/text_mlm_loss | eval/mlm_loss | eval/loss +:-------------:| :------------:| :-----: | :-----:| :-----: +default| 8(gpu) x 489000|0.000001|52.477642 |52.477642 diff --git a/examples/aishell3_vctk/ernie_sat/conf/default.yaml b/examples/aishell3_vctk/ernie_sat/conf/default.yaml index abb69fcc0..efbdd456d 100644 --- a/examples/aishell3_vctk/ernie_sat/conf/default.yaml +++ b/examples/aishell3_vctk/ernie_sat/conf/default.yaml @@ -1,3 +1,6 @@ +# This configuration tested on 8 GPUs (A100) with 80GB GPU memory. +# It takes around 4 days to finish the training,You can adjust +# batch_size、num_workers here and ngpu in local/train.sh for your machine ########################################################### # FEATURE EXTRACTION SETTING # ########################################################### @@ -21,8 +24,8 @@ mlm_prob: 0.8 ########################################################### # DATA SETTING # ########################################################### -batch_size: 20 -num_workers: 2 +batch_size: 40 +num_workers: 8 ########################################################### # MODEL SETTING # @@ -79,7 +82,7 @@ grad_clip: 1.0 ########################################################### # TRAINING SETTING # ########################################################### -max_epoch: 700 +max_epoch: 1500 num_snapshots: 50 ########################################################### diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh index 3e907427c..8b4178f13 100755 --- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh @@ -4,28 +4,11 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=1 -stop_stage=1 - -# pwgan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/synthesize.py \ - --erniesat_config=${config_path} \ - --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --erniesat_stat=dump/train/speech_stats.npy \ - --voc=pwgan_aishell3 \ - --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ - --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt -fi +stage=0 +stop_stage=0 # hifigan -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize.py \ diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh new file mode 100755 index 000000000..c30af6e85 --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh @@ -0,0 +1,53 @@ +# not ready yet +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo 'speech cross language from en to zh !' + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --task_name=synthesize \ + --wav_path=source/p243_313.wav \ + --old_str='For that reason cover should not be given.' \ + --new_str='今天天气很好' \ + --source_lang=en \ + --target_lang=zh \ + --erniesat_config=${config_path} \ + --phones_dict=dump/phone_id_map.txt \ + --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --erniesat_stat=dump/train/speech_stats.npy \ + --voc=hifigan_aishell3 \ + --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ + --output_name=exp/pred_clone_en_zh.wav +fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo 'speech cross language from zh to en !' + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --task_name=synthesize \ + --wav_path=source/SSB03540307.wav \ + --old_str='请播放歌曲小苹果。' \ + --new_str="Thank you!" \ + --source_lang=zh \ + --target_lang=en \ + --erniesat_config=${config_path} \ + --phones_dict=dump/phone_id_map.txt \ + --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --erniesat_stat=dump/train/speech_stats.npy \ + --voc=hifigan_aishell3 \ + --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ + --output_name=exp/pred_clone_zh_en.wav +fi + diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh index 30720e8f5..526aac435 100755 --- a/examples/aishell3_vctk/ernie_sat/local/train.sh +++ b/examples/aishell3_vctk/ernie_sat/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --ngpu=2 \ + --ngpu=8 \ --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/run.sh b/examples/aishell3_vctk/ernie_sat/run.sh index d75a19f23..5509fc4ad 100755 --- a/examples/aishell3_vctk/ernie_sat/run.sh +++ b/examples/aishell3_vctk/ernie_sat/run.sh @@ -9,7 +9,7 @@ stop_stage=100 conf_path=conf/default.yaml train_output_path=exp/default -ckpt_name=snapshot_iter_153.pdz +ckpt_name=snapshot_iter_489000.pdz # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -30,3 +30,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # synthesize, vocoder is pwgan CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/README.md b/examples/vctk/README.md index ac5fd24f8..41163dbe7 100644 --- a/examples/vctk/README.md +++ b/examples/vctk/README.md @@ -9,3 +9,4 @@ * voc1 - Parallel WaveGAN * voc2 - MelGAN * voc3 - MultiBand MelGAN +* ernie_sat - ERNIE-SAT diff --git a/examples/vctk/ernie_sat/README.md b/examples/vctk/ernie_sat/README.md index 055e7903d..0a2f9359e 100644 --- a/examples/vctk/ernie_sat/README.md +++ b/examples/vctk/ernie_sat/README.md @@ -1 +1,152 @@ -# ERNIE SAT with VCTK dataset +# ERNIE-SAT with VCTK dataset + +ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 + +## 模型框架 +ERNIE-SAT 中我们提出了两项创新: +- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 +- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 + +

+ +

+ +## Dataset +### Download and Extract the dataset +Download VCTK-0.92 from it's [Official Website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/VCTK-Corpus-0.92`. + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. +You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. +ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)): +1. `p315`, because of no text for it. +2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them. + +## Get Started +Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`. +Assume the path to the MFA result of VCTK is `./vctk_alignment`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. + +### Synthesizing +We use [HiFiGAN](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5) as the neural vocoder. + +Download pretrained HiFiGAN model from [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip) and unzip it. +```bash +unzip hifigan_vctk_ckpt_0.2.0.zip +``` +HiFiGAN checkpoint contains files listed below. +```text +hifigan_vctk_ckpt_0.2.0 +├── default.yaml # default config used to train HiFiGAN +├── feats_stats.npy # statistics used to normalize spectrogram when training HiFiGAN +└── snapshot_iter_2500000.pdz # generator parameters of HiFiGAN +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +## Speech Synthesis and Speech Editing +### Prepare +**prepare aligner** +```bash +mkdir -p tools/aligner +cd tools +# download MFA +wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz +# extract MFA +tar xvf montreal-forced-aligner_linux.tar.gz +# fix .so of MFA +cd montreal-forced-aligner/lib +ln -snf libpython3.6m.so.1.0 libpython3.6m.so +cd - +# download align models and dicts +cd aligner +wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip +wget https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/simple.lexicon +wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/vctk_model.zip +wget https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/cmudict-0.7b +cd ../../ +``` +**prepare pretrained FastSpeech2 models** + +ERNIE-SAT use FastSpeech2 as phoneme duration predictor: +```bash +mkdir download +cd download +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip +unzip fastspeech2_conformer_baker_ckpt_0.5.zip +unzip fastspeech2_nosil_ljspeech_ckpt_0.5.zip +cd ../ +``` +**prepare source data** +```bash +mkdir source +cd source +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/SSB03540307.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/SSB03540428.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/LJ050-0278.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/p243_313.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/p299_096.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/this_was_not_the_show_for_me.wav +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/source/README.md +cd ../ +``` +You can check the text of downloaded wavs in `source/README.md`. +### Speech Synthesis and Speech Editing +```bash +./run.sh --stage 3 --stop-stage 3 --gpus 0 +``` +`stage 3` of `run.sh` calls `local/synthesize_e2e.sh`, `stage 0` of it is **Speech Synthesis** and `stage 1` of it is **Speech Editing**. + +You can modify `--wav_path`、`--old_str` and `--new_str` yourself, `--old_str` should be the text corresponding to the audio of `--wav_path`, `--new_str` should be designed according to `--task_name`, both `--source_lang` and `--target_lang` should be `en` for model trained with VCTK dataset. +## Pretrained Model +Pretrained ErnieSAT model: +- [erniesat_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_vctk_ckpt_1.2.0.zip) + +Model | Step | eval/mlm_loss | eval/loss +:-------------:| :------------:| :-----: | :-----: +default| 8(gpu) x 199500|57.622215|57.622215 diff --git a/examples/vctk/ernie_sat/conf/default.yaml b/examples/vctk/ernie_sat/conf/default.yaml index 672f937ef..88b3d376d 100644 --- a/examples/vctk/ernie_sat/conf/default.yaml +++ b/examples/vctk/ernie_sat/conf/default.yaml @@ -1,3 +1,6 @@ +# This configuration tested on 8 GPUs (A100) with 80GB GPU memory. +# It takes around 2 days to finish the training,You can adjust +# batch_size、num_workers here and ngpu in local/train.sh for your machine ########################################################### # FEATURE EXTRACTION SETTING # ########################################################### @@ -21,8 +24,8 @@ mlm_prob: 0.8 ########################################################### # DATA SETTING # ########################################################### -batch_size: 20 -num_workers: 2 +batch_size: 40 +num_workers: 8 ########################################################### # MODEL SETTING # diff --git a/examples/vctk/ernie_sat/local/synthesize.sh b/examples/vctk/ernie_sat/local/synthesize.sh index b24db018a..5667f30f8 100755 --- a/examples/vctk/ernie_sat/local/synthesize.sh +++ b/examples/vctk/ernie_sat/local/synthesize.sh @@ -4,31 +4,11 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -stage=1 -stop_stage=1 - -# use am to predict duration here -# 增加 am_phones_dict am_tones_dict 等,也可以用新的方式构造 am, 不需要这么多参数了就 - -# pwgan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/synthesize.py \ - --erniesat_config=${config_path} \ - --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --erniesat_stat=dump/train/speech_stats.npy \ - --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt -fi +stage=0 +stop_stage=0 # hifigan -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize.py \ diff --git a/examples/vctk/ernie_sat/local/synthesize_e2e.sh b/examples/vctk/ernie_sat/local/synthesize_e2e.sh new file mode 100755 index 000000000..fee540169 --- /dev/null +++ b/examples/vctk/ernie_sat/local/synthesize_e2e.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo 'speech synthesize !' + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --task_name=synthesize \ + --wav_path=source/p243_313.wav \ + --old_str='For that reason cover should not be given.' \ + --new_str='I love you very much do you love me' \ + --source_lang=en \ + --target_lang=en \ + --erniesat_config=${config_path} \ + --phones_dict=dump/phone_id_map.txt \ + --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --erniesat_stat=dump/train/speech_stats.npy \ + --voc=hifigan_vctk \ + --voc_config=hifigan_vctk_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \ + --output_name=exp/pred_gen.wav +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo 'speech edit !' + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/synthesize_e2e.py \ + --task_name=edit \ + --wav_path=source/p243_313.wav \ + --old_str='For that reason cover should not be given.' \ + --new_str='For that reason cover is not impossible to be given.' \ + --source_lang=en \ + --target_lang=en \ + --erniesat_config=${config_path} \ + --phones_dict=dump/phone_id_map.txt \ + --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --erniesat_stat=dump/train/speech_stats.npy \ + --voc=hifigan_vctk \ + --voc_config=hifigan_vctk_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \ + --output_name=exp/pred_edit.wav +fi diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh index 30720e8f5..526aac435 100755 --- a/examples/vctk/ernie_sat/local/train.sh +++ b/examples/vctk/ernie_sat/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --ngpu=2 \ + --ngpu=8 \ --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/vctk/ernie_sat/run.sh b/examples/vctk/ernie_sat/run.sh index d75a19f23..94d130d41 100755 --- a/examples/vctk/ernie_sat/run.sh +++ b/examples/vctk/ernie_sat/run.sh @@ -9,7 +9,7 @@ stop_stage=100 conf_path=conf/default.yaml train_output_path=exp/default -ckpt_name=snapshot_iter_153.pdz +ckpt_name=snapshot_iter_199500.pdz # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -30,3 +30,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # synthesize, vocoder is pwgan CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py index 529a8221c..464f51a3b 100755 --- a/paddlespeech/t2s/exps/ernie_sat/align.py +++ b/paddlespeech/t2s/exps/ernie_sat/align.py @@ -19,9 +19,9 @@ import librosa import numpy as np import pypinyin from praatio import textgrid -from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name -from paddlespeech.t2s.exps.ernie_sat.utils import get_dict +from paddlespeech.t2s.exps.ernie_sat.utils import get_dict +from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name DICT_EN = 'tools/aligner/cmudict-0.7b' DICT_ZH = 'tools/aligner/simple.lexicon' @@ -30,6 +30,7 @@ MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip' MFA_PATH = 'tools/montreal-forced-aligner/bin' os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH'] + def _get_max_idx(dic): return sorted([int(key.split('_')[0]) for key in dic.keys()])[-1] @@ -106,11 +107,11 @@ def alignment(wav_path: str, wav_name = os.path.basename(wav_path) utt = wav_name.split('.')[0] # prepare data for MFA - tmp_name = get_tmp_name(text=text) + tmp_name = get_tmp_name(text=text) tmpbase = './tmp_dir/' + tmp_name tmpbase = Path(tmpbase) tmpbase.mkdir(parents=True, exist_ok=True) - print("tmp_name in alignment:",tmp_name) + print("tmp_name in alignment:", tmp_name) shutil.copyfile(wav_path, tmpbase / wav_name) txt_name = utt + '.txt' @@ -340,7 +341,7 @@ def get_phns_spans(wav_path: str, if __name__ == '__main__': text = "For that reason cover should not be given." - phn, dur, word2phns = alignment("exp/p243_313.wav", text, lang='en') + phn, dur, word2phns = alignment("source/p243_313.wav", text, lang='en') print(phn, dur) print(word2phns) print("---------------------------------") @@ -352,7 +353,7 @@ if __name__ == '__main__': style=pypinyin.Style.TONE3, tone_sandhi=True) text_zh = " ".join(text_zh) - phn, dur, word2phns = alignment("exp/000001.wav", text_zh, lang='zh') + phn, dur, word2phns = alignment("source/000001.wav", text_zh, lang='zh') print(phn, dur) print(word2phns) print("---------------------------------") @@ -367,7 +368,7 @@ if __name__ == '__main__': print("---------------------------------") outs = get_phns_spans( - wav_path="exp/p243_313.wav", + wav_path="source/p243_313.wav", old_str="For that reason cover should not be given.", new_str="for that reason cover is impossible to be given.") diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index 95b07367c..21c9ae044 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -11,35 +11,41 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import argparse +import os +from pathlib import Path +from typing import List + import librosa import numpy as np +import paddle +import pypinyin import soundfile as sf +import yaml +from pypinyin_dict.phrase_pinyin_data import large_pinyin +from yacs.config import CfgNode +from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.exps.ernie_sat.align import get_phns_spans from paddlespeech.t2s.exps.ernie_sat.utils import eval_durs from paddlespeech.t2s.exps.ernie_sat.utils import get_dur_adj_factor from paddlespeech.t2s.exps.ernie_sat.utils import get_span_bdy -from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn -from paddlespeech.t2s.exps.syn_utils import get_frontend -from paddlespeech.t2s.datasets.get_feats import LogMelFBank -from paddlespeech.t2s.exps.syn_utils import norm from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.exps.syn_utils import norm +from paddlespeech.t2s.utils import str2bool +large_pinyin.load() - - - - -def _p2id(self, phonemes: List[str]) -> np.ndarray: +def _p2id(phonemes: List[str]) -> np.ndarray: # replace unk phone with sp - phonemes = [ - phn if phn in vocab_phones else "sp" for phn in phonemes - ] + phonemes = [phn if phn in vocab_phones else "sp" for phn in phonemes] phone_ids = [vocab_phones[item] for item in phonemes] return np.array(phone_ids, np.int64) - def prep_feats_with_dur(wav_path: str, old_str: str='', new_str: str='', @@ -67,12 +73,12 @@ def prep_feats_with_dur(wav_path: str, fs=fs, n_shift=n_shift) - mfa_start = phns_spans_outs["mfa_start"] - mfa_end = phns_spans_outs["mfa_end"] - old_phns = phns_spans_outs["old_phns"] - new_phns = phns_spans_outs["new_phns"] - span_to_repl = phns_spans_outs["span_to_repl"] - span_to_add = phns_spans_outs["span_to_add"] + mfa_start = phns_spans_outs['mfa_start'] + mfa_end = phns_spans_outs['mfa_end'] + old_phns = phns_spans_outs['old_phns'] + new_phns = phns_spans_outs['new_phns'] + span_to_repl = phns_spans_outs['span_to_repl'] + span_to_add = phns_spans_outs['span_to_add'] # 中文的 phns 不一定都在 fastspeech2 的字典里, 用 sp 代替 if target_lang in {'en', 'zh'}: @@ -132,7 +138,7 @@ def prep_feats_with_dur(wav_path: str, [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]]) # 音频是正常遮住了 - sf.write(str("new_wav.wav"), new_wav, samplerate=fs) + sf.write(str("mask_wav.wav"), new_wav, samplerate=fs) # 4. get old and new mel span to be mask old_span_bdy = get_span_bdy( @@ -152,8 +158,6 @@ def prep_feats_with_dur(wav_path: str, return outs - - def prep_feats(wav_path: str, old_str: str='', new_str: str='', @@ -163,7 +167,7 @@ def prep_feats(wav_path: str, fs: int=24000, n_shift: int=300): - outs = prep_feats_with_dur( + with_dur_outs = prep_feats_with_dur( wav_path=wav_path, old_str=old_str, new_str=new_str, @@ -176,138 +180,240 @@ def prep_feats(wav_path: str, wav_name = os.path.basename(wav_path) utt_id = wav_name.split('.')[0] - wav = outs['new_wav'] - phns = outs['new_phns'] - mfa_start = outs['new_mfa_start'] - mfa_end = outs['new_mfa_end'] - old_span_bdy = outs['old_span_bdy'] - new_span_bdy = outs['new_span_bdy'] + wav = with_dur_outs['new_wav'] + phns = with_dur_outs['new_phns'] + mfa_start = with_dur_outs['new_mfa_start'] + mfa_end = with_dur_outs['new_mfa_end'] + old_span_bdy = with_dur_outs['old_span_bdy'] + new_span_bdy = with_dur_outs['new_span_bdy'] span_bdy = np.array(new_span_bdy) - text = _p2id(phns) mel = mel_extractor.get_log_mel_fbank(wav) erniesat_mean, erniesat_std = np.load(erniesat_stat) normed_mel = norm(mel, erniesat_mean, erniesat_std) - tmp_name = get_tmp_name(text=old_str) + tmp_name = get_tmp_name(text=old_str) tmpbase = './tmp_dir/' + tmp_name tmpbase = Path(tmpbase) tmpbase.mkdir(parents=True, exist_ok=True) - print("tmp_name in synthesize_e2e:",tmp_name) mel_path = tmpbase / 'mel.npy' - print("mel_path:",mel_path) - np.save(mel_path, logmel) + np.save(mel_path, normed_mel) durations = [e - s for e, s in zip(mfa_end, mfa_start)] + text = _p2id(phns) - datum={ - "utt_id": utt_id, - "spk_id": 0, - "text": text, - "text_lengths": len(text), - "speech_lengths": 115, - "durations": durations, - "speech": mel_path, - "align_start": mfa_start, + datum = { + "utt_id": utt_id, + "spk_id": 0, + "text": text, + "text_lengths": len(text), + "speech_lengths": len(normed_mel), + "durations": durations, + "speech": np.load(mel_path), + "align_start": mfa_start, "align_end": mfa_end, "span_bdy": span_bdy } batch = collate_fn([datum]) - print("batch:",batch) - - return batch, old_span_bdy, new_span_bdy - - -def decode_with_model(mlm_model: nn.Layer, - collate_fn, - wav_path: str, - old_str: str='', - new_str: str='', - source_lang: str='en', - target_lang: str='en', - use_teacher_forcing: bool=False, - duration_adjust: bool=True, - fs: int=24000, - n_shift: int=300, - token_list: List[str]=[]): - batch, old_span_bdy, new_span_bdy = prep_feats( - source_lang=source_lang, - target_lang=target_lang, + outs = dict() + outs['batch'] = batch + outs['old_span_bdy'] = old_span_bdy + outs['new_span_bdy'] = new_span_bdy + return outs + + +def get_mlm_output(wav_path: str, + old_str: str='', + new_str: str='', + source_lang: str='en', + target_lang: str='en', + duration_adjust: bool=True, + fs: int=24000, + n_shift: int=300): + + prep_feats_outs = prep_feats( wav_path=wav_path, old_str=old_str, new_str=new_str, + source_lang=source_lang, + target_lang=target_lang, duration_adjust=duration_adjust, fs=fs, - n_shift=n_shift, - token_list=token_list) - - - - feats = collate_fn(batch)[1] + n_shift=n_shift) - if 'text_masked_pos' in feats.keys(): - feats.pop('text_masked_pos') + batch = prep_feats_outs['batch'] + new_span_bdy = prep_feats_outs['new_span_bdy'] + old_span_bdy = prep_feats_outs['old_span_bdy'] - output = mlm_model.inference( - text=feats['text'], - speech=feats['speech'], - masked_pos=feats['masked_pos'], - speech_mask=feats['speech_mask'], - text_mask=feats['text_mask'], - speech_seg_pos=feats['speech_seg_pos'], - text_seg_pos=feats['text_seg_pos'], - span_bdy=new_span_bdy, - use_teacher_forcing=use_teacher_forcing) + out_mels = erniesat_inference( + speech=batch['speech'], + text=batch['text'], + masked_pos=batch['masked_pos'], + speech_mask=batch['speech_mask'], + text_mask=batch['text_mask'], + speech_seg_pos=batch['speech_seg_pos'], + text_seg_pos=batch['text_seg_pos'], + span_bdy=new_span_bdy) # 拼接音频 - output_feat = paddle.concat(x=output, axis=0) + output_feat = paddle.concat(x=out_mels, axis=0) wav_org, _ = librosa.load(wav_path, sr=fs) - return wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length + outs = dict() + outs['wav_org'] = wav_org + outs['output_feat'] = output_feat + outs['old_span_bdy'] = old_span_bdy + outs['new_span_bdy'] = new_span_bdy + return outs -if __name__ == '__main__': - fs = 24000 - n_shift = 300 - wav_path = "exp/p243_313.wav" - old_str = "For that reason cover should not be given." - # for edit - # new_str = "for that reason cover is impossible to be given." - # for synthesize - append_str = "do you love me i love you so much" - new_str = old_str + append_str - ''' - outs = prep_feats_with_dur( +def get_wav(wav_path: str, + source_lang: str='en', + target_lang: str='en', + old_str: str='', + new_str: str='', + duration_adjust: bool=True, + fs: int=24000, + n_shift: int=300): + + outs = get_mlm_output( wav_path=wav_path, old_str=old_str, new_str=new_str, + source_lang=source_lang, + target_lang=target_lang, + duration_adjust=duration_adjust, fs=fs, n_shift=n_shift) - new_wav = outs['new_wav'] - new_phns = outs['new_phns'] - new_mfa_start = outs['new_mfa_start'] - new_mfa_end = outs['new_mfa_end'] + wav_org = outs['wav_org'] + output_feat = outs['output_feat'] old_span_bdy = outs['old_span_bdy'] new_span_bdy = outs['new_span_bdy'] - print("---------------------------------") + masked_feat = output_feat[new_span_bdy[0]:new_span_bdy[1]] + + with paddle.no_grad(): + alt_wav = voc_inference(masked_feat) + alt_wav = np.squeeze(alt_wav) + + old_time_bdy = [n_shift * x for x in old_span_bdy] + wav_replaced = np.concatenate( + [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]]) + + wav_dict = {"origin": wav_org, "output": wav_replaced} + return wav_dict + + +def parse_args(): + # parse args and config + parser = argparse.ArgumentParser( + description="Synthesize with acoustic model & vocoder") + # ernie sat + + parser.add_argument( + '--erniesat_config', + type=str, + default=None, + help='Config of acoustic model.') + parser.add_argument( + '--erniesat_ckpt', + type=str, + default=None, + help='Checkpoint file of acoustic model.') + parser.add_argument( + "--erniesat_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." + ) + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + # vocoder + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=[ + 'pwgan_aishell3', + 'pwgan_vctk', + 'hifigan_aishell3', + 'hifigan_vctk', + ], + help='Choose vocoder type of tts task.') + parser.add_argument( + '--voc_config', type=str, default=None, help='Config of voc.') + parser.add_argument( + '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') + parser.add_argument( + "--voc_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training voc." + ) + # other + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + # ernie sat related + parser.add_argument("--task_name", type=str, help="task name") + parser.add_argument("--wav_path", type=str, help="path of old wav") + parser.add_argument("--old_str", type=str, help="old string") + parser.add_argument("--new_str", type=str, help="new string") + parser.add_argument( + "--source_lang", type=str, default="en", help="source language") + parser.add_argument( + "--target_lang", type=str, default="en", help="target language") + parser.add_argument( + "--duration_adjust", + type=str2bool, + default=True, + help="whether to adjust duration.") + parser.add_argument("--output_name", type=str, default="output.wav") + + args = parser.parse_args() + return args - print("new_wav:", new_wav) - print("new_phns:", new_phns) - print("new_mfa_start:", new_mfa_start) - print("new_mfa_end:", new_mfa_end) - print("old_span_bdy:", old_span_bdy) - print("new_span_bdy:", new_span_bdy) - print("---------------------------------") - ''' - erniesat_config = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/local/default.yaml" +if __name__ == '__main__': + args = parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") - with open(erniesat_config) as f: + # evaluate(args) + with open(args.erniesat_config) as f: erniesat_config = CfgNode(yaml.safe_load(f)) - - erniesat_stat = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/train/speech_stats.npy" + old_str = args.old_str + new_str = args.new_str + + # convert Chinese characters to pinyin + if args.source_lang == 'zh': + old_str = pypinyin.lazy_pinyin( + old_str, + neutral_tone_with_five=True, + style=pypinyin.Style.TONE3, + tone_sandhi=True) + old_str = ' '.join(old_str) + if args.target_lang == 'zh': + new_str = pypinyin.lazy_pinyin( + new_str, + neutral_tone_with_five=True, + style=pypinyin.Style.TONE3, + tone_sandhi=True) + new_str = ' '.join(new_str) + + if args.task_name == 'edit': + new_str = new_str + elif args.task_name == 'synthesize': + new_str = old_str + new_str + else: + new_str = old_str + new_str + print("new_str:", new_str) # Extractor mel_extractor = LogMelFBank( @@ -319,28 +425,51 @@ if __name__ == '__main__': n_mels=erniesat_config.n_mels, fmin=erniesat_config.fmin, fmax=erniesat_config.fmax) - - collate_fn = build_erniesat_collate_fn( mlm_prob=erniesat_config.mlm_prob, mean_phn_span=erniesat_config.mean_phn_span, seg_emb=erniesat_config.model['enc_input_layer'] == 'sega_mlm', text_masking=False) - - phones_dict='/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/phone_id_map.txt' + vocab_phones = {} - with open(phones_dict, 'rt') as f: + with open(args.phones_dict, 'rt') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: vocab_phones[phn] = int(id) - prep_feats(wav_path=wav_path, - old_str=old_str, - new_str=new_str, - fs=fs, - n_shift=n_shift) - - - + # ernie sat model + erniesat_inference = get_am_inference( + am='erniesat_dataset', + am_config=erniesat_config, + am_ckpt=args.erniesat_ckpt, + am_stat=args.erniesat_stat, + phones_dict=args.phones_dict) + + with open(args.voc_config) as f: + voc_config = CfgNode(yaml.safe_load(f)) + + # vocoder + voc_inference = get_voc_inference( + voc=args.voc, + voc_config=voc_config, + voc_ckpt=args.voc_ckpt, + voc_stat=args.voc_stat) + + erniesat_stat = args.erniesat_stat + + wav_dict = get_wav( + wav_path=args.wav_path, + source_lang=args.source_lang, + target_lang=args.target_lang, + old_str=old_str, + new_str=new_str, + duration_adjust=args.duration_adjust, + fs=erniesat_config.fs, + n_shift=erniesat_config.n_shift) + + sf.write( + args.output_name, wav_dict['output'], samplerate=erniesat_config.fs) + print( + f"\033[1;32;m Generated audio saved into {args.output_name} ! \033[0m") diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 127e1a3ba..c8eb1c64a 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -82,6 +82,10 @@ def denorm(data, mean, std): return data * std + mean +def norm(data, mean, std): + return (data - mean) / std + + def get_chunks(data, block_size: int, pad_size: int): data_len = data.shape[1] chunks = [] diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py index 54f5d542d..08c43dc5f 100644 --- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py +++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py @@ -389,7 +389,7 @@ class MLM(nn.Layer): speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor, span_bdy: List[int], - use_teacher_forcing: bool=False, ) -> List[paddle.Tensor]: + use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]: ''' Args: speech (paddle.Tensor): input speech (1, Tmax, D). @@ -657,7 +657,7 @@ class ErnieSAT(nn.Layer): speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor, span_bdy: List[int], - use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: + use_teacher_forcing: bool=True, ) -> Dict[str, paddle.Tensor]: return self.model.inference( speech=speech, text=text,