diff --git a/CHANGELOG.md b/CHANGELOG.md index 3178434c..6e8315e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,29 @@ # Changelog -Date: 2022-1-19, Author: yt605155624. -Add features to: T2S: - - Add csmsc Tacotron2. +Date: 2022-1-29, Author: yt605155624. +Add features to: T2S: + - Update aishell3 vc0 with new Tacotron2. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1419 + +Date: 2022-1-29, Author: yt605155624. +Add features to: T2S: + - Add ljspeech Tacotron2. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1416 + +Date: 2022-1-24, Author: yt605155624. +Add features to: T2S: + - Add csmsc WaveRNN. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1379 + +Date: 2022-1-19, Author: yt605155624. +Add features to: T2S: + - Add csmsc Tacotron2. - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1314 Date: 2022-1-10, Author: Jackwaterveg. -Add features to: CLI: - - Support English (librispeech/asr1/transformer). +Add features to: CLI: + - Support English (librispeech/asr1/transformer). - Support choosing `decode_method` for conformer and transformer models. - Refactor the config, using the unified config. - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297 @@ -16,8 +31,8 @@ Add features to: CLI: *** Date: 2022-1-17, Author: Jackwaterveg. -Add features to: CLI: - - Support deepspeech2 online/offline model(aishell). +Add features to: CLI: + - Support deepspeech2 online/offline model(aishell). - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1356 *** diff --git a/README.md b/README.md index 23124231..7dd568b0 100644 --- a/README.md +++ b/README.md @@ -317,14 +317,15 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r Acoustic Model - Tacotron2 - LJSpeech + Tacotron2 + LJSpeech / CSMSC - tacotron2-ljspeech + tacotron2-ljspeech / tacotron2-csmsc Transformer TTS + LJSpeech transformer-ljspeech @@ -344,7 +345,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Vocoder + Vocoder WaveFlow LJSpeech @@ -378,7 +379,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r HiFiGAN-csmsc - + + + WaveRNN + CSMSC + + WaveRNN-csmsc + + Voice Cloning GE2E @@ -416,7 +424,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Audio Classification ESC-50 @@ -440,7 +447,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Punctuation Restoration IWLST2012_zh diff --git a/README_cn.md b/README_cn.md index 4ce4ade9..e7cbec7c 100644 --- a/README_cn.md +++ b/README_cn.md @@ -315,14 +315,15 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 声学模型 - Tacotron2 - LJSpeech + Tacotron2 + LJSpeech / CSMSC - tacotron2-ljspeech + tacotron2-ljspeech / tacotron2-csmsc Transformer TTS + LJSpeech transformer-ljspeech @@ -342,7 +343,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 声码器 + 声码器 WaveFlow LJSpeech @@ -376,7 +377,14 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 HiFiGAN-csmsc - + + + WaveRNN + CSMSC + + WaveRNN-csmsc + + 声音克隆 GE2E @@ -415,8 +423,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - - 声音分类 ESC-50 @@ -440,7 +446,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 标点恢复 IWLST2012_zh diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 23309d8e..3b37e169 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,3 +1,4 @@ + # Released Models ## Speech-to-Text Models @@ -32,7 +33,8 @@ Language Model | Training Data | Token-based | Size | Descriptions ### Acoustic Models Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: -Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| +Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)||| +Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| @@ -52,6 +54,8 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| +WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| + ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md index 39bf3d0a..37246e84 100644 --- a/docs/source/tts/quick_start_cn.md +++ b/docs/source/tts/quick_start_cn.md @@ -202,4 +202,4 @@ sf.write( audio_path, wav.numpy(), samplerate=fastspeech2_config.fs) -``` \ No newline at end of file +``` diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 21cd0aa2..29585eb4 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -1,4 +1,3 @@ - # Tacotron2 + AISHELL-3 Voice Cloning This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows: 1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `Tacotron2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). @@ -17,7 +16,7 @@ mkdir data_aishell3 tar zxvf data_aishell3.tgz -C data_aishell3 ``` ### Get MFA Result and Extract -We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. +We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo. ## Pretrained GE2E Model diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh index 9cdbe256..a37cd21e 100755 --- a/examples/aishell3/vc0/path.sh +++ b/examples/aishell3/vc0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=new_tacotron2 +MODEL=tacotron2 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md index 8a566089..04b83a5f 100644 --- a/examples/aishell3/vc1/README.md +++ b/examples/aishell3/vc1/README.md @@ -1,4 +1,3 @@ - # FastSpeech2 + AISHELL-3 Voice Cloning This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows: 1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index b030a51c..0129329a 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -212,6 +212,8 @@ optional arguments: Pretrained Tacotron2 model with no silence in the edge of audios: - [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip) +The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip). + Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh index c957df87..79bb9f83 100755 --- a/examples/csmsc/tts0/local/synthesize_e2e.sh +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -7,6 +7,7 @@ ckpt_name=$3 stage=0 stop_stage=0 +# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -33,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -55,7 +56,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -76,7 +77,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -90,3 +91,24 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi \ No newline at end of file diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh index 9cdbe256..a37cd21e 100755 --- a/examples/csmsc/tts0/path.sh +++ b/examples/csmsc/tts0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=new_tacotron2 +MODEL=tacotron2 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh index 86800920..8f06e933 100755 --- a/examples/csmsc/tts0/run.sh +++ b/examples/csmsc/tts0/run.sh @@ -35,3 +35,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # synthesize_e2e, vocoder is pwgan CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 0a4cf69b..35fcf251 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -92,3 +92,26 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones_dict=dump/phone_id_map.txt \ --tones_dict=dump/tone_id_map.txt fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index d1fadf77..44356e4b 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -102,9 +102,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=wavernn_csmsc \ - --voc_config=wavernn_test/default.yaml \ - --voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \ - --voc_stat=wavernn_test/feats_stats.npy \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index 5c394c9f..e1a149b6 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -36,3 +36,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi + diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md new file mode 100644 index 00000000..ba7ad619 --- /dev/null +++ b/examples/ljspeech/tts0/README.md @@ -0,0 +1,247 @@ +# Tacotron2 with LJSpeech-1.1 +This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/) + +## Dataset +### Download and Extract +Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. +You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. +Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a Tacotron2 model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG tacotron2 config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it. +```bash +unzip pwg_ljspeech_ckpt_0.5.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_ljspeech_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] + [--voice-cloning VOICE_CLONING] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is the model language, which can be `zh` or `en`. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + + +## Pretrained Model +Pretrained Tacotron2 model with no silence in the edge of audios: +- [tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip) + + +Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 1(gpu) x 60300|0.554092|0.394260|0.141046|0.018747|3.8e-05| + +Tacotron2 checkpoint contains files listed below. +```text +tacotron2_ljspeech_ckpt_0.2.0 +├── default.yaml # default config used to train Tacotron2 +├── phone_id_map.txt # phone vocabulary file when training Tacotron2 +├── snapshot_iter_60300.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained Tacotron2 and parallel wavegan models. +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_ljspeech \ + --am_config=tacotron2_ljspeech_ckpt_0.2.0/default.yaml \ + --am_ckpt=tacotron2_ljspeech_ckpt_0.2.0/snapshot_iter_60300.pdz \ + --am_stat=tacotron2_ljspeech_ckpt_0.2.0/speech_stats.npy \ + --voc=pwgan_ljspeech\ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=exp/default/test_e2e \ + --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt +``` diff --git a/examples/ljspeech/tts0/local/synthesize_e2e.sh b/examples/ljspeech/tts0/local/synthesize_e2e.sh new file mode 100755 index 00000000..73dfff60 --- /dev/null +++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +# TODO: dygraph to static graph is not good for tacotron2_ljspeech now +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_ljspeech \ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + # --inference_dir=${train_output_path}/inference \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh index 9cdbe256..a37cd21e 100755 --- a/examples/ljspeech/tts0/path.sh +++ b/examples/ljspeech/tts0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=new_tacotron2 +MODEL=tacotron2 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index f3602c34..f5e919c0 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -1,4 +1,4 @@ -# FastSpeech2 with the LJSpeech-1.1 +# FastSpeech2 with LJSpeech-1.1 This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/). ## Dataset diff --git a/paddleaudio/features/core.py b/paddleaudio/features/core.py index d3c2e290..01925ec6 100644 --- a/paddleaudio/features/core.py +++ b/paddleaudio/features/core.py @@ -415,11 +415,11 @@ def mfcc(x, **kwargs) # librosa mfcc: - spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512, + spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512, win_length=512, hop_length=320, n_mels=64, fmin=50) - b = librosa.feature.mfcc(x, + b = librosa.feature.mfcc(y=x, sr=16000, S=spect, n_mfcc=20, diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 988fd627..889cd349 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): x = np.stack( [ librosa.istft( - y=x[:, ch].T, # [Time, Freq] -> [Freq, Time] + stft_matrix=x[:, ch].T, # [Time, Freq] -> [Freq, Time] hop_length=n_shift, win_length=win_length, window=window, diff --git a/paddlespeech/t2s/audio/audio.py b/paddlespeech/t2s/audio/audio.py index ab9a45d3..59ea8c87 100644 --- a/paddlespeech/t2s/audio/audio.py +++ b/paddlespeech/t2s/audio/audio.py @@ -53,8 +53,8 @@ class AudioProcessor(object): def _create_mel_filter(self): mel_filter = librosa.filters.mel( - self.sample_rate, - self.n_fft, + sr=self.sample_rate, + n_fft=self.n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax) diff --git a/paddlespeech/t2s/datasets/common.py b/paddlespeech/t2s/datasets/common.py index d6fa3a84..122a35ae 100644 --- a/paddlespeech/t2s/datasets/common.py +++ b/paddlespeech/t2s/datasets/common.py @@ -38,7 +38,7 @@ class AudioSegmentDataset(Dataset): def __getitem__(self, i): fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) + y, sr = librosa.load(fpath, sr=self.sr) y, _ = librosa.effects.trim(y, top_db=self.top_db) y = librosa.util.normalize(y) y = y.astype(np.float32) @@ -70,7 +70,7 @@ class AudioDataset(Dataset): def __getitem__(self, i): fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) + y, sr = librosa.load(fpath, sr=self.sr) y, _ = librosa.effects.trim(y, top_db=self.top_db) y = librosa.util.normalize(y) y = y.astype(np.float32) diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index d6dd7af1..1c42a87c 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -38,9 +38,9 @@ model_alias = { "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", "tacotron2": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "paddlespeech.t2s.models.tacotron2:Tacotron2", "tacotron2_inference": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 0b95a883..75c631b8 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -39,9 +39,9 @@ model_alias = { "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", "tacotron2": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "paddlespeech.t2s.models.tacotron2:Tacotron2", "tacotron2_inference": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -229,6 +229,11 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) merge_sentences = False + # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph + # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) + if am_name == 'tacotron2': + merge_sentences = True + for utt_id, sentence in sentences: get_tone_ids = False if am_name == 'speedyspeech': diff --git a/paddlespeech/t2s/exps/new_tacotron2/__init__.py b/paddlespeech/t2s/exps/tacotron2/__init__.py similarity index 100% rename from paddlespeech/t2s/exps/new_tacotron2/__init__.py rename to paddlespeech/t2s/exps/tacotron2/__init__.py diff --git a/paddlespeech/t2s/exps/new_tacotron2/normalize.py b/paddlespeech/t2s/exps/tacotron2/normalize.py similarity index 100% rename from paddlespeech/t2s/exps/new_tacotron2/normalize.py rename to paddlespeech/t2s/exps/tacotron2/normalize.py diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py similarity index 100% rename from paddlespeech/t2s/exps/new_tacotron2/preprocess.py rename to paddlespeech/t2s/exps/tacotron2/preprocess.py diff --git a/paddlespeech/t2s/exps/new_tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py similarity index 97% rename from paddlespeech/t2s/exps/new_tacotron2/train.py rename to paddlespeech/t2s/exps/tacotron2/train.py index a77331e7..bf4c4e01 100644 --- a/paddlespeech/t2s/exps/new_tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -30,9 +30,9 @@ from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable -from paddlespeech.t2s.models.new_tacotron2 import Tacotron2 -from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator -from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater +from paddlespeech.t2s.models.tacotron2 import Tacotron2 +from paddlespeech.t2s.models.tacotron2 import Tacotron2Evaluator +from paddlespeech.t2s.models.tacotron2 import Tacotron2Updater from paddlespeech.t2s.training.extensions.snapshot import Snapshot from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.optimizer import build_optimizers diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py index d6733a94..3de30774 100644 --- a/paddlespeech/t2s/exps/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning.py @@ -34,9 +34,9 @@ model_alias = { "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", "tacotron2": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "paddlespeech.t2s.models.tacotron2:Tacotron2", "tacotron2_inference": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py index 61723e03..4357b282 100644 --- a/paddlespeech/t2s/exps/wavernn/synthesize.py +++ b/paddlespeech/t2s/exps/wavernn/synthesize.py @@ -31,7 +31,7 @@ from paddlespeech.t2s.models.wavernn import WaveRNN def main(): parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.") - parser.add_argument("--config", type=str, help="GANVocoder config file.") + parser.add_argument("--config", type=str, help="Vocoder config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py index aec745f7..c9a6ba01 100644 --- a/paddlespeech/t2s/exps/wavernn/train.py +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -179,7 +179,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") + parser = argparse.ArgumentParser(description="Train a WaveRNN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.") diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 25413871..a488a6fc 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -83,11 +83,6 @@ class English(Phonetics): return phonemes def _p2id(self, phonemes: List[str]) -> np.array: - # replace unk phone with sp - phonemes = [ - phn if (phn in self.vocab_phones and phn not in self.punc) else "sp" - for phn in phonemes - ] phone_ids = [self.vocab_phones[item] for item in phonemes] return np.array(phone_ids, np.int64) @@ -102,6 +97,12 @@ class English(Phonetics): # remove start_symbol and end_symbol phones = phones[1:-1] phones = [phn for phn in phones if not phn.isspace()] + # replace unk phone with sp + phones = [ + phn + if (phn in self.vocab_phones and phn not in self.punc) else "sp" + for phn in phones + ] phones_list.append(phones) if merge_sentences: diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index 3b90a414..41be7c1d 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -14,9 +14,9 @@ from .fastspeech2 import * from .hifigan import * from .melgan import * -from .new_tacotron2 import * from .parallel_wavegan import * from .speedyspeech import * +from .tacotron2 import * from .transformer_tts import * from .waveflow import * from .wavernn import * diff --git a/paddlespeech/t2s/models/new_tacotron2/__init__.py b/paddlespeech/t2s/models/tacotron2/__init__.py similarity index 100% rename from paddlespeech/t2s/models/new_tacotron2/__init__.py rename to paddlespeech/t2s/models/tacotron2/__init__.py diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py similarity index 100% rename from paddlespeech/t2s/models/new_tacotron2/tacotron2.py rename to paddlespeech/t2s/models/tacotron2/tacotron2.py diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py similarity index 100% rename from paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py rename to paddlespeech/t2s/models/tacotron2/tacotron2_updater.py diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index 3622fd7a..0cfe0b84 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -395,9 +395,6 @@ class Decoder(nn.Layer): iunits, odim * reduction_factor, bias_attr=False) self.prob_out = nn.Linear(iunits, reduction_factor) - # initialize - # self.apply(decoder_init) - def _zero_state(self, hs): init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size]) return init_hs @@ -558,8 +555,11 @@ class Decoder(nn.Layer): assert len(paddle.shape(h)) == 2 hs = h.unsqueeze(0) ilens = paddle.shape(h)[0] - maxlen = int(paddle.shape(h)[0] * maxlenratio) - minlen = int(paddle.shape(h)[0] * minlenratio) + # 本来 maxlen 和 minlen 外面有 int(),防止动转静的问题此处删除 + maxlen = paddle.shape(h)[0] * maxlenratio + minlen = paddle.shape(h)[0] * minlenratio + # 本来是直接使用 threshold 的,此处为了防止动转静的问题把 threshold 转成 tensor + threshold = paddle.ones([1]) * threshold # initialize hidden states of decoder c_list = [self._zero_state(hs)] @@ -645,11 +645,27 @@ class Decoder(nn.Layer): if use_att_constraint: last_attended_idx = int(att_w.argmax()) + # tacotron2 ljspeech 动转静的问题应该是这里没有正确判断 prob >= threshold 导致的 if prob >= threshold or idx >= maxlen: # check mininum length if idx < minlen: continue break + """ + 仅解开 665~667 行的代码块,动转静时会卡死,但是动态图时可以正确生成音频,证明模型没问题 + 同时解开 665~667 行 和 668 ~ 670 行的代码块,动转静时不会卡死,但是生成的音频末尾有多余的噪声 + 证明动转静没有进入 prob >= threshold 的判断,但是静态图可以进入 prob >= threshold 并退出循环 + 动转静时是通过 idx >= maxlen 退出循环(所以没有这个逻辑的时候会一直循环,也就是卡死), + 没有在模型判断该结束的时候结束,而是在超出最大长度时结束,所以合成的音频末尾有很长的额外预测的噪声 + 动转静用 prob <= threshold 的条件可以退出循环(虽然结果不正确),证明条件参数的类型本身没问题,可能是 prob 有问题 + """ + # if prob >= threshold: + # print("prob >= threshold") + # break + # elif idx >= maxlen: + # print("idx >= maxlen") + # break + # (1, odim, L) outs = paddle.concat(outs, axis=2) if self.postnet is not None: