From 3ce5dff460ec7561c092ee729731d11690ab9c82 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 26 Oct 2021 04:35:15 +0000 Subject: [PATCH] refactor parakeet examples --- deepspeech/decoders/scores/ngram.py | 5 +- deepspeech/exps/u2_kaldi/model.py | 5 +- .../frontend/featurizer/text_featurizer.py | 2 +- examples/aishell3/README.md | 11 +- examples/aishell3/tts0/run.sh | 0 .../aishell3 => aishell3/tts3}/README.md | 87 ++-- .../tts3}/conf/default.yaml | 0 .../tts3/local}/preprocess.sh | 14 +- .../tts3/local}/synthesize.sh | 13 +- .../tts3/local}/synthesize_e2e.sh | 15 +- .../run.sh => aishell3/tts3/local/train.sh} | 9 +- examples/aishell3/tts3/path.sh | 13 + examples/aishell3/tts3/run.sh | 33 ++ examples/aishell3/vc0/README.md | 89 ++++ .../images/alignment-step2000.png | Bin .../{local/tacotron2 => }/images/train.png | Bin .../{local/tacotron2 => }/images/valid.png | Bin examples/aishell3/vc0/local/preprocess.sh | 43 ++ .../aishell3/vc0/local/tacotron2/README_cn.md | 112 ----- .../vc0/local/tacotron2/voice_cloning.ipynb | 383 ------------------ examples/aishell3/vc0/local/train.sh | 9 + examples/aishell3/vc0/local/voice_cloning.sh | 14 + examples/aishell3/vc0/path.sh | 13 + examples/aishell3/vc0/run.sh | 40 ++ examples/csmsc/README.md | 11 + .../csmsc/speedyspeech/baker/inference.sh | 8 - .../{speedyspeech/baker => tts2}/README.md | 77 ++-- .../baker => tts2}/conf/default.yaml | 0 examples/csmsc/tts2/local/inference.sh | 10 + .../baker => tts2/local}/preprocess.sh | 15 +- .../baker => tts2/local}/synthesize.sh | 14 +- .../baker => tts2/local}/synthesize_e2e.sh | 17 +- .../baker/run.sh => tts2/local/train.sh} | 9 +- examples/csmsc/tts2/path.sh | 13 + examples/csmsc/tts2/run.sh | 37 ++ .../baker => csmsc/tts3}/README.md | 61 +-- .../baker => csmsc/tts3}/conf/default.yaml | 0 .../baker => csmsc/tts3/local}/preprocess.sh | 14 +- .../baker => csmsc/tts3/local}/simple.lexicon | 0 .../baker => csmsc/tts3/local}/synthesize.sh | 13 +- .../tts3/local}/synthesize_e2e.sh | 15 +- examples/csmsc/tts3/local/train.sh | 12 + examples/csmsc/tts3/path.sh | 13 + examples/csmsc/tts3/run.sh | 32 ++ .../baker => csmsc/voc1}/README.md | 59 +-- .../baker => csmsc/voc1}/conf/default.yaml | 0 .../baker => csmsc/voc1/local}/preprocess.sh | 15 +- examples/csmsc/voc1/local/synthesize.sh | 13 + .../run.sh => csmsc/voc1/local/train.sh} | 9 +- examples/csmsc/voc1/path.sh | 13 + examples/csmsc/voc1/run.sh | 27 ++ examples/ljspeech/README.md | 9 +- examples/ljspeech/tts0/README.md | 87 ++++ examples/ljspeech/tts0/local/preprocess.sh | 8 + examples/ljspeech/tts0/local/synthesize.sh | 11 + .../ljspeech/tts0/local/tacotron2/README.md | 92 ----- examples/ljspeech/tts0/local/train.sh | 9 + examples/ljspeech/tts0/path.sh | 13 + examples/ljspeech/tts0/run.sh | 28 ++ .../transformer_tts/ljspeech => }/README.md | 59 +-- .../ljspeech => }/conf/default.yaml | 0 .../ljspeech => }/preprocess.sh | 10 +- .../ljspeech => }/synthesize.sh | 13 +- .../ljspeech => }/synthesize_e2e.sh | 15 +- .../ljspeech/run.sh => train.sh} | 9 +- .../tts1/local/transformer_tts/sentences.txt | 9 - examples/ljspeech/tts1/path.sh | 13 + examples/ljspeech/tts1/run.sh | 32 ++ .../ljspeech => ljspeech/tts3}/README.md | 77 ++-- .../tts3}/conf/default.yaml | 0 .../tts3/local}/preprocess.sh | 14 +- .../tts3/local}/synthesize.sh | 14 +- .../tts3/local}/synthesize_e2e.sh | 16 +- .../run.sh => ljspeech/tts3/local/train.sh} | 9 +- examples/ljspeech/tts3/path.sh | 13 + examples/ljspeech/tts3/run.sh | 32 ++ examples/ljspeech/voc0/README.md | 52 +++ examples/ljspeech/voc0/local/preprocess.sh | 7 + examples/ljspeech/voc0/local/synthesize.sh | 12 + examples/ljspeech/voc0/local/train.sh | 10 + .../ljspeech/voc0/local/waveflow/README.md | 52 --- examples/ljspeech/voc0/path.sh | 13 + examples/ljspeech/voc0/run.sh | 27 ++ .../ljspeech => ljspeech/voc1}/README.md | 62 +-- .../voc1}/conf/default.yaml | 0 .../voc1/local}/preprocess.sh | 14 +- examples/ljspeech/voc1/local/synthesize.sh | 13 + .../run.sh => ljspeech/voc1/local/train.sh} | 9 +- examples/ljspeech/voc1/path.sh | 13 + examples/ljspeech/voc1/run.sh | 27 ++ .../spk0/local => other}/ge2e/README.md | 131 +++--- examples/other/ge2e/local/inference.sh | 14 + examples/other/ge2e/local/preprocess.sh | 9 + examples/other/ge2e/local/train.sh | 10 + examples/other/ge2e/path.sh | 13 + examples/other/ge2e/run.sh | 30 ++ examples/other/text_frontend/get_g2p_data.py | 1 - .../other/text_frontend/get_textnorm_data.py | 1 - examples/other/text_frontend/test_g2p.py | 1 - examples/other/text_frontend/test_textnorm.py | 1 - examples/other/use_mfa/local/detect_oov.py | 3 +- .../other/use_mfa/local/generate_lexicon.py | 3 +- .../parallelwave_gan/baker/synthesize.sh | 8 - .../parallelwave_gan/ljspeech/synthesize.sh | 8 - .../parallelwave_gan/vctk/synthesize.sh | 7 - examples/vctk/README.md | 11 + .../vctk/fastspeech2/aishell3/synthesize.sh | 15 - .../fastspeech2/aishell3/synthesize_e2e.sh | 15 - examples/vctk/fastspeech2/baker/run.sh | 9 - examples/vctk/fastspeech2/sentences.txt | 16 - .../vctk/{fastspeech2/vctk => tts3}/README.md | 96 +++-- .../vctk => tts3}/conf/default.yaml | 0 .../vctk => tts3/local}/preprocess.sh | 16 +- examples/vctk/tts3/local/synthesize.sh | 20 + examples/vctk/tts3/local/synthesize_e2e.sh | 20 + .../vctk/run.sh => tts3/local/train.sh} | 9 +- examples/vctk/tts3/path.sh | 13 + examples/vctk/tts3/run.sh | 32 ++ .../parallelwave_gan/vctk => voc1}/README.md | 47 ++- .../vctk => voc1}/conf/default.yaml | 0 .../vctk => voc1/local}/preprocess.sh | 15 +- examples/vctk/voc1/local/synthesize.sh | 13 + .../baker/run.sh => voc1/local/train.sh} | 9 +- examples/vctk/voc1/path.sh | 13 + examples/vctk/voc1/run.sh | 27 ++ examples/voxceleb/README.md | 3 - .../voxceleb/spk0/local/ge2e/README_cn.md | 124 ------ examples/voxceleb/spk0/run.sh | 0 parakeet/__init__.py | 3 +- parakeet/audio/__init__.py | 1 - parakeet/data/__init__.py | 3 +- parakeet/datasets/__init__.py | 1 - parakeet/datasets/preprocess_utils.py | 1 - parakeet/datasets/vocoder_batch_fn.py | 1 - parakeet/exps/__init__.py | 13 + parakeet/exps/fastspeech2/__init__.py | 13 + .../fastspeech2/multi_spk_synthesize_e2e.py | 4 +- .../multi_spk_synthesize_e2e_en.py | 4 +- .../exps}/fastspeech2/normalize.py | 4 +- .../exps}/fastspeech2/preprocess.py | 4 +- .../exps}/fastspeech2/synthesize.py | 2 +- .../exps/fastspeech2}/synthesize_e2e.py | 4 +- .../exps/fastspeech2/synthesize_e2e_en.py | 2 +- .../exps}/fastspeech2/train.py | 12 +- .../exps/gan_vocoder}/README.md | 0 parakeet/exps/gan_vocoder/__init__.py | 13 + .../exps/gan_vocoder}/normalize.py | 4 +- .../gan_vocoder/parallelwave_gan/__init__.py | 13 + .../parallelwave_gan/synthesize.py | 6 +- .../parallelwave_gan}/synthesize_from_wav.py | 22 +- .../gan_vocoder}/parallelwave_gan/train.py | 12 +- .../exps/gan_vocoder}/preprocess.py | 8 +- parakeet/exps/gan_vocoder/pwgan/__init__.py | 0 parakeet/exps/ge2e/__init__.py | 13 + .../exps}/ge2e/audio_processor.py | 9 +- .../local => parakeet/exps}/ge2e/config.py | 1 - .../exps}/ge2e/dataset_processors.py | 7 +- .../local => parakeet/exps}/ge2e/inference.py | 10 +- .../exps}/ge2e/preprocess.py | 13 +- .../exps}/ge2e/random_cycle.py | 1 - .../ge2e/speaker_verification_dataset.py | 6 +- .../local => parakeet/exps}/ge2e/train.py | 16 +- .../exps}/sentences.txt | 0 .../exps}/sentences_en.txt | 0 parakeet/exps/speedyspeech/__init__.py | 13 + .../exps/speedyspeech}/inference.py | 2 +- .../exps}/speedyspeech/normalize.py | 4 +- .../exps}/speedyspeech/preprocess.py | 14 +- .../exps}/speedyspeech/synthesize.py | 10 +- .../exps/speedyspeech}/synthesize_e2e.py | 10 +- .../exps}/speedyspeech/train.py | 12 +- parakeet/exps/tacotron2/__init__.py | 13 + .../exps}/tacotron2/config.py | 1 - .../exps}/tacotron2/ljspeech.py | 6 +- .../exps}/tacotron2/preprocess.py | 11 +- .../exps}/tacotron2/synthesize.ipynb | 0 .../exps}/tacotron2/synthesize.py | 13 +- .../exps}/tacotron2/train.py | 16 +- parakeet/exps/tacotron2_ge2e/__init__.py | 13 + .../exps/tacotron2_ge2e}/aishell3.py | 9 +- .../exps/tacotron2_ge2e}/chinese_g2p.py | 9 +- .../exps/tacotron2_ge2e}/config.py | 1 - .../exps/tacotron2_ge2e}/extract_mel.py | 11 +- .../exps/tacotron2_ge2e}/lexicon.txt | 0 .../preprocess_transcription.py | 7 +- .../exps/tacotron2_ge2e}/process_wav.py | 9 +- .../exps/tacotron2_ge2e}/train.py | 25 +- parakeet/exps/tacotron2_ge2e/voice_cloning.py | 160 ++++++++ parakeet/exps/transformer_tts/__init__.py | 13 + .../exps}/transformer_tts/normalize.py | 4 +- .../exps}/transformer_tts/preprocess.py | 4 +- .../exps}/transformer_tts/synthesize.py | 2 +- .../exps/transformer_tts}/synthesize_e2e.py | 2 +- .../exps}/transformer_tts/train.py | 12 +- parakeet/exps/waveflow/__init__.py | 13 + .../exps}/waveflow/config.py | 1 - .../exps}/waveflow/ljspeech.py | 4 +- .../exps}/waveflow/preprocess.py | 12 +- .../exps}/waveflow/synthesize.py | 8 +- .../local => parakeet/exps}/waveflow/train.py | 16 +- parakeet/frontend/__init__.py | 3 +- parakeet/frontend/normalizer/__init__.py | 1 - parakeet/frontend/phonectic.py | 2 +- parakeet/frontend/zh_frontend.py | 2 +- .../frontend/zh_normalization/__init__.py | 1 - parakeet/models/__init__.py | 1 - parakeet/models/fastspeech2/__init__.py | 1 - parakeet/models/fastspeech2/fastspeech2.py | 2 +- .../models/fastspeech2/fastspeech2_updater.py | 1 + parakeet/models/lstm_speaker_encoder.py | 4 +- parakeet/models/parallel_wavegan/__init__.py | 1 - .../parallel_wavegan_updater.py | 4 +- parakeet/models/speedyspeech/__init__.py | 1 - .../speedyspeech/speedyspeech_updater.py | 1 + parakeet/models/transformer_tts/__init__.py | 1 - .../models/transformer_tts/transformer_tts.py | 9 +- .../transformer_tts_updater.py | 1 + parakeet/modules/__init__.py | 1 - .../fastspeech2_transformer/decoder.py | 1 - .../fastspeech2_transformer/decoder_layer.py | 2 +- .../fastspeech2_transformer/lightconv.py | 3 +- .../modules/fastspeech2_transformer/mask.py | 1 - parakeet/modules/style_encoder.py | 4 +- parakeet/modules/tacotron2/decoder.py | 3 +- parakeet/modules/tacotron2/encoder.py | 4 +- parakeet/modules/transformer.py | 3 +- parakeet/training/__init__.py | 1 - parakeet/training/optimizer.py | 1 - parakeet/utils/__init__.py | 1 - parakeet/utils/profiler.py | 2 +- requirements.txt | 40 +- setup.py | 26 +- utils/json2trn.py | 1 - 233 files changed, 2278 insertions(+), 1621 deletions(-) delete mode 100644 examples/aishell3/tts0/run.sh rename examples/{vctk/fastspeech2/aishell3 => aishell3/tts3}/README.md (84%) rename examples/{vctk/fastspeech2/aishell3 => aishell3/tts3}/conf/default.yaml (100%) rename examples/{vctk/fastspeech2/aishell3 => aishell3/tts3/local}/preprocess.sh (90%) rename examples/{vctk/fastspeech2/vctk => aishell3/tts3/local}/synthesize.sh (65%) rename examples/{vctk/fastspeech2/vctk => aishell3/tts3/local}/synthesize_e2e.sh (58%) rename examples/{vctk/fastspeech2/aishell3/run.sh => aishell3/tts3/local/train.sh} (61%) create mode 100755 examples/aishell3/tts3/path.sh create mode 100755 examples/aishell3/tts3/run.sh create mode 100644 examples/aishell3/vc0/README.md rename examples/aishell3/vc0/{local/tacotron2 => }/images/alignment-step2000.png (100%) rename examples/aishell3/vc0/{local/tacotron2 => }/images/train.png (100%) rename examples/aishell3/vc0/{local/tacotron2 => }/images/valid.png (100%) create mode 100755 examples/aishell3/vc0/local/preprocess.sh delete mode 100644 examples/aishell3/vc0/local/tacotron2/README_cn.md delete mode 100644 examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb create mode 100755 examples/aishell3/vc0/local/train.sh create mode 100755 examples/aishell3/vc0/local/voice_cloning.sh create mode 100755 examples/aishell3/vc0/path.sh mode change 100644 => 100755 examples/aishell3/vc0/run.sh create mode 100644 examples/csmsc/README.md delete mode 100755 examples/csmsc/speedyspeech/baker/inference.sh rename examples/csmsc/{speedyspeech/baker => tts2}/README.md (86%) rename examples/csmsc/{speedyspeech/baker => tts2}/conf/default.yaml (100%) create mode 100755 examples/csmsc/tts2/local/inference.sh rename examples/csmsc/{speedyspeech/baker => tts2/local}/preprocess.sh (88%) rename examples/csmsc/{speedyspeech/baker => tts2/local}/synthesize.sh (61%) rename examples/csmsc/{speedyspeech/baker => tts2/local}/synthesize_e2e.sh (54%) rename examples/csmsc/{speedyspeech/baker/run.sh => tts2/local/train.sh} (64%) create mode 100755 examples/csmsc/tts2/path.sh create mode 100755 examples/csmsc/tts2/run.sh rename examples/{vctk/fastspeech2/baker => csmsc/tts3}/README.md (90%) rename examples/{vctk/fastspeech2/baker => csmsc/tts3}/conf/default.yaml (100%) rename examples/{vctk/fastspeech2/baker => csmsc/tts3/local}/preprocess.sh (90%) rename examples/{vctk/fastspeech2/baker => csmsc/tts3/local}/simple.lexicon (100%) rename examples/{vctk/fastspeech2/baker => csmsc/tts3/local}/synthesize.sh (63%) rename examples/{vctk/fastspeech2/baker => csmsc/tts3/local}/synthesize_e2e.sh (56%) create mode 100755 examples/csmsc/tts3/local/train.sh create mode 100755 examples/csmsc/tts3/path.sh create mode 100755 examples/csmsc/tts3/run.sh rename examples/{vctk/GANVocoder/parallelwave_gan/baker => csmsc/voc1}/README.md (88%) rename examples/{vctk/GANVocoder/parallelwave_gan/baker => csmsc/voc1}/conf/default.yaml (100%) rename examples/{vctk/GANVocoder/parallelwave_gan/baker => csmsc/voc1/local}/preprocess.sh (83%) create mode 100755 examples/csmsc/voc1/local/synthesize.sh rename examples/{vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh => csmsc/voc1/local/train.sh} (60%) create mode 100755 examples/csmsc/voc1/path.sh create mode 100755 examples/csmsc/voc1/run.sh create mode 100644 examples/ljspeech/tts0/README.md create mode 100755 examples/ljspeech/tts0/local/preprocess.sh create mode 100755 examples/ljspeech/tts0/local/synthesize.sh delete mode 100644 examples/ljspeech/tts0/local/tacotron2/README.md create mode 100755 examples/ljspeech/tts0/local/train.sh create mode 100755 examples/ljspeech/tts0/path.sh mode change 100644 => 100755 examples/ljspeech/tts0/run.sh rename examples/ljspeech/tts1/{local/transformer_tts/ljspeech => }/README.md (89%) rename examples/ljspeech/tts1/{local/transformer_tts/ljspeech => }/conf/default.yaml (100%) rename examples/ljspeech/tts1/local/{transformer_tts/ljspeech => }/preprocess.sh (89%) rename examples/ljspeech/tts1/local/{transformer_tts/ljspeech => }/synthesize.sh (61%) rename examples/ljspeech/tts1/local/{transformer_tts/ljspeech => }/synthesize_e2e.sh (53%) rename examples/ljspeech/tts1/local/{transformer_tts/ljspeech/run.sh => train.sh} (55%) delete mode 100644 examples/ljspeech/tts1/local/transformer_tts/sentences.txt create mode 100755 examples/ljspeech/tts1/path.sh mode change 100644 => 100755 examples/ljspeech/tts1/run.sh rename examples/{vctk/fastspeech2/ljspeech => ljspeech/tts3}/README.md (85%) rename examples/{vctk/fastspeech2/ljspeech => ljspeech/tts3}/conf/default.yaml (100%) rename examples/{vctk/fastspeech2/ljspeech => ljspeech/tts3/local}/preprocess.sh (90%) rename examples/{vctk/fastspeech2/ljspeech => ljspeech/tts3/local}/synthesize.sh (64%) rename examples/{vctk/fastspeech2/ljspeech => ljspeech/tts3/local}/synthesize_e2e.sh (56%) rename examples/{vctk/fastspeech2/ljspeech/run.sh => ljspeech/tts3/local/train.sh} (55%) create mode 100755 examples/ljspeech/tts3/path.sh create mode 100755 examples/ljspeech/tts3/run.sh create mode 100644 examples/ljspeech/voc0/README.md create mode 100755 examples/ljspeech/voc0/local/preprocess.sh create mode 100755 examples/ljspeech/voc0/local/synthesize.sh create mode 100755 examples/ljspeech/voc0/local/train.sh delete mode 100644 examples/ljspeech/voc0/local/waveflow/README.md create mode 100755 examples/ljspeech/voc0/path.sh mode change 100644 => 100755 examples/ljspeech/voc0/run.sh rename examples/{vctk/GANVocoder/parallelwave_gan/ljspeech => ljspeech/voc1}/README.md (87%) rename examples/{vctk/GANVocoder/parallelwave_gan/ljspeech => ljspeech/voc1}/conf/default.yaml (100%) rename examples/{vctk/GANVocoder/parallelwave_gan/ljspeech => ljspeech/voc1/local}/preprocess.sh (84%) create mode 100755 examples/ljspeech/voc1/local/synthesize.sh rename examples/{vctk/GANVocoder/parallelwave_gan/vctk/run.sh => ljspeech/voc1/local/train.sh} (60%) create mode 100755 examples/ljspeech/voc1/path.sh create mode 100755 examples/ljspeech/voc1/run.sh rename examples/{voxceleb/spk0/local => other}/ge2e/README.md (55%) create mode 100755 examples/other/ge2e/local/inference.sh create mode 100755 examples/other/ge2e/local/preprocess.sh create mode 100755 examples/other/ge2e/local/train.sh create mode 100755 examples/other/ge2e/path.sh create mode 100755 examples/other/ge2e/run.sh delete mode 100755 examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh delete mode 100755 examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh delete mode 100755 examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh create mode 100644 examples/vctk/README.md delete mode 100755 examples/vctk/fastspeech2/aishell3/synthesize.sh delete mode 100755 examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh delete mode 100755 examples/vctk/fastspeech2/baker/run.sh delete mode 100644 examples/vctk/fastspeech2/sentences.txt rename examples/vctk/{fastspeech2/vctk => tts3}/README.md (65%) rename examples/vctk/{fastspeech2/vctk => tts3}/conf/default.yaml (100%) rename examples/vctk/{fastspeech2/vctk => tts3/local}/preprocess.sh (90%) create mode 100755 examples/vctk/tts3/local/synthesize.sh create mode 100755 examples/vctk/tts3/local/synthesize_e2e.sh rename examples/vctk/{fastspeech2/vctk/run.sh => tts3/local/train.sh} (61%) create mode 100755 examples/vctk/tts3/path.sh create mode 100755 examples/vctk/tts3/run.sh rename examples/vctk/{GANVocoder/parallelwave_gan/vctk => voc1}/README.md (83%) rename examples/vctk/{GANVocoder/parallelwave_gan/vctk => voc1}/conf/default.yaml (100%) rename examples/vctk/{GANVocoder/parallelwave_gan/vctk => voc1/local}/preprocess.sh (83%) create mode 100755 examples/vctk/voc1/local/synthesize.sh rename examples/vctk/{GANVocoder/parallelwave_gan/baker/run.sh => voc1/local/train.sh} (60%) create mode 100755 examples/vctk/voc1/path.sh create mode 100755 examples/vctk/voc1/run.sh delete mode 100644 examples/voxceleb/README.md delete mode 100644 examples/voxceleb/spk0/local/ge2e/README_cn.md delete mode 100644 examples/voxceleb/spk0/run.sh rename examples/vctk/fastspeech2/aishell3/synthesize_e2e.py => parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py (100%) rename examples/vctk/fastspeech2/vctk/synthesize_e2e.py => parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py (100%) rename {examples/vctk => parakeet/exps}/fastspeech2/normalize.py (100%) rename {examples/vctk => parakeet/exps}/fastspeech2/preprocess.py (100%) rename {examples/vctk => parakeet/exps}/fastspeech2/synthesize.py (100%) rename {examples/vctk/fastspeech2/baker => parakeet/exps/fastspeech2}/synthesize_e2e.py (100%) rename examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py => parakeet/exps/fastspeech2/synthesize_e2e_en.py (100%) rename {examples/vctk => parakeet/exps}/fastspeech2/train.py (100%) rename {examples/vctk/GANVocoder => parakeet/exps/gan_vocoder}/README.md (100%) rename {examples/vctk/GANVocoder => parakeet/exps/gan_vocoder}/normalize.py (100%) create mode 100644 parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py rename {examples/vctk/GANVocoder => parakeet/exps/gan_vocoder}/parallelwave_gan/synthesize.py (100%) rename {examples/vctk/GANVocoder/parallelwave_gan/baker => parakeet/exps/gan_vocoder/parallelwave_gan}/synthesize_from_wav.py (87%) rename {examples/vctk/GANVocoder => parakeet/exps/gan_vocoder}/parallelwave_gan/train.py (100%) rename {examples/vctk/GANVocoder => parakeet/exps/gan_vocoder}/preprocess.py (100%) delete mode 100644 parakeet/exps/gan_vocoder/pwgan/__init__.py create mode 100644 parakeet/exps/ge2e/__init__.py rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/audio_processor.py (99%) rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/config.py (99%) rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/dataset_processors.py (98%) rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/inference.py (97%) rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/preprocess.py (90%) rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/random_cycle.py (99%) rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/speaker_verification_dataset.py (97%) rename {examples/voxceleb/spk0/local => parakeet/exps}/ge2e/train.py (93%) rename {examples/csmsc/speedyspeech => parakeet/exps}/sentences.txt (100%) rename {examples/vctk/fastspeech2 => parakeet/exps}/sentences_en.txt (100%) create mode 100644 parakeet/exps/speedyspeech/__init__.py rename {examples/csmsc/speedyspeech/baker => parakeet/exps/speedyspeech}/inference.py (100%) rename {examples/csmsc => parakeet/exps}/speedyspeech/normalize.py (100%) rename {examples/csmsc => parakeet/exps}/speedyspeech/preprocess.py (100%) rename {examples/csmsc => parakeet/exps}/speedyspeech/synthesize.py (100%) rename {examples/csmsc/speedyspeech/baker => parakeet/exps/speedyspeech}/synthesize_e2e.py (100%) rename {examples/csmsc => parakeet/exps}/speedyspeech/train.py (100%) create mode 100644 parakeet/exps/tacotron2/__init__.py rename {examples/ljspeech/tts0/local => parakeet/exps}/tacotron2/config.py (99%) rename {examples/ljspeech/tts0/local => parakeet/exps}/tacotron2/ljspeech.py (97%) rename {examples/ljspeech/tts0/local => parakeet/exps}/tacotron2/preprocess.py (95%) rename {examples/ljspeech/tts0/local => parakeet/exps}/tacotron2/synthesize.ipynb (100%) rename {examples/ljspeech/tts0/local => parakeet/exps}/tacotron2/synthesize.py (91%) rename {examples/ljspeech/tts0/local => parakeet/exps}/tacotron2/train.py (95%) create mode 100644 parakeet/exps/tacotron2_ge2e/__init__.py rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/aishell3.py (92%) rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/chinese_g2p.py (86%) rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/config.py (99%) rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/extract_mel.py (92%) rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/lexicon.txt (100%) rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/preprocess_transcription.py (99%) rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/process_wav.py (99%) rename {examples/aishell3/vc0/local/tacotron2 => parakeet/exps/tacotron2_ge2e}/train.py (94%) create mode 100644 parakeet/exps/tacotron2_ge2e/voice_cloning.py create mode 100644 parakeet/exps/transformer_tts/__init__.py rename {examples/ljspeech/tts1/local => parakeet/exps}/transformer_tts/normalize.py (100%) rename {examples/ljspeech/tts1/local => parakeet/exps}/transformer_tts/preprocess.py (100%) rename {examples/ljspeech/tts1/local => parakeet/exps}/transformer_tts/synthesize.py (100%) rename {examples/ljspeech/tts1/local/transformer_tts/ljspeech => parakeet/exps/transformer_tts}/synthesize_e2e.py (100%) rename {examples/ljspeech/tts1/local => parakeet/exps}/transformer_tts/train.py (100%) create mode 100644 parakeet/exps/waveflow/__init__.py rename {examples/ljspeech/voc0/local => parakeet/exps}/waveflow/config.py (99%) rename {examples/ljspeech/voc0/local => parakeet/exps}/waveflow/ljspeech.py (97%) rename {examples/ljspeech/voc0/local => parakeet/exps}/waveflow/preprocess.py (98%) rename {examples/ljspeech/voc0/local => parakeet/exps}/waveflow/synthesize.py (97%) rename {examples/ljspeech/voc0/local => parakeet/exps}/waveflow/train.py (92%) diff --git a/deepspeech/decoders/scores/ngram.py b/deepspeech/decoders/scores/ngram.py index 050a8c81f..a34d82483 100644 --- a/deepspeech/decoders/scores/ngram.py +++ b/deepspeech/decoders/scores/ngram.py @@ -85,8 +85,9 @@ class NgramFullScorer(Ngrambase, BatchScorerInterface): and next state list for ys. """ - return self.score_partial_( - y, paddle.to_tensor(range(self.charlen)), state, x) + return self.score_partial_(y, + paddle.to_tensor(range(self.charlen)), state, + x) class NgramPartScorer(Ngrambase, PartialScorerInterface): diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 18e29b28f..a80a6dd9e 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -436,8 +436,9 @@ class U2Tester(U2Trainer): simulate_streaming=cfg.simulate_streaming) decode_time = time.time() - start_time - for i, (utt, target, result, rec_tids) in enumerate(zip( - utts, target_transcripts, result_transcripts, result_tokenids)): + for i, (utt, target, result, rec_tids) in enumerate( + zip(utts, target_transcripts, result_transcripts, + result_tokenids)): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index 34220432b..2194d4928 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -140,7 +140,7 @@ class TextFeaturizer(): Returns: str: text string. """ - tokens = [t.replace(SPACE, " ") for t in tokens ] + tokens = [t.replace(SPACE, " ") for t in tokens] return "".join(tokens) def word_tokenize(self, text): diff --git a/examples/aishell3/README.md b/examples/aishell3/README.md index 2111901df..b52950c47 100644 --- a/examples/aishell3/README.md +++ b/examples/aishell3/README.md @@ -1,4 +1,11 @@ # Aishell3 -* tts0 - fastspeech2 -* vc0 - tactron2 voice clone +* tts0 - Tactron2 +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN +* vc0 - Tactron2 Voice Clone with GE2E diff --git a/examples/aishell3/tts0/run.sh b/examples/aishell3/tts0/run.sh deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/vctk/fastspeech2/aishell3/README.md b/examples/aishell3/tts3/README.md similarity index 84% rename from examples/vctk/fastspeech2/aishell3/README.md rename to examples/aishell3/tts3/README.md index c56242856..0feeced9b 100644 --- a/examples/vctk/fastspeech2/aishell3/README.md +++ b/examples/aishell3/tts3/README.md @@ -18,12 +18,23 @@ tar zxvf data_aishell3.tgz -C data_aishell3 ### Get MFA result of AISHELL-3 and Extract it We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo. -### Preprocess the dataset + +## Get Started Assume the path to the dataset is `~/datasets/data_aishell3`. Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. ```text @@ -47,10 +58,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. ```text @@ -85,20 +96,8 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. 7. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2. -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) -FastSpeech2 checkpoint contains files listed below. - -```text -fastspeech2_nosil_aishell3_ckpt_0.4 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_96400.pdz # model parameters and optimizer states -├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2 -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash @@ -111,9 +110,9 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -153,22 +152,22 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` - -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. - +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] [--text TEXT] - [--output-dir OUTPUT_DIR] [--device DEVICE] - [--verbose VERBOSE] +usage: multi_spk_synthesize_e2e.py [-h] + [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] + [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--text TEXT] + [--output-dir OUTPUT_DIR] [--device DEVICE] + [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -204,24 +203,36 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) + +FastSpeech2 checkpoint contains files listed below. + +```text +fastspeech2_nosil_aishell3_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_96400.pdz # model parameters and optimizer states +├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2 +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. ```bash FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \ --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \ --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \ --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt ``` - ## Future work A multi-speaker vocoder is needed. diff --git a/examples/vctk/fastspeech2/aishell3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/aishell3/conf/default.yaml rename to examples/aishell3/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/aishell3/preprocess.sh b/examples/aishell3/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/aishell3/preprocess.sh rename to examples/aishell3/tts3/local/preprocess.sh index 281abee0c..a40ee96d7 100755 --- a/examples/vctk/fastspeech2/aishell3/preprocess.sh +++ b/examples/aishell3/tts3/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./aishell3_alignment_tone \ --output durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=aishell3 \ --rootdir=~/datasets/data_aishell3/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/fastspeech2/vctk/synthesize.sh b/examples/aishell3/tts3/local/synthesize.sh similarity index 65% rename from examples/vctk/fastspeech2/vctk/synthesize.sh rename to examples/aishell3/tts3/local/synthesize.sh index 329fc9e1e..64361983d 100755 --- a/examples/vctk/fastspeech2/vctk/synthesize.sh +++ b/examples/aishell3/tts3/local/synthesize.sh @@ -1,15 +1,20 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak\ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh similarity index 58% rename from examples/vctk/fastspeech2/vctk/synthesize_e2e.sh rename to examples/aishell3/tts3/local/synthesize_e2e.sh index 446e3363c..8a979844b 100755 --- a/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh +++ b/examples/aishell3/tts3/local/synthesize_e2e.sh @@ -1,15 +1,20 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences_en.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/aishell3/run.sh b/examples/aishell3/tts3/local/train.sh similarity index 61% rename from examples/vctk/fastspeech2/aishell3/run.sh rename to examples/aishell3/tts3/local/train.sh index d4f06da91..be6051c97 100755 --- a/examples/vctk/fastspeech2/aishell3/run.sh +++ b/examples/aishell3/tts3/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh new file mode 100755 index 000000000..561d01632 --- /dev/null +++ b/examples/aishell3/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh new file mode 100755 index 000000000..a58fec5d3 --- /dev/null +++ b/examples/aishell3/tts3/run.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_482.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md new file mode 100644 index 000000000..7235eba2d --- /dev/null +++ b/examples/aishell3/vc0/README.md @@ -0,0 +1,89 @@ +# Tacotron2 + AISHELL-3 Voice Cloning +This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows: +1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](../../other/ge2e). +2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs. +3. Vocoder: We use WaveFlow as the neural Vocoder,参考实验 [waveflow](../../ljspeech/voc0). + +## Get Started +Assume the path to the dataset is `~/datasets/data_aishell3`. +Assume the path to the MFA result of AISHELL-3 is `./alignment`. +Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000` +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. start a voice cloning inference. +```bash +./run.sh +``` +### Preprocess the dataset +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} +``` +#### generate utterance embedding + Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. + +```bash +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../ge2e/inference.py \ + --input=${input} \ + --output=${preprocess_path}/embed \ + --device="gpu" \ + --checkpoint_path=${ge2e_ckpt_path} +fi +``` + +The computing time of utterance embedding can be x hours. +#### process wav +There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence. + +We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`. + +We use [lexicon.txt](./lexicon.txt) as the lexicon. + +You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo. + +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Process wav ..." + python3 ${BIN_DIR}/process_wav.py \ + --input=${input}/wav \ + --output=${preprocess_path}/normalized_wav \ + --alignment=${alignment} +fi +``` + +#### preprocess transcription +We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels. + +```bash +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/preprocess_transcription.py \ + --input=${input} \ + --output=${preprocess_path} +fi +``` +The default input is `~/datasets/data_aishell3/train`,which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading. +#### extract mel +```python +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/extract_mel.py \ + --input=${preprocess_path}/normalized_wav \ + --output=${preprocess_path}/mel +fi +``` + +### Train the model +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} +``` + +Our model remve stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition. + +In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster. +### Infernece +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} +``` +## Pretrained Model +[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). diff --git a/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png b/examples/aishell3/vc0/images/alignment-step2000.png similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png rename to examples/aishell3/vc0/images/alignment-step2000.png diff --git a/examples/aishell3/vc0/local/tacotron2/images/train.png b/examples/aishell3/vc0/images/train.png similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/images/train.png rename to examples/aishell3/vc0/images/train.png diff --git a/examples/aishell3/vc0/local/tacotron2/images/valid.png b/examples/aishell3/vc0/images/valid.png similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/images/valid.png rename to examples/aishell3/vc0/images/valid.png diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh new file mode 100755 index 000000000..3776b7f81 --- /dev/null +++ b/examples/aishell3/vc0/local/preprocess.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +input=$1 +preprocess_path=$2 +alignment=$3 +ge2e_ckpt_path=$4 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../ge2e/inference.py \ + --input=${input} \ + --output=${preprocess_path}/embed \ + --device="gpu" \ + --checkpoint_path=${ge2e_ckpt_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Process wav ..." + python3 ${BIN_DIR}/process_wav.py \ + --input=${input}/wav \ + --output=${preprocess_path}/normalized_wav \ + --alignment=${alignment} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/preprocess_transcription.py \ + --input=${input} \ + --output=${preprocess_path} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/extract_mel.py \ + --input=${preprocess_path}/normalized_wav \ + --output=${preprocess_path}/mel +fi + + + + + + diff --git a/examples/aishell3/vc0/local/tacotron2/README_cn.md b/examples/aishell3/vc0/local/tacotron2/README_cn.md deleted file mode 100644 index a364994a6..000000000 --- a/examples/aishell3/vc0/local/tacotron2/README_cn.md +++ /dev/null @@ -1,112 +0,0 @@ -## Tacotron2 + AISHELL-3 数据集训练语音克隆模型 - -本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务,使用的模型大体结构和论文 [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下: - -1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同,因为不需要 transcription 的缘故,我们使用了较多的训练数据,可以参考实现 [ge2e](../ge2e)。 -2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。 -3. Vocoder: 我们使用的声码器是 WaveFlow,参考实验 [waveflow](../waveflow). - -## 数据处理 - -### utterance embedding 的生成 - -使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。 - -首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip),然后运行脚本生成每个句子的 utterance embedding. - -```bash -python inference.py --input= --output= --device="gpu" --checkpoint_path= -``` - -其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`,然后 output 是用于存储 utterance embed 的文件夹,这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储,格式为 `.npy`. - -utterance embedding 的计算可能会用几个小时的时间,请耐心等待。 - -### 音频处理 - -因为 AISHELL-3 数据集前后有一些空白,静音片段,而且语音幅值很小,所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法,但是效果不是很好,对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。 - -我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注,所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$` 和 `%`)去除,并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件,扩展名为 `.lab`. - -此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分,而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。 - -准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令,即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径,词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。 - -```bash -./mfa_train_and_align \ - ~/datasets/aishell3/train/wav \ - lexicon.txt \ - ~/datasets/aishell3/train/alignment \ - -o aishell3_model \ - -v -``` - -因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz),其中每个句子对应的文件为 `.TextGrid` 格式的文本。 - -得到了对齐文件之后,可以运行 `process_wav.py` 脚本来处理音频。 - -```bash -python process_wav.py --input= --output= --alignment= -``` - -默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`. - -处理结束后,会将处理好的音频保存在 `` 文件夹中。 - -### 转录文本处理 - -把文本转换成为 phone 和 tone 的形式,并存储起来。值得注意的是,这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式,当然也可以只做声母和韵母的切分。 - -运行脚本处理转录文本。 - -```bash -python preprocess_transcription.py --input= --output= -``` - -默认的 input 是 `~/datasets/aishell3/train`,其中会包含 `label_train-set.txt` 文件,处理后的结果会 `metadata.yaml` 和 `metadata.pickle`. 前者是文本格式,方便查看,后者是二进制格式,方便直接读取。 - -### mel 频谱提取 - -对处理后的音频进行 mel 频谱的提取,并且以和音频文件夹同构的方式存储,存储格式是 `.npy` 文件。 - -```python -python extract_mel.py --input= --output= -``` - -input 是处理后的音频所在的文件夹,output 是输出频谱的文件夹。 - -## 训练 - -运行脚本训练。 - -```python -python train.py --data= --output= --device="gpu" -``` - -我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题,每个句子可能有几百帧对应负样例,只有一帧正样例,而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。 - -另外,为了加速模型的收敛,我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。 - -可以使用 visualdl 查看训练过程的 log。 - -```bash -visualdl --logdir= --host=$HOSTNAME -``` - -示例 training loss / validation loss 曲线如下。 - -![train](./images/train.png) - -![valid](./images/valid.png) - -alignment-step2000 - -大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加,对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的,所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线,需要更长的时间。 - -## 预训练模型 - -预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). - -## 使用 - -本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。 diff --git a/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb b/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb deleted file mode 100644 index fc4705fc6..000000000 --- a/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb +++ /dev/null @@ -1,383 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import paddle\n", - "from matplotlib import pyplot as plt\n", - "from IPython import display as ipd\n", - "import soundfile as sf\n", - "import librosa.display\n", - "from parakeet.utils import display\n", - "paddle.set_device(\"gpu:0\")\n", - "import sys\n", - "sys.path.append(\"../../\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 加载模型" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "vocab_phones:\n", - " Vocab(size: 68,\n", - "stoi:\n", - "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('$', 4), ('%', 5), ('&r', 6), ('a', 7), ('ai', 8), ('an', 9), ('ang', 10), ('ao', 11), ('b', 12), ('c', 13), ('ch', 14), ('d', 15), ('e', 16), ('ea', 17), ('ei', 18), ('en', 19), ('eng', 20), ('er', 21), ('f', 22), ('g', 23), ('h', 24), ('i', 25), ('ia', 26), ('iai', 27), ('ian', 28), ('iang', 29), ('iao', 30), ('ie', 31), ('ien', 32), ('ieng', 33), ('ii', 34), ('iii', 35), ('io', 36), ('iou', 37), ('j', 38), ('k', 39), ('l', 40), ('m', 41), ('n', 42), ('o', 43), ('ou', 44), ('p', 45), ('q', 46), ('r', 47), ('s', 48), ('sh', 49), ('t', 50), ('u', 51), ('ua', 52), ('uai', 53), ('uan', 54), ('uang', 55), ('uei', 56), ('uen', 57), ('ueng', 58), ('uo', 59), ('v', 60), ('van', 61), ('ve', 62), ('ven', 63), ('veng', 64), ('x', 65), ('z', 66), ('zh', 67)]))\n", - "vocab_tones:\n", - " Vocab(size: 10,\n", - "stoi:\n", - "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('0', 4), ('1', 5), ('2', 6), ('3', 7), ('4', 8), ('5', 9)]))\n" - ] - } - ], - "source": [ - "from examples.ge2e.audio_processor import SpeakerVerificationPreprocessor\n", - "from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder\n", - "\n", - "# speaker encoder\n", - "p = SpeakerVerificationPreprocessor(\n", - " sampling_rate=16000, \n", - " audio_norm_target_dBFS=-30, \n", - " vad_window_length=30, \n", - " vad_moving_average_width=8, \n", - " vad_max_silence_length=6, \n", - " mel_window_length=25, \n", - " mel_window_step=10, \n", - " n_mels=40, \n", - " partial_n_frames=160, \n", - " min_pad_coverage=0.75, \n", - " partial_overlap_ratio=0.5)\n", - "speaker_encoder = LSTMSpeakerEncoder(n_mels=40, num_layers=3, hidden_size=256, output_size=256)\n", - "speaker_encoder_params_path = \"../../pretrained/ge2e/ge2e_ckpt_0.3/step-3000000.pdparams\"\n", - "speaker_encoder.set_state_dict(paddle.load(speaker_encoder_params_path))\n", - "speaker_encoder.eval()\n", - "\n", - "# synthesizer\n", - "from parakeet.models.tacotron2 import Tacotron2\n", - "from examples.tacotron2_aishell3.chinese_g2p import convert_sentence\n", - "from examples.tacotron2_aishell3.aishell3 import voc_phones, voc_tones\n", - "\n", - "synthesizer = Tacotron2(\n", - " vocab_size=68,\n", - " n_tones=10,\n", - " d_mels= 80,\n", - " d_encoder= 512,\n", - " encoder_conv_layers = 3,\n", - " encoder_kernel_size= 5,\n", - " d_prenet= 256,\n", - " d_attention_rnn= 1024,\n", - " d_decoder_rnn = 1024,\n", - " attention_filters = 32,\n", - " attention_kernel_size = 31,\n", - " d_attention= 128,\n", - " d_postnet = 512,\n", - " postnet_kernel_size = 5,\n", - " postnet_conv_layers = 5,\n", - " reduction_factor = 1,\n", - " p_encoder_dropout = 0.5,\n", - " p_prenet_dropout= 0.5,\n", - " p_attention_dropout= 0.1,\n", - " p_decoder_dropout= 0.1,\n", - " p_postnet_dropout= 0.5,\n", - " d_global_condition=256,\n", - " use_stop_token=False,\n", - ")\n", - "params_path = \"../../pretrained/tacotron2_aishell3/tacotron2_aishell3_ckpt_0.3/step-450000.pdparams\"\n", - "synthesizer.set_state_dict(paddle.load(params_path))\n", - "synthesizer.eval()\n", - "\n", - "# vocoder\n", - "from parakeet.models import ConditionalWaveFlow\n", - "vocoder = ConditionalWaveFlow(upsample_factors=[16, 16], n_flows=8, n_layers=8, n_group=16, channels=128, n_mels=80, kernel_size=[3, 3])\n", - "params_path = \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\"\n", - "vocoder.set_state_dict(paddle.load(params_path))\n", - "vocoder.eval()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 生成 speaker encoding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "首先在当前文件夹下新建文件夹 `ref_audio`,把要作为参考的音频存在在这个文件夹中。格式要求是 wav 格式,采样率会被重采样至 16kHz." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ref_name = \"女声2.wav\"\n", - "ref_audio_path = f\"./ref_audio/{ref_name}\"\n", - "ipd.Audio(ref_audio_path, normalize=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mel_sequences: (2, 160, 40)\n", - "embed shape: [256]\n" - ] - } - ], - "source": [ - "mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))\n", - "print(\"mel_sequences: \", mel_sequences.shape)\n", - "with paddle.no_grad():\n", - " embed = speaker_encoder.embed_utterance(paddle.to_tensor(mel_sequences))\n", - "print(\"embed shape: \", embed.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 合成频谱" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "因为 AISHELL-3 数据集中使用 `%` 和 `$` 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 `%` 和 `$` 来调节韵律。\n", - "\n", - "值得的注意的是,句子的有效字符集仅包含汉字和 `%`, `$`, 因此输入的句子只能包含这些字符。" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['m', 'ei', 'd', 'ang', 'n', 'i', 'j', 've', 'd', 'e', '%', 'x', 'iang', 'iao', 'p', 'i', 'p', 'ieng', 'sh', 'en', 'm', 'e', 'r', 'en', 'd', 'e', 'sh', 'iii', 'h', 'ou', '$', 'n', 'i', 'q', 'ie', 'iao', 'j', 'i', 'zh', 'e', '%', 'zh', 'e', 'g', 'e', 'sh', 'iii', 'j', 'ie', 'sh', 'ang', 'd', 'e', 'r', 'en', '%', 'b', 'ieng', 'f', 'ei', 'd', 'ou', 'j', 'v', 'b', 'ei', 'n', 'i', 'b', 'ieng', 'iou', 'd', 'e', 't', 'iao', 'j', 'ian', '$']\n", - "['0', '3', '0', '1', '0', '3', '0', '2', '0', '5', '0', '0', '3', '4', '0', '1', '0', '2', '0', '2', '0', '5', '0', '2', '0', '5', '0', '2', '0', '4', '0', '0', '3', '0', '4', '4', '0', '4', '0', '5', '0', '0', '4', '0', '4', '0', '4', '0', '4', '0', '4', '0', '5', '0', '2', '0', '0', '4', '0', '1', '0', '1', '0', '4', '0', '4', '0', '3', '0', '3', '3', '0', '5', '0', '2', '0', '4', '0']\n" - ] - } - ], - "source": [ - "sentence = \"每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$\"\n", - "phones, tones = convert_sentence(sentence)\n", - "print(phones)\n", - "print(tones)\n", - "\n", - "phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64)\n", - "tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)\n", - "\n", - "phones = paddle.to_tensor(phones).unsqueeze(0)\n", - "tones = paddle.to_tensor(tones).unsqueeze(0)\n", - "utterance_embeds = paddle.unsqueeze(embed, 0)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 73%|███████▎ | 733/1000 [00:02<00:01, 255.71it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "content exhausted!\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEYCAYAAAB2qXBEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deZgdZZn+8e99Tm/pzr6QBAIkSIiAQoAMu4iyowPq8HPAZSIyZhwVdXAB1HHUmXHihrigGAGJjooMikRFAUFUZEvYdwgBTJAkkIXsvZzz/P6o6uSk6eUk3adPdff9ua66uqpOLU+nkzz9vvW+TykiMDMz21m5agdgZmYDmxOJmZn1ihOJmZn1ihOJmZn1ihOJmZn1Sk21AyhHneqjgaZqh2FmQ5WA9gGuw4eR260VgD3q1lCrHEIE20bAPre0jZdWF9QXtz7pDU2xanWh7OPvebD5hog4uS/uXa4BkUgaaOIwHVftMMxsKMrlUU5EWxsAMXMmDV9cTo0KfHPqtUzON5JXjkIUt55y+MnL+uz2q1YXuPuGPco+Pj/5qfF9dvMyDYhEYmY2VAVQpNjjcdXkRGJmVkK1dURrC6qtg5zIjR6FJL585y/Zu7aGuS+1cON/HwMRzLnhVArr10MEaFtP1lPFNX0YUWzX2skiJxIzswxLWiTZrkDiRGJmlmFB0BrlP2yvBicSMxu6SrqjcsOHs/SDr+WYf7iXr06+m1YKFCN4ri1PCzk+cdjpFFasBInhWghRpFBaq7CCdQvdIjEzs50WQMGJxMzMesMtEjOzaku7sHLDhxNbmlFDPRSLsPce6K/LufbhG1lRaOb1N83gifP35623tkIUO3RXrYRcPlkt9t8zi4Dtu9AyyInEzCzjsj3414nEzCzTgvAzEjOz/qb6elRTk5Q1KRTIjRuLcjmor+N9N93CpPzL7FqzmZ+tO5ATmh7ltCmHg3LMyD8IUSS6mgDYj11aWwUUsp1HnEjMzLIsmZCYbU4kZmaZJgr0SSHhinEiMbOBL63Qq7o6KBbJjRtLtLTCli1QKMDYUUShyD/96ha+f/wbKK5eS3HDBlRTy+/bDk9GZ0X6JKIa3VfdCKDori0zM+sNt0jMzGynBdAa2X6ZrROJmQ1Iqq0j1zQM8nkYM4porGfL5OGs2r+Ot87+I8cMf5xd8+sZkSvy/TWHUYgcP3j1VJR/Yes1olBA+fzWl1ZlrVsL2kukuEViZmY7KRAFst0iqVh0kmZIur9kWSfpo5LGSrpJ0lPp1zGVisHMbDAohspeqqFiLZKIeAKYCSApDzwPXAtcANwcEXMlXZBun1+pOMxsAEtrZKmmFtXVUty0CZRjy6mH8NLZG/nygT9nr5rVPNG6C8tbR3Hb2un8dck0Fr11Onc+27CtXlY6qoto29aNBclIrYxP0hgIXVv91V46Dng6Ip4DTgfmp/vnA2/ppxjMzAYgUYhc2Us19NczkjOBn6brEyOi/WnXcmBiZydImgPMAWigseIBmpllUTKzPdvPSCqeSCTVAacBF3b8LCJCUqdTbSJiHjAPYKTGZnw6jpn1pY4jsqivo2WXJuruf4Yv3vs7vvrCCDbM35dLPnAMhZUvlpz5MntzP20dy64Xs9+F1Z2sd231R4vkFODeiFiRbq+QNDkiXpA0GVjZDzGYmQ1IEapal1W5+iO6s9jWrQWwAJidrs8GruuHGMzMBqwiKnuphoq2SCQ1AScA/1Kyey5wtaRzgOeAt1cyBjMbIHJ5ck2NxIw9eeb0EdRuEM0HbuKCg37Hlqhl0bqpfHzSjXzi706jsGo1E/L3UGhr7fAWw8EnGbWV7RZJRRNJRGwExnXYt4pkFJeZmfUgEK2R7bnj2Y7OzMwoVGmiYbmcSMysOiRUU0tuWANqaiTGjmLjXqP469sKDHtaNI8LRt46jJ+/fx+KmzYRxU2cx9EQL4FyxBDo1oKBUSLFicTMLOOKGR+15URiZpZhQ/5hu5lZO9XXkx8zmsKqNUShwF8/exjnn3kNo/ObmJBfx1Mtk7jqb3/Hrt+dwuh7/kas30hh1WoK7fWypG1dWZG9cu+VEsjPSMzMrHeGfIkUMzPbeRFkfma7E4mZVUyuoYEoFJO6WZN3SdYLRZ59/3S2TGrj6rcdi17eQNsLK5KS7/yN4fE8bRIot/0bC4fACK3OVW/GermcSMzMMixwi8TMzHrJo7bMbMhoL/9eeHkdyud5+sp9+Ooh1zC99iX+vHlvXmodwUkjHuKMmz/AjPffT6HQYfSVckA6SmsIjczqTlC9V+iWy4nEzCzDAlxry8zMekN+sZWZDULtkwPT0VXK51E+R27XSaw7cCJ/vuR7NEcrh3/xUL53wfEUlv2NKBQggj9yKPuwiCidYNjO3VmvELhEipmZ9VLWWyTZTnNmZkNchChGruylJ5JOlvSEpMWSLujk8z0k/UHSfZIelHRqT9d0i8TMepbLo3yeXNMwihs2EsVg6acO42Pv+gUnNC2mUaI1gps2TWXes8dw0q4zAdgldxdtpbWySg3ZCYY7rq/mkUjKA5eQvLl2GbBQ0oKIeLTksM8AV0fEdyXtB1wPTO3uum6RmJllWNCn72w/FFgcEUsiogW4Cji9k1uOTNdHAX/r6aJukZiZZZp2tEUyXtKiku15ETEvXd8NWFry2TLgsA7nfw64UdK5QBNwfE83dCIxsy7lx41NurWGNxL5HNTWkHtxNY3Xis2PtXLNmW/gmiXTKW7clNTKUo6meGbbBdq7tcBdWTspGbW1Qw/bX4qIWb245VnAlRHxNUlHAD+S9JqIKHZ1ghOJmVnG9WGJlOeB3Uu2p6T7Sp0DnAwQEXdIagDGAyu7umhFn5FIGi3pGkmPS3pM0hGSxkq6SdJT6dcxlYzBzGwgay+RUu7Sg4XAdEnTJNUBZwILOhzzV+A4AEn7Ag3Ai91dtNItkm8Av4uIM9KgG4FPATdHxNx06NkFwPkVjsPMepLLkxvWgBrqoRgU16/nxfnj+dyrF7CxWM+mYj0vto3ghKZHOfdjH2bGr+6j2Na6rctKQjmBaonWlmSfu7N6LQJaI99H14o2SR8CbgDywBUR8YikLwCLImIB8DHg+5L+jaRn7T0R3f8gK5ZIJI0CjgHek34DLUCLpNOBY9PD5gO34kRiZtalvizaGBHXkwzpLd332ZL1R4GjduSalWyRTCNpDv1A0oHAPcBHgIkR8UJ6zHJgYmcnS5oDzAFooLGCYZqZZVfStZXtmRqVTCQ1wMHAuRFxl6RvkHRjbRURIanTJlM6XG0ewEiNdfvYrFIkcsOGoalTaJ48gr+eUMdV//gN9q2FE86bxSU3H0lxw0YkEW1t/L7tMBp1N+Tz23ddRRBtbVX7NgazoVwiZRmwLCLuSrevIUksKyRNBki/djkSwMxsqGsf/ttHD9sromKJJCKWA0slzUh3HQc8SjJCYHa6bzZwXaViMDMb+Pq21lYlVHrU1rnAj9MRW0uAs0mS19WSzgGeA95e4RjMrKNcHooFVn7wSC77+MUA3L5pOgvX7cnzN76Gz+x/LMVNmxhRs4hCWxtIhLuxqqaM0idVVdFEEhH3A53NsDyukvc1MxssIqDgV+2amVlvDOVRW2Zm1kvtM9uzzInEbLAreR1ubtQIaGll/fH7csu3vsMBt7+Wzx7x98SWLcSWZorN69mTOyimz0OikL761jPUq2pIPyMxM7Pe2Ynqv/3OicTMLMtCtBX7ptZWpTiRmA02Esrnk24p5VBO5MaNpbjrBJYfPpL8m1axdl0zb54yiz30KG3FtPsql9/aBeaii9nR/obELHMiMTPLOHdtmZnZTvMzEjOrPAnV1JIb3oQah7F5/11ZcUgd+576JN/a85fUSjzcMoLbN07n8geOZMJVY5l+3xoKEWzXY5J2cUV7V5dlhhOJmZntNM8jMTOzXvPDdjPrc/mRI5P3gaQTBtU4jJePnsryw3IUxreQX1XkpbnTOPvGNxJtrclIrGKwNw9CFNnaeeVurOwLd22ZmVkv+GG7mZn1mhOJmfUJ1dSQa2xETY1snLk7dS+3sOKTLcw/8EpWFRv59BNvpe72XZj8W2h47FmKa9YSUdz+Iu7KGnD8sN3MzHqt4DLyZma2s2KwPGyXNAk4lOS5z8L0fexmVmG5hgaiUIScyA1rIPaYzKbdRrB+jxoOOPtx1n1rfy74f68j2loZnX+WUcVnUE4USEvAt5eD92txB7TIeCLpsb0k6Z+Bu4G3AWcAd0p6b6UDMzMzIH1GUu5SDeW0SD4BHBQRqwAkjQNuB66oZGBmZpbIeouknESyClhfsr0+3WdmfSjX1ITq6iisWQPA8n87kuPfdSfn7/JnahEvFoNr183k9ytfTcsNu/PC65oZHYuIYoByyddigY4DtWxgGyzzSBYDd0m6juR7Oh14UNJ5ABFxUVcnSnqWJPEUgLaImCVpLPAzYCrwLPD2iFjTi+/BzGzwiuy/FqacMWVPA78kSSIA1wHPACPSpSdviIiZETEr3b4AuDkipgM3p9tmZtaFIip7qYYeWyQR8XkASY0RsakP7nk6cGy6Ph+4FTi/D65rNjBIya+YuTz5cWNRXS3RNIw1syaw8pRmnnzjzeSVY8afD+ChDx/AP92V227UVY6l7MpSQgLlPMlwkAuy/4yknFFbR0h6FHg83T5Q0nfKvH4AN0q6R9KcdN/EiHghXV8OTOzivnMkLZK0qJXmMm9nZjbYDI5RWxcDJwELACLiAUnHlHn9oyPieUm7ADdJerz0w4gISZ32/kXEPGAewEiNzXgPoZlZ5WT9GUlZExIjYqm0XaYrqy0dEc+nX1dKupZkUuMKSZMj4gVJk4GVOxiz2cCSdmWppiYZWZUOq8rvtQfLvtrAp/b9HVuilkuXHMPI63fh1HcfDMBUPZSOxurin1sEhLu1BrsIKBazXSKlnOiWSjoSCEm1kj4OPNbTSZKaJI1oXwdOBB4madnMTg+bTfLw3szMujAYurbeD3wD2A14HrgR+EAZ500Erk1bMjXATyLid5IWAldLOgd4Dnj7zgRuZjZUDIaurRkR8c7SHZKOAv7S3UkRsQQ4sJP9q4DjdiRIs4FKtXXkJ+1Cce3LAGw8YT/a6sWtX/0Wv964hK/9+zv44e8PIjZuYtSWxaCnt7+AR2QZg2DUFvCtMveZmVkfC0RE+Us1dNkikXQEcCQwoX0We2okkK90YGZmlsh4z1a3XVt1wPD0mNIZ7OtIqgCbWQnV1CSjrFpb0h1iy4kHMu3fH+ffJt3ElsjzheeG01rIc9ruhwMwMreIQiHtvsrlt+/KynrHuPWP6NuuLUknkzz3zgOXRcTcTo55O/C55O48EBHv6O6aXSaSiPgj8EdJV0bEc+nFc8DwiFi309+FmZntmD76nUJSHrgEOAFYBiyUtCAiHi05ZjpwIXBURKxJ5wF2q5xnJP8jaWQ6hPdh4FFJn9ip78LMzHZYHz4jORRYHBFLIqIFuIqkbFWp9wGXtBfTjYge5/qVM2prv4hYJ+mdwG9JiizeA3yljHPNBiXV1kFORGsbyufJjRzO5ll78fzZLTz2uivJK0chirz72dEsn7Mb5z95LBFBtCzf1mUlEcXctm1PLrQu7GAv53hJi0q256WVQiCZxrG05LNlwGEdzt8HQNJfSLq/PhcRv+vuhuUkklpJtcBbgG9HRGtXZU3MzKxv7UTRxpdKqq3vjBpgOklx3SnAnyS9NiLWdnVCOV1b3yN5b0hTesE9SR64m5lZpQUQKn/p3vPA7iXbU9J9pZYBCyKiNSKeAZ4kSSxdKqeM/DeBb5bsek7SG3o6z2ywUU0N5PPkhjcRm7fQdvA+nPn933H4sGfYEnm+9reTWHPVvpx65sHbTsqtQ/mNRKGAcmnZ9/YuLNfKsjL14VsvFwLTJU0jSSBnAh1HZP0SOAv4gaTxJF1dS7q7aDll5CdKulzSb9Pt/dhWK8vMzCqq7yYkRkQb8CHgBpKaiVdHxCOSviDptPSwG4BV6etD/gB8Iq1I0qVynpFcCfwA+HS6/STJq3IvL+NcMzPrrT58Kh0R1wPXd9j32ZL1AM5Ll7KUk0jGR8TVki5Mb9Imye1xGzLyY8bA5AnQ2gY1edbvO5YP/s/V3LdpEz8/4/X8fOk+FJubobiRiYW7tj+5WEj+DygW+rJ7woaSPp6QWAnlJJKNksaR5kRJhwMvVzQqMzPbJuPjZMtJJOeRvEPkVem44gm4RIqZWT8a4C2SiLhX0uuBGSTfzRMR0VrxyMwyID9jbw7+2RNMqXuShlwrhchx2bNH8aNjD6Nt+QrQ4u2H1HQ2c8yl4K23BnqLJK3NciowNT3+RElExEUVjs3MzGDgJxLgV8AW4CHAjwvNzPpT+4TEDCsnkUyJiAMqHolZNSWvhCbX2Ehu9Cie/4ep3H/Bd7hq/XP88K0nsOjp56BQIAoFRupZ2qLoCYXWb7L+RoFySqT8VtKJFY/EzMw6FzuwVEE5LZI7gWvTd5G0kjxwj4gYWdHIzMwMABUHftfWRcARwEPpjEezwSOXhyiimlrICdXWsPLEPWkeDSftOjP5nKeTkVnpX3/l28u/u1vL+kEVWxrlKqdraynw8M4mEUl5SfdJ+nW6PU3SXZIWS/qZpLqdua6Z2dCwA5V/q/RQvpwWyRLg1rRoY3P7zh0Y/vsRkuJg7V1hXwK+HhFXSboUOAf4bvkhm5kNMRlvkZSTSJ5Jl7p0KZukKcCbgP8GzpMk4I1sK1s8n+QF804k1j9yeVRbg2pqKG7cyOIfHsDlR8xnv7r11CKeaK3nXXdOY9r30t/sigVUX0+0tgFJ91a0tVX1W7AhaKAnkoj4fC+ufzHwSWBEuj0OWJuWMobkBSq79eL6ZmaD30BNJJIujoiPSvoVnXwbEXFaJ6eVnv9mYGVE3CPp2B0NTNIcYA5AA407erqZ2eAwwCck/ij9+tWdvPZRwGmSTgUaSJ6RfAMYLakmbZV09ppHANKX1c8DGKmxGc/HljnpBEMiUG0dqqslN3YMMayewtgmVv/7Zq577Q84+aIj+cqn30Tb0r8lNbEkXqWHttXPyuWJ5uau72PWD5Tx/wG7HLUVEfekqzMj4o+lCzCzpwtHxIURMSUippK8zvGWiHgnyRu32qsHzwau69V3YGY22GV8QmI5w387e63ue3pxz/NJHrwvJnlm4jctmpkNYN09IzmLZHTVNEkLSj4aAazekZtExK3Aren6EuDQHQ3UbDvSKwsQte/L5VE+T7S2APDkxQfxsTdcz6vqFrG20MjDm6fw+68exXv/4Xgmt91NW+korAigCEonHLoEvGVA1ru2untGcjvwAjAe+FrJ/vXAg5UMyszMSgzUh+0R8RzwHEl5FDMzq4a0kZxl5UxINMsM1dah2hqQKG7egnIiCkn3k/J5VF+PGhspTJvEr6+9klrl2fd7R/KbM44glvyVYkvycs9RcReRz3d+E5eHt4wZyF1bZmaWBRlPJN2O2koLLv64v4IxM7NOZHz4b7ctkogoSNpTUl1EtPRXUGbtck1NUCigpkbI5SlOnUTLiDpyrUVe+vhmrj/oMsbm68mRozUKbIhWlrbVsmDdQbx5t0MA2LP+HgotLUmXVcloL9fMsoFAMTi6tpYAf0mHAG9s37kD1X/NzKw3BuqorRJPp0uObcUXzcysvwz0Fkl79V9JjRGxqfIh2ZDVYZLhyg8cyYnvu505Y29jVbGeJrXx6/UHsGTzePIKVl92EO/9x+MpbtnyiusonweSrqvtamX5JZ82AGW9a6vHEimSjpD0KPB4un2gpO9UPDIzM0tk/GF7ObW2LgZOAlYBRMQDwDGVDMrMzFKx7YF7OUs1lDWPJCKWSts97PFsLes9iVx9PWqoh/p6Wl+9Gx+7/Cec3NhMIYrMvHs/7v+X1/KhhxuhWKS4tYsq6coaH3d0PuHXbzG0wSbjXVvlJJKlko4EQlIt297BbmZm/UAZL5FSTtfW+4EPkrwS93mSd5F8sJJBmZnZwFHOqK2XgHf2Qyw2RKi2jvwu41l65lROedftnDhyIbUqMH/l0Xzz5FP5+uJnANit5kkAiu6msqFuoHZtSfoW3YQfER+uSERmZrbNAJjZ3l3X1iLgHpL3rR8MPJUuM4G6yodmZmZA5of/dvc+kvkAkv4VODoi2tLtS4E/9094NihIoByqrSE/aRee/MAUirtuodjSwv3/egAPPT4qKQmfL1Lc9MzW0zzyyiyV8RZJOaO2xgAj2fZ63eHpPjMzqzAxsLu22s0F7pN0paT5wL3AFysblpmZbdWHXVuSTpb0hKTFki7o5rh/kBSSZvV0zXJGbf1A0m+Bw9Iwz4+I5T2Ha0NSe72s9npXyqG6WlRXh0YOZ+P+kxi+VIz8cy2NtzxMcfPmZHarclDM+K9dZtXQhw/bJeWBS4ATgGXAQkkLIuLRDseNIJkzeFc51y2nRQJwKPA6ktIof1du0GZm1gf6rkVyKLA4Ipak75i6Cji9k+P+E/gS7WUkelBO0ca5JJnp0XT5sKQeu7YkNUi6W9IDkh6R1F5FeJqku9Jm1c8keQSYmVl3+i6R7AYsLdlelu7bStLBwO4R8ZtywyvnYfupwMyIKKY3mQ/cB3yqh/OagTdGxIa0tMptaRfZecDXI+KqdATYOcB3yw3YMkhCdXXkhjXAhHFowya27Lsby95Qx0mnLOK9424hp+DZ1rF88alTmfSpOnLrNlEoFJIuLYBigSi6hJtZZ3awa2u8pEUl2/MiYl5Z95FywEXAe3bkhmUVbQRGs23U1qhyToiIADakm7XpEsAbgXek++cDn8OJxMyscwGdVyft0ksR0dUD8ueB3Uu2p6T72o0AXgPcmhbqnQQskHRaRJQmp+2Uk0j+h2TU1h9IRqIdA3T5pL9U+mDnHmBvkgc8TwNr2+ek0EmzquTcOcAcgAYay7mdmdmg1IfDfxcC0yVNI0kgZ7LtF3si4mVg/Nb7SrcCH+8uiUB5o7Z+ml6s/SF72aO2IqIAzJQ0GrgWeHU556XnzgPmAYzUWA/nyRqJ/PjxFPeciFraWHHUGF4+agunzHiUE0c/xC3r9uOZOw/h0U+8lgvvrqO4eQtEkVF6hmIUk1+w/LZCs/L00T+ViGiT9CHgBiAPXBERj0j6ArAoIhbszHV7TCSS3grc0n4DSaMlvSUifrkDwa9NWzRHAKMl1aStko7NKjMz66AvJyRGxPXA9R32fbaLY48t55rlDP/9j7S5037htcB/9HSSpAlpSwRJw0jGLT8G/AE4Iz1sNnBdOYGamQ1ZA7XWVonOkk05500G5qfPSXLA1RHx6/T971dJ+i+S0V+Xlx2t9b9cHuVEFAqv6Ipacu7evPr1Szhi7BIuf+QIRt42nCVfmMKly+spbtrE9OIiiCLF9vMkiKK7tMx2RBUTRLnKSQiLJF1E8rAckpda3dPTSRHxIHBQJ/uXkEyKMTOzHihdsqycrq1zgRbgZ+nSjN+QaGbWfwZ611ZEbKTM4b428OVHjtw6wgrlyDUNI1rb2PDzidx2wC+2HtcaBfb78eG0vG84ty6bwF6tjxNtrUndrO26skr+ZrtLy2ynZL36bzmjtvYBPg5MLT0+It5YubDMzGyrgZ5IgP8DLgUuA1zDwsysvw2CRNIWES5hMlhJ5IYNQ/X1EEWitY0nLz6IP/3916iTeLGQI0cw+3Mf46STZ2536qtqFlJoL/3e2Wgsd2WZ9V6AdqxESr8rJ5H8StIHSGamN7fvjIjVXZ9iZmZ9ZcA/IyGZNAjwiZJ9AezV9+GYmdkrDPREEhHT+iMQ6z+qryc3rIHY0gz5PNpjV1bNGs/opzbyyR//mA/NP5D37XM8xZbWreeMiTtfcZ1oa3vFPjPre1lvkXQ5j0TSJ0vW/1+Hz/zOdjOz/rAjc0iqlHC6m5B4Zsn6hR0+O7kCsZiZWWcynki669pSF+udbdsAoPp6ciOG0zZjd/52WCP/8f7/ZXrdShasm8mPHjuUGXs8y1f3/zv2bLubYqFkpLdyHoFlViUi+11b3SWS6GK9s20zM6uUjP+P210iOVDSOpKEOCxdJ91uqHhkZmYGgDLeI9BlIomIfH8GYn1HNTXkRo2kuG4D5IQk1FBP88F7s+LQemqPWM365+u44rhjKKx8EUnsxZOsjCBa05FY29XIckEDs6oZJGXkzcysigbyMxIzM8uAwVAixQYCiVxjIxreRNtek1lxSBMbj97A74/8Dg0S64vBl1ecwDMLD2Cfz+WZ9OQjFJqbiWJALh2EpxwU3Y1lljlukZiZ2U4Ld22ZmVlvOZFYpeXHj0O1tbROm8jqfRvZNEm0jCnSdMdw5sw+jmhpSQ5UC9NZlPydzGlrrazIeP+r2VA20CckmplZFmR8Hkl3tbZ6RdLukv4g6VFJj0j6SLp/rKSbJD2Vfh1TqRjMzAYDRflLNVSyRdIGfCwi7pU0ArhH0k3Ae4CbI2KupAuAC4DzKxjH4CGBcqikW2rZhUdyyT9fSoNa+fnaWVz3xAHU39PEjO8sp+2Z54guJha6O8tsgBgAExIr1iKJiBci4t50fT3wGLAbcDowPz1sPvCWSsVgZjYYqFj+Ug398oxE0lTgIOAuYGJEvJB+tByY2MU5c4A5AA00Vj5IM7OsyniLpOKJRNJw4OfARyNinbStAn1EhNR5r15EzAPmAYzU2Iz/MVZObsQIJBERaNeJrD1oPK3vWs2dB11FXjn2uvEQvjTzKIobNkAE0/QgAG0ZfzhnZuXL+qitinVtAUiqJUkiP46IX6S7V0ianH4+GVhZyRjMzAa0IBm1Ve5SBZUctSXgcuCxiLio5KMFwOx0fTZwXaViMDMbDIbyM5KjgHcDD0m6P933KWAucLWkc4DngLdXMAYzswFtSE9IjIjb6PqVvMdV6r4DmkSuvh6AYnMzAIs/8xredPxC3jZmEU81T+Lqv81i1U9259Q3HwzA9Nz9FKOYNGmlzE9cMrMdVMUuq3J5ZruZWcYN2RaJmZn1EScS60yuoQFqawFQXS2qrQWJTQdMoTAsx/9+8yIm54dxwLwjePz9+zJ3yUiKGzZCy/OM4/ltF2rv1oLMN3/NbOe4RWJmZjsvgGK2M4kTiZlZ1p13t04AAA9LSURBVGU7jziRVEUuD0Bx/XryY8bw5Lf24L2vuYMZDS/w0xWH0lKs4V/2P4Xi+vVMbbiXYksrhdIqi9sVYsz43zAz67W+7NqSdDLwDSAPXBYRczt8fh7wzySFd18E3hsRz3V3zYrObDczsz7QRzPbJeWBS4BTgP2AsyTt1+Gw+4BZEXEAcA3w5Z7CcyIxM8u4PnwfyaHA4ohYEhEtwFUkFdm3iog/RMSmdPNOYEpPF3XXVl9LJxWqqZHiy+uIYmx7+YdyKJ8nN3Y0e/9mDZ+Z+Beeah3GuV/+ILd9/rXctnJXonkTxS3Nr3xhiLuwzIYkBajvHrbvBiwt2V4GHNbN8ecAv+3pok4kZmZZt2M1tMZLWlSyPS+tpr5DJL0LmAW8vqdjnUjMzDJOO9Yj8VJEzOris+eB3Uu2p6T7tr+fdDzwaeD1EdHc0w2dSHpBNTXkRo1Malzl86imhhjZROu4Jop1ef7+WzfzthEPMzZXQ71qaY0CrRR4uKWeL5w5m3ff3QoSE3Q3hWKh03sUt2zp5+/KzDKlb1+1uxCYLmkaSQI5E3hH6QGSDgK+B5wcEWW95sOJxMws0/quaGNEtEn6EHADyfDfKyLiEUlfABZFxALgK8Bw4P/SFxH+NSJO6+66TiRmZhnXl/NIIuJ64PoO+z5bsn78jl7TiWQn5Robee6He3HVrMuopUgRsTFqeLZ1PAs37MVzm8byu9lHc8MTu1PcuGn7kVs5IR5LWqsREJ13a5mZAZkftelEYmaWZVG9Nx+Wy4nEzCzr3CIZBCRq9pjCihOmcMfnv02t8qwsbOSU//o4nzzrGKKtNTmu/S2FAFoLxVWvHGwRhVfMNTQz61a284gTiZlZ1u3gPJJ+50RiZpZ1TiQDm2qSP6JY8zITf1vgzZfP2tqFNV53Ex0nEm59W6FHYplZ7ykCFbKdSCpW/VfSFZJWSnq4ZN9YSTdJeir9OqZS9zczGzT6qIx8pVSyjPyVwMkd9l0A3BwR04Gb020zM+vOUE0kEfEnYHWH3acD89P1+cBbKnX/nSaRHzeWmkkTqdlzd576wWt50wMvcuRtK3nsfyaVdF0FqnXPoJlVWJBU/y13qYL+/p9wYkS8kK4vByZ2daCkOcAcgAYa+yE0M7Ns8qitLkRESF1XkEnr588DGKmx2f5TNDOrpIwnkv5+1e4KSZMB0q9llSjuF7k8+dGjqJmyG0yawObXTOHcm29k+D3DuP4dR3L76yezz3sf3O6UaO6xTL+ZWS/twPORwfaMpAsLgNnp+mzgun6+v5nZwBIM3UQi6afAHcAMScsknQPMBU6Q9BRwfLptZmbdGaoP2yPirC4+Oq5S99whufzW0u75cWN57MvT+K+jrmW3mjXctelV3LF6Ly7efya76l6KLe21tFwky8z6nx+2m5lZ7ziRmJnZTouAQrZ7Q4ZGIpHINTYmEwjr61FtLYUJo9k8pYnvf/vrvFgYxvsv/RA/+fyRtP11WZr9VwCZr95sZkOBWyRmZtYrTiRmZrbTAig6kVSNauvIjRyORo2kdeIonjlX/Och1zGp5mXu37IHi16eykcP/ntiSzNT4n7aNm/OfOY3s6EmMj9idFAnEjOzQSHjv+A6kZiZZZm7tqpASku815HfdSLNe03gufcVOWn6Y7x0xcH86LxDKaxZC4UCUVwPHd9waGaWNW6RmJlZrziRmJnZzqteMcZyDZ5Eksvz1MWzeMcxt3P2mDuoF9zbvAtXrTyMjZfM4OlfiAnNd9KW8R+Imdl2Aih61JaZmfVGxn8BdiIxM8s019qqjFyeXFMj0dJCfvJECuNHcs0vL+OQ+Ydy10dmce/DE4iNm5Ly77GWkXFntcr0m5n1TkB4QqKZmfWK55GYmVmv+BlJ3ym+7iBeeu0w2k5Yy5Uzr2T3mlYeahnJ4827csbUo9mr5j6Kzc0UMv6HbmZWtgiP2jIzs17K+C/HTiRmZhkXbpH0nnI5CscezObz16KWTdTcOI5/P/sEihs3QxSJYkCxjWhrq3aoZmZ9LPsz23PVuKmkkyU9IWmxpAuqEYOZ2YDQXv233KUK+r1FIikPXAKcACwDFkpaEBGP9ncsZmYDgueRvMKhwOKIWAIg6SrgdKDLRDLtNet43cV3cuc/HcjwBx4DwMXfzWwoCEi67/uIpJOBbwB54LKImNvh83rgh8AhwCrgHyPi2e6uWY2urd2ApSXby9J925E0R9IiSYtWrcp2NjYzq5gIolAoe+lOSY/QKcB+wFmS9utw2DnAmojYG/g68KWeQqzKM5JyRMS8iJgVEbPGjctsmGZmlRfF8pfube0RiogWoL1HqNTpwPx0/RrgOEnq7qLV6Np6Hti9ZHtKuq9LDzzYuuGBAxY8AQsqGlgvjQdeqnYQPXCMfcMx9o3BHOOefRXAetbc8Pu4ZvwOnNIgaVHJ9ryImJeud9YjdFiH87ceExFtkl4GxtHNn0M1EslCYLqkaSQJ5EzgHT2c80REzKp4ZL0gaZFj7D3H2DccY9/IQowRcXI171+Ofu8ziog24EPADcBjwNUR8Uh/x2FmNgSV0yO09RhJNcAokofuXarKhMSIuB64vhr3NjMbwsrpEVoAzAbuAM4AbonofkbkgJjZDszr+ZCqc4x9wzH2DcfYNwZCjGVLn3m09wjlgSsi4hFJXwAWRcQC4HLgR5IWA6tJkk231EOiMTMz65bH1ZqZWa84kZiZWa9kPpFkpcCjpCskrZT0cMm+sZJukvRU+nVMul+SvpnG/KCkg/shvt0l/UHSo5IekfSRrMWY3rdB0t2SHkjj/Hy6f5qku9J4fiapLt1fn24vTj+f2k9x5iXdJ+nXWYwvvfezkh6SdH/7vIEM/rxHS7pG0uOSHpN0RJZilDQj/fNrX9ZJ+miWYhwQIiKzC8nDoKeBvYA64AFgvyrFcgxwMPBwyb4vAxek6xcAX0rXTwV+Cwg4HLirH+KbDBycro8AniQpgZCZGNP7ChiertcCd6X3vxo4M91/KfCv6foHgEvT9TOBn/VTnOcBPwF+nW5nKr70fs8C4zvsy9rPez7wz+l6HTA6azGWxJoHlpNMJsxkjFldqh5ADz/YI4AbSrYvBC6sYjxTOySSJ4DJ6fpkkomTAN8DzursuH6M9TqSCstZjrERuJdkZu1LQE3HnzvJ6JIj0vWa9DhVOK4pwM3AG4Ffp/9pZCa+kjg7SySZ+XmTzD94puOfR5Zi7BDXicBfshxjVpesd22VVeCxiiZGxAvp+nJgYrpe1bjT7pWDSH7bz1yMabfR/cBK4CaSVufaSCardoxlu3INQHu5hkq6GPgk0F64aFzG4msXwI2S7pE0J92XpZ/3NOBF4AdpN+FlkpoyFmOpM4GfputZjTGTsp5IBoxIfj2p+lhqScOBnwMfjYh1pZ9lJcaIKETETJLf/A8FXl3lkLaS9GZgZUTcU+1YynB0RBxMUsn1g5KOKf0wAz/vGpLu4O9GxEHARpJuoq0yECMA6TOv04D/6/hZVmLMsqwnkh0u8NjPVkiaDJB+XZnur0rckmpJksiPI+IXWYyxVESsBf5A0lU0Wkk5ho6x7HC5hl46CjhN0rMklVHfSPLuhqzEt1VEPJ9+XQlcS5KUs/TzXgYsi4i70u1rSBJLlmJsdwpwb0SsSLezGGNmZT2RbJ3On/7GcCbZKgHcXkqA9Ot1Jfv/KR3hcTjwckkzuSIkiWRG6mMRcVEWY0zjnCBpdLo+jOQ5zmMkCeWMLuJsj7+scg29EREXRsSUiJhK8vftloh4Z1biayepSdKI9nWS/v2HydDPOyKWA0slzUh3HUfyArvMxFjiLLZ1a7XHkrUYs6vaD2l6WkhGSTxJ0o/+6SrG8VPgBaCV5Detc0j6wm8GngJ+D4xNjxXJy2OeBh4CZvVDfEeTNL8fBO5Pl1OzFGN63wOA+9I4HwY+m+7fC7gbWEzSvVCf7m9Itxenn+/Vjz/zY9k2aitT8aXxPJAuj7T/28jgz3smsCj9ef8SGJPBGJtIWpGjSvZlKsasLy6RYmZmvZL1ri0zM8s4JxIzM+sVJxIzM+sVJxIzM+sVJxIzM+sVJxLrU5IKaRXVR5RU+P2YpIr9PZM0VSUVmXfyGp/qsH1776Lq9l5TJXV8tanZgOZEYn1tc0TMjIj9SSYbngL8R5Vj2qpkdnqp7RJJRBxZwRCm8sp3ZJsNaE4kVjGRlO6YA3wonQmcl/QVSQvTdzn8S/uxks5X8m6NByTNTffNlHRneuy1Je+EOCQ97gHggyXX6PT6ko6V9GdJC0hmVlNyzlxgWNqK+nG6b0PJeX+UdJ2kJZLmSnqnkvepPCTpVelxEyT9PL3vQklHpftfr23vubgvnYk+F3hduu/feoj5T5J+o+R9PJdWsmVn1ivVnhHpZXAtwIZO9q0lqZ46B/hMuq+eZMbzNJJWy+1AY/pZ+yziB4HXp+tfAC4u2X9Muv4V0tL+3Vz/WJKCgdPKibl9Oz1vLUkZ8XqSmkqfTz/7SEk8PyEpoAiwB0mZGoBfAUel68NJihgeSzpbvoyYt5DMYM+TVEk+o9o/Xy9eOls6a+abVcqJwAGS2mtWjQKmA8cDP4iITQARsVrSKGB0RPwxPXY+8H9pna7REfGndP+PSBJRd9dvAe6OiGd2IuaFkdZSkvQ0cGO6/yHgDen68cB+SbkzAEYqqcL8F+CitKXzi4hYVnJMT38m7TEvSe/9U5IyONfsxPdgVlFOJFZRkvYCCiTVUwWcGxE3dDjmpL66XRfXP5akRbIzmkvWiyXbRbb9+8kBh0fElg7nzpX0G5KaZ3/p4vvsLuaO9Ytcz8gyyX2uVjGSJpC8lvbbEREkbxP8VyXl7pG0T1q59ibgbEmN6f6xEfEysEbS69LLvRv4YySl59dKOjrd/86SW3Z1/Z60tp+zk24Ezm3fkDQz/fqqiHgoIr5EUsn61cB6klchlxPzoUoqX+eAfwRu60WMZhXjFon1tWFK3n5YC7SRdD21l7W/jGTU0r1K+nheBN4SEb9L//NdJKkFuJ5kJNVs4NI0wSwBzk6vczZwhaRgW1dTl9cvI+Z5wIOS7o2kZPyO+jBwiaQHSf5N/Ql4P/BRSW8gab08QvKu7yJQSAcKXEnyrpOuYl4IfBvYm6SM/bU7EZtZxbn6r1kGpV1bH4+IN1c7FrOeuGvLzMx6xS0SMzPrFbdIzMysV5xIzMysV5xIzMysV5xIzMysV5xIzMysV/4/Qbezy38fbnwAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "outputs = synthesizer.infer(phones, tones=tones, global_condition=utterance_embeds)\n", - "mel_input = paddle.transpose(outputs[\"mel_outputs_postnet\"], [0, 2, 1])\n", - "fig = display.plot_alignment(outputs[\"alignments\"][0].numpy().T)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 合成语音" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "合成的语音会保存在 `syn_audio` 目录下,使用和 reference 相同的文件名。" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "time: 19.793312788009644s\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEGCAYAAABmXi5tAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO2dd5QVVbbGv90BkJxzaDICAkKLGJCoA7QDKhgY8zx1zOm9x6DOGEcFdUwzijJiHBV9mBiJoiAgQYIgWQGbnBGQTHef90ff6q6urlynUtf+rcXihrpVu++t+uqcfXYgIQQYhmGYsk9a2AYwDMMwwcCCzzAMkxBY8BmGYRICCz7DMExCYMFnGIZJCBlhG2BE7dq1RVZWVthmMAzDxIolS5bsFULU0XsvsoKflZWFxYsXh20GwzBMrCCiTUbvsUuHYRgmIbDgMwzDJAQWfIZhmITAgs8wDJMQWPAZhmESAgs+wzBMQmDBZxiGSQgs+AzDMAmBBT9gVm8/hKFj5iEvvyBsUxiGSRgs+AEzddVOLNn0K976LjdsUxiGSRgs+AGTX1A4sn9y8pqQLWEYJmmw4AfMT7sOh20CwzAJhQU/QIQQOJHHvnuGYcKBBT9AJq3Ygdk/7QnbDIZhEgoLfoBs2X8sbBMYhkkwLPgB8s683LBNYBgmwbDgB8jOQ8fDNoFhmATDgs8wDJMQWPAZhmESAgs+wzBMQmDBZxiGSQhSBJ+I3iSi3US00uB9IqKXiWg9Ef1IRF1lHJdhGIaxj6wR/tsABpi8PxBA69S/WwCMkXRchmEYxiZSBF8IMRvAfpNNhgB4VxSyAEB1Imog49gMwzCMPYLy4TcCsEX1fGvqtRIQ0S1EtJiIFu/ZwyUIGIZhZBKpRVshxFghRLYQIrtOnTphm8MwDFOmCErwtwFoonreOPUawzAMExBBCf5EANelonV6ADgohNgR0LEZhmEYABkydkJEHwLoDaA2EW0F8AiATAAQQrwGYDKAQQDWAzgK4EYZx2UYhmHsI0XwhRDDLd4XAO6QcSyGYdyxduch/Lj1IK7IbmK9MVMmidSiLcMw/vHs1HUYMeHHsM1gQoQFn2ESgkj9n5fPbTaTCgs+wySEb9buBgAMHTMvZEuYsGDBZ5iEsXzrwbBNYEKCBT8gCtetGYZhwoMFPyBO5LHflGGYcGHBZxiGSQgs+CGy9dejYZvAMEyCYMEPkaMn88M2IbG0eWgKlm05ELYZDBMoLPghQmEbkGBO5hdg1XaOVmGSBQs+k1iIb7lMwmDBDxFivQmVv09fF7YJDBMoLPghMmsdd/UKk31HToZtAsMECgt+QOjlXXHGI8MwQcKCHxDbDx4r9dp/lm8PwRKGYZIKC35ALMn9NWwTGIZJOCz4TOK4d/wPYZvAMKHAgh8QAlw8LSp8voxdaYeOnwrbBCYEWPAD4okv14RtApNg9h4+UeL5kk3sYkwiLPgBcfhEXtgmMAlm58HjYZvARAAW/IRyKr/Ac43+k3kFOBLzG1lBAbvaosyq7QeRNXISvvyR3XAyYMEPmWMhFVBr/dAUPDPNfabp+ws3oc1fpqDzY9MlWhU817/1fdgmhEJcbnQ5L88FANz5AS+0y4AFP2Q27Dkc2rHnbdjn+rMPfbYSAJAXE+EwYs7Pe8M2IRRenPFz2CYwIcCCHzIZ6eEV1HF76B8284Jf3FmxjbO8kwgLviTy8gtcuWfSQqygtv2Au4W8GWt2SbZEPjsOHsPt7y8J2wyGiRRlSvA/WrQ5tGbhf/1iJU5/eKrjz+0+dMJ6I5/Yecid4Id5k7LLsDHzMXnFzrDNsOSjRZtD86cfOBrt4nEyruWnJq/B0ZPxDiyQSZkS/D9/sgIn88NpFv7h91tcfe6t736RbIk1SbgAth0orF20duehkC0xZufB4/jzJyuwfKv/nbfmri+9VnEqP9rrL+t2/eZ5H2Nnb8Tq7YfQ69mZaPngZAx8aY4Ey+JLmRJ8xh67PM4q/vHNekmW+M+AF6NzgXd4eCryVAOSLamexpe+Ok/K/s06eP0kQTyDZOrKnfhxi7x1hk37jiK/QGDNjugOAIKgzAn+6Cnxamrx9drdYZvAeGTtzkMlhNyIIyfz8Y3q9/506daix/ke3TpLN/+KnJfnYsbq6K+v2OHWfy/BiE9+9LQPZU1Nb3aTVMqc4L8Zgoskbhw/JTf2Py4x3bJ5+eufMWrKWgx4cQ4++2Gb6bbfpUTnb5P0S2zs99iM5dmphQOd+z9epr9BAn8ixXW5cluyR/VqYif4r8xcjzGzNgR6TCFEmRk5AcBr38r9/kZ+6m0kFjZuR9fPf/VT0Xe5+zdzN9nVbywEAGzef7ToNVItfsvqjXDouP76zAmdGcipkNa7gmJHqpxE1BengyR2gv/stHUYPXVtqVGqjBX9nJfnYMmm/SVee29+Lpo/MBk3vbvY8/6jwr7DxRfA3sMnMHzsAk/7+2Sp+ejWLrN/2mPqh5bBybzSIrfvsPdIqWdNspa1hcsUpq8qjiJa7dG3PH+jeRLdpB93lHrNalYSd2atK3SfxT05UCZSBJ+IBhDROiJaT0Qjdd6/gYj2ENGy1L+b3BxHXQBKKwxeskaBwjj6VdsPYeiY+SWyX0dNWetpv1FE7dMc+ckKzN+4DwePuS+X69X/rHDdm98XpdJ7QRv59PLXxVmlBSGE7V75+nzd1/eqbrwTlmzV3cZP8iIapaN3U3aDMoNatsX/KKigWbX9IP75jfNsac+CT0TpAF4BMBBAewDDiai9zqYfCSG6pP694eZYD322oujx0DElLyKvq+9rdxZHMfT7+7fIGjkJeyym6Qrf/7LfeqOIoiRRLc4N92+QuQ7w2H9Wl3j+/Fc/mW7vt+xt2HPE5yPIYfuBY5i6cmdouSwKSvSSlhN54dSdiiIvzfgZz003P6/1yJBw7O4A1gshNgIAEY0HMATAatNPucDsNPTqVvh6TelombU7D+GIKnv2l71H0Lx2pVLbyfaJh0HI1zjyJRngxl979lNfI3dUjpTja1lo4WoJE23+3DXjFmKj6ubk13ei5tVZ67F531Gc16o2BnSsj8z0NMMZo9NTxCg/8ERePspnpDu0NFqscZlfIsOl0wiAOutoa+o1LUOJ6EcimkBETSQcVyp6J4d2hP/vBZt0P/uNRWhl2CMmOxyVHLnjlHFzi90wXlxEVjkCQc/GrvS4PhIkB46WdOtt2uf/zOSZqeswftEW3PXhD1hkMct0LPjQV/zDBgvbcWLL/mOuPhfUou1/AGQJIToB+ArAO3obEdEtRLSYiBbv2bOn1PtmwurVpZOmc26ofb9ASVEqazw6cVWox//X7I1Fj3/Z615orH6j//m/5a73Xdb4h8YHrA0N/crnyDRtL4X1u80rx749L1f39X2HT+Cvn68s9brRCH9HzJvBeHFtyRD8bQDUI/bGqdeKEELsE0Iow+U3AHTT25EQYqwQIlsIkV2nTh0JptmHdM6O6I/L5WHXFbJ5n75/1Sv7VGLT//lvpUTO6GEVPpkkrEorPDnZ37acHR6ZVuL5w1+YDzpGT9UPoHh6ylq8ZzD71kOJ3okje347gSMnwhX8RQBaE1FzIioH4CoAE9UbEFED1dPBABLV4DUMj44QAqu2H8SW/UdtdaWy60XZ9Zv+6GjHQXdTTCN+PSq3ybY23Jaxxs/zVuYivRLhlP23Gba21xvcxYWznpyBERPc5714FnwhRB6AOwFMQ6GQfyyEWEVEjxPR4NRmdxPRKiJaDuBuADc4PU7WyEleTTXlE52wODcn/PKIhICt2/Ubcl6ei57PzCwR3eSVLw0ShLT+X6/0f/5bqfsbN/cX03PIKHSSKcZthrZeyO99RhnBHjDKdygrrEz1MFCXJz903Nl1J8WHL4SYLIRoI4RoKYR4MvXaw0KIianHDwghOgghOgsh+gghAg9ut1oQ2qjjN1ZnRSpYjUz0PhMGp/KK7fQSY6/lnfn6U2d1fLvT0dtug1mDTKxKJS/0YTFXXSvHDlt/PWoqqkIIPG3DzeImjv2gwQ1bCThYv/sw2v3VfvnvE3n5EELg+Kl8dH5sOu76sGSLwi+WGWcWy5pZGO3nu4jW1rn6jQWmLtNfddyuTpvTxy7T1g2vf7sBl782H7keFgMVWjw42fT9KPr9g0g0/PeCzQAKR4FW35GWS/75nR8m+cr47zdbbuO0feL5o2cW5QzcO/4HfLO25KLp8VMFeH32RstGO+obzcFjp2zNjo0ynJUcgoPHnIW7tv3LVDR/YHLRQOA/y7fjcEQa3ntN0vSL79bvw/cmA1O9333ZZmcehVgIvtewxqdT2bI/bPG/NZ+erU6nXTKYt6FYbL79qXTEk2zW7y5MXHOTybo94lETJ/MKcO/4kiPUkZ9au8ncnLfK4vnny7aXyi1RGux8vqzk69ow1k9VnztkMbs7fCIPu387bjgoUG5sM9e6O4cWbiwWML8W4tX4FVQQBfQSCL/92dnvEgvBN6p2ZxSvvcXArTJ6yroiP5gs7FzUst0Fi3L3W5bjfdpDSYj9R04ia+QktHIwUld+CuVvjcpoTga7Dh3H5yYuCIVnDKJI3KJX/wYoTFZS88vekuGMZqNELb2fnYnuT35tmN2amxLQySv1bdFD3fNY3dzlS4O/R4vwME9+YpL0fM9A+NVGtdTfdPIH8h2Wx4iF4BstxqhHsWpueOt73dd3HjqOi//hvVaL2qYLX5hd4vWNOmn06RKjAmat243LX5vvyJ/qlJdmFI4k8gqE7VGqMrK/8a1FACDFfeYUv1LvB6W6JFldlK9qqrjK9KSpK1tqW0zqNSS/4jV7i9BKPR/jm1XhX6Gc13bOh3dV6zwvznBe72WKg9aU2lmDOh9nwhJ3Xei0LNn0q2mOwDdrdxWVYl6z4xA27jHPJ9BD+X31gkcUlC5uapxGx8VC8I3u+EaZdFa1S8560l74lhFfrd5VFAKmPRFe+rr0CZ6ul9XlkhtSgmpWAdDsomzxgLE/V7mJfbS4+EIZZXPUqj2k3YgJJ7MIK/yYzs9YvQu/pWYrSzb57xI0Qp31qhX8+z4qnUzmZJQP2L85Ld9qPUM2qsL5rzkbdV/X4iTxTttfQB0wYKUDWSMnGQ4a1QwdM880auyPby9Gl8e+QtbISRj40hz0/bv7CDOrqqda7PweamIh+EYccLiQpGC3KJoRTsO/nPrwp6/aiQ8WWi8KZo2cpJuMstRkIcdsAXeTjmC+/q29ixQA3lRluSo3JivMblxZIyfhRF4+5v6811aYph/h1eqy2O/Mz3X0WaNm72bng5FI9X++eCbpJRPZKau3l3Sn2unsZYTd0N2JJr0BtMfX3lzslkJWsoqvHfc9Pvthq+sGNIr7OKxe2k6JheAXGHyX/wypt6rTWi+jHfrTb3lvCR78bIVlKCkAjJm1AVkjJ5UY2Rgt1Fm5Wd6dn6v7eq9nZ1raIQA8/qW5/7TL49Mdj5IX5/6Ka8YtxPrdhy2F7sHPSqfXO0HbEETb7N1p1I3RSDd37xHD5upOw+zMcBIea1QqWbugPuy1+fhimX919K2uLatzwK7wdn3iq6Lj3ffRcsM6WXrfYdbISUUDsp7PWF8bdtinueHIKhGtJRaC/8Zc/VFmWI0NnEainHRZd/zy1+ZjxITltjJl1dUmtTVSAOCNORvR+7lZpvtQYqO1rjK9kb8WOwlnB46ewlKHgv+hKvzxEYt6P14Lo2nFdt56/8L3jEaUMmcpc9bvtR3TbrbIrg2CuGe8/KSpIjssCptZ/TluEwCNSmgv3ax/vj742QrTnInJK+wvcgMl13/+POFHtPnLFEeft0ssBN9IcMxGMHoLHGq8FFuzqvmhZvjYBZ4yAD9evBXXjlvo6DN6NVKMeqlq2bL/KI75WDnT6dRX3bxits/hpdrRpZlrwQtCGC/k60Vi6GFnveLYyXws+MX7TStXp2rmI1+sRM9nvpHeJtFqMBV0mRLt4dRCbhZ1dPv7Sx0dR93iUr2GJptYCL7RoEfJjv1B5y583qhvTPdp1RRDFk4XYfSq/pn55BVaP1Q8IvAySoxaO7itv8qt0eME3wQfQJrBQv7BY6dsRcKs3/Ob5TavfbvBU90VhROnSov6O/M3Ycv+Y3hB8nVkJfhBdyzbdajkrE/t1otj5dV4CL6Fgl366jzH+/Sz5s2PW93v20nVPyO8FIeys25gF73mH3ZHsGEQlJTk5RfoiqjCBzayeD//wfpmJKu1370fGbtwtKGoXrEab9gRfJm5Nn9XdZU6eOxUqXWduBELwfeCUVyrn2VytXX0g8aLG/jrNfJqoL+uqnGvdAXz2h1MO+KKI/+asxFjTUIUt1u4I4MmyCS6PKMIjRR2Zl1uc230XMTqa6nzY9NNawDpkV8gInWTKPOCr5eU4jcxaHBlyLRV+oLvpkyA8pnPf9gmrRm8nQVkt6j/Rtm+aTXTVu3CMRMRWL6l+Jw1EtuFEnzzfuC1DMpEC0Hd6qDTU/kMZ/KmW/rb4yL6C1/9hPYPT3OVFPibDyVZYiH4ezwsevp54Rolbfih904uJDd9Xa1o+9eptqKF1MxcV7jI+uy0ddLs0HPF3freEtyjqXXjhhOqUDi3FUbthuyaud3mqqo5dnxkmm4o665D0SkFrI5WOWJR2M0KqzUkdRKmVRG5IV0aOjq2XoZvuXRvEvnPmYWh4z/vcp59m/OyvZmKk7LVsRB8LzGp75skMHkdjRitxGtjamVw6Ji12G5N1UPJ9WEUfDKvwPXNUxsx5eV7f312aZfQ1FU7HU+19Xj7u1wAhQv+Rs00rNwbtvvAOvgK9gRQPtoL7xmUzHaDdX/o4sdWN+WPFzsrT62XR7J2p/XiuBFqV46bQavdUutOAi1iIfhumWuRKLMhVfPC7dTJKOZXGYXaSdu2i51QSbcNKuxiVMrCKV7us3sPn8RN79jL4nXK8dS02yyk98DRk6bhwJk2R4RO+iaY9el9wiLZLQjUf4ud9oFmNYmsEvOiXllVzXHVwvxjJjkkXgee9zrIiyjTgn+NRfz6sZOFP4gfkYgFBQJ/+Jez+HkzptqoVvj0ZH/7ynR+fLqU/SiRFm5zIWasKRYVbc14vxHCfESleGqsImR2Olh8XpRrLIJWTduDQB1ZZjUTFUJ4KkPgd0e55VsOGFbbdYravWc2616w0Vtk3AwHgRaxF3wvd8e3vvPvYpmy0n7FvzdtXLRGcdtqvraYDkcF5ReTEY3yx7cXW29kE7vzl+MmC3CKb/6SV+LX1EUGVus8i3J/lTRPBMbMMi6t4rT8icKQV77DEEm/nTbiyGhwcuxUcFE8sRd8L6PzU6kPe51S6XHHB/Yz7axq0ADWjSyiil7Ti/mpjkNR6yX9+bLtlsXBXp21HlNM0uY/shFDXxZRBPanXeY+72On8j1HvigYtdsErEs0mOG2kJoW7drjH99erHsjmv1TcC0XYy/4XmqgV0iFbcUhjPK56fYyGv2MSnLDUZ1ICiV7doJJ7e+weMYioujD77folq5QWLXdfcmOOKMUVOvRopbpdkIIaWtBfuI0Is0uz00vfX69PS/Xl2PpEXvBV/tznfJzqpZ90OnafvLqTLmZj155wKQVoFVj8TCwU/foLzrlLxTi4laTzf0fF5YZsJq1vTd/k5TkOatZ+bRV3s4to+qhTtAzURvF5bXgn1MiL/hWP+zdH7qPv1YW1nbEZOXfTrnbCUv9K7zkBnVMucKDn61w1RVIjVsfrRVGNewZexiVhFbYf/Qkpjhol2iE1RrZiE+81RBastkfIdbK2XMWM0ohBPLyC2w1ordD5AXfbPosC6+jgaCwU4dmi4NMxDDx0hUIAI6czLPVBzQM/LoZxQEr//eW/cekuHRe91iiwwoZwQCfLLXhsrT4Ko6ezPeUC6Al8oK/1aC5skzGzrbf1SlMth7w/7uIC8PHLvBl0VdGOrsfQQB6PK/jDw4TO+tHew+fwGoPpcmBwnwTp639wkAJTlCjPWetgu+I3NcG0iPygv+5hAxKKy7qUN/3Y8igesVyYZsQGVZtP+RL/oRRLSEnBDXAf9lmxzeJLZVNmbdhH07LTLfcziqb1gqzhuJRQs89OHnFTkdu6I0WfXkV7A4yoi/4Fj5BGfzHp7rnsikoEKiQGfmfLDCiutgeNbuCugFNWbEDV5zVxHK7tvWqeDqOUStOLVVPy/B0HL9QV/zU9gzWYrUmomC3T3fk1aNh9QphmxAZ8gqEYaekJCIjksIPJpl0QirLjF+0BRk2phOdm1TzdBy7NXK6Na3h6TgrPLqNvrfRW8Iq6zgj3d71brdXdOQFv3tWTV/373f9GZmczCvw1NykrNHj6a/DNkGXn2PicvADIaxFymtFTbt4LSLoR3liLQ9f3MH0/de/tbe+eJvNloqRF/zW9aqgUnlrv6BbZFb685sRE5YH2oyCcYfHirqx5t0FmyxnXkHNgH7Za7NyaYgoCWtBEflT89GJq3DkhH8jgicn22vu7TdCCORanKBxiExggB9s9CAuq/hd3CxI3vexTMaW/UcxfdVOLJSYeLV+t3X4ZuQFP6dTg8CiDMJk6eYD6P3crLDNYCSQ5Dj8soSfM5GhY+bhlveWOO7KZcbN7y6x3Cbygl+vajIWbYeOcd6InYkmMkdtTNlE6al9wkNzJy12XFiRF/y8fBFYWBnDMEyc+U6nlIkaKYJPRAOIaB0RrSeikTrvlyeij1LvLySiLLv7jlr1R4ZhmKhy9RvmTZc8Cz4RpQN4BcBAAO0BDCei9prN/gvAr0KIVgBeADDa7v4PHItmvRSGYZgoUq5+q25G78kY4XcHsF4IsVEIcRLAeABDNNsMAfBO6vEEAP3IRkB5QYEIvHwowzBMWUVG7nEjAOqavFsBnG20jRAij4gOAqgFoITDiYhuAXALANSs1xAtHpwswTyGYRgGiNiirRBirBAiWwiR3axRAzwzrFPYJjEMw5QZZAj+NgDqikmNU6/pbkNEGQCqAShdO1RtGAFXZDfB8O5NJZjIMAzDyBD8RQBaE1FzIioH4CoAEzXbTARwferxMADfCJv1PKtXzJRgIsMwTDI4uXO9YQaWZ8EXQuQBuBPANABrAHwshFhFRI8T0eDUZuMA1CKi9QDuB1AqdNMImZloDMMwZZlnhpq7waUUjBZCTAYwWfPaw6rHxwFc7mbfGWmENAqupjfDMExcueKsJrjS5P3ID5+PnyqQ0gMz6tx/YZuwTWAkUbMSdyZj7CHTg2FnX5EX/DfmbkR+xDoI+cFdfVvh3T92D9sMRgLt6nvr6MREg+7N/evFMeHWc3B775ZSa+l8cHMPy20iL/hPDOloq0+mW/7Uq4Vv+3YCEeGCNnXCNoORQPsGVcM2ITRqlKEgi9t6t/Rt39lZNTFiQDvUqVJe2j7PbFLdcpvIC/6Bo6dwzMeuVLf3auXbvmUz5Z6evjaDYeRQwccBStR5KKe9pWuhV0ADm3pVvYlp+QA62Tw06HRp+0qzUUc+8oI/x6L6m1eqxWhEUiEzHSj73i3bzBnRJ2wTdKl2WnzOKdmkkXXJX69CbBevM63MACIER09da/r+1Wfby0MaddkZtraLvOD/sje5/UG1ZKQR672KqI6kr+nRLGwTQuG8VrVsbbftwDFPx7mofb1AjpPdzFsT9M42XCx7UnXxjahYzt453qutvVlT5AW/d5u6vh+j/+n2TqCwSUsjHA2oAXQcSI9oK7So2RWUPTed3wITl2+33O679aZJ9pbccG6Wre1+2uVtsGijvqMpRq6tMxpVK3qcnWV+U+nZ2p6Q161ir1FU5AX/iuwm1ht5ZO76Pb4fQwZHuIF5EZXKpfvS+vKcFvZGqWYEpfddbIwggeBaLnbLqoG9h81HrADQsZE3V0snm3932OgVE7iwfT18evu5qm3M92F1Q1Cwe1OPvOC3qV/Z92PEpV5P9QT7hrXM+O9evghZoxqned6H15GhXT6/47xAjmOXqhUyLQWsftUKnhdtK5fPQO3K0c91aFG7tHZlpBEyHSwGCwG8dFUXaTZFXvAz0/w38UKbPsGwqVU5mMWuIHjrxrM8fb5ulQqRTXCKmEcnUKxCqGtXKSclkXLEgHae92HGi1d6F1m9kG/tWMBqyHJaZjoGdmzg2RaFyAu+VajRf3vIUFUEo229eCTK2BGSm85v7r8hDqhSoXT1jicv7Yg+bb2tzaSnkS8jaRk5fkGN8KPIZV0bm75fISMd57b07ja79MxGpu97za+RETqaZuM8GN7d3GWdlkYol5GG3FE5nu0BYiD4Vlx3Tpbrz/Y/vVB04nKB2rHz3oiVaNBzOygjvCiGL2amW3/Hj/5e28GzmK5N4+Ffls0DAwtH3JV1bvBqbuvdEq0lDLCs3CJe82vSbZwHVuhdrlUrlDznLz3T/AYpm9gLvpdEJCXiJQ5T8Ft72cv6q1xeSj08aejZowj9C1d2DtocSx7KMU+EaVW3MsqbuC06NU6m4CsDr/kbzCNwMtPTIGIQXKwVZlk8rDNYCDIzO/aC7yXkrCA1f/ejONtfLIRDzTU9rBeNG1SzF3YVNepVLW33RR0K10yiViJpSJeGqGJxob91w1mm7qgbz8uSbFU8OC0VL251nsqcTPdsXdvSnjDJ0MxCbu3VEhXLlR4A/c/vgpuVx17wvbhjrlfcQT6M8J0k3zw+uKOUY8qsy+Enytcto7lNGFVGa1Qytlu5yJ9NaHvO2haBBee0qCUtW3y0Se33ci6zZJ+69Ax8cLO2Jbc7ymkEf+RA/YXmIF3KsRd8M6zSjf2M8qiQmW74A2uxUwPDzkLXBzfJOVGNWPKX/lL2o5zg3Zq5q0ao9v3f3a+1FJvsQgRkmESOFaRCRS+XmD9iti5wto8VHe0y6Iz6RY8zLHzfGelpts53I9SDGjuLok75w9lNcW5L45mDE9R/ZrNaFQ23u8BmcpURTWsa71tLmRb8qyzi65vVqgQAqGqx0GSE0aJjq7qF8bd2/e52sJNJp51CykbWBeZlL+XS07D8kYuk2KHFjl2Vy2eYuhHt5gY4Kb07rJvxzeOjP51jez9+0Vm1bjGgY32TLQupWdF4oGXlEurW1Fu5gyBRzzJuM9ECr5nQ6kQuK8q04APAFdnGq6HK398AABgNSURBVODKD+J2SvVng1hgJ3dcu9hZnM5KjSL8mrm4jVzQum68jPCuP7e0q6xF7UpSFr6uS6Xs547Kwbf/21t3m+omYgXor1no4sCtkVVb/vkkE3WIpJ1Bgdnvf+055q5Q9eSqosU1oZ552GHEgLalXvPS20C9HtRRVU7BLlqXkBGVdNYFjIiF4Deq7j770c8CW38wqGTnh0fOzuhduXH5ccPZ+NQgx5ELfVIFnW48V15uwEUdSl/EX93fC1/edb7nfaszmW0Ltwa7i4UFJivW6hnnjPsvQI/m3uPW/aSu6rvy2rvC6oahDrCwOh+XbT7g6Nj92pVOwDyV761ByZAuDQGYu3SMmGUw6NDiZIE6FoLvBS83C7fEJKxfF6NKhG5G5coN6J7+rdGiTiVPdinU0VkUTE8jT7MGBfVMz8+BwlkW9VHUo9xWdavo/m2/79xQul0y8OqeUHJjjKjqIHdj+8Hjjo6tuGLVeF1fHnVZJ8z9cx/L6C89GvqgXWVe8G/uGXxHK5kLdm7wcpKeKdFHOlglSt/8d28A9kJQzciqLefGESZ392ttur5TuXz0EtKCotpp5i4zO/Xhn7vcXX6H3s1KPRGb/0Bf9G3nLEP8tHLpaFwjOi65MiH4b96Qbfie0cjPz5LIv9NxO9ilm8ca3AA8BbjbWXSzyyU66e8NqgU/47JLUBOzKhUyTRvv/JeN8hh2Uv9lzW7/MfxMw/cUl4UsrCYIdtYIhnWTl72qDq+uX7WCa1dfVIiF4OuVGQWAhqkV/b46vrf/u9U8esFuJ5mgmaBjt53KgD8/ObDocUDVcAOhsYTqlW6RUSpZjzQqDt/Ue89ODHmXJtaLgA8OOr2o5IEX9PIllEXyx4fIySFRsBL0AGoplqBT4+LvmYjQtl6x28dul6koEQvBN/LbmS1WnJVlHvbWx+HUTI2TLNrcUTmWyShq9CKGJt7pbEFSL3W9TT17ZaazalVEhUz/TosMhz5eteDb/RvcohUbP7NmjW7Kdv22LetYfxeZ6eRptqmgl9A3+Z6eyB2VI70ektVajB9Z8WZox5rXq5qv9DPxEtzjMD9E7f70EhlkRSwE3+jLM0uA8ROnC3pGMxQrnru8M3JH5dgSAbVY3XR+6XWL6ff1sixJ3LN1bdchquqRkBmtHYr2bb2Li2D9/XJ5dcH10AqbzPUMLV5vqnZ+p95t60rpdtWufsmQVzsuJ7dYRflY/Tk1XGZvG2Vsa5u1KN/7wI71TTPb7+3vTPAvV4WPT767J74b2dfR5+0SC8E3muIahUX6jdOL6B6HP/6Dg9phYMf6tnyRvdvWQe6onBI21TdIXrEqSXxXX307Nz41yNKONCLLUq+/PD1I1/1mRpfG1fHQoMIZ1RkWN5Up9/R0tG8t2hmj1pVmVrtFD6MSvg2qnWbYrcpNNIcRTsoL2C1z8fGfzsFfLzauFuoVK5utFkDtNhf5j2rWPLx7E+R00q85r1f7JndUDsZc0w2AsevY6cBJXaI9LY18iy6MheAbfXeyQv2c4nR01sThKv0tF7QsOqHMWPe3AXj7xu6lXu/qcmTasHrpG8VZWTVshzw+fVkn1ePS/k2nF0HuqBxUq5iJmy9oYaseuFN3kRVEVCIZ52IDUTDCaGZXp0p5w+/iDIP2f+o8gyY1g1vXCKPfs1lopvamrC05Yncwpgwenh3WCU9f1smWi0wPK9dx1IiH4Bv47dwGo6x+/HcerAEGd25UYoSg5nKdUblZko1T1OsH5TP0p79moyT14q4WZfSkjgAZf4u91H2tfp1hM7Nw+cPyyiQYzWy8oE6J79XG2bqPzMVz9eAmXfNljx7qffHQbtkMO1FkZxrU/rmvv71Cd01r2h/IPXFJyUVjteDrNd9Rs/aJAbZCqP92SUeMu944EvDaHs2w4IF+yB2Vg5eu6mIaNWiF05G9k/VBICaCX9Fgcdbo5Jv5P70N9/XkpR11p2lOSE+johHC2icGlHjvfJ1pf57EK/+mni0weugZWPP4AOuNdbAz5VVP2W03R04JhlIl0u7io1l4olNkukMUiKiozILVDUXr8pGZgKc+Z/M1A4g2Og1FnHZIus6ipIFyw7FzPqj94ep+0VaF1RSchFW20ORlqMMmb+ttXsvK7lrcNT2amS7QPnFJx6JzY0iXRo7dlkDx7Pc+k+qvejewbIdh3LEQfKNpUyWDZh/NDEoLPPr79rj6bPtli+1g56RxU0fDjCvPamqZTu00QURN4xoVsfGpQY5uKooOKL7QqPabdUOdKuVRxUZjmff+q2S1Uq96bxRrP6xryVGpdjalzuS1uukseKAfpt7bE9kGlUsV374TYTm/VfGNT+1ysVsV0skamdY19tjgDsXvBRzR4wU7pc1r6VxTTtcTYyH4TtPmtdsro4yeEvpUWqHnm20QQrKGevrsJp48LY0c1eioVanwhHVTUTPqpSgqlsvAisdKugH/dIF1BrebiCfl+8uqVbFUSKXiitRmK2vrLKlvFFZRL/WrVUC7+lUNo1+UTPUrz7KfPa7+u9UDj6Yu6sk4RfbgKkroBVVYBTJoiVY/PJ948pKOyEwnNK/lfZF36V8vNH0/itrlZ1y9wu19WqaOlY4FD/Rz9NmJd5yP3/9zrh9m+cbIge3w+uyNptvouVrM+OS2c9Ey5TqZ9b99Sr1fPiMdzWtXQi0Lv+11qljxWpXLY9Vjv0OHR6aZfqaFwaKlUqnTaL3IiO9G9kXdKuVLlIuOSg/jsII97GAWg1+3aunf3elM3pMSEFFNIvqKiH5O/a877yOifCJalvo30csx3ZCWRnh8SEfTmYKeKDbU8dlauSr0pl1lCaPiauqcCKeLp05HKW6wahTidGpsZ/R+ay9ndZy6NathWn45PY1M16cUtFUkjVyfaox+M0XoOzaqhlk2jq3QqPppyExPQ4XMdHx4cw/88nTJ0N7uJtEtsmZ8RvsZ2jXYxuF2yR2VYzpD0XMfO3Wdeh36jQTwtRCiNYCvU8/1OCaE6JL6N9jNgZY9bD6y9ope5yQ3DUXObSWnW45X2jesilf+0BWf3n4uRktst3ezgSvDSzN5Pb644zyp+7ujTyvTfIINNnINnBJk67ogcFu47pyWtUp9F2//0TwJ0E/cJkKGjbKOou6AFnSUzhAA76QevwPgEo/7M8Sq8YRX9M4Bv0cafpKeRsjp1ABdm9aw1S3Lro1GNcibSXCXqZFdpOr8VrWllFBm5OA1Uk6NkuXtNDIpbhARpt7bEy+bFLOzwqvg1xNC7Eg93gnAKB6pAhEtJqIFROTbTcELenf9JMmD3cXWtj7V+VC7wl66qov0mHoWe+fIKLxmhrarmFXJBqPudf8c3tWyNaIat4mJUaBd/aqOR/VqLAWfiGYQ0Uqdf0PU24lCxTSaKzUTQmQD+AOAF4lIN0CWiG5J3RgW79mzp9T7F5hE2ZzuscWdXqj8cE1P3Mu66qfKlwX+doncqodOUXeyGnSGs4xWNVYNNF75Q1fX+y5r3N23VYnn2kzlHj5VC1XQzgr7Wfx2D/++g+7rTWtVxHydQAEjz00bH4uTBYGX5jyWgi+E6C+E6Kjz7wsAu4ioAQCk/t9tsI9tqf83ApgFQHdOIoQYK4TIFkJk16lTWtxlp86XPHbp19o3tFc0ymrVPw6+XL1OUkHy6ODiZC+79VD0eMyiXK+VqMhmpM+jZC9o16gaaEprdDao9yOTK7OboEuT6hjWrTE6NDRfvHd6/etVjQWMEznjhNs8F68unYkArk89vh7AF9oNiKgGEZVPPa4N4DwAq90czOzn7t3WW4y9Xtu5ZjUroZLq5DA6IUf8LroXtV3CvifJqnzqpuiUX5UJAZh2tgob7SDnhSu6YFi3xlj28IVSS16YMXpYJ3x+x3l47vLORWGbbnI59DAa4ctcPwgLt/kGXq+yUQAuJKKfAfRPPQcRZRPRG6ltTgewmIiWA5gJYJQQwpXg36magr6tKfVr1SfUim6qz795QzZ+eXqQ7UQRmV2igkap0qedzQSNjDK+Ctf2KJlNbVQJUSHTZ/9+VOLPrcjOqonnLu+M6hXLSS154RS9In6Av32G48aN52Whn4tsek+CL4TYJ4ToJ4RonXL97E+9vlgIcVPq8TwhxBlCiM6p/8e5PZ66Prm2VnlPm2nbRijxxs9f0Rl929UrcsNceVY0O2N5ob1qveOVqwt92l5aD8rSywcGtsOYq7372NXp9UBJv30YM5nxt/Sw3Ob8iITzRgFZI3AlEENb074s0KdtXYy7wXloayxKK6i59MxGGNq1calRkxe/r0LuqBxcpknK+EvO6Vjx6EWWxaXihLq8bqu6lT2Hs8la3PtTr5YY6GHBVsEsIkcvY1TGaNbMdWMUUKAOCvBS+wiwLp+hNxq0mvnEHSUaR4Y2lBVi9028cGUX/P0Kd13p3ZCWRqhSIVN6784wUeqjyGLc9eEl0cjAadkAhcGdGxaNzBtZ9N59fEjhrKNSiQXDYiez1zUoBaN7nZ5bKS6uJrcoIcQ1fM7hiRPxX73QYFSLmynGqhaLU5wUWStLKAkwM1bvwrmtzEfY152ThYe/WIVHBuuHFlp1crLi6h5NMX/jvlLurCKiHygmnfIpn39Tg+q5SSR2I3wrPrzZ2l8aJdwsvDDRon/7erb9zkO6FDerVrecdNKOUI+cMxrg4Yvb49pzsjztJyrccG4WblAVgXND5VQNobLuunJCmRP8sMML40BNj1Pcm3xsYi2bCQY9R8Mgd1ROCfeR4ts366ZkFyLCH01+F9mlKvzm0cEdpBU5Y0kopky5dPq2q4tMSfHcTrmwfT18tXqX488NVo34gsLrImUcXDgNq1XA9oPHI51G36RmRXRuXA3nBRChM6hjA4yZtaHEa1EfHHWQECrcpl5lNK9dCSsevQjlM9I9z6TiTpkS/DddhCnJ4vkrOuPQ8TzHn/NaEiIM4lBs8IUru+DKsQsiX0PnC4PeyLLRE3cvNVmCQMZvN/2+XhIsKTuUKcEPkyoVMl31VJXZ4NwpXV0ucMdhYfzsFrXKfPVEhnFKsuc3ESAvPzzBd3tos4bOTDyoqtMQmyn7sOCHTGOL+G0/ae6hx+iN52XJMyREqodYQiBMRg+V1xSHiQ98mw8Zvxu7GDFnRB9PNecf+X0H3NuvDX47cUqiVcHz/YP9wzYhFCrEYOEdAJ4Z1gkjJvyIe/uX7kjHOIcFP6E0kZCMUq1iZqhFtmSQ9KiNqHNFdhP0aVsXdapEe4E5LvDZzjAJoIbL+ulRgMVeHiz4AfHUpWeEbQKTYLR9AtrHMByY8Q4LfkD42a2LcUabepXDNiF04pZ5y8iBBZ9JHJyMwyQVFvyAaBvzxskMw8QfFvyAaFOvtOD35wQmhmEChAU/IPRqmbAvmWGYIGHBD5FLz2wUtgkMwyQIFnwmsTx9GYfKMsmCBT8g4lBSOGmEWamUYcKABT9EWG7CpYnHPrIMEze4lk5A6C3acjJWeGx4ahDS+ftnEgaP8EOkRR2O0gkLFnsmibDgMwzDJAQW/IAol85fNcMw4cIqFBBRb6bNMEzZhwWfYRIG15dPLiz4DJMQLmhTBwAwg6uFJhYWfIZJCJkpt2Lc21Iy7uE4fIZJCNf0aIYG1bnxSZJhwWeYhNCnXV30aVc3bDOYEPHk0iGiy4loFREVEFG2yXYDiGgdEa0nopFejskwDMO4w6sPfyWAywDMNtqAiNIBvAJgIID2AIYTUXuPx2UYhmEc4smlI4RYAwCkVyimmO4A1gshNqa2HQ9gCIDVXo7NMAzDOCOIKJ1GALaonm9NvVYKIrqFiBYT0eI9e/YEYBrDMExysBzhE9EMAPV13npICPGFTGOEEGMBjAWA7Oxsrh7MMAwjEUvBF0L093iMbQCaqJ43Tr3GMAzDBEgQLp1FAFoTUXMiKgfgKgATAzguwzAMo8JrWOalRLQVwDkAJhHRtNTrDYloMgAIIfIA3AlgGoA1AD4WQqzyZjbDMAzjFK9ROp8B+Ezn9e0ABqmeTwYw2cuxGIZhGG9wLR2GYZiEwILPMAyTEFjwGYZhEgILPsMwTEJgwQ+QPw9oF7YJDMMkGBb8AKl6GlejZhgmPFjwA+Sqs5qiRZ1KYZvBMExCYcEPkPQ0Qss6lcM2g2GYhMKCHzDNalYM2wSGYRIKC37AVCxf6Me/q2+rkC1hGCZpsOAHTPesmgCAW3u1DNkShmGSBoeNBMz5rWsjd1RO2GYwDJNAeITPMAyTEFjwGYZhEgILPsMwTEJgwWcYhkkILPgMwzAJgQWfYRgmIbDgMwzDJAQWfIZhmIRAQoiwbdCFiH4DsC5sO0yoDWBv2EYYEGXbgGjbx7a5J8r2Jcm2ZkKIOnpvRDnTdp0QIjtsI4wgosVRtS/KtgHRto9tc0+U7WPbCmGXDsMwTEJgwWcYhkkIURb8sWEbYEGU7YuybUC07WPb3BNl+9g2RHjRlmEYhpFLlEf4DMMwjERY8BmGYRJCJAWfiAYQ0ToiWk9EI8O2Rw0RvUlEu4loZdi2aCGiJkQ0k4hWE9EqIronbJsUiKgCEX1PRMtTtj0Wtk1aiCidiH4goi/DtkULEeUS0QoiWkZEi8O2Rw0RVSeiCUS0lojWENE5YdukQERtU9+Z8u8QEd0btl0KRHRf6npYSUQfElEFX48XNR8+EaUD+AnAhQC2AlgEYLgQYnWohqUgogsAHAbwrhCiY9j2qCGiBgAaCCGWElEVAEsAXBKF746ICEAlIcRhIsoEMBfAPUKIBSGbVgQR3Q8gG0BVIcTFYdujhohyAWQLISKXPERE7wCYI4R4g4jKAagohDgQtl1aUtqyDcDZQohNEbCnEQqvg/ZCiGNE9DGAyUKIt/06ZhRH+N0BrBdCbBRCnAQwHsCQkG0qQggxG8D+sO3QQwixQwixNPX4NwBrADQK16pCRCGHU08zU/8iM9ogosYAcgC8EbYtcYKIqgG4AMA4ABBCnIyi2KfoB2BDFMReRQaA04goA0BFANv9PFgUBb8RgC2q51sREdGKE0SUBeBMAAvDtaSYlMtkGYDdAL4SQkTGNgAvAhgBoCBsQwwQAKYT0RIiuiVsY1Q0B7AHwFspd9gbRFQpbKMMuArAh2EboSCE2AbgOQCbAewAcFAIMd3PY0ZR8BmPEFFlAJ8AuFcIcShsexSEEPlCiC4AGgPoTkSRcIkR0cUAdgshloRtiwnnCyG6AhgI4I6UazEKZADoCmCMEOJMAEcARGrdDQBSrqbBAP4vbFsUiKgGCr0XzQE0BFCJiK7x85hRFPxtAJqonjdOvcbYIOUf/wTA+0KIT8O2R4/UlH8mgAFh25LiPACDU37y8QD6EtG/wzWpJKnRIIQQuwF8hkLXZxTYCmCrarY2AYU3gKgxEMBSIcSusA1R0R/AL0KIPUKIUwA+BXCunweMouAvAtCaiJqn7spXAZgYsk2xILUwOg7AGiHE82Hbo4aI6hBR9dTj01C4KL82XKsKEUI8IIRoLITIQuH59o0QwteRlhOIqFJqER4pd8lFACIRJSaE2AlgCxG1Tb3UD0DoQQI6DEeE3DkpNgPoQUQVU9duPxSuu/lG5KplCiHyiOhOANMApAN4UwixKmSziiCiDwH0BlCbiLYCeEQIMS5cq4o4D8C1AFakfOUA8KAQYnKINik0APBOKlIiDcDHQojIhT9GlHoAPivUBGQA+EAIMTVck0pwF4D3UwO0jQBuDNmeEqRukhcC+FPYtqgRQiwkogkAlgLIA/ADfC6zELmwTIZhGMYfoujSYRiGYXyABZ9hGCYhsOAzDMMkBBZ8hmGYhMCCzzAMkxBY8BkGABHVUlVU3ElE21KPDxPRq2HbxzAy4LBMhtFARI8COCyEeC5sWxhGJjzCZxgTiKi3Uh+fiB4loneIaA4RbSKiy4jomVSd+qmpshYgom5E9G2q0Nm0VNlqhgkdFnyGcUZLAH1RWIjr3wBmCiHOAHAMQE5K9P8BYJgQohuANwE8GZaxDKMmcqUVGCbiTBFCnCKiFSgs/aGUOFgBIAtAWwAdAXyVKoWQjsLStwwTOiz4DOOMEwAghCggolOieBGsAIXXEwFYJYSITJs/hlFglw7DyGUdgDpKX1ciyiSiDiHbxDAAWPAZRiqptpzDAIwmouUAlsHnGucMYxcOy2QYhkkIPMJnGIZJCCz4DMMwCYEFn2EYJiGw4DMMwyQEFnyGYZiEwILPMAyTEFjwGYZhEsL/A/dtCXm6BAJfAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "with paddle.no_grad():\n", - " wav = vocoder.infer(mel_input)\n", - "wav = wav.numpy()[0]\n", - "sf.write(f\"syn_audio/{ref_name}\", wav, samplerate=22050)\n", - "librosa.display.waveplot(wav)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ipd.Audio(wav, rate=22050)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/aishell3/vc0/local/train.sh b/examples/aishell3/vc0/local/train.sh new file mode 100755 index 000000000..eb968b5fc --- /dev/null +++ b/examples/aishell3/vc0/local/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device="gpu" \ No newline at end of file diff --git a/examples/aishell3/vc0/local/voice_cloning.sh b/examples/aishell3/vc0/local/voice_cloning.sh new file mode 100755 index 000000000..ee96b9e0d --- /dev/null +++ b/examples/aishell3/vc0/local/voice_cloning.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +ge2e_params_path=$1 +tacotron2_params_path=$2 +waveflow_params_path=$3 +vc_input=$4 +vc_output=$5 + +python3 ${BIN_DIR}/voice_cloning.py \ + --ge2e_params_path=${ge2e_params_path} \ + --tacotron2_params_path=${tacotron2_params_path} \ + --waveflow_params_path=${waveflow_params_path} \ + --input-dir=${vc_input} \ + --output-dir=${vc_output} \ No newline at end of file diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh new file mode 100755 index 000000000..df2af8035 --- /dev/null +++ b/examples/aishell3/vc0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=tacotron2_ge2e +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/aishell3/vc0/run.sh b/examples/aishell3/vc0/run.sh old mode 100644 new mode 100755 index e69de29bb..dab9a5ceb --- a/examples/aishell3/vc0/run.sh +++ b/examples/aishell3/vc0/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +input=~/datasets/data_aishell3/train +preprocess_path=dump +alignment=./alignment + +# not include ".pdparams" here +ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 +train_output_path=output +# include ".pdparams" here +ge2e_params_path=${ge2e_ckpt_path}.pdparams +tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams +# pretrained model +# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams +waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams +vc_input=ref_audio +vc_output=syn_audio + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1 +fi + + diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md new file mode 100644 index 000000000..08a513491 --- /dev/null +++ b/examples/csmsc/README.md @@ -0,0 +1,11 @@ + +# CSMSC + +* tts0 - Tactron2 +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN diff --git a/examples/csmsc/speedyspeech/baker/inference.sh b/examples/csmsc/speedyspeech/baker/inference.sh deleted file mode 100755 index 880a1fd53..000000000 --- a/examples/csmsc/speedyspeech/baker/inference.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -python3 inference.py \ - --inference-dir=exp/default/inference \ - --text=../sentences.txt \ - --output-dir=exp/default/pd_infer_out \ - --phones-dict=dump/phone_id_map.txt \ - --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/speedyspeech/baker/README.md b/examples/csmsc/tts2/README.md similarity index 86% rename from examples/csmsc/speedyspeech/baker/README.md rename to examples/csmsc/tts2/README.md index 0484d4846..f5d8a720a 100644 --- a/examples/csmsc/speedyspeech/baker/README.md +++ b/examples/csmsc/tts2/README.md @@ -1,5 +1,4 @@ # Speedyspeech with CSMSC - This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner). ## Dataset @@ -10,12 +9,23 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -## Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +6. inference using static model. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -37,13 +47,12 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 ``` Here's the complete help message. - ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] @@ -81,20 +90,7 @@ optional arguments: 6. `--phones-dict` is the path of the phone vocabulary file. 7. `--tones-dict` is the path of the tone vocabulary file. -## Pretrained Model -Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) - -SpeedySpeech checkpoint contains files listed below. -```text -speedyspeech_nosil_baker_ckpt_0.5 -├── default.yaml # default config used to train speedyspeech -├── feats_stats.npy # statistics used to normalize spectrogram when training speedyspeech -├── phone_id_map.txt # phone vocabulary file when training speedyspeech -├── snapshot_iter_11400.pdz # model parameters and optimizer states -└── tone_id_map.txt # tone vocabulary file when training speedyspeech -``` - -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash @@ -107,9 +103,9 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] @@ -152,9 +148,9 @@ optional arguments: --device DEVICE device type to use --verbose VERBOSE verbose ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] @@ -203,21 +199,40 @@ optional arguments: 4. `--output-dir` is the directory to save synthesized audio files. 5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece. 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -6. `--phones-dict` is the path of the phone vocabulary file. -7. `--tones-dict` is the path of the tone vocabulary file. +7. `--phones-dict` is the path of the phone vocabulary file. +8. `--tones-dict` is the path of the tone vocabulary file. + +### Inference +After Synthesize, we will get static models of speedyspeech and pwgan in `${train_output_path}/inference`. +`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for speedyspeech + pwgan synthesize. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} +``` + +## Pretrained Model +Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) -You can use the following scripts to synthesize for `../sentences.txt` using pretrained speedyspeech and parallel wavegan models. +SpeedySpeech checkpoint contains files listed below. +```text +speedyspeech_nosil_baker_ckpt_0.5 +├── default.yaml # default config used to train speedyspeech +├── feats_stats.npy # statistics used to normalize spectrogram when training speedyspeech +├── phone_id_map.txt # phone vocabulary file when training speedyspeech +├── snapshot_iter_11400.pdz # model parameters and optimizer states +└── tone_id_map.txt # tone vocabulary file when training speedyspeech +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models. ```bash FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e.py \ --speedyspeech-config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \ --speedyspeech-checkpoint=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \ --speedyspeech-stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --inference-dir=exp/default/inference \ --device="gpu" \ diff --git a/examples/csmsc/speedyspeech/baker/conf/default.yaml b/examples/csmsc/tts2/conf/default.yaml similarity index 100% rename from examples/csmsc/speedyspeech/baker/conf/default.yaml rename to examples/csmsc/tts2/conf/default.yaml diff --git a/examples/csmsc/tts2/local/inference.sh b/examples/csmsc/tts2/local/inference.sh new file mode 100755 index 000000000..37e2e55c7 --- /dev/null +++ b/examples/csmsc/tts2/local/inference.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +train_output_path=$1 + +python3 ${BIN_DIR}/inference.py \ + --inference-dir=${train_output_path}/inference \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/pd_infer_out \ + --phones-dict=dump/phone_id_map.txt \ + --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/speedyspeech/baker/preprocess.sh b/examples/csmsc/tts2/local/preprocess.sh similarity index 88% rename from examples/csmsc/speedyspeech/baker/preprocess.sh rename to examples/csmsc/tts2/local/preprocess.sh index 422caa310..f7f5ea74c 100755 --- a/examples/csmsc/speedyspeech/baker/preprocess.sh +++ b/examples/csmsc/tts2/local/preprocess.sh @@ -1,9 +1,10 @@ #!/bin/bash + stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,17 +12,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=baker \ --rootdir=~/datasets/BZNSYP/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True \ --use-relative-path=True @@ -38,7 +39,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/tone to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy \ @@ -46,7 +47,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy \ @@ -54,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy \ diff --git a/examples/csmsc/speedyspeech/baker/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh similarity index 61% rename from examples/csmsc/speedyspeech/baker/synthesize.sh rename to examples/csmsc/tts2/local/synthesize.sh index 7c37c5bd7..418ee02e6 100755 --- a/examples/csmsc/speedyspeech/baker/synthesize.sh +++ b/examples/csmsc/tts2/local/synthesize.sh @@ -1,16 +1,20 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --speedyspeech-config=conf/default.yaml \ - --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --speedyspeech-config=${config_path} \ + --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --speedyspeech-stat=dump/train/feats_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ - --inference-dir=exp/default/inference \ + --output-dir=${train_output_path}/test \ + --inference-dir=${train_output_path}/inference \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ --device="gpu" diff --git a/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh similarity index 54% rename from examples/csmsc/speedyspeech/baker/synthesize_e2e.sh rename to examples/csmsc/tts2/local/synthesize_e2e.sh index 4800a0f71..c50fa7765 100755 --- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -1,16 +1,21 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python synthesize_e2e.py \ - --speedyspeech-config=conf/default.yaml \ - --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --speedyspeech-config=${config_path} \ + --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --speedyspeech-stat=dump/train/feats_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ - --inference-dir=exp/default/inference \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/test_e2e \ + --inference-dir=${train_output_path}/inference \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/speedyspeech/baker/run.sh b/examples/csmsc/tts2/local/train.sh similarity index 64% rename from examples/csmsc/speedyspeech/baker/run.sh rename to examples/csmsc/tts2/local/train.sh index 64936ef34..e44c7da5b 100755 --- a/examples/csmsc/speedyspeech/baker/run.sh +++ b/examples/csmsc/tts2/local/train.sh @@ -1,11 +1,14 @@ #!/bin/bash -python ../train.py \ +config_path=$1 +train_output_path=$2 + +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ diff --git a/examples/csmsc/tts2/path.sh b/examples/csmsc/tts2/path.sh new file mode 100755 index 000000000..1a9519f37 --- /dev/null +++ b/examples/csmsc/tts2/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=speedyspeech +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} \ No newline at end of file diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh new file mode 100755 index 000000000..5d00a0700 --- /dev/null +++ b/examples/csmsc/tts2/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_76.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi diff --git a/examples/vctk/fastspeech2/baker/README.md b/examples/csmsc/tts3/README.md similarity index 90% rename from examples/vctk/fastspeech2/baker/README.md rename to examples/csmsc/tts3/README.md index a9f0fc8b2..3e28bed0b 100644 --- a/examples/vctk/fastspeech2/baker/README.md +++ b/examples/csmsc/tts3/README.md @@ -9,13 +9,22 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -40,11 +49,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] @@ -78,18 +87,7 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) - -FastSpeech2 checkpoint contains files listed below. -```text -fastspeech2_nosil_baker_ckpt_0.4 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_76000.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash @@ -102,9 +100,9 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -144,9 +142,9 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -191,18 +189,29 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) + +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_baker_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_76000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. ```bash FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e.py \ --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/baker/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/baker/conf/default.yaml rename to examples/csmsc/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/baker/preprocess.sh b/examples/csmsc/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/baker/preprocess.sh rename to examples/csmsc/tts3/local/preprocess.sh index dff3e349d..c83d9a9b6 100755 --- a/examples/vctk/fastspeech2/baker/preprocess.sh +++ b/examples/csmsc/tts3/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=baker \ --rootdir=~/datasets/BZNSYP/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/fastspeech2/baker/simple.lexicon b/examples/csmsc/tts3/local/simple.lexicon similarity index 100% rename from examples/vctk/fastspeech2/baker/simple.lexicon rename to examples/csmsc/tts3/local/simple.lexicon diff --git a/examples/vctk/fastspeech2/baker/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh similarity index 63% rename from examples/vctk/fastspeech2/baker/synthesize.sh rename to examples/csmsc/tts3/local/synthesize.sh index 535ebdba4..724afb04a 100755 --- a/examples/vctk/fastspeech2/baker/synthesize.sh +++ b/examples/csmsc/tts3/local/synthesize.sh @@ -1,14 +1,19 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_76000.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/baker/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh similarity index 56% rename from examples/vctk/fastspeech2/baker/synthesize_e2e.sh rename to examples/csmsc/tts3/local/synthesize_e2e.sh index a2deec145..8c9755dd0 100755 --- a/examples/vctk/fastspeech2/baker/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -1,14 +1,19 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts3/local/train.sh b/examples/csmsc/tts3/local/train.sh new file mode 100755 index 000000000..fbbc9a9de --- /dev/null +++ b/examples/csmsc/tts3/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --nprocs=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts3/path.sh b/examples/csmsc/tts3/path.sh new file mode 100755 index 000000000..561d01632 --- /dev/null +++ b/examples/csmsc/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh new file mode 100755 index 000000000..24e439924 --- /dev/null +++ b/examples/csmsc/tts3/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md b/examples/csmsc/voc1/README.md similarity index 88% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/README.md rename to examples/csmsc/voc1/README.md index a58fd9229..2a7b3185b 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md +++ b/examples/csmsc/voc1/README.md @@ -1,6 +1,6 @@ # Parallel WaveGAN with CSMSC This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). -## Preprocess the dataset +## Dataset ### Download and Extract the datasaet Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`. @@ -8,12 +8,21 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -30,17 +39,15 @@ dump ├── raw └── feats_stats.npy ``` - The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`. Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. -## Train the model - -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text @@ -86,25 +93,10 @@ benchmark: 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -## Pretrained Models - -Pretrained models can be downloaded here: -1. Parallel WaveGAN checkpoint. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip), which is used as a vocoder in the end-to-end inference script. - -Parallel WaveGAN checkpoint contains files listed below. - -```text -pwg_baker_ckpt_0.4 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan -``` - -## Synthesize - -`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] @@ -127,10 +119,21 @@ optional arguments: ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. -2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +## Pretrained Models +Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwg_baker_ckpt_0.4 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml rename to examples/csmsc/voc1/conf/default.yaml diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh similarity index 83% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh rename to examples/csmsc/voc1/local/preprocess.sh index df5b7d22e..61d6d62be 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh +++ b/examples/csmsc/voc1/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features echo "Extract features ..." - python3 ../../preprocess.py \ + python3 ${BIN_DIR}/../preprocess.py \ --rootdir=~/datasets/BZNSYP/ \ --dataset=baker \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --cut-sil=True \ --num-cpu=20 fi @@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/csmsc/voc1/local/synthesize.sh b/examples/csmsc/voc1/local/synthesize.sh new file mode 100755 index 000000000..9f904ac0c --- /dev/null +++ b/examples/csmsc/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh b/examples/csmsc/voc1/local/train.sh similarity index 60% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh rename to examples/csmsc/voc1/local/train.sh index df8cefd88..1ef860c36 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh +++ b/examples/csmsc/voc1/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 + FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ../train.py \ +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 diff --git a/examples/csmsc/voc1/path.sh b/examples/csmsc/voc1/path.sh new file mode 100755 index 000000000..28d39ae00 --- /dev/null +++ b/examples/csmsc/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/csmsc/voc1/run.sh b/examples/csmsc/voc1/run.sh new file mode 100755 index 000000000..666a15120 --- /dev/null +++ b/examples/csmsc/voc1/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/ljspeech/README.md b/examples/ljspeech/README.md index db87b149a..67b1bf473 100644 --- a/examples/ljspeech/README.md +++ b/examples/ljspeech/README.md @@ -2,5 +2,10 @@ # LJSpeech * tts0 - Tactron2 -* tts1 - TransformerTTS -* voc0 - WaveFlow +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md new file mode 100644 index 000000000..e95f6614d --- /dev/null +++ b/examples/ljspeech/tts0/README.md @@ -0,0 +1,87 @@ +# Tacotron2 with LJSpeech +PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884). + +## Dataset +We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). + +```bash +wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +tar xjvf LJSpeech-1.1.tar.bz2 +``` +## Get Started +Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize mels. +```bash +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} +``` +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +Here's the complete help message. +```text +usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR] + [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}] + [--nprocs NPROCS] [--opts ...] + +optional arguments: + -h, --help show this help message and exit + --config FILE path of the config file to overwrite to default config + with. + --data DATA_DIR path to the datatset. + --output OUTPUT_DIR path to save checkpoint and logs. + --checkpoint_path CHECKPOINT_PATH + path of the checkpoint to load + --device {cpu,gpu} device type to use, cpu and gpu are supported. + --nprocs NPROCS number of parallel processes to use. + --opts ... options to overwrite --config file and the default + config, passing in KEY VALUE pairs +``` + +If you want to train on CPU, just set ``--device=cpu``. +If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU. +By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. +And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load. +**Note: The checkpoint path cannot contain the file extension.** + +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH] + [--input INPUT] [--output OUTPUT] [--device DEVICE] + [--opts ...] [-v] + +generate mel spectrogram with TransformerTTS. + +optional arguments: + -h, --help show this help message and exit + --config FILE extra config to overwrite the default config + --checkpoint_path CHECKPOINT_PATH + path of the checkpoint to load. + --input INPUT path of the text sentences + --output OUTPUT path to save outputs + --device DEVICE device type to use. + --opts ... options to overwrite --config file and the default + config, passing in KEY VALUE pairs + -v, --verbose print msg +``` +**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example) + +## Pretrained Models +Pretrained Models can be downloaded from links below. We provide 2 models with different configurations. + +1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip) + +2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip) diff --git a/examples/ljspeech/tts0/local/preprocess.sh b/examples/ljspeech/tts0/local/preprocess.sh new file mode 100755 index 000000000..c39a3172d --- /dev/null +++ b/examples/ljspeech/tts0/local/preprocess.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +preprocess_path=$1 + +python3 ${BIN_DIR}/preprocess.py \ + --input=~/datasets/LJSpeech-1.1 \ + --output=${preprocess_path} \ + -v \ \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/synthesize.sh b/examples/ljspeech/tts0/local/synthesize.sh new file mode 100755 index 000000000..91c89dd49 --- /dev/null +++ b/examples/ljspeech/tts0/local/synthesize.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +train_output_path=$1 +ckpt_name=$2 + +python3 ${BIN_DIR}/synthesize.py \ + --config=${train_output_path}/config.yaml \ + --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ + --input=${BIN_DIR}/../sentences_en.txt \ + --output=${train_output_path}/test + --device=gpu \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/tacotron2/README.md b/examples/ljspeech/tts0/local/tacotron2/README.md deleted file mode 100644 index e5f159df9..000000000 --- a/examples/ljspeech/tts0/local/tacotron2/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# Tacotron2 - -PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884). - -## Project Structure - -```text -├── config.py # default configuration file -├── ljspeech.py # dataset and dataloader settings for LJSpeech -├── preprocess.py # script to preprocess LJSpeech dataset -├── synthesize.py # script to synthesize spectrogram from text -├── train.py # script for tacotron2 model training -├── synthesize.ipynb # notebook example for end-to-end TTS -``` - -## Dataset - -We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). - -```bash -wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -tar xjvf LJSpeech-1.1.tar.bz2 -``` - -Then you need to preprocess the data by running ``preprocess.py``, the preprocessed data will be placed in ``--output`` directory. - -```bash -python preprocess.py \ ---input=${DATAPATH} \ ---output=${PREPROCESSEDDATAPATH} \ --v \ -``` - -For more help on arguments - -``python preprocess.py --help``. - -## Train the model - -Tacotron2 model can be trained by running ``train.py``. - -```bash -python train.py \ ---data=${PREPROCESSEDDATAPATH} \ ---output=${OUTPUTPATH} \ ---device=gpu \ -``` - -If you want to train on CPU, just set ``--device=cpu``. -If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU. -By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load. - -**Note: The checkpoint path cannot contain the file extension.** - -For more help on arguments - -``python train_transformer.py --help``. - -## Synthesize - -After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``. - -```bash -python synthesize.py \ ---config=${CONFIGPATH} \ ---checkpoint_path=${CHECKPOINTPATH} \ ---input=${TEXTPATH} \ ---output=${OUTPUTPATH} ---device=gpu -``` - -The ``${CONFIGPATH}`` needs to be matched with ``${CHECKPOINTPATH}``. - -For more help on arguments - -``python synthesize.py --help``. - -Then you can find the spectrogram files in ``${OUTPUTPATH}``, and then they can be the input of vocoder like [waveflow](../waveflow/README.md#Synthesis) to get audio files. - - -## Pretrained Models - -Pretrained Models can be downloaded from links below. We provide 2 models with different configurations. - -1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip) - -2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip) - - -## Notebook: End-to-end TTS - -See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow. diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh new file mode 100755 index 000000000..b8bcf5cb9 --- /dev/null +++ b/examples/ljspeech/tts0/local/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device=gpu \ \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh new file mode 100755 index 000000000..590e7a9c9 --- /dev/null +++ b/examples/ljspeech/tts0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=tacotron2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/ljspeech/tts0/run.sh b/examples/ljspeech/tts0/run.sh old mode 100644 new mode 100755 index e69de29bb..9907b97fd --- a/examples/ljspeech/tts0/run.sh +++ b/examples/ljspeech/tts0/run.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +preprocess_path=preprocessed_ljspeech +train_output_path=output +ckpt_name=step-35000 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${preprocess_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1 +fi + diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md b/examples/ljspeech/tts1/README.md similarity index 89% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md rename to examples/ljspeech/tts1/README.md index b6b2ac9a5..0e47a236a 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md +++ b/examples/ljspeech/tts1/README.md @@ -8,12 +8,21 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 ```bash tar xjvf LJSpeech-1.1.tar.bz2 ``` -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh. +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. ```text @@ -35,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, path of speech features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. ```text @@ -71,17 +80,6 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model -Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) - -TransformerTTS checkpoint contains files listed below. -```text -transformer_tts_ljspeech_ckpt_0.4 -├── default.yaml # default config used to train transformer_tts -├── phone_id_map.txt # phone vocabulary file when training transformer_tts -├── snapshot_iter_201500.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training transformer_tts -``` ## Synthesize We use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder. Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it. @@ -94,9 +92,9 @@ waveflow_ljspeech_ckpt_0.3 ├── config.yaml # default config used to train waveflow └── step-2000000.pdparams # model parameters of waveflow ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG] @@ -132,9 +130,9 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize_e2e.py [-h] @@ -177,17 +175,28 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences.txt` using pretrained transformer_tts and waveflow models. +## Pretrained Model +Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) + +TransformerTTS checkpoint contains files listed below. +```text +transformer_tts_ljspeech_ckpt_0.4 +├── default.yaml # default config used to train transformer_tts +├── phone_id_map.txt # phone vocabulary file when training transformer_tts +├── snapshot_iter_201500.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training transformer_tts +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained transformer_tts and waveflow models. ```bash FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e.py \ --transformer-tts-config=transformer_tts_ljspeech_ckpt_0.4/default.yaml \ --transformer-tts-checkpoint=transformer_tts_ljspeech_ckpt_0.4/snapshot_iter_201500.pdz \ --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \ --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml rename to examples/ljspeech/tts1/conf/default.yaml diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh b/examples/ljspeech/tts1/local/preprocess.sh similarity index 89% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh rename to examples/ljspeech/tts1/local/preprocess.sh index 7fc5247bd..e1acc8e83 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh +++ b/examples/ljspeech/tts1/local/preprocess.sh @@ -3,12 +3,12 @@ stage=1 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=ljspeech \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dumpdir=dump \ @@ -27,21 +27,21 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh b/examples/ljspeech/tts1/local/synthesize.sh similarity index 61% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh rename to examples/ljspeech/tts1/local/synthesize.sh index 164e5ba23..5d1c9534a 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh +++ b/examples/ljspeech/tts1/local/synthesize.sh @@ -1,13 +1,18 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --transformer-tts-config=conf/default.yaml \ - --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --transformer-tts-config=${config_path} \ + --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --transformer-tts-stat=dump/train/speech_stats.npy \ --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh b/examples/ljspeech/tts1/local/synthesize_e2e.sh similarity index 53% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh rename to examples/ljspeech/tts1/local/synthesize_e2e.sh index 4fb692384..333a5cd6b 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh +++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh @@ -1,13 +1,18 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --transformer-tts-config=conf/default.yaml \ - --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --transformer-tts-config=${config_path} \ + --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --transformer-tts-stat=dump/train/speech_stats.npy \ --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh b/examples/ljspeech/tts1/local/train.sh similarity index 55% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh rename to examples/ljspeech/tts1/local/train.sh index f448bdfc1..8527f57f3 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh +++ b/examples/ljspeech/tts1/local/train.sh @@ -1,9 +1,12 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/sentences.txt b/examples/ljspeech/tts1/local/transformer_tts/sentences.txt deleted file mode 100644 index 36b73a528..000000000 --- a/examples/ljspeech/tts1/local/transformer_tts/sentences.txt +++ /dev/null @@ -1,9 +0,0 @@ -001 Life was like a box of chocolates, you never know what you're gonna get. -002 With great power there must come great responsibility. -003 To be or not to be, that’s a question. -004 A man can be destroyed but not defeated -005 Do not, for one repulse, give up the purpose that you resolved to effort. -006 Death is just a part of life, something we're all destined to do. -007 I think it's hard winning a war with words. -008 Don’t argue with the people of strong determination, because they may change the fact! -009 Love you three thousand times. \ No newline at end of file diff --git a/examples/ljspeech/tts1/path.sh b/examples/ljspeech/tts1/path.sh new file mode 100755 index 000000000..201261b16 --- /dev/null +++ b/examples/ljspeech/tts1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=transformer_tts +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/ljspeech/tts1/run.sh b/examples/ljspeech/tts1/run.sh old mode 100644 new mode 100755 index e69de29bb..7d6599061 --- a/examples/ljspeech/tts1/run.sh +++ b/examples/ljspeech/tts1/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_403.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/fastspeech2/ljspeech/README.md b/examples/ljspeech/tts3/README.md similarity index 85% rename from examples/vctk/fastspeech2/ljspeech/README.md rename to examples/ljspeech/tts3/README.md index ed905bea3..e99852cb7 100644 --- a/examples/vctk/fastspeech2/ljspeech/README.md +++ b/examples/ljspeech/tts3/README.md @@ -9,13 +9,22 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -40,10 +49,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. ```text @@ -78,18 +87,7 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) - -FastSpeech2 checkpoint contains files listed below. -```text -fastspeech2_nosil_ljspeech_ckpt_0.5 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_100000.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/ljspeech/) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it. ```bash @@ -102,9 +100,9 @@ pwg_ljspeech_ckpt_0.5 ├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -144,19 +142,19 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] + [--text TEXT] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -191,18 +189,29 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models. +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) + +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_ljspeech_ckpt_0.5 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_100000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models. ```bash FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e_en.py \ --fastspeech2-config=fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \ --fastspeech2-checkpoint=fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \ --fastspeech2-stat=fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \ --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ - --text=../sentences_en.txt \ + --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/ljspeech/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/ljspeech/conf/default.yaml rename to examples/ljspeech/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/ljspeech/preprocess.sh b/examples/ljspeech/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/ljspeech/preprocess.sh rename to examples/ljspeech/tts3/local/preprocess.sh index ff2e765d7..749a9884c 100755 --- a/examples/vctk/fastspeech2/ljspeech/preprocess.sh +++ b/examples/ljspeech/tts3/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./ljspeech_alignment \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=ljspeech \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=8 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/fastspeech2/ljspeech/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh similarity index 64% rename from examples/vctk/fastspeech2/ljspeech/synthesize.sh rename to examples/ljspeech/tts3/local/synthesize.sh index 0f8225e44..32dcde586 100755 --- a/examples/vctk/fastspeech2/ljspeech/synthesize.sh +++ b/examples/ljspeech/tts3/local/synthesize.sh @@ -1,15 +1,19 @@ - #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh similarity index 56% rename from examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh rename to examples/ljspeech/tts3/local/synthesize_e2e.sh index 158d4483c..28ea3a8fa 100755 --- a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh +++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh @@ -1,15 +1,19 @@ - #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \ +python3 ${BIN_DIR}/synthesize_e2e_en.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ - --text=../sentences_en.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/ljspeech/run.sh b/examples/ljspeech/tts3/local/train.sh similarity index 55% rename from examples/vctk/fastspeech2/ljspeech/run.sh rename to examples/ljspeech/tts3/local/train.sh index fd5e2c689..847a44e3c 100755 --- a/examples/vctk/fastspeech2/ljspeech/run.sh +++ b/examples/ljspeech/tts3/local/train.sh @@ -1,9 +1,12 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh new file mode 100755 index 000000000..561d01632 --- /dev/null +++ b/examples/ljspeech/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh new file mode 100755 index 000000000..329ba124d --- /dev/null +++ b/examples/ljspeech/tts3/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_201.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md new file mode 100644 index 000000000..6163ae42f --- /dev/null +++ b/examples/ljspeech/voc0/README.md @@ -0,0 +1,52 @@ +# WaveFlow with LJSpeech +## Dataset +### Download the datasaet. +```bash +wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +``` +### Extract the dataset. +```bash +tar xjvf LJSpeech-1.1.tar.bz2 +``` +## Get Started +Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. +Assume the path to the Tacotron2 generated mels is `../tts0/output/test`. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs from mels. +```bash +./run.sh +``` +### Preprocess the dataset. +```bash +./local/preprocess.sh ${preprocess_path} +``` +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} +``` +The training script requires 4 command line arguments. +1. `--data` is the path of the training dataset. +2. `--output` is the path of the output directory. +3. `--device` should be "cpu" or "gpu" +4. `--nprocs` is the number of processes to train the model in parallel. + +If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet. + +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} +``` + +Synthesize waveform. +1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. +2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does. +3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here. +4. `--device` specifies to device to run synthesis on. + +## Pretrained Model +Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc0/local/preprocess.sh b/examples/ljspeech/voc0/local/preprocess.sh new file mode 100755 index 000000000..4a45793e6 --- /dev/null +++ b/examples/ljspeech/voc0/local/preprocess.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +preprocess_path=$1 + +python3 ${BIN_DIR}/preprocess.py \ + --input=~/datasets/LJSpeech-1.1 \ + --output=${preprocess_path} \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/synthesize.sh b/examples/ljspeech/voc0/local/synthesize.sh new file mode 100755 index 000000000..055542cf9 --- /dev/null +++ b/examples/ljspeech/voc0/local/synthesize.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +input_mel_path=$1 +train_output_path=$2 +ckpt_name=$3 + +python ${BIN_DIR}/synthesize.py \ + --input=${input_mel_path} \ + --output=${train_output_path}/wavs/ \ + --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ + --device="gpu" \ + --verbose \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/train.sh b/examples/ljspeech/voc0/local/train.sh new file mode 100755 index 000000000..5c4defd9b --- /dev/null +++ b/examples/ljspeech/voc0/local/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device="gpu" \ + --nprocs=1 \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/waveflow/README.md b/examples/ljspeech/voc0/local/waveflow/README.md deleted file mode 100644 index b3be1e4a3..000000000 --- a/examples/ljspeech/voc0/local/waveflow/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# WaveFlow with LJSpeech - -## Dataset - -### Download the datasaet. - -```bash -wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -``` - -### Extract the dataset. - -```bash -tar xjvf LJSpeech-1.1.tar.bz2 -``` - -### Preprocess the dataset. - -Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset. - -```bash -python preprocess.py --input=LJSpeech-1.1/ --output=ljspeech_waveflow -``` - -## Train the model - -The training script requires 4 command line arguments. -`--data` is the path of the training dataset, `--output` is the path of the output directory (we recommend to use a subdirectory in `runs` to manage different experiments.) - -`--device` should be "cpu" or "gpu", `--nprocs` is the number of processes to train the model in parallel. - -```bash -python train.py --data=ljspeech_waveflow/ --output=runs/test --device="gpu" --nprocs=1 -``` - -If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet. - -## Synthesize - -Synthesize waveform. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does. - -`--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here. - -`--device` specifies to device to run synthesis on. - -```bash -python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-2000000' --device="gpu" --verbose -``` - -## Pretrained Model - -Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc0/path.sh b/examples/ljspeech/voc0/path.sh new file mode 100755 index 000000000..b9fe83ecc --- /dev/null +++ b/examples/ljspeech/voc0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=waveflow +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc0/run.sh b/examples/ljspeech/voc0/run.sh old mode 100644 new mode 100755 index e69de29bb..aeb1c8d1a --- a/examples/ljspeech/voc0/run.sh +++ b/examples/ljspeech/voc0/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +preprocess_path=preprocessed_ljspeech +train_output_path=output +# mel generated by Tacotron2 +input_mel_path=../tts0/output/test +ckpt_name=step-10000 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${preprocess_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md b/examples/ljspeech/voc1/README.md similarity index 87% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md rename to examples/ljspeech/voc1/README.md index 5b54ef5ad..995b4c7c6 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md +++ b/examples/ljspeech/voc1/README.md @@ -1,22 +1,28 @@ -# Parallel WaveGAN with the LJSpeech-1.1 dataset - +# Parallel WaveGAN with the LJSpeech-1.1 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/). - -## Preprocess the dataset - +## Dataset ### Download and Extract the datasaet Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/). - ### Get MFA results for silence trim We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` + +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -38,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. @@ -88,23 +94,10 @@ benchmark: 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -## Pretrained Models -Pretrained models can be downloaded here: -1. Parallel WaveGAN checkpoint. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip), which is used as a vocoder in the end-to-end inference script. - -Parallel WaveGAN checkpoint contains files listed below. - -```text -pwg_ljspeech_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan -``` - -## Synthesize -`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] @@ -127,10 +120,21 @@ optional arguments: ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. -2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +## Pretrained Models +Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwg_ljspeech_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml rename to examples/ljspeech/voc1/conf/default.yaml diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh similarity index 84% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh rename to examples/ljspeech/voc1/local/preprocess.sh index d88d2989c..d1af60dad 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh +++ b/examples/ljspeech/voc1/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./ljspeech_alignment \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../../preprocess.py \ + python3 ${BIN_DIR}/../preprocess.py \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dataset=ljspeech \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --cut-sil=True \ --num-cpu=20 fi @@ -39,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh new file mode 100755 index 000000000..9f904ac0c --- /dev/null +++ b/examples/ljspeech/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh b/examples/ljspeech/voc1/local/train.sh similarity index 60% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh rename to examples/ljspeech/voc1/local/train.sh index df8cefd88..1ef860c36 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh +++ b/examples/ljspeech/voc1/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 + FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ../train.py \ +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh new file mode 100755 index 000000000..28d39ae00 --- /dev/null +++ b/examples/ljspeech/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc1/run.sh b/examples/ljspeech/voc1/run.sh new file mode 100755 index 000000000..666a15120 --- /dev/null +++ b/examples/ljspeech/voc1/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/voxceleb/spk0/local/ge2e/README.md b/examples/other/ge2e/README.md similarity index 55% rename from examples/voxceleb/spk0/local/ge2e/README.md rename to examples/other/ge2e/README.md index b05786a16..89365d635 100644 --- a/examples/voxceleb/spk0/local/ge2e/README.md +++ b/examples/other/ge2e/README.md @@ -1,97 +1,78 @@ # Speaker Encoder - This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [tacotron2_aishell3](../tacotron2_shell3). The trained speaker encoder is used to extract utterance embeddings from utterances. - ## Model - The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used. -## File Structure - -```text -ge2e -├── README.md -├── README_cn.md -├── audio_processor.py -├── config.py -├── dataset_processors.py -├── inference.py -├── preprocess.py -├── random_cycle.py -├── speaker_verification_dataset.py -└── train.py -``` - ## Download Datasets - Currently supported datasets are Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata, which can be downloaded from corresponding webpage. 1. Librispeech/train-other-500 - An English multispeaker dataset,[URL](https://www.openslr.org/resources/12/train-other-500.tar.gz),only the `train-other-500` subset is used. - 2. VoxCeleb1 - An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev D should be downloaded, combined and extracted. - 3. VoxCeleb2 - An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev H should be downloaded, combined and extracted. - 4. Aidatatang-200zh - A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/62/) . - 5. magicdata - A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/68/) . If you want to use other datasets, you can also download and preprocess it as long as it meets the requirements described below. -## Preprocess Datasets +## Get Started +```bash +./run.sh +``` + +### Preprocess Datasets +`./local/preprocess.sh` calls `${BIN_DIR}/preprocess.py`. +```bash +./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} +``` +Assume datasets_root is `~/datasets/GE2E`, and it has the follow structure(We only use `train-other-500` for simplicity): +```Text +GE2E +├── LibriSpeech +└── (other datasets) +``` Multispeaker datasets are used as training data, though the transcriptions are not used. To enlarge the amount of data used for training, several multispeaker datasets are combined. The preporcessed datasets are organized in a file structure described below. The mel spectrogram of each utterance is save in `.npy` format. The dataset is 2-stratified (speaker-utterance). Since multiple datasets are combined, to avoid conflict in speaker id, dataset name is prepended to the speake ids. ```text dataset_root ├── dataset01_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy +│ ├── utterance01.npy +│ ├── utterance02.npy +│ └── utterance03.npy ├── dataset01_speaker02/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy +│ ├── utterance01.npy +│ ├── utterance02.npy +│ └── utterance03.npy ├── dataset02_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy +│ ├── utterance01.npy +│ ├── utterance02.npy +│ └── utterance03.npy └── dataset02_speaker02/ -    ├── utterance01.npy -    ├── utterance02.npy -    └── utterance03.npy + ├── utterance01.npy + ├── utterance02.npy + └── utterance03.npy ``` +In `${BIN_DIR}/preprocess.py`: +1. `--datasets_root` is the directory that contains several extracted dataset +2. `--output_dir` is the directory to save the preprocessed dataset +3. `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata. -Run the command to preprocess datasets. - +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -python preprocess.py --datasets_root= --output_dir= --dataset_names= +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} ``` - -Here `--datasets_root` is the directory that contains several extracted dataset; `--output_dir` is the directory to save the preprocessed dataset; `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata. - -## Training - -When preprocessing is done, run the command below to train the mdoel. - -```bash -python train.py --data= --output= --device="gpu" --nprocs=1 -``` - -- `--data` is the path to the preprocessed dataset. -- `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training. -- `--device` is the device type to run the training, 'cpu' and 'gpu' are supported. -- `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda. +In `${BIN_DIR}/train.py`: +1. `--data` is the path to the preprocessed dataset. +2. `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training. +3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported. +4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. +5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda. Other options are described below. @@ -99,29 +80,23 @@ Other options are described below. - `--opts` is command line options to further override config files. It should be the last comman line options passed with multiple key-value pairs separated by spaces. - `--checkpoint_path` specifies the checkpoiont to load before training, extension is not included. A parameter file ( `.pdparams`) and an optimizer state file ( `.pdopt`) with the same name is used. This option has a higher priority than auto-resuming from the `--output` directory. -## Pretrained Model - -The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. - -Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip). - -## Inference - +### Inference When training is done, run the command below to generate utterance embedding for each utterance in a dataset. - +`./local/inference.sh` calls `${BIN_DIR}/inference.py`. ```bash -python inference.py --input= --output= --checkpoint_path= --device="gpu" +CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} ``` +In `${BIN_DIR}/inference.py`: +1. `--input` is the path of the dataset used for inference. +2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format. +3. `--checkpoint_path` is the path of the checkpoint to use, extension not included. +4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`. +5. `--device` and `--opts` have the same meaning as in the training script. -`--input` is the path of the dataset used for inference. - -`--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format. - -`--checkpoint_path` is the path of the checkpoint to use, extension not included. - -`--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`. +## Pretrained Model +The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. -`--device` and `--opts` have the same meaning as in the training script. +Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip). ## References diff --git a/examples/other/ge2e/local/inference.sh b/examples/other/ge2e/local/inference.sh new file mode 100755 index 000000000..1beebdfaa --- /dev/null +++ b/examples/other/ge2e/local/inference.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +#generate utterance embedding for each utterance in a dataset. +infer_input=$1 +infer_output=$2 +train_output_path=$3 +ckpt_name=$4 + +python3 ${BIN_DIR}/inference.py \ + --input=${infer_input} \ + --output=${infer_output} \ + --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ + --device="gpu" + diff --git a/examples/other/ge2e/local/preprocess.sh b/examples/other/ge2e/local/preprocess.sh new file mode 100755 index 000000000..9851596b5 --- /dev/null +++ b/examples/other/ge2e/local/preprocess.sh @@ -0,0 +1,9 @@ +#!/bin/bash +datasets_root=$1 +preprocess_path=$2 +dataset_names=$3 + +python3 ${BIN_DIR}/preprocess.py \ + --datasets_root=${datasets_root} \ + --output_dir=${preprocess_path} \ + --dataset_names=${dataset_names} \ No newline at end of file diff --git a/examples/other/ge2e/local/train.sh b/examples/other/ge2e/local/train.sh new file mode 100755 index 000000000..5c4defd9b --- /dev/null +++ b/examples/other/ge2e/local/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device="gpu" \ + --nprocs=1 \ No newline at end of file diff --git a/examples/other/ge2e/path.sh b/examples/other/ge2e/path.sh new file mode 100755 index 000000000..4333199cb --- /dev/null +++ b/examples/other/ge2e/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=ge2e +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/other/ge2e/run.sh b/examples/other/ge2e/run.sh new file mode 100755 index 000000000..2a2db3eeb --- /dev/null +++ b/examples/other/ge2e/run.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +datasets_root=~/datasets/GE2E +preprocess_path=dump +dataset_names=librispeech_other +train_output_path=output +infer_input=infer_input +infer_output=infer_output +ckpt_name=step-10000 + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/other/text_frontend/get_g2p_data.py b/examples/other/text_frontend/get_g2p_data.py index 78535b66a..61ef3d098 100644 --- a/examples/other/text_frontend/get_g2p_data.py +++ b/examples/other/text_frontend/get_g2p_data.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from collections import defaultdict from pathlib import Path diff --git a/examples/other/text_frontend/get_textnorm_data.py b/examples/other/text_frontend/get_textnorm_data.py index 8058e0584..3928e67c5 100644 --- a/examples/other/text_frontend/get_textnorm_data.py +++ b/examples/other/text_frontend/get_textnorm_data.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path diff --git a/examples/other/text_frontend/test_g2p.py b/examples/other/text_frontend/test_g2p.py index 0515e9940..15005a003 100644 --- a/examples/other/text_frontend/test_g2p.py +++ b/examples/other/text_frontend/test_g2p.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import re from pathlib import Path diff --git a/examples/other/text_frontend/test_textnorm.py b/examples/other/text_frontend/test_textnorm.py index 99eed290a..22f90f874 100644 --- a/examples/other/text_frontend/test_textnorm.py +++ b/examples/other/text_frontend/test_textnorm.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import re from pathlib import Path diff --git a/examples/other/use_mfa/local/detect_oov.py b/examples/other/use_mfa/local/detect_oov.py index f5ae728f7..4928e4534 100644 --- a/examples/other/use_mfa/local/detect_oov.py +++ b/examples/other/use_mfa/local/detect_oov.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse +import logging from collections import OrderedDict from pathlib import Path -import logging def detect_oov(corpus_dir, lexicon_path, transcription_pattern="*.lab"): diff --git a/examples/other/use_mfa/local/generate_lexicon.py b/examples/other/use_mfa/local/generate_lexicon.py index b6e594ab8..e9445665b 100644 --- a/examples/other/use_mfa/local/generate_lexicon.py +++ b/examples/other/use_mfa/local/generate_lexicon.py @@ -20,9 +20,8 @@ than words are used in transcriptions produced by `reorganize_baker.py`. We make this choice to better leverage other software for chinese text to pinyin tools like pypinyin. This is the convention for G2P in Chinese. """ - -import re import argparse +import re from collections import OrderedDict INITIALS = [ diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh b/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh deleted file mode 100755 index e95b0da8f..000000000 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --config=conf/default.yaml \ - --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh deleted file mode 100755 index e95b0da8f..000000000 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --config=conf/default.yaml \ - --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh b/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh deleted file mode 100755 index 42213058f..000000000 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -python3 ../synthesize.py \ - --config=conf/default.yaml \ - --checkpoint=exp/default/checkpoints/snapshot_iter_35000.pdz_bak\ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test diff --git a/examples/vctk/README.md b/examples/vctk/README.md new file mode 100644 index 000000000..4007c0319 --- /dev/null +++ b/examples/vctk/README.md @@ -0,0 +1,11 @@ + +# VCTK + +* tts0 - Tactron2 +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN diff --git a/examples/vctk/fastspeech2/aishell3/synthesize.sh b/examples/vctk/fastspeech2/aishell3/synthesize.sh deleted file mode 100755 index 950b2077f..000000000 --- a/examples/vctk/fastspeech2/aishell3/synthesize.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \ - --fastspeech2-stat=dump/train/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ - --device="gpu" \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh b/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh deleted file mode 100755 index 315337143..000000000 --- a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \ - --fastspeech2-stat=dump/train/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ - --device="gpu" \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/baker/run.sh b/examples/vctk/fastspeech2/baker/run.sh deleted file mode 100755 index 3e9a5e222..000000000 --- a/examples/vctk/fastspeech2/baker/run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -python3 ../train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ - --nprocs=1 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/vctk/fastspeech2/sentences.txt b/examples/vctk/fastspeech2/sentences.txt deleted file mode 100644 index 3aa5376b4..000000000 --- a/examples/vctk/fastspeech2/sentences.txt +++ /dev/null @@ -1,16 +0,0 @@ -001 凯莫瑞安联合体的经济崩溃,迫在眉睫。 -002 对于所有想要离开那片废土,去寻找更美好生活的人来说。 -003 克哈,是你们所有人安全的港湾。 -004 为了保护尤摩扬人民不受异虫的残害,我所做的,比他们自己的领导委员会都多。 -005 无论他们如何诽谤我,我将继续为所有泰伦人的最大利益,而努力奋斗。 -006 身为你们的元首,我带领泰伦人实现了人类统治领地和经济的扩张。 -007 我们将继续成长,用行动回击那些只会说风凉话,不愿意和我们相向而行的害群之马。 -008 帝国武装力量,无数的优秀儿女,正时刻守卫着我们的家园大门,但是他们孤木难支。 -009 凡是今天应征入伍者,所获的所有刑罚罪责,减半。 -010 激进分子和异见者希望你们一听见枪声,就背弃多年的和平与繁荣。 -011 他们没有勇气和能力,带领人类穿越一个充满危险的星系。 -012 法治是我们的命脉,然而它却受到前所未有的挑战。 -013 我将恢复我们帝国的荣光,绝不会向任何外星势力低头。 -014 我已经驯服了异虫,荡平了星灵。如今它们的创造者,想要夺走我们拥有的一切。 -015 永远记住,谁才是最能保护你们的人。 -016 不要听信别人的谗言,我不是什么克隆人。 \ No newline at end of file diff --git a/examples/vctk/fastspeech2/vctk/README.md b/examples/vctk/tts3/README.md similarity index 65% rename from examples/vctk/fastspeech2/vctk/README.md rename to examples/vctk/tts3/README.md index 8dc939d85..f85db7d12 100644 --- a/examples/vctk/fastspeech2/vctk/README.md +++ b/examples/vctk/tts3/README.md @@ -12,13 +12,22 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith 1. `p315`, because no txt for it. 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`. Assume the path to the MFA result of VCTK is `./vctk_alignment`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -43,11 +52,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] @@ -81,14 +90,23 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model - -## Synthesize +### Synthesize +We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it. +```bash +unzip pwg_vctk_ckpt_0.5.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_vctk_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -128,19 +146,22 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e_en.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: multi_spk_synthesize_e2e_en.py [-h] + [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] + [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] + [--text TEXT] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -161,6 +182,8 @@ optional arguments: spectrogram when training parallel wavegan. --phones-dict PHONES_DICT phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file. --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. @@ -175,7 +198,32 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models. -```bash +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip) +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_vctk_ckpt_0.5 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_66200.pdz # model parameters and optimizer states +├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2 +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +```bash +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ + --fastspeech2-config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \ + --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_96400.pdz \ + --fastspeech2-stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \ + --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ + --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=exp/default/test_e2e \ + --device="gpu" \ + --phones-dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \ + --speaker-dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt ``` diff --git a/examples/vctk/fastspeech2/vctk/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/vctk/conf/default.yaml rename to examples/vctk/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/vctk/preprocess.sh b/examples/vctk/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/vctk/preprocess.sh rename to examples/vctk/tts3/local/preprocess.sh index df4b634a3..4d589d666 100755 --- a/examples/vctk/fastspeech2/vctk/preprocess.sh +++ b/examples/vctk/tts3/local/preprocess.sh @@ -1,9 +1,9 @@ #!/bin/bash -stage=1 +stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./vctk_alignment \ --output durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=vctk \ --rootdir=~/datasets/VCTK-Corpus-0.92/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh new file mode 100755 index 000000000..ca1129691 --- /dev/null +++ b/examples/vctk/tts3/local/synthesize.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ + --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test \ + --device="gpu" \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh new file mode 100755 index 000000000..d919bb08e --- /dev/null +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ + --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output-dir=${train_output_path}/test_e2e \ + --device="gpu" \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/vctk/run.sh b/examples/vctk/tts3/local/train.sh similarity index 61% rename from examples/vctk/fastspeech2/vctk/run.sh rename to examples/vctk/tts3/local/train.sh index d4f06da91..be6051c97 100755 --- a/examples/vctk/fastspeech2/vctk/run.sh +++ b/examples/vctk/tts3/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh new file mode 100755 index 000000000..561d01632 --- /dev/null +++ b/examples/vctk/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh new file mode 100755 index 000000000..474d8e49a --- /dev/null +++ b/examples/vctk/tts3/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_331.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md b/examples/vctk/voc1/README.md similarity index 83% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md rename to examples/vctk/voc1/README.md index 29538dc46..b74b9d4a7 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md +++ b/examples/vctk/voc1/README.md @@ -1,6 +1,7 @@ # Parallel WaveGAN with VCTK This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443). -## Preprocess the dataset + +## Dataset ### Download and Extract the datasaet Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`. @@ -11,12 +12,21 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith 1. `p315`, because no txt for it. 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`. Assume the path to the MFA result of VCTK is `./vctk_alignment`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -38,12 +48,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. -## Train the model - -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text @@ -88,15 +97,10 @@ benchmark: 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. - -## Pretrained Models - - -## Synthesize - -`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] @@ -124,5 +128,16 @@ optional arguments: 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +## Pretrained Models +Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip). + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwg_vctk_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/conf/default.yaml rename to examples/vctk/voc1/conf/default.yaml diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh similarity index 83% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh rename to examples/vctk/voc1/local/preprocess.sh index 3ed4c0ccc..88a478cd5 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh +++ b/examples/vctk/voc1/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./vctk_alignment \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features echo "Extract features ..." - python3 ../../preprocess.py \ + python3 ${BIN_DIR}/../preprocess.py \ --rootdir=~/datasets/VCTK-Corpus-0.92/ \ --dataset=vctk \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --cut-sil=True \ --num-cpu=20 fi @@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh new file mode 100755 index 000000000..9f904ac0c --- /dev/null +++ b/examples/vctk/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh b/examples/vctk/voc1/local/train.sh similarity index 60% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh rename to examples/vctk/voc1/local/train.sh index df8cefd88..1ef860c36 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh +++ b/examples/vctk/voc1/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 + FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ../train.py \ +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh new file mode 100755 index 000000000..28d39ae00 --- /dev/null +++ b/examples/vctk/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/vctk/voc1/run.sh b/examples/vctk/voc1/run.sh new file mode 100755 index 000000000..71e2727c9 --- /dev/null +++ b/examples/vctk/voc1/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md deleted file mode 100644 index 9512ac09c..000000000 --- a/examples/voxceleb/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Voxceleb - -* spk0 - ge2e diff --git a/examples/voxceleb/spk0/local/ge2e/README_cn.md b/examples/voxceleb/spk0/local/ge2e/README_cn.md deleted file mode 100644 index 7777e4dd4..000000000 --- a/examples/voxceleb/spk0/local/ge2e/README_cn.md +++ /dev/null @@ -1,124 +0,0 @@ -# Speaker Encoder - -本实验是的在多说话人数据集上以 Speaker Verification 为任务训练一个 speaker encoder, 这是作为 transfer learning from speaker verification to multispeaker text-to-speech synthesis 实验的一部分, 可以在 [tacotron2_aishell3](../tacotron2_aishell3) 中找到。用训练好的模型来提取音频的 utterance embedding. - -## 模型 - -本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。 - -## 目录结构 - -```text -ge2e -├── README_cn.md -├── audio_processor.py -├── config.py -├── dataset_processors.py -├── inference.py -├── preprocess.py -├── random_cycle.py -├── speaker_verification_dataset.py -└── train.py -``` - -## 数据集下载 - -本实验支持了 Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata 数据集。可以在对应的页面下载。 - -1. Librispeech/train-other-500 - - 英文多说话人数据集,[下载链接](https://www.openslr.org/resources/12/train-other-500.tar.gz),我们的实验中仅用到了 train-other-500 这个子集。 - -2. VoxCeleb1 - - 英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev D 四个压缩文件并合并解压。 - -3. VoxCeleb2 - - 英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev H 八个压缩文件并合并解压。 - -4. Aidatatang-200zh - - 中文多说话人数据集,[下载链接](https://www.openslr.org/62/)。 - -5. magicdata - - 中文多说话人数据集,[下载链接](https://www.openslr.org/68/)。 - -如果用户需要使用其他的数据集,也可以自行下载并进行数据处理,只要符合如下的要求。 - -## 数据集预处理 - -训练中使用的数据集是多说话人数据集,transcription 并不会被使用。为了扩大数据的量,训练过程可以将多个数据集合并为一个。处理后的文件结果组织方式如下,每个句子的频谱存储为 `.npy` 格式。以 speaker-utterance 的两层目录结构存储。因为合并数据集的原因,为了避免 speaker id 冲突,dataset 名会被添加到 speaker id 前面。 - -```text -dataset_root -├── dataset01_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy -├── dataset01_speaker02/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy -├── dataset02_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy -└── dataset02_speaker02/ -    ├── utterance01.npy -    ├── utterance02.npy -    └── utterance03.npy -``` - -运行数据处理脚本 - -```bash -python preprocess.py --datasets_root= --output_dir= --dataset_names= -``` - -其中 datasets_root 是包含多个原始数据集的路径,--output_dir 是多个数据集合并后输出的路径,dataset_names 是数据集的名称,多个数据集可以用逗号分割,比如 'librispeech_other, voxceleb1'. 目前支持的数据集有 librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh, magicdata. - -## 训练 - -数据处理完成后,使用如下的脚本训练。 - -```bash -python train.py --data= --output= --device="gpu" --nprocs=1 -``` - -- `--data` 是处理后的数据集路径。 -- `--output` 是训练结果的保存路径,一般使用 runs 下的一个子目录。保存结果包含 visualdl 的 log 文件,文本 log 记录,运行 config 备份,以及 checkpoints 目录,里面包含参数文件和优化器状态文件。如果指定的 output 路径包含此前的训练结果,训练前会自动加载最近的参数文件和优化器状态文件。 -- `--device` 是运行设备,目前支持 'cpu' 和 'gpu'. -- `--nprocs` 是指定运行进程数。目前仅在使用 'gpu' 是支持多进程训练。可以配合 `CUDA_VISIBLE_DEVICES` 环境变量指定可见卡号。 - -另外还有几个选项。 - -- `--config` 是用于覆盖默认配置(默认配置可以查看 `config.py`) 的配置文件,为 `.yaml` 文件。 -- `--opts` 是用命令行参数进一步覆盖配置。这是最后一个传入的命令行选项,用多组空格分隔的 KEY VALUE 对的方式传入。 -- `--checkpoint_path` 指定从中恢复的 checkpoint, 不需要包含扩展名。同名的参数文件( `.pdparams`) 和优化器文件( `.pdopt`)会被加载以恢复训练。这个参数指定的恢复训练优先级高于自动从 `output` 文件夹中恢复训练。 - -## 预训练模型 - -预训练模型是在 Librispeech-other-500 和 voxceleb1 上训练到 1560k steps 后用 aidatatang_200h 和 magic_data 训练到 3000k 的结果。 - -下载链接 [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) - -## 预测 - -使用训练好的模型进行预测,对一个数据集中的所有 utterance 生成一个 embedding. - -```bash -python inference.py --input= --output= --checkpoint_path= --device="gpu" -``` - -- `--input` 是需要处理的数据集的路径。 -- `--output` 是处理的结果,它会保持和 `--input` 相同的文件夹结构,对应 input 中的每一个音频文件会有一个同名的 `*.npy` 文件,是从这个音频文件中提取到的 utterance embedding. -- `--checkpoint_path` 为用于预测的参数文件路径,不包含扩展名。 -- `--pattern` 是用于筛选数据集中需要处理的音频文件的通配符模式,默认为 `*.wav`. -- `--device` 和 `--opts` 的语义和训练脚本一致。 - -## 参考文献 - -1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) -2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) diff --git a/examples/voxceleb/spk0/run.sh b/examples/voxceleb/spk0/run.sh deleted file mode 100644 index e69de29bb..000000000 diff --git a/parakeet/__init__.py b/parakeet/__init__.py index 87528b833..8a0acc48a 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging + from . import data from . import datasets +from . import exps from . import frontend from . import models from . import modules diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py index 23e378fde..7747b7945 100644 --- a/parakeet/audio/__init__.py +++ b/parakeet/audio/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .audio import AudioProcessor from .spec_normalizer import LogMagnitude from .spec_normalizer import NormalizerBase diff --git a/parakeet/data/__init__.py b/parakeet/data/__init__.py index 6f15dbbec..c71c05bd7 100644 --- a/parakeet/data/__init__.py +++ b/parakeet/data/__init__.py @@ -13,6 +13,5 @@ # limitations under the License. """Parakeet's infrastructure for data processing. """ - -from .dataset import * from .batch import * +from .dataset import * diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py index cbdcdfa49..fc64a82f2 100644 --- a/parakeet/datasets/__init__.py +++ b/parakeet/datasets/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .common import * from .ljspeech import * diff --git a/parakeet/datasets/preprocess_utils.py b/parakeet/datasets/preprocess_utils.py index ddbedf5ce..8b01f6c3c 100644 --- a/parakeet/datasets/preprocess_utils.py +++ b/parakeet/datasets/preprocess_utils.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re diff --git a/parakeet/datasets/vocoder_batch_fn.py b/parakeet/datasets/vocoder_batch_fn.py index 925303b5f..30adb142d 100644 --- a/parakeet/datasets/vocoder_batch_fn.py +++ b/parakeet/datasets/vocoder_batch_fn.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import paddle diff --git a/parakeet/exps/__init__.py b/parakeet/exps/__init__.py index e69de29bb..abf198b97 100644 --- a/parakeet/exps/__init__.py +++ b/parakeet/exps/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/parakeet/exps/fastspeech2/__init__.py b/parakeet/exps/fastspeech2/__init__.py index e69de29bb..abf198b97 100644 --- a/parakeet/exps/fastspeech2/__init__.py +++ b/parakeet/exps/fastspeech2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.py b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py similarity index 100% rename from examples/vctk/fastspeech2/aishell3/synthesize_e2e.py rename to parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py index 13f59bfd9..825b3ed36 100644 --- a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -20,13 +19,14 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.frontend.zh_frontend import Frontend from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, fastspeech2_config, pwg_config): diff --git a/examples/vctk/fastspeech2/vctk/synthesize_e2e.py b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py similarity index 100% rename from examples/vctk/fastspeech2/vctk/synthesize_e2e.py rename to parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py index 65a927b4f..a47619826 100644 --- a/examples/vctk/fastspeech2/vctk/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -20,13 +19,14 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.frontend import English from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, fastspeech2_config, pwg_config): diff --git a/examples/vctk/fastspeech2/normalize.py b/parakeet/exps/fastspeech2/normalize.py similarity index 100% rename from examples/vctk/fastspeech2/normalize.py rename to parakeet/exps/fastspeech2/normalize.py index 2d40bdf37..b4b31e311 100644 --- a/examples/vctk/fastspeech2/normalize.py +++ b/parakeet/exps/fastspeech2/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/examples/vctk/fastspeech2/preprocess.py b/parakeet/exps/fastspeech2/preprocess.py similarity index 100% rename from examples/vctk/fastspeech2/preprocess.py rename to parakeet/exps/fastspeech2/preprocess.py index ee2b3f915..bb796b64c 100644 --- a/examples/vctk/fastspeech2/preprocess.py +++ b/parakeet/exps/fastspeech2/preprocess.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os from concurrent.futures import ThreadPoolExecutor @@ -26,6 +25,8 @@ import librosa import numpy as np import tqdm import yaml +from yacs.config import CfgNode + from parakeet.data.get_feats import Energy from parakeet.data.get_feats import LogMelFBank from parakeet.data.get_feats import Pitch @@ -34,7 +35,6 @@ from parakeet.datasets.preprocess_utils import get_input_token from parakeet.datasets.preprocess_utils import get_phn_dur from parakeet.datasets.preprocess_utils import get_spk_id_map from parakeet.datasets.preprocess_utils import merge_silence -from yacs.config import CfgNode def process_sentence(config: Dict[str, Any], diff --git a/examples/vctk/fastspeech2/synthesize.py b/parakeet/exps/fastspeech2/synthesize.py similarity index 100% rename from examples/vctk/fastspeech2/synthesize.py rename to parakeet/exps/fastspeech2/synthesize.py index c1329f8d0..913277571 100644 --- a/examples/vctk/fastspeech2/synthesize.py +++ b/parakeet/exps/fastspeech2/synthesize.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -22,6 +21,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference diff --git a/examples/vctk/fastspeech2/baker/synthesize_e2e.py b/parakeet/exps/fastspeech2/synthesize_e2e.py similarity index 100% rename from examples/vctk/fastspeech2/baker/synthesize_e2e.py rename to parakeet/exps/fastspeech2/synthesize_e2e.py index 75e06edf7..dd1b57c8a 100644 --- a/examples/vctk/fastspeech2/baker/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/synthesize_e2e.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -20,13 +19,14 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.frontend.zh_frontend import Frontend from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, fastspeech2_config, pwg_config): diff --git a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py b/parakeet/exps/fastspeech2/synthesize_e2e_en.py similarity index 100% rename from examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py rename to parakeet/exps/fastspeech2/synthesize_e2e_en.py index 6732aa408..4e8a20c75 100644 --- a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/synthesize_e2e_en.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -21,6 +20,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.frontend import English from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference diff --git a/examples/vctk/fastspeech2/train.py b/parakeet/exps/fastspeech2/train.py similarity index 100% rename from examples/vctk/fastspeech2/train.py rename to parakeet/exps/fastspeech2/train.py index 1ea2c561e..59b1ea3af 100644 --- a/examples/vctk/fastspeech2/train.py +++ b/parakeet/exps/fastspeech2/train.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os import shutil +from pathlib import Path import jsonlines import numpy as np @@ -25,9 +25,12 @@ from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from parakeet.datasets.data_table import DataTable -from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn +from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn +from parakeet.datasets.data_table import DataTable from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Evaluator from parakeet.models.fastspeech2 import FastSpeech2Updater @@ -36,9 +39,6 @@ from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from pathlib import Path -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/examples/vctk/GANVocoder/README.md b/parakeet/exps/gan_vocoder/README.md similarity index 100% rename from examples/vctk/GANVocoder/README.md rename to parakeet/exps/gan_vocoder/README.md diff --git a/parakeet/exps/gan_vocoder/__init__.py b/parakeet/exps/gan_vocoder/__init__.py index e69de29bb..abf198b97 100644 --- a/parakeet/exps/gan_vocoder/__init__.py +++ b/parakeet/exps/gan_vocoder/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/vctk/GANVocoder/normalize.py b/parakeet/exps/gan_vocoder/normalize.py similarity index 100% rename from examples/vctk/GANVocoder/normalize.py rename to parakeet/exps/gan_vocoder/normalize.py index 74d838adb..c772594bb 100644 --- a/examples/vctk/GANVocoder/normalize.py +++ b/parakeet/exps/gan_vocoder/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py b/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/synthesize.py b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/synthesize.py rename to parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py index e57ddf880..9129caa54 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/synthesize.py +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py @@ -11,11 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os from pathlib import Path -from timer import timer import jsonlines import numpy as np @@ -23,9 +21,11 @@ import paddle import soundfile as sf import yaml from paddle import distributed as dist +from timer import timer +from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.models.parallel_wavegan import PWGGenerator -from yacs.config import CfgNode def main(): diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py similarity index 87% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py rename to parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index f20f0a726..c451a51c1 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -import os import logging +import os from pathlib import Path import librosa @@ -22,20 +21,12 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.data.get_feats import LogMelFBank from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode as Configuration - - -def get_cfg_default(): - config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve() - with open(config_path, 'rt') as f: - _C = yaml.safe_load(f) - _C = Configuration(_C) - config = _C.clone() - return config def evaluate(args, config): @@ -91,7 +82,7 @@ def main(): description="Synthesize with parallel wavegan.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="parallel wavegan config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") parser.add_argument( "--stat", @@ -108,9 +99,8 @@ def main(): paddle.set_device(args.device) - config = get_cfg_default() - if args.config: - config.merge_from_file(args.config) + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) print("========Args========") print(yaml.safe_dump(vars(args))) diff --git a/examples/vctk/GANVocoder/parallelwave_gan/train.py b/parakeet/exps/gan_vocoder/parallelwave_gan/train.py similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/train.py rename to parakeet/exps/gan_vocoder/parallelwave_gan/train.py index 7e6aa9a62..7a16ca597 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/train.py +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/train.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os import shutil +from pathlib import Path import jsonlines import numpy as np @@ -28,20 +28,20 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from paddle.optimizer import Adam # No RAdaom from paddle.optimizer.lr import StepDecay +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.datasets.vocoder_batch_fn import Clip -from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGDiscriminator -from parakeet.models.parallel_wavegan import PWGUpdater from parakeet.models.parallel_wavegan import PWGEvaluator +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGUpdater from parakeet.modules.stft_loss import MultiResolutionSTFTLoss from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from pathlib import Path -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/examples/vctk/GANVocoder/preprocess.py b/parakeet/exps/gan_vocoder/preprocess.py similarity index 100% rename from examples/vctk/GANVocoder/preprocess.py rename to parakeet/exps/gan_vocoder/preprocess.py index e9f182869..c10143c71 100644 --- a/examples/vctk/GANVocoder/preprocess.py +++ b/parakeet/exps/gan_vocoder/preprocess.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os +from concurrent.futures import ThreadPoolExecutor from operator import itemgetter +from pathlib import Path from typing import Any from typing import Dict from typing import List @@ -24,12 +25,11 @@ import librosa import numpy as np import tqdm import yaml -from concurrent.futures import ThreadPoolExecutor +from yacs.config import CfgNode + from parakeet.data.get_feats import LogMelFBank from parakeet.datasets.preprocess_utils import get_phn_dur from parakeet.datasets.preprocess_utils import merge_silence -from pathlib import Path -from yacs.config import CfgNode def process_sentence(config: Dict[str, Any], diff --git a/parakeet/exps/gan_vocoder/pwgan/__init__.py b/parakeet/exps/gan_vocoder/pwgan/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/parakeet/exps/ge2e/__init__.py b/parakeet/exps/ge2e/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/parakeet/exps/ge2e/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/voxceleb/spk0/local/ge2e/audio_processor.py b/parakeet/exps/ge2e/audio_processor.py similarity index 99% rename from examples/voxceleb/spk0/local/ge2e/audio_processor.py rename to parakeet/exps/ge2e/audio_processor.py index 921e99901..2d6bbe34e 100644 --- a/examples/voxceleb/spk0/local/ge2e/audio_processor.py +++ b/parakeet/exps/ge2e/audio_processor.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import struct from pathlib import Path from warnings import warn -import struct -from scipy.ndimage.morphology import binary_dilation -import numpy as np import librosa +import numpy as np +from scipy.ndimage.morphology import binary_dilation try: import webrtcvad @@ -97,7 +96,7 @@ def trim_long_silences(wav, return ret[width - 1:] / width audio_mask = moving_average(voice_flags, vad_moving_average_width) - audio_mask = np.round(audio_mask).astype(np.bool) + audio_mask = np.round(audio_mask).astype(bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, diff --git a/examples/voxceleb/spk0/local/ge2e/config.py b/parakeet/exps/ge2e/config.py similarity index 99% rename from examples/voxceleb/spk0/local/ge2e/config.py rename to parakeet/exps/ge2e/config.py index b8d748aac..3e1142916 100644 --- a/examples/voxceleb/spk0/local/ge2e/config.py +++ b/parakeet/exps/ge2e/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode _C = CfgNode() diff --git a/examples/voxceleb/spk0/local/ge2e/dataset_processors.py b/parakeet/exps/ge2e/dataset_processors.py similarity index 98% rename from examples/voxceleb/spk0/local/ge2e/dataset_processors.py rename to parakeet/exps/ge2e/dataset_processors.py index 50a8f3e73..29b584107 100644 --- a/examples/voxceleb/spk0/local/ge2e/dataset_processors.py +++ b/parakeet/exps/ge2e/dataset_processors.py @@ -11,16 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import multiprocessing as mp from functools import partial -from typing import List from pathlib import Path -import multiprocessing as mp +from typing import List import numpy as np from tqdm import tqdm -from audio_processor import SpeakerVerificationPreprocessor +from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor): diff --git a/examples/voxceleb/spk0/local/ge2e/inference.py b/parakeet/exps/ge2e/inference.py similarity index 97% rename from examples/voxceleb/spk0/local/ge2e/inference.py rename to parakeet/exps/ge2e/inference.py index 1cca132da..156866627 100644 --- a/examples/voxceleb/spk0/local/ge2e/inference.py +++ b/parakeet/exps/ge2e/inference.py @@ -11,19 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path -import tqdm -import paddle import numpy as np +import paddle +import tqdm +from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from parakeet.exps.ge2e.config import get_cfg_defaults from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder -from audio_processor import SpeakerVerificationPreprocessor -from config import get_cfg_defaults - def embed_utterance(processor, model, fpath_or_wav): # audio processor diff --git a/examples/voxceleb/spk0/local/ge2e/preprocess.py b/parakeet/exps/ge2e/preprocess.py similarity index 90% rename from examples/voxceleb/spk0/local/ge2e/preprocess.py rename to parakeet/exps/ge2e/preprocess.py index b1e59460e..f6457251d 100644 --- a/examples/voxceleb/spk0/local/ge2e/preprocess.py +++ b/parakeet/exps/ge2e/preprocess.py @@ -11,14 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path -from config import get_cfg_defaults + from audio_processor import SpeakerVerificationPreprocessor -from dataset_processors import (process_librispeech, process_voxceleb1, - process_voxceleb2, process_aidatatang_200zh, - process_magicdata) + +from parakeet.exps.ge2e.config import get_cfg_defaults +from parakeet.exps.ge2e.dataset_processors import process_aidatatang_200zh +from parakeet.exps.ge2e.dataset_processors import process_librispeech +from parakeet.exps.ge2e.dataset_processors import process_magicdata +from parakeet.exps.ge2e.dataset_processors import process_voxceleb1 +from parakeet.exps.ge2e.dataset_processors import process_voxceleb2 if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/examples/voxceleb/spk0/local/ge2e/random_cycle.py b/parakeet/exps/ge2e/random_cycle.py similarity index 99% rename from examples/voxceleb/spk0/local/ge2e/random_cycle.py rename to parakeet/exps/ge2e/random_cycle.py index 4a2015813..290fd2fa2 100644 --- a/examples/voxceleb/spk0/local/ge2e/random_cycle.py +++ b/parakeet/exps/ge2e/random_cycle.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import random diff --git a/examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py b/parakeet/exps/ge2e/speaker_verification_dataset.py similarity index 97% rename from examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py rename to parakeet/exps/ge2e/speaker_verification_dataset.py index c9cfda29c..896676d96 100644 --- a/examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py +++ b/parakeet/exps/ge2e/speaker_verification_dataset.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import random from pathlib import Path import numpy as np -from paddle.io import Dataset, BatchSampler +from paddle.io import BatchSampler +from paddle.io import Dataset -from random_cycle import random_cycle +from parakeet.exps.ge2e.random_cycle import random_cycle class MultiSpeakerMelDataset(Dataset): diff --git a/examples/voxceleb/spk0/local/ge2e/train.py b/parakeet/exps/ge2e/train.py similarity index 93% rename from examples/voxceleb/spk0/local/ge2e/train.py rename to parakeet/exps/ge2e/train.py index 950d486df..7a59c436b 100644 --- a/examples/voxceleb/spk0/local/ge2e/train.py +++ b/parakeet/exps/ge2e/train.py @@ -11,23 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time -from paddle import distributed as dist -from paddle.optimizer import Adam from paddle import DataParallel +from paddle import distributed as dist from paddle.io import DataLoader from paddle.nn.clip import ClipGradByGlobalNorm +from paddle.optimizer import Adam +from parakeet.exps.ge2e.config import get_cfg_defaults +from parakeet.exps.ge2e.speaker_verification_dataset import Collate +from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset +from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder -from parakeet.training import ExperimentBase from parakeet.training import default_argument_parser - -from speaker_verification_dataset import MultiSpeakerMelDataset -from speaker_verification_dataset import MultiSpeakerSampler -from speaker_verification_dataset import Collate -from config import get_cfg_defaults +from parakeet.training import ExperimentBase class Ge2eExperiment(ExperimentBase): diff --git a/examples/csmsc/speedyspeech/sentences.txt b/parakeet/exps/sentences.txt similarity index 100% rename from examples/csmsc/speedyspeech/sentences.txt rename to parakeet/exps/sentences.txt diff --git a/examples/vctk/fastspeech2/sentences_en.txt b/parakeet/exps/sentences_en.txt similarity index 100% rename from examples/vctk/fastspeech2/sentences_en.txt rename to parakeet/exps/sentences_en.txt diff --git a/parakeet/exps/speedyspeech/__init__.py b/parakeet/exps/speedyspeech/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/parakeet/exps/speedyspeech/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/csmsc/speedyspeech/baker/inference.py b/parakeet/exps/speedyspeech/inference.py similarity index 100% rename from examples/csmsc/speedyspeech/baker/inference.py rename to parakeet/exps/speedyspeech/inference.py index a1d185402..bf144d760 100644 --- a/examples/csmsc/speedyspeech/baker/inference.py +++ b/parakeet/exps/speedyspeech/inference.py @@ -11,13 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os from pathlib import Path import soundfile as sf from paddle import inference + from parakeet.frontend.zh_frontend import Frontend diff --git a/examples/csmsc/speedyspeech/normalize.py b/parakeet/exps/speedyspeech/normalize.py similarity index 100% rename from examples/csmsc/speedyspeech/normalize.py rename to parakeet/exps/speedyspeech/normalize.py index eeb58bb7c..8f02c33cc 100644 --- a/examples/csmsc/speedyspeech/normalize.py +++ b/parakeet/exps/speedyspeech/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/examples/csmsc/speedyspeech/preprocess.py b/parakeet/exps/speedyspeech/preprocess.py similarity index 100% rename from examples/csmsc/speedyspeech/preprocess.py rename to parakeet/exps/speedyspeech/preprocess.py index 647c9b363..f3ae294d8 100644 --- a/examples/csmsc/speedyspeech/preprocess.py +++ b/parakeet/exps/speedyspeech/preprocess.py @@ -11,27 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import argparse +import re +from concurrent.futures import ThreadPoolExecutor from operator import itemgetter +from pathlib import Path from typing import Any from typing import Dict from typing import List -import argparse import jsonlines import librosa import numpy as np -import re import tqdm import yaml -from concurrent.futures import ThreadPoolExecutor +from yacs.config import CfgNode + from parakeet.data.get_feats import LogMelFBank from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length -from parakeet.datasets.preprocess_utils import get_phones_tones from parakeet.datasets.preprocess_utils import get_phn_dur +from parakeet.datasets.preprocess_utils import get_phones_tones from parakeet.datasets.preprocess_utils import merge_silence -from pathlib import Path -from yacs.config import CfgNode def process_sentence(config: Dict[str, Any], diff --git a/examples/csmsc/speedyspeech/synthesize.py b/parakeet/exps/speedyspeech/synthesize.py similarity index 100% rename from examples/csmsc/speedyspeech/synthesize.py rename to parakeet/exps/speedyspeech/synthesize.py index 4225071ec..43ab4a69b 100644 --- a/examples/csmsc/speedyspeech/synthesize.py +++ b/parakeet/exps/speedyspeech/synthesize.py @@ -11,25 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import logging import argparse +import logging +import os from pathlib import Path import jsonlines import numpy as np -import soundfile as sf import paddle +import soundfile as sf import yaml from paddle import jit from paddle.static import InputSpec from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.modules.normalizer import ZScore diff --git a/examples/csmsc/speedyspeech/baker/synthesize_e2e.py b/parakeet/exps/speedyspeech/synthesize_e2e.py similarity index 100% rename from examples/csmsc/speedyspeech/baker/synthesize_e2e.py rename to parakeet/exps/speedyspeech/synthesize_e2e.py index 6dd3abd1a..47e064e95 100644 --- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.py +++ b/parakeet/exps/speedyspeech/synthesize_e2e.py @@ -11,25 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os from pathlib import Path import numpy as np -import soundfile as sf import paddle +import soundfile as sf import yaml from paddle import jit from paddle.static import InputSpec +from yacs.config import CfgNode + from parakeet.frontend.zh_frontend import Frontend -from parakeet.models.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, speedyspeech_config, pwg_config): diff --git a/examples/csmsc/speedyspeech/train.py b/parakeet/exps/speedyspeech/train.py similarity index 100% rename from examples/csmsc/speedyspeech/train.py rename to parakeet/exps/speedyspeech/train.py index f7a4e3018..ea9fe20d7 100644 --- a/examples/csmsc/speedyspeech/train.py +++ b/parakeet/exps/speedyspeech/train.py @@ -11,22 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os import shutil +from pathlib import Path import jsonlines import numpy as np import paddle import yaml -from paddle import distributed as dist from paddle import DataParallel +from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from parakeet.datasets.data_table import DataTable +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.am_batch_fn import speedyspeech_batch_fn +from parakeet.datasets.data_table import DataTable from parakeet.models.speedyspeech import SpeedySpeech from parakeet.models.speedyspeech import SpeedySpeechEvaluator from parakeet.models.speedyspeech import SpeedySpeechUpdater @@ -35,9 +38,6 @@ from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from pathlib import Path -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/parakeet/exps/tacotron2/__init__.py b/parakeet/exps/tacotron2/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/parakeet/exps/tacotron2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/ljspeech/tts0/local/tacotron2/config.py b/parakeet/exps/tacotron2/config.py similarity index 99% rename from examples/ljspeech/tts0/local/tacotron2/config.py rename to parakeet/exps/tacotron2/config.py index e370e77a8..0ce2df368 100644 --- a/examples/ljspeech/tts0/local/tacotron2/config.py +++ b/parakeet/exps/tacotron2/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode as CN _C = CN() diff --git a/examples/ljspeech/tts0/local/tacotron2/ljspeech.py b/parakeet/exps/tacotron2/ljspeech.py similarity index 97% rename from examples/ljspeech/tts0/local/tacotron2/ljspeech.py rename to parakeet/exps/tacotron2/ljspeech.py index 76e4b3a6e..20dc29d37 100644 --- a/examples/ljspeech/tts0/local/tacotron2/ljspeech.py +++ b/parakeet/exps/tacotron2/ljspeech.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from pathlib import Path import pickle +from pathlib import Path import numpy as np from paddle.io import Dataset -from parakeet.data.batch import batch_spec, batch_text_id +from parakeet.data.batch import batch_spec +from parakeet.data.batch import batch_text_id class LJSpeech(Dataset): diff --git a/examples/ljspeech/tts0/local/tacotron2/preprocess.py b/parakeet/exps/tacotron2/preprocess.py similarity index 95% rename from examples/ljspeech/tts0/local/tacotron2/preprocess.py rename to parakeet/exps/tacotron2/preprocess.py index aa7bf2449..893444855 100644 --- a/examples/ljspeech/tts0/local/tacotron2/preprocess.py +++ b/parakeet/exps/tacotron2/preprocess.py @@ -11,21 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import argparse import os import pickle -import argparse from pathlib import Path -import tqdm import numpy as np +import tqdm +from parakeet.audio import AudioProcessor +from parakeet.audio import LogMagnitude from parakeet.datasets import LJSpeechMetaData -from parakeet.audio import AudioProcessor, LogMagnitude +from parakeet.exps.tacotron2.config import get_cfg_defaults from parakeet.frontend import EnglishCharacter -from config import get_cfg_defaults - def create_dataset(config, source_path, target_path, verbose=False): # create output dir diff --git a/examples/ljspeech/tts0/local/tacotron2/synthesize.ipynb b/parakeet/exps/tacotron2/synthesize.ipynb similarity index 100% rename from examples/ljspeech/tts0/local/tacotron2/synthesize.ipynb rename to parakeet/exps/tacotron2/synthesize.ipynb diff --git a/examples/ljspeech/tts0/local/tacotron2/synthesize.py b/parakeet/exps/tacotron2/synthesize.py similarity index 91% rename from examples/ljspeech/tts0/local/tacotron2/synthesize.py rename to parakeet/exps/tacotron2/synthesize.py index f933c32c1..56257c9b0 100644 --- a/examples/ljspeech/tts0/local/tacotron2/synthesize.py +++ b/parakeet/exps/tacotron2/synthesize.py @@ -11,20 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path -import paddle import numpy as np +import paddle from matplotlib import pyplot as plt +from parakeet.exps.tacotron2.config import get_cfg_defaults from parakeet.frontend import EnglishCharacter from parakeet.models.tacotron2 import Tacotron2 from parakeet.utils import display -from config import get_cfg_defaults - def main(config, args): paddle.set_device(args.device) @@ -36,8 +34,13 @@ def main(config, args): # inputs input_path = Path(args.input).expanduser() + sentences = [] with open(input_path, "rt") as f: - sentences = f.readlines() + for line in f: + line_list = line.strip().split() + utt_id = line_list[0] + sentence = " ".join(line_list[1:]) + sentences.append((utt_id, sentence)) if args.output is None: output_dir = input_path.parent / "synthesis" diff --git a/examples/ljspeech/tts0/local/tacotron2/train.py b/parakeet/exps/tacotron2/train.py similarity index 95% rename from examples/ljspeech/tts0/local/tacotron2/train.py rename to parakeet/exps/tacotron2/train.py index 82dd4c32e..3677c271d 100644 --- a/examples/ljspeech/tts0/local/tacotron2/train.py +++ b/parakeet/exps/tacotron2/train.py @@ -11,23 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time from collections import defaultdict import numpy as np import paddle +from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from paddle import distributed as dist + from parakeet.data import dataset +from parakeet.exps.tacotron2.config import get_cfg_defaults +from parakeet.exps.tacotron2.ljspeech import LJSpeech +from parakeet.exps.tacotron2.ljspeech import LJSpeechCollector +from parakeet.models.tacotron2 import Tacotron2 +from parakeet.models.tacotron2 import Tacotron2Loss from parakeet.training.cli import default_argument_parser from parakeet.training.experiment import ExperimentBase -from parakeet.utils import display, mp_tools -from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss - -from config import get_cfg_defaults -from ljspeech import LJSpeech, LJSpeechCollector +from parakeet.utils import display +from parakeet.utils import mp_tools class Experiment(ExperimentBase): diff --git a/parakeet/exps/tacotron2_ge2e/__init__.py b/parakeet/exps/tacotron2_ge2e/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/parakeet/exps/tacotron2_ge2e/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/aishell3/vc0/local/tacotron2/aishell3.py b/parakeet/exps/tacotron2_ge2e/aishell3.py similarity index 92% rename from examples/aishell3/vc0/local/tacotron2/aishell3.py rename to parakeet/exps/tacotron2_ge2e/aishell3.py index c53cf59dc..542573996 100644 --- a/examples/aishell3/vc0/local/tacotron2/aishell3.py +++ b/parakeet/exps/tacotron2_ge2e/aishell3.py @@ -11,16 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import pickle from pathlib import Path import numpy as np from paddle.io import Dataset -from parakeet.frontend import Vocab -from parakeet.data import batch_text_id, batch_spec -from preprocess_transcription import _phones, _tones +from parakeet.data import batch_spec +from parakeet.data import batch_text_id +from parakeet.exps.tacotron2_ge2e.preprocess_transcription import _phones +from parakeet.exps.tacotron2_ge2e.preprocess_transcription import _tones +from parakeet.frontend import Vocab voc_phones = Vocab(sorted(list(_phones))) print("vocab_phones:\n", voc_phones) diff --git a/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py b/parakeet/exps/tacotron2_ge2e/chinese_g2p.py similarity index 86% rename from examples/aishell3/vc0/local/tacotron2/chinese_g2p.py rename to parakeet/exps/tacotron2_ge2e/chinese_g2p.py index e2437f06f..6cb86d9db 100644 --- a/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py +++ b/parakeet/exps/tacotron2_ge2e/chinese_g2p.py @@ -11,10 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import List +from typing import Tuple -from typing import List, Tuple -from pypinyin import lazy_pinyin, Style -from preprocess_transcription import split_syllable +from pypinyin import lazy_pinyin +from pypinyin import Style + +from parakeet.exps.tacotron2_ge2e.preprocess_transcription import split_syllable def convert_to_pinyin(text: str) -> List[str]: diff --git a/examples/aishell3/vc0/local/tacotron2/config.py b/parakeet/exps/tacotron2_ge2e/config.py similarity index 99% rename from examples/aishell3/vc0/local/tacotron2/config.py rename to parakeet/exps/tacotron2_ge2e/config.py index 440bdbd94..8d8c9c4e1 100644 --- a/examples/aishell3/vc0/local/tacotron2/config.py +++ b/parakeet/exps/tacotron2_ge2e/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode as CN _C = CN() diff --git a/examples/aishell3/vc0/local/tacotron2/extract_mel.py b/parakeet/exps/tacotron2_ge2e/extract_mel.py similarity index 92% rename from examples/aishell3/vc0/local/tacotron2/extract_mel.py rename to parakeet/exps/tacotron2_ge2e/extract_mel.py index b7bafb86c..e32f3e3bb 100644 --- a/examples/aishell3/vc0/local/tacotron2/extract_mel.py +++ b/parakeet/exps/tacotron2_ge2e/extract_mel.py @@ -11,19 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import multiprocessing as mp from functools import partial from pathlib import Path import numpy as np -from parakeet.audio import AudioProcessor -from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude - import tqdm -from config import get_cfg_defaults +from parakeet.audio import AudioProcessor +from parakeet.audio.spec_normalizer import LogMagnitude +from parakeet.audio.spec_normalizer import NormalizerBase +from parakeet.exps.tacotron2_ge2e.config import get_cfg_defaults def extract_mel(fname: Path, @@ -47,7 +46,7 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"): output_dir.mkdir(parents=True, exist_ok=True) p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length, - config.hop_length, config.n_mels, config.fmin, + config.hop_length, config.d_mels, config.fmin, config.fmax) n = LogMagnitude(1e-5) diff --git a/examples/aishell3/vc0/local/tacotron2/lexicon.txt b/parakeet/exps/tacotron2_ge2e/lexicon.txt similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/lexicon.txt rename to parakeet/exps/tacotron2_ge2e/lexicon.txt diff --git a/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py b/parakeet/exps/tacotron2_ge2e/preprocess_transcription.py similarity index 99% rename from examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py rename to parakeet/exps/tacotron2_ge2e/preprocess_transcription.py index fa74331b3..ce117d420 100644 --- a/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py +++ b/parakeet/exps/tacotron2_ge2e/preprocess_transcription.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -from pathlib import Path -import re import pickle +import re +from pathlib import Path -import yaml import tqdm +import yaml zh_pattern = re.compile("[\u4e00-\u9fa5]") diff --git a/examples/aishell3/vc0/local/tacotron2/process_wav.py b/parakeet/exps/tacotron2_ge2e/process_wav.py similarity index 99% rename from examples/aishell3/vc0/local/tacotron2/process_wav.py rename to parakeet/exps/tacotron2_ge2e/process_wav.py index 34d408970..c1be0a37e 100644 --- a/examples/aishell3/vc0/local/tacotron2/process_wav.py +++ b/parakeet/exps/tacotron2_ge2e/process_wav.py @@ -11,17 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -from pathlib import Path -from multiprocessing import Pool from functools import partial +from multiprocessing import Pool +from pathlib import Path -import numpy as np import librosa +import numpy as np import soundfile as sf -from tqdm import tqdm from praatio import tgio +from tqdm import tqdm def get_valid_part(fpath): diff --git a/examples/aishell3/vc0/local/tacotron2/train.py b/parakeet/exps/tacotron2_ge2e/train.py similarity index 94% rename from examples/aishell3/vc0/local/tacotron2/train.py rename to parakeet/exps/tacotron2_ge2e/train.py index de0181168..35878a1b5 100644 --- a/examples/aishell3/vc0/local/tacotron2/train.py +++ b/parakeet/exps/tacotron2_ge2e/train.py @@ -11,26 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time -from pathlib import Path from collections import defaultdict +from pathlib import Path import numpy as np -from matplotlib import pyplot as plt - import paddle +from matplotlib import pyplot as plt from paddle import distributed as dist -from paddle.io import DataLoader, DistributedBatchSampler +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from parakeet.data import dataset +from parakeet.exps.tacotron2_ge2e.aishell3 import AiShell3 +from parakeet.exps.tacotron2_ge2e.aishell3 import collate_aishell3_examples +from parakeet.exps.tacotron2_ge2e.config import get_cfg_defaults +from parakeet.models.tacotron2 import Tacotron2 +from parakeet.models.tacotron2 import Tacotron2Loss from parakeet.training.cli import default_argument_parser from parakeet.training.experiment import ExperimentBase -from parakeet.utils import display, mp_tools -from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss - -from config import get_cfg_defaults -from aishell3 import AiShell3, collate_aishell3_examples +from parakeet.utils import display +from parakeet.utils import mp_tools class Experiment(ExperimentBase): @@ -192,9 +193,9 @@ class Experiment(ExperimentBase): def setup_dataloader(self): args = self.args config = self.config - ljspeech_dataset = AiShell3(args.data) + aishell3_dataset = AiShell3(args.data) - valid_set, train_set = dataset.split(ljspeech_dataset, + valid_set, train_set = dataset.split(aishell3_dataset, config.data.valid_size) batch_fn = collate_aishell3_examples diff --git a/parakeet/exps/tacotron2_ge2e/voice_cloning.py b/parakeet/exps/tacotron2_ge2e/voice_cloning.py new file mode 100644 index 000000000..269b0c18f --- /dev/null +++ b/parakeet/exps/tacotron2_ge2e/voice_cloning.py @@ -0,0 +1,160 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import soundfile as sf +from matplotlib import pyplot as plt + +from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from parakeet.exps.tacotron2_ge2e.aishell3 import voc_phones +from parakeet.exps.tacotron2_ge2e.aishell3 import voc_tones +from parakeet.exps.tacotron2_ge2e.chinese_g2p import convert_sentence +from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder +from parakeet.models.tacotron2 import Tacotron2 +from parakeet.models.waveflow import ConditionalWaveFlow +from parakeet.utils import display + + +def voice_cloning(args): + # speaker encoder + p = SpeakerVerificationPreprocessor( + sampling_rate=16000, + audio_norm_target_dBFS=-30, + vad_window_length=30, + vad_moving_average_width=8, + vad_max_silence_length=6, + mel_window_length=25, + mel_window_step=10, + n_mels=40, + partial_n_frames=160, + min_pad_coverage=0.75, + partial_overlap_ratio=0.5) + print("Audio Processor Done!") + + speaker_encoder = LSTMSpeakerEncoder( + n_mels=40, num_layers=3, hidden_size=256, output_size=256) + speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path)) + speaker_encoder.eval() + print("GE2E Done!") + + synthesizer = Tacotron2( + vocab_size=68, + n_tones=10, + d_mels=80, + d_encoder=512, + encoder_conv_layers=3, + encoder_kernel_size=5, + d_prenet=256, + d_attention_rnn=1024, + d_decoder_rnn=1024, + attention_filters=32, + attention_kernel_size=31, + d_attention=128, + d_postnet=512, + postnet_kernel_size=5, + postnet_conv_layers=5, + reduction_factor=1, + p_encoder_dropout=0.5, + p_prenet_dropout=0.5, + p_attention_dropout=0.1, + p_decoder_dropout=0.1, + p_postnet_dropout=0.5, + d_global_condition=256, + use_stop_token=False, ) + synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path)) + synthesizer.eval() + print("Tacotron2 Done!") + + # vocoder + vocoder = ConditionalWaveFlow( + upsample_factors=[16, 16], + n_flows=8, + n_layers=8, + n_group=16, + channels=128, + n_mels=80, + kernel_size=[3, 3]) + vocoder.set_state_dict(paddle.load(args.waveflow_params_path)) + vocoder.eval() + print("WaveFlow Done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + input_dir = Path(args.input_dir) + + # 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 % 和 $ 来调节韵律。 + # 值得的注意的是,句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。 + sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$" + phones, tones = convert_sentence(sentence) + phones = np.array( + [voc_phones.lookup(item) for item in phones], dtype=np.int64) + tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64) + phones = paddle.to_tensor(phones).unsqueeze(0) + tones = paddle.to_tensor(tones).unsqueeze(0) + + for name in os.listdir(input_dir): + utt_id = name.split(".")[0] + ref_audio_path = input_dir / name + mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path)) + print("mel_sequences: ", mel_sequences.shape) + with paddle.no_grad(): + embed = speaker_encoder.embed_utterance( + paddle.to_tensor(mel_sequences)) + print("embed shape: ", embed.shape) + utterance_embeds = paddle.unsqueeze(embed, 0) + outputs = synthesizer.infer( + phones, tones=tones, global_condition=utterance_embeds) + mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1]) + alignment = outputs["alignments"][0].numpy().T + display.plot_alignment(alignment) + plt.savefig(str(output_dir / (utt_id + ".png"))) + + with paddle.no_grad(): + wav = vocoder.infer(mel_input) + wav = wav.numpy()[0] + sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--ge2e_params_path", type=str, help="ge2e params path.") + parser.add_argument( + "--tacotron2_params_path", type=str, help="tacotron2 params path.") + parser.add_argument( + "--waveflow_params_path", type=str, help="waveflow params path.") + parser.add_argument( + "--device", type=str, default="gpu", help="device type to use.") + + parser.add_argument( + "--input-dir", + type=str, + help="input dir of *.wav, the sample rate will be resample to 16k.") + parser.add_argument("--output-dir", type=str, help="output dir.") + + args = parser.parse_args() + + paddle.set_device(args.device) + + voice_cloning(args) + + +if __name__ == "__main__": + main() diff --git a/parakeet/exps/transformer_tts/__init__.py b/parakeet/exps/transformer_tts/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/parakeet/exps/transformer_tts/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/ljspeech/tts1/local/transformer_tts/normalize.py b/parakeet/exps/transformer_tts/normalize.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/normalize.py rename to parakeet/exps/transformer_tts/normalize.py index a666ca2f2..127449ee3 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/normalize.py +++ b/parakeet/exps/transformer_tts/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/examples/ljspeech/tts1/local/transformer_tts/preprocess.py b/parakeet/exps/transformer_tts/preprocess.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/preprocess.py rename to parakeet/exps/transformer_tts/preprocess.py index 0f998bc30..96696eaed 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/preprocess.py +++ b/parakeet/exps/transformer_tts/preprocess.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from concurrent.futures import ThreadPoolExecutor from operator import itemgetter @@ -25,9 +24,10 @@ import librosa import numpy as np import tqdm import yaml +from yacs.config import CfgNode as Configuration + from parakeet.data.get_feats import LogMelFBank from parakeet.frontend import English -from yacs.config import CfgNode as Configuration def get_lj_sentences(file_name, frontend): diff --git a/examples/ljspeech/tts1/local/transformer_tts/synthesize.py b/parakeet/exps/transformer_tts/synthesize.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/synthesize.py rename to parakeet/exps/transformer_tts/synthesize.py index 21614c539..5c1945d28 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/synthesize.py +++ b/parakeet/exps/transformer_tts/synthesize.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -22,6 +21,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.models.transformer_tts import TransformerTTS from parakeet.models.transformer_tts import TransformerTTSInference diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py b/parakeet/exps/transformer_tts/synthesize_e2e.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py rename to parakeet/exps/transformer_tts/synthesize_e2e.py index 7ca75a8f4..2bee77d35 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py +++ b/parakeet/exps/transformer_tts/synthesize_e2e.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -21,6 +20,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.frontend import English from parakeet.models.transformer_tts import TransformerTTS from parakeet.models.transformer_tts import TransformerTTSInference diff --git a/examples/ljspeech/tts1/local/transformer_tts/train.py b/parakeet/exps/transformer_tts/train.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/train.py rename to parakeet/exps/transformer_tts/train.py index b1263bcca..fdaff3475 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/train.py +++ b/parakeet/exps/transformer_tts/train.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -import os import logging +import os import shutil from pathlib import Path @@ -26,18 +25,19 @@ from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from parakeet.datasets.data_table import DataTable +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.am_batch_fn import transformer_single_spk_batch_fn +from parakeet.datasets.data_table import DataTable from parakeet.models.transformer_tts import TransformerTTS -from parakeet.models.transformer_tts import TransformerTTSUpdater from parakeet.models.transformer_tts import TransformerTTSEvaluator +from parakeet.models.transformer_tts import TransformerTTSUpdater from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/parakeet/exps/waveflow/__init__.py b/parakeet/exps/waveflow/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/parakeet/exps/waveflow/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/ljspeech/voc0/local/waveflow/config.py b/parakeet/exps/waveflow/config.py similarity index 99% rename from examples/ljspeech/voc0/local/waveflow/config.py rename to parakeet/exps/waveflow/config.py index d009a2c82..869caa6a2 100644 --- a/examples/ljspeech/voc0/local/waveflow/config.py +++ b/parakeet/exps/waveflow/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode as CN _C = CN() diff --git a/examples/ljspeech/voc0/local/waveflow/ljspeech.py b/parakeet/exps/waveflow/ljspeech.py similarity index 97% rename from examples/ljspeech/voc0/local/waveflow/ljspeech.py rename to parakeet/exps/waveflow/ljspeech.py index afeba3915..ca18f400e 100644 --- a/examples/ljspeech/voc0/local/waveflow/ljspeech.py +++ b/parakeet/exps/waveflow/ljspeech.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from pathlib import Path import numpy as np import pandas from paddle.io import Dataset -from parakeet.data.batch import batch_spec, batch_wav +from parakeet.data.batch import batch_spec +from parakeet.data.batch import batch_wav class LJSpeech(Dataset): diff --git a/examples/ljspeech/voc0/local/waveflow/preprocess.py b/parakeet/exps/waveflow/preprocess.py similarity index 98% rename from examples/ljspeech/voc0/local/waveflow/preprocess.py rename to parakeet/exps/waveflow/preprocess.py index 199081c00..d4ec0de5d 100644 --- a/examples/ljspeech/voc0/local/waveflow/preprocess.py +++ b/parakeet/exps/waveflow/preprocess.py @@ -11,20 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os import argparse +import os from pathlib import Path -import tqdm -import numpy as np import librosa +import numpy as np import pandas as pd +import tqdm -from parakeet.datasets import LJSpeechMetaData from parakeet.audio import LogMagnitude - -from config import get_cfg_defaults +from parakeet.datasets import LJSpeechMetaData +from parakeet.exps.waveflow.config import get_cfg_defaults class Transform(object): diff --git a/examples/ljspeech/voc0/local/waveflow/synthesize.py b/parakeet/exps/waveflow/synthesize.py similarity index 97% rename from examples/ljspeech/voc0/local/waveflow/synthesize.py rename to parakeet/exps/waveflow/synthesize.py index e25cec3ee..4dd52514a 100644 --- a/examples/ljspeech/voc0/local/waveflow/synthesize.py +++ b/parakeet/exps/waveflow/synthesize.py @@ -11,20 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os import argparse +import os from pathlib import Path import numpy as np -import soundfile as sf import paddle +import soundfile as sf +from parakeet.exps.waveflow.config import get_cfg_defaults from parakeet.models.waveflow import ConditionalWaveFlow from parakeet.utils import layer_tools -from config import get_cfg_defaults - def main(config, args): paddle.set_device(args.device) diff --git a/examples/ljspeech/voc0/local/waveflow/train.py b/parakeet/exps/waveflow/train.py similarity index 92% rename from examples/ljspeech/voc0/local/waveflow/train.py rename to parakeet/exps/waveflow/train.py index 359670fac..ecfcbcaac 100644 --- a/examples/ljspeech/voc0/local/waveflow/train.py +++ b/parakeet/exps/waveflow/train.py @@ -11,22 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time import numpy as np import paddle from paddle import distributed as dist -from paddle.io import DataLoader, DistributedBatchSampler +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from parakeet.data import dataset -from parakeet.models.waveflow import ConditionalWaveFlow, WaveFlowLoss -from parakeet.utils import mp_tools +from parakeet.exps.waveflow.config import get_cfg_defaults +from parakeet.exps.waveflow.ljspeech import LJSpeech +from parakeet.exps.waveflow.ljspeech import LJSpeechClipCollector +from parakeet.exps.waveflow.ljspeech import LJSpeechCollector +from parakeet.models.waveflow import ConditionalWaveFlow +from parakeet.models.waveflow import WaveFlowLoss from parakeet.training.cli import default_argument_parser from parakeet.training.experiment import ExperimentBase - -from config import get_cfg_defaults -from ljspeech import LJSpeech, LJSpeechClipCollector, LJSpeechCollector +from parakeet.utils import mp_tools class Experiment(ExperimentBase): diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py index b8779b65b..64015435e 100644 --- a/parakeet/frontend/__init__.py +++ b/parakeet/frontend/__init__.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .zh_normalization import * from .generate_lexicon import * from .normalizer import * from .phonectic import * from .punctuation import * from .tone_sandhi import * from .vocab import * +from .zh_normalization import * diff --git a/parakeet/frontend/normalizer/__init__.py b/parakeet/frontend/normalizer/__init__.py index 37fd5806d..d1f2bfc53 100644 --- a/parakeet/frontend/normalizer/__init__.py +++ b/parakeet/frontend/normalizer/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from parakeet.frontend.normalizer.normalizer import * from parakeet.frontend.normalizer.numbers import * diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py index 23662254c..874c19795 100644 --- a/parakeet/frontend/phonectic.py +++ b/parakeet/frontend/phonectic.py @@ -17,9 +17,9 @@ from abc import abstractmethod from g2p_en import G2p from g2pM import G2pM -from parakeet.frontend.vocab import Vocab from parakeet.frontend.normalizer.normalizer import normalize from parakeet.frontend.punctuation import get_punctuations +from parakeet.frontend.vocab import Vocab # discard opencc untill we find an easy solution to install it on windows # from opencc import OpenCC diff --git a/parakeet/frontend/zh_frontend.py b/parakeet/frontend/zh_frontend.py index 8a0c1668c..04ce235f7 100644 --- a/parakeet/frontend/zh_frontend.py +++ b/parakeet/frontend/zh_frontend.py @@ -22,9 +22,9 @@ from g2pM import G2pM from pypinyin import lazy_pinyin from pypinyin import Style -from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer from parakeet.frontend.generate_lexicon import generate_lexicon from parakeet.frontend.tone_sandhi import ToneSandhi +from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer class Frontend(): diff --git a/parakeet/frontend/zh_normalization/__init__.py b/parakeet/frontend/zh_normalization/__init__.py index 77e10ebb4..1e4940804 100644 --- a/parakeet/frontend/zh_normalization/__init__.py +++ b/parakeet/frontend/zh_normalization/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from parakeet.frontend.zh_normalization.text_normlization import * diff --git a/parakeet/models/__init__.py b/parakeet/models/__init__.py index e943def76..4ce90896d 100644 --- a/parakeet/models/__init__.py +++ b/parakeet/models/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .fastspeech2 import * from .tacotron2 import * from .transformer_tts import * diff --git a/parakeet/models/fastspeech2/__init__.py b/parakeet/models/fastspeech2/__init__.py index 83479d6f6..52925ef8c 100644 --- a/parakeet/models/fastspeech2/__init__.py +++ b/parakeet/models/fastspeech2/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .fastspeech2 import * from .fastspeech2_updater import * diff --git a/parakeet/models/fastspeech2/fastspeech2.py b/parakeet/models/fastspeech2/fastspeech2.py index 019979b95..7c0e20bc2 100644 --- a/parakeet/models/fastspeech2/fastspeech2.py +++ b/parakeet/models/fastspeech2/fastspeech2.py @@ -28,10 +28,10 @@ from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePr from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder -from parakeet.modules.tacotron2.decoder import Postnet from parakeet.modules.nets_utils import initialize from parakeet.modules.nets_utils import make_non_pad_mask from parakeet.modules.nets_utils import make_pad_mask +from parakeet.modules.tacotron2.decoder import Postnet class FastSpeech2(nn.Layer): diff --git a/parakeet/models/fastspeech2/fastspeech2_updater.py b/parakeet/models/fastspeech2/fastspeech2_updater.py index 789965f4d..ea23ec2af 100644 --- a/parakeet/models/fastspeech2/fastspeech2_updater.py +++ b/parakeet/models/fastspeech2/fastspeech2_updater.py @@ -14,6 +14,7 @@ import logging from paddle import distributed as dist + from parakeet.models.fastspeech2 import FastSpeech2Loss from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report diff --git a/parakeet/models/lstm_speaker_encoder.py b/parakeet/models/lstm_speaker_encoder.py index 3372b2129..f92fddc0e 100644 --- a/parakeet/models/lstm_speaker_encoder.py +++ b/parakeet/models/lstm_speaker_encoder.py @@ -106,10 +106,10 @@ class LSTMSpeakerEncoder(nn.Layer): def do_gradient_ops(self): for p in [self.similarity_weight, self.similarity_bias]: g = p._grad_ivar() - g[...] = g * 0.01 + g = g * 0.01 def inv_argmax(self, i, num): - return np.eye(1, num, i, dtype=np.int)[0] + return np.eye(1, num, i, dtype=int)[0] def loss(self, embeds): """ diff --git a/parakeet/models/parallel_wavegan/__init__.py b/parakeet/models/parallel_wavegan/__init__.py index 89403c0e0..72322735b 100644 --- a/parakeet/models/parallel_wavegan/__init__.py +++ b/parakeet/models/parallel_wavegan/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .parallel_wavegan import * from .parallel_wavegan_updater import * diff --git a/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py b/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py index 68328fb3c..7bd59881d 100644 --- a/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py +++ b/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging from typing import Dict @@ -21,11 +20,12 @@ from paddle.io import DataLoader from paddle.nn import Layer from paddle.optimizer import Optimizer from paddle.optimizer.lr import LRScheduler +from timer import timer + from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater from parakeet.training.updaters.standard_updater import UpdaterState -from timer import timer logging.basicConfig( format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', datefmt='[%Y-%m-%d %H:%M:%S]') diff --git a/parakeet/models/speedyspeech/__init__.py b/parakeet/models/speedyspeech/__init__.py index 6d9b70887..abdac8da4 100644 --- a/parakeet/models/speedyspeech/__init__.py +++ b/parakeet/models/speedyspeech/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .speedyspeech import * from .speedyspeech_updater import * diff --git a/parakeet/models/speedyspeech/speedyspeech_updater.py b/parakeet/models/speedyspeech/speedyspeech_updater.py index 3135d3426..a17c93c79 100644 --- a/parakeet/models/speedyspeech/speedyspeech_updater.py +++ b/parakeet/models/speedyspeech/speedyspeech_updater.py @@ -17,6 +17,7 @@ import paddle from paddle import distributed as dist from paddle.fluid.layers import huber_loss from paddle.nn import functional as F + from parakeet.modules.losses import masked_l1_loss from parakeet.modules.losses import weighted_mean from parakeet.modules.ssim import ssim diff --git a/parakeet/models/transformer_tts/__init__.py b/parakeet/models/transformer_tts/__init__.py index 0456c3006..80a151eca 100644 --- a/parakeet/models/transformer_tts/__init__.py +++ b/parakeet/models/transformer_tts/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .transformer_tts import * from .transformer_tts_updater import * diff --git a/parakeet/models/transformer_tts/transformer_tts.py b/parakeet/models/transformer_tts/transformer_tts.py index 42ab5f867..bb3674f38 100644 --- a/parakeet/models/transformer_tts/transformer_tts.py +++ b/parakeet/models/transformer_tts/transformer_tts.py @@ -15,10 +15,11 @@ from typing import Dict from typing import Sequence from typing import Tuple + import numpy import paddle -from paddle import nn import paddle.nn.functional as F +from paddle import nn from typeguard import check_argument_types from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention @@ -27,13 +28,13 @@ from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncodin from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding from parakeet.modules.fastspeech2_transformer.encoder import Encoder from parakeet.modules.fastspeech2_transformer.mask import subsequent_mask +from parakeet.modules.nets_utils import initialize +from parakeet.modules.nets_utils import make_non_pad_mask +from parakeet.modules.nets_utils import make_pad_mask from parakeet.modules.style_encoder import StyleEncoder from parakeet.modules.tacotron2.decoder import Postnet from parakeet.modules.tacotron2.decoder import Prenet as DecoderPrenet from parakeet.modules.tacotron2.encoder import Encoder as EncoderPrenet -from parakeet.modules.nets_utils import initialize -from parakeet.modules.nets_utils import make_non_pad_mask -from parakeet.modules.nets_utils import make_pad_mask class TransformerTTS(nn.Layer): diff --git a/parakeet/models/transformer_tts/transformer_tts_updater.py b/parakeet/models/transformer_tts/transformer_tts_updater.py index 7e75a8601..4bec47585 100644 --- a/parakeet/models/transformer_tts/transformer_tts_updater.py +++ b/parakeet/models/transformer_tts/transformer_tts_updater.py @@ -16,6 +16,7 @@ from typing import Sequence import paddle from paddle import distributed as dist + from parakeet.models.transformer_tts import GuidedMultiHeadAttentionLoss from parakeet.models.transformer_tts import TransformerTTSLoss from parakeet.training.extensions.evaluator import StandardEvaluator diff --git a/parakeet/modules/__init__.py b/parakeet/modules/__init__.py index fd38c0c40..664267895 100644 --- a/parakeet/modules/__init__.py +++ b/parakeet/modules/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .attention import * from .conv import * from .geometry import * diff --git a/parakeet/modules/fastspeech2_transformer/decoder.py b/parakeet/modules/fastspeech2_transformer/decoder.py index a41a87c8c..0f09014f6 100644 --- a/parakeet/modules/fastspeech2_transformer/decoder.py +++ b/parakeet/modules/fastspeech2_transformer/decoder.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # 暂时删除了 dyminic conv """Decoder definition.""" import logging diff --git a/parakeet/modules/fastspeech2_transformer/decoder_layer.py b/parakeet/modules/fastspeech2_transformer/decoder_layer.py index 53328866e..f968051e6 100644 --- a/parakeet/modules/fastspeech2_transformer/decoder_layer.py +++ b/parakeet/modules/fastspeech2_transformer/decoder_layer.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Decoder self-attention layer definition.""" - import paddle from paddle import nn + from parakeet.modules.layer_norm import LayerNorm diff --git a/parakeet/modules/fastspeech2_transformer/lightconv.py b/parakeet/modules/fastspeech2_transformer/lightconv.py index e5f59df18..061168848 100644 --- a/parakeet/modules/fastspeech2_transformer/lightconv.py +++ b/parakeet/modules/fastspeech2_transformer/lightconv.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """Lightweight Convolution Module.""" - import numpy import paddle -from paddle import nn import paddle.nn.functional as F +from paddle import nn from parakeet.modules.glu import GLU from parakeet.modules.masked_fill import masked_fill diff --git a/parakeet/modules/fastspeech2_transformer/mask.py b/parakeet/modules/fastspeech2_transformer/mask.py index 7dbd4d2fa..fd97b0049 100644 --- a/parakeet/modules/fastspeech2_transformer/mask.py +++ b/parakeet/modules/fastspeech2_transformer/mask.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Mask module.""" - import paddle diff --git a/parakeet/modules/style_encoder.py b/parakeet/modules/style_encoder.py index aa94d4ba0..fb27258ce 100644 --- a/parakeet/modules/style_encoder.py +++ b/parakeet/modules/style_encoder.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """Style encoder of GST-Tacotron.""" - -from typeguard import check_argument_types from typing import Sequence import paddle from paddle import nn +from typeguard import check_argument_types + from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention diff --git a/parakeet/modules/tacotron2/decoder.py b/parakeet/modules/tacotron2/decoder.py index bf9d7e364..779fd0c62 100644 --- a/parakeet/modules/tacotron2/decoder.py +++ b/parakeet/modules/tacotron2/decoder.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tacotron2 decoder related modules.""" - -import six import paddle.nn.functional as F +import six from paddle import nn diff --git a/parakeet/modules/tacotron2/encoder.py b/parakeet/modules/tacotron2/encoder.py index 1e22b769a..95f71d5e3 100644 --- a/parakeet/modules/tacotron2/encoder.py +++ b/parakeet/modules/tacotron2/encoder.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tacotron2 encoder related modules.""" - -import six - import paddle +import six from paddle import nn diff --git a/parakeet/modules/transformer.py b/parakeet/modules/transformer.py index 696b12b6e..490458bef 100644 --- a/parakeet/modules/transformer.py +++ b/parakeet/modules/transformer.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. from paddle import nn -from parakeet.modules import attention as attn from paddle.nn import functional as F +from parakeet.modules import attention as attn + __all__ = [ "PositionwiseFFN", "TransformerEncoderLayer", diff --git a/parakeet/training/__init__.py b/parakeet/training/__init__.py index 277171dee..719e8445d 100644 --- a/parakeet/training/__init__.py +++ b/parakeet/training/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .cli import * from .experiment import * diff --git a/parakeet/training/optimizer.py b/parakeet/training/optimizer.py index 1f5496c09..c6a6944d1 100644 --- a/parakeet/training/optimizer.py +++ b/parakeet/training/optimizer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle optim_classes = dict( diff --git a/parakeet/utils/__init__.py b/parakeet/utils/__init__.py index 9811f201f..ce3a4ef60 100644 --- a/parakeet/utils/__init__.py +++ b/parakeet/utils/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from . import checkpoint from . import display from . import layer_tools diff --git a/parakeet/utils/profiler.py b/parakeet/utils/profiler.py index e64afd6a0..2bbeb02d1 100644 --- a/parakeet/utils/profiler.py +++ b/parakeet/utils/profiler.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import sys + import paddle # A global variable to record the number of calling times for profiler diff --git a/requirements.txt b/requirements.txt index 2a3f06514..1635fd9c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,42 +1,42 @@ coverage editdistance +g2p_en +g2pM gpustat +h5py +inflect +jieba jsonlines kaldiio +librosa +llvmlite loguru +matplotlib +nltk +numba +numpy==1.20.0 +pandas +phkit Pillow +praatio~=4.1 pre-commit pybind11 +pypinyin +pyworld resampy==0.2.2 sacrebleu scipy==1.2.1 sentencepiece snakeviz +soundfile~=0.10 sox tensorboardX textgrid +timer tqdm typeguard -visualdl==2.2.0 -yacs -numpy==1.20.0 -numba -nltk -inflect -librosa unidecode -llvmlite -matplotlib -pandas -soundfile~=0.10 -g2p_en -pypinyin +visualdl==2.2.0 webrtcvad -g2pM -praatio~=4.1 -h5py -timer -pyworld -jieba -phkit +yacs yq diff --git a/setup.py b/setup.py index 07b6aac04..be17e0a4f 100644 --- a/setup.py +++ b/setup.py @@ -11,20 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import contextlib +import inspect import io import os -import re +import subprocess as sp import sys from pathlib import Path -import contextlib -import inspect +from setuptools import Command from setuptools import find_packages from setuptools import setup -from setuptools import Command from setuptools.command.develop import develop from setuptools.command.install import install -import subprocess as sp HERE = Path(os.path.abspath(os.path.dirname(__file__))) @@ -40,16 +39,18 @@ def pushd(new_dir): def read(*names, **kwargs): - with io.open(os.path.join(os.path.dirname(__file__), *names), - encoding=kwargs.get("encoding", "utf8")) as fp: + with io.open( + os.path.join(os.path.dirname(__file__), *names), + encoding=kwargs.get("encoding", "utf8")) as fp: return fp.read() def check_call(cmd: str, shell=False, executable=None): try: - sp.check_call(cmd.split(), - shell=shell, - executable="/bin/bash" if shell else executable) + sp.check_call( + cmd.split(), + shell=shell, + executable="/bin/bash" if shell else executable) except sp.CalledProcessError as e: print( f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:", @@ -82,7 +83,7 @@ def _post_install(install_lib_dir): tools_extrs_dir = HERE / 'tools/extras' with pushd(tools_extrs_dir): print(os.getcwd()) - check_call(f"./install_autolog.sh") + check_call("./install_autolog.sh") print("autolog install.") # ctcdecoder @@ -189,7 +190,6 @@ setup_info = dict( 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', - ], -) + ], ) setup(**setup_info) diff --git a/utils/json2trn.py b/utils/json2trn.py index 873fde4f7..4adfa491d 100755 --- a/utils/json2trn.py +++ b/utils/json2trn.py @@ -4,7 +4,6 @@ # 2018 Xuankai Chang (Shanghai Jiao Tong University) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import argparse -import json import logging import sys