From e3075e79170b42cc3ef61ed196c7e8bfdb0594d0 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 28 Jun 2022 09:59:27 +0000 Subject: [PATCH 1/2] install CPython version monotonic_align before train, test=tts --- examples/csmsc/vits/local/synthesize.sh | 2 +- examples/csmsc/vits/local/train.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/csmsc/vits/local/synthesize.sh b/examples/csmsc/vits/local/synthesize.sh index c15d5f99..a4b35ec0 100755 --- a/examples/csmsc/vits/local/synthesize.sh +++ b/examples/csmsc/vits/local/synthesize.sh @@ -15,4 +15,4 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --phones_dict=dump/phone_id_map.txt \ --test_metadata=dump/test/norm/metadata.jsonl \ --output_dir=${train_output_path}/test -fi \ No newline at end of file +fi diff --git a/examples/csmsc/vits/local/train.sh b/examples/csmsc/vits/local/train.sh index 42fff26c..289837a5 100755 --- a/examples/csmsc/vits/local/train.sh +++ b/examples/csmsc/vits/local/train.sh @@ -3,6 +3,11 @@ config_path=$1 train_output_path=$2 +# install monotonic_align +cd ${MAIN_ROOT}/paddlespeech/t2s/models/vits/monotonic_align +python3 setup.py build_ext --inplace +cd - + python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ From b2b05a0bc7fdf0f9e015b3fe871a7de57b5e9745 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 29 Jun 2022 07:06:24 +0000 Subject: [PATCH 2/2] add vits ckpt, test=doc --- README.md | 9 +++++++++ README_cn.md | 10 ++++++++++ examples/csmsc/vits/README.md | 31 +++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/README.md b/README.md index c9d4796c..a81850dc 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ + ([简体中文](./README_cn.md)|English)

@@ -494,6 +495,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r ge2e-fastspeech2-aishell3 + + End-to-End + VITS + CSMSC + + VITS-csmsc + + diff --git a/README_cn.md b/README_cn.md index c751b061..7e102f62 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,3 +1,4 @@ + (简体中文|[English](./README.md))

@@ -480,6 +481,15 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ge2e-fastspeech2-aishell3 + + + 端到端 + VITS + CSMSC + + VITS-csmsc + + diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index 0c16840a..5ca57e3a 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -144,3 +144,34 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model + +The pretrained model can be downloaded here: + +- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true) + +VITS checkpoint contains files listed below. +```text +vits_csmsc_ckpt_1.1.0 +├── default.yaml # default config used to train vitx +├── phone_id_map.txt # phone vocabulary file when training vits +└── snapshot_iter_350000.pdz # model parameters and optimizer states +``` + +ps: This ckpt is not good enough, a better result is training + +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained VITS. + +```bash +source path.sh +add_blank=true + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --config=vits_csmsc_ckpt_1.1.0/default.yaml \ + --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_350000.pdz \ + --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \ + --output_dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences.txt \ + --add-blank=${add_blank} +```