From 2f3ca4ac4809767008f89b0ab24846b2f5e0b983 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 17 Jan 2023 13:55:18 +0800 Subject: [PATCH 01/42] Update README.md (#2840) * Update README.md * Update README_cn.md --- README.md | 4 ++-- README_cn.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2fb773634..40064f5d2 100644 --- a/README.md +++ b/README.md @@ -157,8 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update -- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). -- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). +- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model). - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid). - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website diff --git a/README_cn.md b/README_cn.md index 53f6a66e4..d2e5f63d7 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,8 +164,8 @@ ### 近期更新 -- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). -- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). +- 🔥 2023.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验! From 478fd2593e215a65ec641bc1ba831e53d9da7d4b Mon Sep 17 00:00:00 2001 From: Ming Date: Tue, 17 Jan 2023 17:40:15 +0800 Subject: [PATCH 02/42] update QR Code in README, test=doc (#2841) --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 40064f5d2..afc4e4d09 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
- +
## Installation diff --git a/README_cn.md b/README_cn.md index d2e5f63d7..ecc4644aa 100644 --- a/README_cn.md +++ b/README_cn.md @@ -202,7 +202,7 @@ 微信扫描二维码关注公众号,点击“马上报名”填写问卷加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
- +
From 140aed4b545885cdb9a13117e9d1a009466c44ac Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Thu, 19 Jan 2023 16:04:03 +0800 Subject: [PATCH 03/42] [TTS]VITS init sampler reverse, test=tts (#2843) --- paddlespeech/t2s/exps/vits/normalize.py | 2 +- paddlespeech/t2s/exps/vits/preprocess.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 514cbef8e..24e15765e 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -187,7 +187,7 @@ def main(): record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) - output_metadata.sort(key=itemgetter('feats_lengths')) + output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" with jsonlines.open(output_metadata_path, 'w') as writer: for item in output_metadata: diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index 2b1a40834..d6b226a20 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -166,7 +166,7 @@ def process_sentences(config, if record: results.append(record) - results.sort(key=itemgetter("feats_lengths")) + results.sort(key=itemgetter("feats_lengths"), reverse=True) with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: for item in results: writer.write(item) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index 07301db56..f6a31ced2 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -110,7 +110,7 @@ def train_sp(args, config): train_sampler = ErnieSATSampler( train_dataset, batch_size=config.batch_size, - shuffle=True, + shuffle=False, drop_last=True) dev_sampler = ErnieSATSampler( dev_dataset, From 2b01e4052559b5c0e1a7d47f4eb1e340a5a1bf1d Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 30 Jan 2023 13:33:38 +0800 Subject: [PATCH 04/42] =?UTF-8?q?[TTS]soft=20link=20for=20shell=20in=20exa?= =?UTF-8?q?mple,=20add=20skip=5Fcopy=5Fwave=20in=20norm=20stage=20of=20G?= =?UTF-8?q?=E2=80=A6=20(#2851)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit soft link for shell in example, add skip_copy_wave in norm stage of GANVocoders to save disk --- examples/aishell3/tts3/path.sh | 14 +--- examples/aishell3/vc0/path.sh | 14 +--- examples/aishell3/vc1/local/train.sh | 14 +--- examples/aishell3/vc1/path.sh | 14 +--- examples/aishell3/vc2/local/synthesize.sh | 21 +----- examples/aishell3/vc2/local/train.sh | 14 +--- examples/aishell3/vc2/path.sh | 14 +--- examples/aishell3/voc1/local/preprocess.sh | 10 ++- examples/aishell3/voc1/local/synthesize.sh | 15 +---- examples/aishell3/voc1/local/train.sh | 14 +--- examples/aishell3/voc1/path.sh | 14 +--- examples/aishell3/voc5/local/preprocess.sh | 56 +--------------- examples/aishell3/voc5/local/synthesize.sh | 15 +---- examples/aishell3/voc5/local/train.sh | 14 +--- examples/aishell3/voc5/path.sh | 14 +--- .../ernie_sat/local/synthesize.sh | 26 +------- .../aishell3_vctk/ernie_sat/local/train.sh | 13 +--- examples/aishell3_vctk/ernie_sat/path.sh | 14 +--- examples/csmsc/voc1/local/preprocess.sh | 10 ++- examples/csmsc/voc3/finetune.sh | 65 +------------------ examples/csmsc/voc3/local/preprocess.sh | 56 +--------------- examples/csmsc/voc3/local/train.sh | 14 +--- examples/csmsc/voc4/local/preprocess.sh | 56 +--------------- examples/csmsc/voc4/local/train.sh | 14 +--- examples/csmsc/voc5/finetune.sh | 9 ++- examples/csmsc/voc5/local/preprocess.sh | 56 +--------------- examples/csmsc/voc5/local/train.sh | 14 +--- examples/csmsc/voc6/local/preprocess.sh | 10 ++- examples/csmsc/voc6/local/train.sh | 14 +--- examples/ljspeech/tts0/local/train.sh | 13 +--- examples/ljspeech/tts0/path.sh | 14 +--- examples/ljspeech/tts3/local/train.sh | 13 +--- examples/ljspeech/tts3/path.sh | 14 +--- examples/ljspeech/voc1/local/preprocess.sh | 10 ++- examples/ljspeech/voc1/local/synthesize.sh | 15 +---- examples/ljspeech/voc1/local/train.sh | 14 +--- examples/ljspeech/voc1/path.sh | 14 +--- examples/ljspeech/voc5/local/preprocess.sh | 56 +--------------- examples/ljspeech/voc5/local/synthesize.sh | 15 +---- examples/ljspeech/voc5/local/train.sh | 14 +--- examples/ljspeech/voc5/path.sh | 14 +--- examples/vctk/ernie_sat/local/train.sh | 13 +--- examples/vctk/ernie_sat/path.sh | 14 +--- examples/vctk/tts3/local/train.sh | 14 +--- examples/vctk/tts3/path.sh | 14 +--- examples/vctk/voc1/local/preprocess.sh | 10 ++- examples/vctk/voc1/local/synthesize.sh | 15 +---- examples/vctk/voc1/local/train.sh | 14 +--- examples/vctk/voc1/path.sh | 14 +--- examples/vctk/voc5/local/preprocess.sh | 56 +--------------- examples/vctk/voc5/local/synthesize.sh | 15 +---- examples/vctk/voc5/local/train.sh | 14 +--- examples/vctk/voc5/path.sh | 14 +--- examples/zh_en_tts/tts3/local/train.sh | 14 +--- examples/zh_en_tts/tts3/path.sh | 14 +--- 55 files changed, 90 insertions(+), 979 deletions(-) mode change 100755 => 120000 examples/aishell3/tts3/path.sh mode change 100755 => 120000 examples/aishell3/vc0/path.sh mode change 100755 => 120000 examples/aishell3/vc1/local/train.sh mode change 100755 => 120000 examples/aishell3/vc1/path.sh mode change 100755 => 120000 examples/aishell3/vc2/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/vc2/local/train.sh mode change 100755 => 120000 examples/aishell3/vc2/path.sh mode change 100755 => 120000 examples/aishell3/voc1/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/voc1/local/train.sh mode change 100755 => 120000 examples/aishell3/voc1/path.sh mode change 100755 => 120000 examples/aishell3/voc5/local/preprocess.sh mode change 100755 => 120000 examples/aishell3/voc5/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/voc5/local/train.sh mode change 100755 => 120000 examples/aishell3/voc5/path.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/synthesize.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/train.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/path.sh mode change 100755 => 120000 examples/csmsc/voc3/finetune.sh mode change 100755 => 120000 examples/csmsc/voc3/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc3/local/train.sh mode change 100755 => 120000 examples/csmsc/voc4/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc4/local/train.sh mode change 100755 => 120000 examples/csmsc/voc5/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc5/local/train.sh mode change 100755 => 120000 examples/csmsc/voc6/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts0/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts0/path.sh mode change 100755 => 120000 examples/ljspeech/tts3/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts3/path.sh mode change 100755 => 120000 examples/ljspeech/voc1/local/synthesize.sh mode change 100755 => 120000 examples/ljspeech/voc1/local/train.sh mode change 100755 => 120000 examples/ljspeech/voc1/path.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/preprocess.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/synthesize.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/train.sh mode change 100755 => 120000 examples/ljspeech/voc5/path.sh mode change 100755 => 120000 examples/vctk/ernie_sat/local/train.sh mode change 100755 => 120000 examples/vctk/ernie_sat/path.sh mode change 100755 => 120000 examples/vctk/tts3/local/train.sh mode change 100755 => 120000 examples/vctk/tts3/path.sh mode change 100755 => 120000 examples/vctk/voc1/local/synthesize.sh mode change 100755 => 120000 examples/vctk/voc1/local/train.sh mode change 100755 => 120000 examples/vctk/voc1/path.sh mode change 100755 => 120000 examples/vctk/voc5/local/preprocess.sh mode change 100755 => 120000 examples/vctk/voc5/local/synthesize.sh mode change 100755 => 120000 examples/vctk/voc5/local/train.sh mode change 100755 => 120000 examples/vctk/voc5/path.sh mode change 100755 => 120000 examples/zh_en_tts/tts3/local/train.sh mode change 100755 => 120000 examples/zh_en_tts/tts3/path.sh diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/aishell3/vc0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/aishell3/vc0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc1/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc1/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh deleted file mode 100755 index 8fd8977d3..000000000 --- a/examples/aishell3/vc2/local/synthesize.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_aishell3 \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_aishell3 \ - --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ - --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt \ - --voice-cloning=True diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh new file mode 120000 index 000000000..ca8df6b04 --- /dev/null +++ b/examples/aishell3/vc2/local/synthesize.sh @@ -0,0 +1 @@ +../../vc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc2/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc2/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc2/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc2/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh index 44cc3dbe4..71eab68ad 100755 --- a/examples/aishell3/voc1/local/preprocess.sh +++ b/examples/aishell3/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/aishell3/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/aishell3/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/aishell3/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/aishell3/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh deleted file mode 100755 index 44cc3dbe4..000000000 --- a/examples/aishell3/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./aishell3_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/data_aishell3/ \ - --dataset=aishell3 \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/aishell3/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/aishell3/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/aishell3/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/aishell3/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/aishell3/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh deleted file mode 100755 index 8b4178f13..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -stage=0 -stop_stage=0 - -# hifigan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/synthesize.py \ - --erniesat_config=${config_path} \ - --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --erniesat_stat=dump/train/speech_stats.npy \ - --voc=hifigan_aishell3 \ - --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ - --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ - --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt -fi diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh new file mode 120000 index 000000000..5703dcb2c --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/aishell3_vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh index 61d6d62be..62d0717b9 100755 --- a/examples/csmsc/voc1/local/preprocess.sh +++ b/examples/csmsc/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh deleted file mode 100755 index 6719bd0be..000000000 --- a/examples/csmsc/voc3/finetune.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -source path.sh - -gpus=0 -stage=0 -stop_stage=100 - -source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ - --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --dur-file=durations.txt \ - --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ - --dataset=baker \ - --rootdir=~/datasets/BZNSYP/ -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 ${MAIN_ROOT}/utils/link_wav.py \ - --old-dump-dir=dump \ - --dump-dir=dump_finetune -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - cp dump/train/feats_stats.npy dump_finetune/train/ -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/train/raw/metadata.jsonl \ - --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/dev/raw/metadata.jsonl \ - --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/test/raw/metadata.jsonl \ - --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - CUDA_VISIBLE_DEVICES=${gpus} \ - FLAGS_cudnn_exhaustive_search=true \ - FLAGS_conv_workspace_size_limit=4000 \ - python ${BIN_DIR}/train.py \ - --train-metadata=dump_finetune/train/norm/metadata.jsonl \ - --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ - --config=conf/finetune.yaml \ - --output-dir=exp/finetune \ - --ngpu=1 -fi \ No newline at end of file diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh new file mode 120000 index 000000000..b6fa868e2 --- /dev/null +++ b/examples/csmsc/voc3/finetune.sh @@ -0,0 +1 @@ +../voc5/finetune.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc3/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc3/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc3/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc4/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc4/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc4/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc4/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh index 6719bd0be..eb8325aeb 100755 --- a/examples/csmsc/voc5/finetune.sh +++ b/examples/csmsc/voc5/finetune.sh @@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/train/raw/metadata.jsonl \ --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/dev/raw/metadata.jsonl \ --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/test/raw/metadata.jsonl \ --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc5/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh index 2dcc39ac7..509824b8e 100755 --- a/examples/csmsc/voc6/local/preprocess.sh +++ b/examples/csmsc/voc6/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc6/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc6/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh deleted file mode 100755 index f90db9150..000000000 --- a/examples/ljspeech/tts0/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh new file mode 120000 index 000000000..7f54e9239 --- /dev/null +++ b/examples/ljspeech/tts0/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts0/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/ljspeech/tts0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/ljspeech/tts0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh deleted file mode 100755 index d1302f99f..000000000 --- a/examples/ljspeech/tts3/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh new file mode 120000 index 000000000..d7b05058e --- /dev/null +++ b/examples/ljspeech/tts3/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/ljspeech/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/ljspeech/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh index d1af60dad..bfbf75b7d 100755 --- a/examples/ljspeech/voc1/local/preprocess.sh +++ b/examples/ljspeech/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/ljspeech/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/ljspeech/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/ljspeech/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/ljspeech/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh deleted file mode 100755 index d1af60dad..000000000 --- a/examples/ljspeech/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./ljspeech_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/LJSpeech-1.1/ \ - --dataset=ljspeech \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/ljspeech/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/ljspeech/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/ljspeech/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/ljspeech/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/ljspeech/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh deleted file mode 100755 index 3a5076505..000000000 --- a/examples/vctk/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/vctk/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/vctk/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/vctk/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh index 88a478cd5..6b7e5288a 100755 --- a/examples/vctk/voc1/local/preprocess.sh +++ b/examples/vctk/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/vctk/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/vctk/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/vctk/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/vctk/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh deleted file mode 100755 index 88a478cd5..000000000 --- a/examples/vctk/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./vctk_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/VCTK-Corpus-0.92/ \ - --dataset=vctk \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/vctk/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/vctk/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/vctk/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/vctk/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/vctk/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh deleted file mode 100755 index 1da72f117..000000000 --- a/examples/zh_en_tts/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/zh_en_tts/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/zh_en_tts/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/zh_en_tts/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file From 31c2c226cacf88281332e61bd03bb863b1c1e9cf Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 30 Jan 2023 19:11:02 +0800 Subject: [PATCH 05/42] clean fluid elementwise_max and square api. (#2852) --- paddlespeech/s2t/training/gradclip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py index 26ac501e2..b2c0500d3 100644 --- a/paddlespeech/s2t/training/gradclip.py +++ b/paddlespeech/s2t/training/gradclip.py @@ -43,7 +43,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = layers.square(merge_grad) + square = paddle.square(merge_grad) sum_square = layers.reduce_sum(square) sum_square_list.append(sum_square) @@ -66,7 +66,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm)) + y=paddle.maximum(x=global_norm_var, y=max_global_norm)) for i, (p, g) in enumerate(params_grads): if g is None: continue From b5764e9f74665babfdd922189560ba269c072635 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 30 Jan 2023 19:17:37 +0800 Subject: [PATCH 06/42] [Install]rm protobuf in setup.py (#2853) * rm protobuf in setup.py && rm audio's dependances in setup.py --- audio/setup.py | 2 +- docs/requirements.txt | 4 +--- setup.py | 6 +----- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index 82e9a55a5..d36b2c440 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -43,7 +43,7 @@ base = [ "scipy>=1.0.0", "soundfile~=0.10", "colorlog", - "pathos == 0.2.8", + "pathos==0.2.8", "pybind11", "parameterized", "tqdm", diff --git a/docs/requirements.txt b/docs/requirements.txt index c6228d917..5422c26f9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,4 @@ braceexpand -colorlog editdistance fastapi g2p_en @@ -16,7 +15,7 @@ matplotlib myst-parser nara_wpe numpydoc -onnxruntime==1.10.0 +onnxruntime>=1.11.0 opencc paddlenlp # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243 @@ -24,7 +23,6 @@ paddlepaddle>=2.2.2,<2.4.0 paddlespeech_ctcdecoders paddlespeech_feat pandas -pathos==0.2.8 pattern_singleton Pillow>=9.0.0 ppdiffusers>=0.9.0 diff --git a/setup.py b/setup.py index 212d3b109..be6cf63a9 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,6 @@ base = [ "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", - "protobuf>=3.1.0, <=3.20.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", @@ -72,12 +71,9 @@ base = [ "yacs~=0.1.8", "prettytable", "zhon", - "colorlog", - "pathos==0.2.8", "braceexpand", "pyyaml", - "pybind11", - "paddleslim==2.3.4", + "paddleslim>=2.3.4", "paddleaudio>=1.0.2", ] From 64aeb6dccc73a262bab9f9ed2a1b8c7b15a30582 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 31 Jan 2023 19:52:45 +0800 Subject: [PATCH 07/42] remove some fluid api (elementwise_div elementwise_mul sqrt reduce_sum). (#2859) --- paddlespeech/s2t/training/gradclip.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py index b2c0500d3..be6fcf589 100644 --- a/paddlespeech/s2t/training/gradclip.py +++ b/paddlespeech/s2t/training/gradclip.py @@ -44,7 +44,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = paddle.square(merge_grad) - sum_square = layers.reduce_sum(square) + sum_square = paddle.sum(square) sum_square_list.append(sum_square) # debug log, not dump all since slow down train process @@ -57,14 +57,15 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): return params_grads global_norm_var = layers.concat(sum_square_list) - global_norm_var = layers.reduce_sum(global_norm_var) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sum(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) + # debug log logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!") max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var, y=max_global_norm)) for i, (p, g) in enumerate(params_grads): @@ -73,7 +74,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue - new_grad = layers.elementwise_mul(x=g, y=clip_var) + new_grad = paddle.multiply(x=g, y=clip_var) params_and_grads.append((p, new_grad)) # debug log, not dump all since slow down train process From 2f526c093cac230493f1ae399fa7182f73d588d3 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 1 Feb 2023 14:06:46 +0800 Subject: [PATCH 08/42] fix data for slim (#2862) --- examples/csmsc/tts3/local/PTQ_static.sh | 2 +- examples/csmsc/voc1/local/PTQ_static.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh index a70a77b58..c6dce53cb 100755 --- a/examples/csmsc/tts3/local/PTQ_static.sh +++ b/examples/csmsc/tts3/local/PTQ_static.sh @@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ - --onnx_forma=True \ No newline at end of file + --onnx_format=True \ No newline at end of file diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh index 2e5166141..c85ebd109 100755 --- a/examples/csmsc/voc1/local/PTQ_static.sh +++ b/examples/csmsc/voc1/local/PTQ_static.sh @@ -2,7 +2,7 @@ train_output_path=$1 model_name=$2 python3 ${BIN_DIR}/../../PTQ_static.py \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ + --dev-metadata=dump/dev/raw/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ --onnx_format=True \ No newline at end of file From ac3ed3c5a8a4e81ad662b8c41efa562f415dad7b Mon Sep 17 00:00:00 2001 From: QuanZ9 <31169290+QuanZ9@users.noreply.github.com> Date: Wed, 1 Feb 2023 15:55:52 +0800 Subject: [PATCH 09/42] Update zh_frontend.py (#2863) --- paddlespeech/t2s/frontend/zh_frontend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index ddd8cf5c7..efb673e36 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -138,7 +138,7 @@ class Frontend(): "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", - "狗儿" + "狗儿", "少儿" } self.vocab_phones = {} From 896da6dcd152b6241f606343dfa5ee6ec4932df5 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 1 Feb 2023 18:25:00 +0800 Subject: [PATCH 10/42] remove utils and third_party in paddlespeech's site-packages (#2867) --- audio/setup.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index d36b2c440..6e358346c 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -273,7 +273,7 @@ def main(): }, # Package info - packages=find_packages(include=('paddleaudio*')), + packages=find_packages(include=['paddleaudio*']), package_data=lib_package_data, ext_modules=setup_helpers.get_ext_modules(), zip_safe=True, diff --git a/setup.py b/setup.py index be6cf63a9..2c97ce783 100644 --- a/setup.py +++ b/setup.py @@ -300,7 +300,7 @@ setup_info = dict( }, # Package info - packages=find_packages(include=('paddlespeech*')), + packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']), zip_safe=True, classifiers=[ 'Development Status :: 5 - Production/Stable', From a55fd2e55685236c34330e0ba01e98878fc5b8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Thu, 2 Feb 2023 13:03:41 +0800 Subject: [PATCH 11/42] [TTS]Fix diffusion wavenet denoiser final conv init param (#2868) * add diffusion module for training diffsinger * add wavenet denoiser final conv initializer --- paddlespeech/t2s/modules/diffusion.py | 34 +++++++++++---------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index 52fe84ceb..eb67ffb0d 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -40,7 +40,7 @@ class WaveNetDenoiser(nn.Layer): layers (int, optional): Number of residual blocks inside, by default 20 stacks (int, optional): - The number of groups to split the residual blocks into, by default 4 + The number of groups to split the residual blocks into, by default 5 Within each group, the dilation of the residual block grows exponentially. residual_channels (int, optional): Residual channel of the residual blocks, by default 256 @@ -64,7 +64,7 @@ class WaveNetDenoiser(nn.Layer): out_channels: int=80, kernel_size: int=3, layers: int=20, - stacks: int=4, + stacks: int=5, residual_channels: int=256, gate_channels: int=512, skip_channels: int=256, @@ -72,7 +72,7 @@ class WaveNetDenoiser(nn.Layer): dropout: float=0., bias: bool=True, use_weight_norm: bool=False, - init_type: str="kaiming_uniform", ): + init_type: str="kaiming_normal", ): super().__init__() # initialize parameters @@ -118,18 +118,15 @@ class WaveNetDenoiser(nn.Layer): bias=bias) self.conv_layers.append(conv) + final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True) + nn.initializer.Constant(0.0)(final_conv.weight) self.last_conv_layers = nn.Sequential(nn.ReLU(), nn.Conv1D( skip_channels, skip_channels, 1, bias_attr=True), - nn.ReLU(), - nn.Conv1D( - skip_channels, - out_channels, - 1, - bias_attr=True)) + nn.ReLU(), final_conv) if use_weight_norm: self.apply_weight_norm() @@ -200,10 +197,6 @@ class GaussianDiffusion(nn.Layer): Args: denoiser (Layer, optional): The model used for denoising noises. - In fact, the denoiser model performs the operation - of producing a output with more noises from the noisy input. - Then we use the diffusion algorithm to calculate - the input with the output to get the denoised result. num_train_timesteps (int, optional): The number of timesteps between the noise and the real during training, by default 1000. beta_start (float, optional): @@ -233,7 +226,8 @@ class GaussianDiffusion(nn.Layer): >>> def callback(index, timestep, num_timesteps, sample): >>> nonlocal pbar >>> if pbar is None: - >>> pbar = tqdm(total=num_timesteps-index) + >>> pbar = tqdm(total=num_timesteps) + >>> pbar.update(index) >>> pbar.update() >>> >>> return callback @@ -247,7 +241,7 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) @@ -262,7 +256,7 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x_in, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) @@ -277,11 +271,11 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, None, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) - 100%|█████| 25/25 [00:01<00:00, 19.75it/s] + 100%|█████| 34/34 [00:01<00:00, 19.75it/s] >>> >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output >>> ds = 1000 @@ -292,11 +286,11 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) - 100%|█████| 5/5 [00:00<00:00, 23.80it/s] + 100%|█████| 14/14 [00:00<00:00, 23.80it/s] """ From a283f8a57e8bbc411bd36f2e0d8df3e0780a1c0e Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 2 Feb 2023 13:04:20 +0800 Subject: [PATCH 12/42] [TTS]fix open encoding (#2865) --- paddlespeech/cli/tts/infer.py | 6 +++--- paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py | 2 +- paddlespeech/t2s/exps/ernie_sat/train.py | 2 +- paddlespeech/t2s/exps/fastspeech2/train.py | 4 ++-- paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py | 6 +++--- paddlespeech/t2s/exps/speedyspeech/train.py | 6 +++--- paddlespeech/t2s/exps/syn_utils.py | 8 ++++---- paddlespeech/t2s/exps/tacotron2/train.py | 2 +- paddlespeech/t2s/exps/transformer_tts/train.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 4 ++-- paddlespeech/t2s/frontend/phonectic.py | 2 +- paddlespeech/t2s/frontend/zh_frontend.py | 4 ++-- 12 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 707518c05..5515ade26 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor): with open(self.voc_config) as f: self.voc_config = CfgNode(yaml.safe_load(f)) - with open(self.phones_dict, "r") as f: + with open(self.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if self.tones_dict: - with open(self.tones_dict, "r") as f: + with open(self.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if self.speaker_dict: - with open(self.speaker_dict, 'rt') as f: + with open(self.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index e450aa1a0..c43dafb3c 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -437,7 +437,7 @@ if __name__ == '__main__': vocab_phones = {} - with open(args.phones_dict, 'rt') as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py index 75a666bb1..c98d691be 100644 --- a/paddlespeech/t2s/exps/ernie_sat/train.py +++ b/paddlespeech/t2s/exps/ernie_sat/train.py @@ -109,7 +109,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index d31e62a82..97626db0b 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -67,7 +67,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -123,7 +123,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 644ec250d..d05dfafcf 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config): # construct dataset for evaluation sentences = [] - with open(args.text, 'rt') as f: + with open(args.text, 'rt', encoding='utf-8') as f: for line in f: items = line.strip().split() utt_id = items[0] sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 7b422e64f..c90090daa 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -70,7 +70,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker speedyspeech!") collate_fn = speedyspeech_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -133,11 +133,11 @@ def train_sp(args, config): collate_fn=collate_fn, num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 6b693440c..491edda30 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int): def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): # construct dataset for evaluation sentences = [] - with open(text_file, 'rt') as f: + with open(text_file, 'rt', encoding='utf-8') as f: for line in f: if line.strip() != "": items = re.split(r"\s+", line.strip(), 1) @@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc', tones_dict: Optional[os.PathLike]=None, speaker_dict: Optional[os.PathLike]=None, return_am: bool=False): - with open(phones_dict, "r") as f: + with open(phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if tones_dict is not None: - with open(tones_dict, "r") as f: + with open(tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if speaker_dict is not None: - with open(speaker_dict, 'rt') as f: + with open(speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) odim = am_config.n_mels diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py index 69ff80e46..db88009a8 100644 --- a/paddlespeech/t2s/exps/tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -119,7 +119,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index da48b6b99..d49baad99 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -114,7 +114,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index f6a31ced2..0e74bf631 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -78,7 +78,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker vits!") collate_fn = vits_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -132,7 +132,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 261db80a8..af86d9b80 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -58,7 +58,7 @@ class English(Phonetics): self.punc = ":,;。?!“”‘’':,;.?!" self.text_normalizer = TextNormalizer() if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index efb673e36..35b97a93a 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -144,12 +144,12 @@ class Frontend(): self.vocab_phones = {} self.vocab_tones = {} if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) if tone_vocab_path: - with open(tone_vocab_path, 'rt') as f: + with open(tone_vocab_path, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) From c764710aa12a2f0db23475b15e1f6cafd5f05e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A0=E5=AE=8F=E5=BD=AC?= <57510731+hopingZ@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:05:35 +0800 Subject: [PATCH 13/42] [TTS]Avoid using variable "attn_loss" before assignment (#2860) * Avoid using variable "attn_loss" before assignment * Update tacotron2_updater.py --------- Co-authored-by: TianYuan --- .../t2s/models/tacotron2/tacotron2_updater.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py index 09e6827d0..1db9248ae 100644 --- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py @@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater): loss.backward() optimizer.step() + if self.use_guided_attn_loss: + report("train/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) + report("train/l1_loss", float(l1_loss)) report("train/mse_loss", float(mse_loss)) report("train/bce_loss", float(bce_loss)) - report("train/attn_loss", float(attn_loss)) report("train/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) @@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator): attn_loss = self.attn_loss( att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss + + if self.use_guided_attn_loss: + report("eval/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) report("eval/l1_loss", float(l1_loss)) report("eval/mse_loss", float(mse_loss)) report("eval/bce_loss", float(bce_loss)) - report("eval/attn_loss", float(attn_loss)) report("eval/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) From 6b00ad6064a390525bd992dc747e1e5681b49db4 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 3 Feb 2023 09:57:51 +0800 Subject: [PATCH 14/42] [Install]clean dependencies (#2871) * clean dependencies * update paddleaudio's version * rm dependency in librosa and paddlenlp * rm dependency in paddlepaddle * rm dependency in speech_web --- audio/setup.py | 7 +------ .../speech_web/speech_server/requirements.txt | 4 +--- docs/requirements.txt | 15 +++------------ setup.py | 18 +++++------------- 4 files changed, 10 insertions(+), 34 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index 6e358346c..823e5dfad 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -40,14 +40,9 @@ COMMITID = 'none' base = [ "kaldiio", "librosa==0.8.1", - "scipy>=1.0.0", - "soundfile~=0.10", - "colorlog", - "pathos==0.2.8", + "pathos", "pybind11", "parameterized", - "tqdm", - "scikit-learn" ] requirements = { diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt index cdc654656..8425a1fee 100644 --- a/demos/speech_web/speech_server/requirements.txt +++ b/demos/speech_web/speech_server/requirements.txt @@ -1,8 +1,6 @@ aiofiles faiss-cpu -praatio==5.0.0 +praatio>=5.0.0 pydantic python-multipart -scikit_learn starlette -uvicorn diff --git a/docs/requirements.txt b/docs/requirements.txt index 5422c26f9..609f27925 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,11 +1,9 @@ braceexpand editdistance -fastapi g2p_en g2pM h5py inflect -jieba jsonlines kaldiio keyboard @@ -24,30 +22,23 @@ paddlespeech_ctcdecoders paddlespeech_feat pandas pattern_singleton -Pillow>=9.0.0 ppdiffusers>=0.9.0 -praatio==5.0.0 +praatio>=5.0.0 prettytable pypinyin-dict pypinyin<=0.44.0 python-dateutil -pyworld==0.2.12 +pyworld>=0.2.12 recommonmark>=0.5.0 -resampy==0.2.2 +resampy sacrebleu -scipy -sentencepiece~=0.1.96 -soundfile~=0.10 sphinx sphinx-autobuild sphinx-markdown-tables sphinx_rtd_theme textgrid timer -tqdm typeguard -uvicorn -visualdl webrtcvad websockets yacs~=0.1.8 diff --git a/setup.py b/setup.py index 2c97ce783..76bc5be8d 100644 --- a/setup.py +++ b/setup.py @@ -37,9 +37,7 @@ base = [ "g2pM", "h5py", "inflect", - "jieba", "jsonlines", - "kaldiio", "librosa==0.8.1", "loguru", "matplotlib", @@ -51,22 +49,16 @@ base = [ "paddlenlp>=2.4.8", "ppdiffusers>=0.9.0", "paddlespeech_feat", - "Pillow>=9.0.0", - "praatio==5.0.0", + "praatio>=5.0.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", - "pyworld==0.2.12", - "resampy==0.2.2", + "pyworld>=0.2.12", + "resampy", "sacrebleu", - "scipy", - "sentencepiece~=0.1.96", - "soundfile~=0.10", "textgrid", "timer", - "tqdm", "typeguard", - "visualdl", "webrtcvad", "yacs~=0.1.8", "prettytable", @@ -74,10 +66,10 @@ base = [ "braceexpand", "pyyaml", "paddleslim>=2.3.4", - "paddleaudio>=1.0.2", + "paddleaudio>=1.1.0", ] -server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] +server = ["pattern_singleton", "websockets"] requirements = { "install": From 089c060756c9fe5494ad9e13a57e61451103fee1 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 6 Feb 2023 19:59:02 +0800 Subject: [PATCH 15/42] fix pwgan tipc (#2882) --- tests/test_tipc/prepare.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index cb05a1d0f..9ff81bd8b 100755 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then mkdir -p BZNSYP unrar x BZNSYP.rar BZNSYP wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt + # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住 + wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz + tar -xzf nltk_data.tar.gz -C ${HOME} # 数据预处理 python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" From 16d84367c6c7452deb0cc9955aa40298271637b0 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 7 Feb 2023 10:10:53 +0800 Subject: [PATCH 16/42] fix Tensor.numpy()[0] to float(Tensor) to adapt 0D (#2884) --- examples/tess/cls0/local/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py index 25382d8c3..f023a37b7 100644 --- a/examples/tess/cls0/local/train.py +++ b/examples/tess/cls0/local/train.py @@ -121,7 +121,7 @@ if __name__ == "__main__": optimizer.clear_grad() # Calculate loss - avg_loss += loss.numpy()[0] + avg_loss += float(loss) # Calculate metrics preds = paddle.argmax(logits, axis=1) From 3a8ba2e24246f1283bfcfd149bd803e03987d4d3 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Tue, 7 Feb 2023 13:14:54 +0800 Subject: [PATCH 17/42] [TTS]Update VITS checkpoint , test=tts (#2887) --- examples/csmsc/vits/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index 8f223e07b..50d703b2d 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -147,14 +147,14 @@ optional arguments: The pretrained model can be downloaded here: -- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true) +- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true) VITS checkpoint contains files listed below. ```text -vits_csmsc_ckpt_1.1.0 -├── default.yaml # default config used to train vitx -├── phone_id_map.txt # phone vocabulary file when training vits -└── snapshot_iter_333000.pdz # model parameters and optimizer states +vits_csmsc_ckpt_1.4.0 +├── default.yaml # default config used to train vitx +├── phone_id_map.txt # phone vocabulary file when training vits +└── snapshot_iter_150000.pdz # model parameters and optimizer states ``` ps: This ckpt is not good enough, a better result is training @@ -168,9 +168,9 @@ add_blank=true FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize_e2e.py \ - --config=vits_csmsc_ckpt_1.1.0/default.yaml \ - --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \ - --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \ + --config=vits_csmsc_ckpt_1.4.0/default.yaml \ + --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \ + --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \ --output_dir=exp/default/test_e2e \ --text=${BIN_DIR}/../sentences.txt \ --add-blank=${add_blank} From b1d0658ef8d265a2781ce712982edfca4f0d4c52 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 8 Feb 2023 15:47:30 +0800 Subject: [PATCH 18/42] Update stale.yml --- .github/stale.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/stale.yml b/.github/stale.yml index da19b6606..6b0da9b98 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -6,7 +6,8 @@ daysUntilClose: 30 exemptLabels: - Roadmap - Bug - - New Feature + - feature request + - Tips # Label to use when marking an issue as stale staleLabel: Stale # Comment to post when marking an issue as stale. Set to `false` to disable @@ -17,4 +18,4 @@ markComment: > unmarkComment: false # Comment to post when closing a stale issue. Set to `false` to disable closeComment: > - This issue is closed. Please re-open if needed. \ No newline at end of file + This issue is closed. Please re-open if needed. From f6b624ddc8d4bc869b0a5fbe36d3e183a6adf01c Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Wed, 8 Feb 2023 16:29:58 +0800 Subject: [PATCH 19/42] add encoding=utf8 for text cli. (#2896) --- paddlespeech/cli/text/infer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index ff822f674..bd76a13d0 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor): if self.task == 'punc': # punc list self._punc_list = [] - with open(self.vocab_file, 'r') as f: + with open(self.vocab_file, 'r', encoding='utf-8') as f: for line in f: self._punc_list.append(line.strip()) @@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor): if self.task == 'punc': # punc list self._punc_list = [] - with open(self.vocab_file, 'r') as f: + with open(self.vocab_file, 'r', encoding='utf-8') as f: for line in f: self._punc_list.append(line.strip()) # model - with open(self.cfg_path) as f: + with open(self.cfg_path, 'r', encoding='utf-8') as f: config = CfgNode(yaml.safe_load(f)) self.model = ErnieLinear(**config["model"]) From 8cbf6a2c9aa0dd1099bcb7654f1a984a8ea541b5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 9 Feb 2023 10:51:03 +0800 Subject: [PATCH 20/42] Update setup.py (#2879) --- third_party/ctc_decoders/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py index c13f3df99..5ae5b3bf6 100644 --- a/third_party/ctc_decoders/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -129,7 +129,7 @@ decoders_module = [ setup( name='paddlespeech_ctcdecoders', - version='0.2.0', + version='0.2.2', description="CTC decoders in paddlespeech", author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com", From 6728db5b59d4aa26424400a6c420036b98cca726 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 9 Feb 2023 10:51:23 +0800 Subject: [PATCH 21/42] [ASR]Whisper remove audio duration limit, test=asr (#2900) --- paddlespeech/cli/whisper/infer.py | 9 +-------- paddlespeech/s2t/models/whisper/__init__.py | 2 +- paddlespeech/s2t/models/whisper/tokenizer.py | 2 +- paddlespeech/s2t/models/whisper/utils.py | 2 +- paddlespeech/s2t/models/whisper/whipser.py | 2 +- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py index c016b453a..ebcca890b 100644 --- a/paddlespeech/cli/whisper/infer.py +++ b/paddlespeech/cli/whisper/infer.py @@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor): Init model and other resources from a specific path. """ logger.debug("start to init the model") - # default max_len: unit:second - self.max_len = 50 + if hasattr(self, 'model'): logger.debug('Model had been initialized.') return @@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor): try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) - audio_duration = audio.shape[0] / audio_sample_rate - if audio_duration > self.max_len: - logger.error( - f"Please input audio file less then {self.max_len} seconds.\n" - ) - return False except Exception as e: logger.exception(e) logger.error( diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py index 98ab23610..b78dece8a 100644 --- a/paddlespeech/s2t/models/whisper/__init__.py +++ b/paddlespeech/s2t/models/whisper/__init__.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py) from paddlespeech.s2t.models.whisper.whipser import decode diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py index 1e1aea044..e8b201bcc 100644 --- a/paddlespeech/s2t/models/whisper/tokenizer.py +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py) import os diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py index d067af7d2..5528f9604 100644 --- a/paddlespeech/s2t/models/whisper/utils.py +++ b/paddlespeech/s2t/models/whisper/utils.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py) import zlib diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 9cf9a9eca..a28013e4b 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper) import os From bcd8e309ec3fade62971067de6d5607027c254e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Thu, 9 Feb 2023 14:58:34 +0800 Subject: [PATCH 22/42] [TTS]Add diffusion noise clip to optimize sample result (#2902) * add diffusion module for training diffsinger * add wavenet denoiser final conv initializer * add diffusion noise clip to optimize sample result --- paddlespeech/t2s/modules/diffusion.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index eb67ffb0d..be684ce38 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -360,6 +360,8 @@ class GaussianDiffusion(nn.Layer): num_inference_steps: Optional[int]=1000, strength: Optional[float]=None, scheduler_type: Optional[str]="ddpm", + clip_noise: Optional[bool]=True, + clip_noise_range: Optional[Tuple[float, float]]=(-1, 1), callback: Optional[Callable[[int, int, int, paddle.Tensor], None]]=None, callback_steps: Optional[int]=1): @@ -380,6 +382,10 @@ class GaussianDiffusion(nn.Layer): scheduler_type (str, optional): Noise scheduler for generate noises. Choose a great scheduler can skip many denoising step, by default 'ddpm'. + clip_noise (bool, optional): + Whether to clip each denoised output, by default True. + clip_noise_range (tuple, optional): + denoised output min and max value range after clip, by default (-1, 1). callback (Callable[[int,int,int,Tensor], None], optional): Callback function during denoising steps. @@ -440,6 +446,9 @@ class GaussianDiffusion(nn.Layer): # denoising loop denoised_output = noisy_input + if clip_noise: + n_min, n_max = clip_noise_range + denoised_output = paddle.clip(denoised_output, n_min, n_max) num_warmup_steps = len( timesteps) - num_inference_steps * scheduler.order for i, t in enumerate(timesteps): @@ -451,6 +460,8 @@ class GaussianDiffusion(nn.Layer): # compute the previous noisy sample x_t -> x_t-1 denoised_output = scheduler.step(noise_pred, t, denoised_output).prev_sample + if clip_noise: + denoised_output = paddle.clip(denoised_output, n_min, n_max) # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and From 66a9cf8ebc0be8deb68c6fa9b53f961aa647b045 Mon Sep 17 00:00:00 2001 From: ZhengZZZ <107376080+EscaticZheng@users.noreply.github.com> Date: Mon, 13 Feb 2023 18:01:00 +0800 Subject: [PATCH 23/42] modify readme (#2915) --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index afc4e4d09..1678e9e04 100644 --- a/README.md +++ b/README.md @@ -987,7 +987,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P - Many thanks to [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) for developing a rasa chatbot,which is able to speak and listen thanks to PaddleSpeech. - Many thanks to [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) for the C++ inference implementation of PaddleSpeech ASR. - Many thanks to [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) for the real-time voice typing tool implementation of PaddleSpeech ASR streaming services. - +- Many thanks to [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) for the python3.9 prebuilt wheel for PaddleSpeech installation in Windows without Viusal Studio. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index ecc4644aa..572f9dee6 100644 --- a/README_cn.md +++ b/README_cn.md @@ -988,10 +988,10 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。 - 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。 - 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。 - - 非常感谢 [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) 基于 PaddleSpeech 的 ASR 与 TTS 设计的可听、说对话机器人。 - 非常感谢 [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) 对 PaddleSpeech 的 ASR 进行 C++ 推理实现。 - 非常感谢 [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) 基于 PaddleSpeech 的 ASR 流式服务实现的实时语音输入法工具。 +- 非常感谢 [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) 对PaddleSpeech在Windows下的安装提供了无需Visua Studio,基于python3.9的预编译依赖安装包。 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 From 047092de8ed344ec391e5492c897395837773765 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 13 Feb 2023 22:16:05 +0800 Subject: [PATCH 24/42] add wav2vev2_zh aishell recipe, and speechbrain dataloader. (#2916) --- examples/aishell/asr3/README.md | 198 ++++ examples/aishell/asr3/cmd.sh | 89 ++ examples/aishell/asr3/conf/preprocess.yaml | 3 + .../aishell/asr3/conf/train_with_wav2vec.yaml | 101 +++ examples/aishell/asr3/conf/tuning/decode.yaml | 4 + examples/aishell/asr3/conf/wav2vec2ASR.yaml | 167 ++++ .../aishell/asr3/local/aishell_prepare.py | 129 +++ examples/aishell/asr3/local/data.sh | 101 +++ examples/aishell/asr3/local/test.sh | 84 ++ examples/aishell/asr3/local/test_wav.sh | 58 ++ examples/aishell/asr3/local/train.sh | 59 ++ examples/aishell/asr3/path.sh | 15 + examples/aishell/asr3/run.sh | 48 + examples/aishell/asr3/utils | 1 + paddlespeech/s2t/exps/wav2vec2/model.py | 412 +++++++-- paddlespeech/s2t/io/speechbrain/__init__.py | 13 + paddlespeech/s2t/io/speechbrain/batch.py | 107 +++ .../s2t/io/speechbrain/data_pipeline.py | 488 ++++++++++ paddlespeech/s2t/io/speechbrain/data_utils.py | 177 ++++ paddlespeech/s2t/io/speechbrain/dataio.py | 845 ++++++++++++++++++ paddlespeech/s2t/io/speechbrain/dataloader.py | 172 ++++ paddlespeech/s2t/io/speechbrain/dataset.py | 371 ++++++++ paddlespeech/s2t/io/speechbrain/depgraph.py | 237 +++++ .../s2t/io/speechbrain/make_dataloader.py | 118 +++ paddlespeech/s2t/io/speechbrain/sampler.py | 503 +++++++++++ .../s2t/io/speechbrain/sb_pipeline.py | 156 ++++ paddlespeech/s2t/models/wav2vec2/__init__.py | 2 +- .../wav2vec2/processing/signal_processing.py | 20 +- .../processing/speech_augmentation.py | 44 +- .../s2t/models/wav2vec2/wav2vec2_ASR.py | 54 +- setup.py | 2 + 31 files changed, 4672 insertions(+), 106 deletions(-) create mode 100644 examples/aishell/asr3/README.md create mode 100755 examples/aishell/asr3/cmd.sh create mode 100755 examples/aishell/asr3/conf/preprocess.yaml create mode 100755 examples/aishell/asr3/conf/train_with_wav2vec.yaml create mode 100755 examples/aishell/asr3/conf/tuning/decode.yaml create mode 100755 examples/aishell/asr3/conf/wav2vec2ASR.yaml create mode 100644 examples/aishell/asr3/local/aishell_prepare.py create mode 100755 examples/aishell/asr3/local/data.sh create mode 100755 examples/aishell/asr3/local/test.sh create mode 100755 examples/aishell/asr3/local/test_wav.sh create mode 100755 examples/aishell/asr3/local/train.sh create mode 100755 examples/aishell/asr3/path.sh create mode 100755 examples/aishell/asr3/run.sh create mode 120000 examples/aishell/asr3/utils create mode 100644 paddlespeech/s2t/io/speechbrain/__init__.py create mode 100755 paddlespeech/s2t/io/speechbrain/batch.py create mode 100755 paddlespeech/s2t/io/speechbrain/data_pipeline.py create mode 100755 paddlespeech/s2t/io/speechbrain/data_utils.py create mode 100755 paddlespeech/s2t/io/speechbrain/dataio.py create mode 100755 paddlespeech/s2t/io/speechbrain/dataloader.py create mode 100755 paddlespeech/s2t/io/speechbrain/dataset.py create mode 100755 paddlespeech/s2t/io/speechbrain/depgraph.py create mode 100755 paddlespeech/s2t/io/speechbrain/make_dataloader.py create mode 100755 paddlespeech/s2t/io/speechbrain/sampler.py create mode 100755 paddlespeech/s2t/io/speechbrain/sb_pipeline.py mode change 100644 => 100755 paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md new file mode 100644 index 000000000..e5806d621 --- /dev/null +++ b/examples/aishell/asr3/README.md @@ -0,0 +1,198 @@ +# Wav2vec2ASR with Aishell +This example contains code used to finetune [wav2vec2.0](https://https://arxiv.org/pdf/2006.11477.pdf) model with [Aishell dataset](http://www.openslr.org/resources/33) +## Overview +All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function. +| Stage | Function | +|:---- |:----------------------------------------------------------- | +| 0 | Process data. It includes:
(1) Download the dataset
(2) Calculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset
(5) Download the pretrained wav2vec2 model | +| 1 | Train the model | +| 2 | Get the final model by averaging the top-k models, set k = 1 means to choose the best model | +| 3 | Test the final model performance | +| 4 | Infer the single audio file | + + +You can choose to run a range of stages by setting `stage` and `stop_stage `. + +For example, if you want to execute the code in stage 2 and stage 3, you can run this script: +```bash +bash run.sh --stage 2 --stop_stage 3 +``` +Or you can set `stage` equal to `stop-stage` to only run one stage. +For example, if you only want to run `stage 0`, you can use the script below: +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +The document below will describe the scripts in `run.sh` in detail. +## The Environment Variables +The path.sh contains the environment variables. +```bash +. ./path.sh +. ./cmd.sh +``` +This script needs to be run first. And another script is also needed: +```bash +source ${MAIN_ROOT}/utils/parse_options.sh +``` +It will support the way of using `--variable value` in the shell scripts. +## The Local Variables +Some local variables are set in `run.sh`. +`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. +`stage` denotes the number of stages you want to start from in the experiments. +`stop stage` denotes the number of the stage you want to end at in the experiments. +`conf_path` denotes the config path of the model. +`avg_num` denotes the number K of top-K models you want to average to get the final model. +`audio file` denotes the file path of the single file you want to infer in stage 5 +`ckpt` denotes the checkpoint prefix of the model, e.g. "wav2vec2ASR" + +You can set the local variables (except `ckpt`) when you use `run.sh` + +For example, you can set the `gpus` and `avg_num` when you use the command line: +```bash +bash run.sh --gpus 0,1 --avg_num 20 +``` +## Stage 0: Data Processing +To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below: +```bash + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 + fi +``` +Stage 0 is for processing the data. + +If you only want to process the data. You can run +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +You can also just run these scripts in your command line. +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +``` +After processing the data, the `data` directory will look like this: +```bash +data/ +|-- dev.meta +|-- lang_char +| `-- vocab.txt +|-- manifest.dev +|-- manifest.dev.raw +|-- manifest.test +|-- manifest.test.raw +|-- manifest.train +|-- manifest.train.raw +|-- mean_std.json +|-- test.meta +|-- train.meta +|-- train.csv +|-- dev.csv +|-- test.csv +``` + +Stage 0 also downloads the Chinese pre-trained [wav2vec2](https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams) model. +```bash +mkdir -p exp/wav2vec2 +wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams +``` +## Stage 1: Model Training +If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + fi +``` +If you want to train the model, you can use the script below to execute stage 0 and stage 1: +```bash +bash run.sh --stage 0 --stop_stage 1 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +``` +## Stage 2: Top-k Models Averaging +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for wav2vec2ASR, thus the `avg_num` is set to 1. +```bash + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} + fi +``` +The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`. +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: +```bash +bash run.sh --stage 0 --stop_stage 2 +``` +or you can run these scripts in the command line (only use CPU). + +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +``` +## Stage 3: Model Testing +The test stage is to evaluate the model performance. The code of test stage is shown below: +```bash + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +```bash +bash run.sh --stage 0 --stop_stage 3 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +## Pretrained Model +You can get the pretrained wav2vec2ASR from [this](../../../docs/source/released_model.md). + +using the `tar` scripts to unpack the model and then you can use the script to test the model. + +For example: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +source path.sh +# If you have process the data and get the manifest file, you can skip the following 2 steps +bash local/data.sh --stage -1 --stop_stage -1 +bash local/data.sh --stage 2 --stop_stage 2 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +The performance of the released models are shown in [here](./RESULTS.md). + + +## Stage 4: Single Audio File Inference +In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below +```bash + if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + fi +``` +you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +``` +You can download the audio demo: +```bash +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +``` +You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. +```bash +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav +``` diff --git a/examples/aishell/asr3/cmd.sh b/examples/aishell/asr3/cmd.sh new file mode 100755 index 000000000..7b70ef5e0 --- /dev/null +++ b/examples/aishell/asr3/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time