diff --git a/.gitignore b/.gitignore index 75f56b604..4a0c43312 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ *.egg-info build *output/ +.history audio/dist/ audio/fc_patch/ diff --git a/README.md b/README.md index dbdf6a4f8..afc4e4d09 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model). - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid). - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website @@ -189,7 +191,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
- +
## Installation diff --git a/README_cn.md b/README_cn.md index 5cc156c9f..ecc4644aa 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,6 +164,8 @@ ### 近期更新 +- 🔥 2023.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验! @@ -200,7 +202,7 @@ 微信扫描二维码关注公众号,点击“马上报名”填写问卷加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
- +
diff --git a/audio/setup.py b/audio/setup.py index 82e9a55a5..d36b2c440 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -43,7 +43,7 @@ base = [ "scipy>=1.0.0", "soundfile~=0.10", "colorlog", - "pathos == 0.2.8", + "pathos==0.2.8", "pybind11", "parameterized", "tqdm", diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c815a88af..ee2acd6fd 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. Usage @@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav -v # English paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v + # Code-Switch + paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v # Chinese ASR + Punctuation Restoration paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` @@ -40,6 +42,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `input`(required): Audio file to recognize. - `model`: Model type of asr task. Default: `conformer_wenetspeech`. - `lang`: Model language. Default: `zh`. + - `codeswitch`: Code Swith Model. Default: `False` - `sample_rate`: Sample rate of the model. Default: `16000`. - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. @@ -83,14 +86,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API: -| Model | Language | Sample Rate -| :--- | :---: | :---: | -| conformer_wenetspeech | zh | 16k -| conformer_online_multicn | zh | 16k -| conformer_aishell | zh | 16k -| conformer_online_aishell | zh | 16k -| transformer_librispeech | en | 16k -| deepspeech2online_wenetspeech | zh | 16k -| deepspeech2offline_aishell| zh| 16k -| deepspeech2online_aishell | zh | 16k -| deepspeech2offline_librispeech | en | 16k +| Model | Code Switch | Language | Sample Rate +| :--- | :---: | :---: | :---: | +| conformer_wenetspeech | False | zh | 16k +| conformer_online_multicn | False | zh | 16k +| conformer_aishell | False | zh | 16k +| conformer_online_aishell | False | zh | 16k +| transformer_librispeech | False | en | 16k +| deepspeech2online_wenetspeech | False | zh | 16k +| deepspeech2offline_aishell | False | zh| 16k +| deepspeech2online_aishell | False | zh | 16k +| deepspeech2offline_librispeech | False | en | 16k +| conformer_talcs | True | zh_en | 16k diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index 13aa9f277..62dce3bc9 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -1,4 +1,5 @@ (简体中文|[English](./README.md)) + (简体中文|[English](./README.md)) # 语音识别 ## 介绍 @@ -16,7 +17,7 @@ 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) @@ -25,6 +26,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav -v # 英文 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v + #中英混合 + paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v # 中文 + 标点恢复 paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` @@ -38,6 +41,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `input`(必须输入):用于识别的音频文件。 - `model`:ASR 任务的模型,默认值:`conformer_wenetspeech`。 - `lang`:模型语言,默认值:`zh`。 + - `codeswitch`: 是否使用语言转换,默认值:`False`。 - `sample_rate`:音频采样率,默认值:`16000`。 - `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 @@ -80,14 +84,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 4.预训练模型 以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表: -| 模型 | 语言 | 采样率 -| :--- | :---: | :---: | -| conformer_wenetspeech | zh | 16k -| conformer_online_multicn | zh | 16k -| conformer_aishell | zh | 16k -| conformer_online_aishell | zh | 16k -| transformer_librispeech | en | 16k -| deepspeech2online_wenetspeech | zh | 16k -| deepspeech2offline_aishell| zh| 16k -| deepspeech2online_aishell | zh | 16k -| deepspeech2offline_librispeech | en | 16k +| 模型 | 语言转换 | 语言 | 采样率 +| :--- | :---: | :---: | :---: | +| conformer_wenetspeech | False | zh | 16k +| conformer_online_multicn | False | zh | 16k +| conformer_aishell | False | zh | 16k +| conformer_online_aishell | False | zh | 16k +| transformer_librispeech | False | en | 16k +| deepspeech2online_wenetspeech | False | zh | 16k +| deepspeech2offline_aishell | False | zh| 16k +| deepspeech2online_aishell | False | zh | 16k +| deepspeech2offline_librispeech | False | en | 16k +| conformer_talcs | True | zh_en | 16k diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index e48ff3e96..8ba6e4c3e 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -2,6 +2,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav # asr paddlespeech asr --input ./zh.wav @@ -18,6 +19,11 @@ paddlespeech asr --help # english asr paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav + +# code-switch asr +paddlespeech asr --lang zh_en --codeswitch True --model conformer_talcs --input ./ch_zh_mix.wav + + # model stats paddlespeech stats --task asr diff --git a/docs/requirements.txt b/docs/requirements.txt index c6228d917..5422c26f9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,4 @@ braceexpand -colorlog editdistance fastapi g2p_en @@ -16,7 +15,7 @@ matplotlib myst-parser nara_wpe numpydoc -onnxruntime==1.10.0 +onnxruntime>=1.11.0 opencc paddlenlp # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243 @@ -24,7 +23,6 @@ paddlepaddle>=2.2.2,<2.4.0 paddlespeech_ctcdecoders paddlespeech_feat pandas -pathos==0.2.8 pattern_singleton Pillow>=9.0.0 ppdiffusers>=0.9.0 diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/aishell3/vc0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/aishell3/vc0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc1/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc1/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh deleted file mode 100755 index 8fd8977d3..000000000 --- a/examples/aishell3/vc2/local/synthesize.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_aishell3 \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_aishell3 \ - --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ - --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt \ - --voice-cloning=True diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh new file mode 120000 index 000000000..ca8df6b04 --- /dev/null +++ b/examples/aishell3/vc2/local/synthesize.sh @@ -0,0 +1 @@ +../../vc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc2/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc2/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc2/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc2/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh index 44cc3dbe4..71eab68ad 100755 --- a/examples/aishell3/voc1/local/preprocess.sh +++ b/examples/aishell3/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/aishell3/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/aishell3/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/aishell3/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/aishell3/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh deleted file mode 100755 index 44cc3dbe4..000000000 --- a/examples/aishell3/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./aishell3_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/data_aishell3/ \ - --dataset=aishell3 \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/aishell3/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/aishell3/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/aishell3/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/aishell3/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/aishell3/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh deleted file mode 100755 index 8b4178f13..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -stage=0 -stop_stage=0 - -# hifigan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/synthesize.py \ - --erniesat_config=${config_path} \ - --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --erniesat_stat=dump/train/speech_stats.npy \ - --voc=hifigan_aishell3 \ - --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ - --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ - --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt -fi diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh new file mode 120000 index 000000000..5703dcb2c --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/aishell3_vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh index a70a77b58..c6dce53cb 100755 --- a/examples/csmsc/tts3/local/PTQ_static.sh +++ b/examples/csmsc/tts3/local/PTQ_static.sh @@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ - --onnx_forma=True \ No newline at end of file + --onnx_format=True \ No newline at end of file diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh index 2e5166141..c85ebd109 100755 --- a/examples/csmsc/voc1/local/PTQ_static.sh +++ b/examples/csmsc/voc1/local/PTQ_static.sh @@ -2,7 +2,7 @@ train_output_path=$1 model_name=$2 python3 ${BIN_DIR}/../../PTQ_static.py \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ + --dev-metadata=dump/dev/raw/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ --onnx_format=True \ No newline at end of file diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh index 61d6d62be..62d0717b9 100755 --- a/examples/csmsc/voc1/local/preprocess.sh +++ b/examples/csmsc/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh deleted file mode 100755 index 6719bd0be..000000000 --- a/examples/csmsc/voc3/finetune.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -source path.sh - -gpus=0 -stage=0 -stop_stage=100 - -source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ - --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --dur-file=durations.txt \ - --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ - --dataset=baker \ - --rootdir=~/datasets/BZNSYP/ -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 ${MAIN_ROOT}/utils/link_wav.py \ - --old-dump-dir=dump \ - --dump-dir=dump_finetune -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - cp dump/train/feats_stats.npy dump_finetune/train/ -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/train/raw/metadata.jsonl \ - --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/dev/raw/metadata.jsonl \ - --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/test/raw/metadata.jsonl \ - --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - CUDA_VISIBLE_DEVICES=${gpus} \ - FLAGS_cudnn_exhaustive_search=true \ - FLAGS_conv_workspace_size_limit=4000 \ - python ${BIN_DIR}/train.py \ - --train-metadata=dump_finetune/train/norm/metadata.jsonl \ - --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ - --config=conf/finetune.yaml \ - --output-dir=exp/finetune \ - --ngpu=1 -fi \ No newline at end of file diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh new file mode 120000 index 000000000..b6fa868e2 --- /dev/null +++ b/examples/csmsc/voc3/finetune.sh @@ -0,0 +1 @@ +../voc5/finetune.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc3/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc3/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc3/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc4/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc4/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc4/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc4/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh index 6719bd0be..eb8325aeb 100755 --- a/examples/csmsc/voc5/finetune.sh +++ b/examples/csmsc/voc5/finetune.sh @@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/train/raw/metadata.jsonl \ --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/dev/raw/metadata.jsonl \ --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/test/raw/metadata.jsonl \ --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc5/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh index 2dcc39ac7..509824b8e 100755 --- a/examples/csmsc/voc6/local/preprocess.sh +++ b/examples/csmsc/voc6/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc6/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc6/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh deleted file mode 100755 index f90db9150..000000000 --- a/examples/ljspeech/tts0/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh new file mode 120000 index 000000000..7f54e9239 --- /dev/null +++ b/examples/ljspeech/tts0/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts0/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/ljspeech/tts0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/ljspeech/tts0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh deleted file mode 100755 index d1302f99f..000000000 --- a/examples/ljspeech/tts3/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh new file mode 120000 index 000000000..d7b05058e --- /dev/null +++ b/examples/ljspeech/tts3/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/ljspeech/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/ljspeech/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh index d1af60dad..bfbf75b7d 100755 --- a/examples/ljspeech/voc1/local/preprocess.sh +++ b/examples/ljspeech/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/ljspeech/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/ljspeech/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/ljspeech/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/ljspeech/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh deleted file mode 100755 index d1af60dad..000000000 --- a/examples/ljspeech/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./ljspeech_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/LJSpeech-1.1/ \ - --dataset=ljspeech \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/ljspeech/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/ljspeech/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/ljspeech/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/ljspeech/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/ljspeech/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh deleted file mode 100755 index 3a5076505..000000000 --- a/examples/vctk/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/vctk/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/vctk/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/vctk/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh index 88a478cd5..6b7e5288a 100755 --- a/examples/vctk/voc1/local/preprocess.sh +++ b/examples/vctk/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/vctk/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/vctk/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/vctk/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/vctk/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh deleted file mode 100755 index 88a478cd5..000000000 --- a/examples/vctk/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./vctk_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/VCTK-Corpus-0.92/ \ - --dataset=vctk \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/vctk/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/vctk/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/vctk/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/vctk/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/vctk/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh deleted file mode 100755 index 1da72f117..000000000 --- a/examples/zh_en_tts/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/zh_en_tts/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/zh_en_tts/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/zh_en_tts/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 004143361..7a7aef8b0 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -25,6 +25,9 @@ import librosa import numpy as np import paddle import soundfile +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ...utils.env import MODEL_HOME @@ -34,9 +37,6 @@ from ..log import logger from ..utils import CLI_TIMER from ..utils import stats_wrapper from ..utils import timer_register -from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] @@ -62,8 +62,13 @@ class ASRExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]' + help='Choose model language. [zh, en, zh_en], zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k], zh_en:[conformer_talcs-codeswitch_zh_en-16k]' ) + self.parser.add_argument( + '--codeswitch', + type=bool, + default=False, + help='Choose whether use code-switch. True or False.') self.parser.add_argument( "--sample_rate", type=int, @@ -127,6 +132,7 @@ class ASRExecutor(BaseExecutor): def _init_from_path(self, model_type: str='wenetspeech', lang: str='zh', + codeswitch: bool=False, sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, decode_method: str='attention_rescoring', @@ -144,7 +150,12 @@ class ASRExecutor(BaseExecutor): if cfg_path is None or ckpt_path is None: sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str + if lang == "zh_en" and codeswitch is True: + tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str + elif lang == "zh_en" or codeswitch is True: + raise Exception("codeswitch is true only in zh_en model") + else: + tag = model_type + '-' + lang + '-' + sample_rate_str self.task_resource.set_task_model(tag, version=None) self.res_path = self.task_resource.res_dir @@ -423,6 +434,7 @@ class ASRExecutor(BaseExecutor): model = parser_args.model lang = parser_args.lang + codeswitch = parser_args.codeswitch sample_rate = parser_args.sample_rate config = parser_args.config ckpt_path = parser_args.ckpt_path @@ -444,6 +456,7 @@ class ASRExecutor(BaseExecutor): audio_file=input_, model=model, lang=lang, + codeswitch=codeswitch, sample_rate=sample_rate, config=config, ckpt_path=ckpt_path, @@ -472,6 +485,7 @@ class ASRExecutor(BaseExecutor): audio_file: os.PathLike, model: str='conformer_u2pp_online_wenetspeech', lang: str='zh', + codeswitch: bool=False, sample_rate: int=16000, config: os.PathLike=None, ckpt_path: os.PathLike=None, @@ -485,8 +499,8 @@ class ASRExecutor(BaseExecutor): """ audio_file = os.path.abspath(audio_file) paddle.set_device(device) - self._init_from_path(model, lang, sample_rate, config, decode_method, - num_decoding_left_chunks, ckpt_path) + self._init_from_path(model, lang, codeswitch, sample_rate, config, + decode_method, num_decoding_left_chunks, ckpt_path) if not self._check(audio_file, sample_rate, force_yes): sys.exit(-1) if rtf: diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py index 767d0df78..dfeb5cae5 100644 --- a/paddlespeech/cli/base_commands.py +++ b/paddlespeech/cli/base_commands.py @@ -14,6 +14,7 @@ import argparse from typing import List +import numpy from prettytable import PrettyTable from ..resource import CommonTaskResource @@ -78,7 +79,7 @@ class VersionCommand: model_name_format = { - 'asr': 'Model-Language-Sample Rate', + 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate', 'cls': 'Model-Sample Rate', 'st': 'Model-Source language-Target language', 'text': 'Model-Task-Language', @@ -111,7 +112,21 @@ class StatsCommand: fields = model_name_format[self.task].split("-") table = PrettyTable(fields) for key in pretrained_models: - table.add_row(key.split("-")) + line = key.split("-") + if self.task == "asr" and len(line) < len(fields): + for i in range(len(line), len(fields)): + line.append("-") + if "codeswitch" in key: + line[3], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + elif "multilingual" in key: + line[4], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + tmp = numpy.array(line) + idx = [0, 5, 3, 4, 1, 2] + line = tmp[idx] + table.add_row(line) + print(table) def execute(self, argv: List[str]) -> bool: diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 3c5aa1f90..ff0b30f6d 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -30,6 +30,7 @@ __all__ = [ ] # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". +# Add code-switch and multilingual tag, "{model_name}[_{dataset}]-[codeswitch/multilingual][_{lang}][-...]". # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k". # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" @@ -322,6 +323,18 @@ asr_dynamic_pretrained_models = { '099a601759d467cd0a8523ff939819c5' }, }, + "conformer_talcs-codeswitch_zh_en-16k": { + '1.4': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz', + 'md5': + '01962c5d0a70878fe41cacd4f61e14d1', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/conformer/checkpoints/avg_10' + }, + }, } asr_static_pretrained_models = { diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py index 8bd85c914..1e1aea044 100644 --- a/paddlespeech/s2t/models/whisper/tokenizer.py +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -155,6 +155,10 @@ class Tokenizer: if ids < len(self.tokenizer): ids_list.append(ids) token_ids = ids_list + elif len(token_ids) == 1: + token_ids = token_ids[0] + else: + raise ValueError(f"token_ids {token_ids} load error.") return self.tokenizer.decode(token_ids, **kwargs) diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 63cafbdb7..9cf9a9eca 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -17,12 +17,11 @@ from typing import Union import numpy as np import paddle import paddle.nn.functional as F +import paddlespeech.s2t.modules.align as paddlespeech_nn import soundfile import tqdm from paddle import nn from paddle.distribution import Categorical - -import paddlespeech.s2t.modules.align as paddlespeech_nn from paddlespeech.s2t.models.whisper import utils from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES @@ -477,7 +476,7 @@ def transcribe( decode_options["fp16"] = False if decode_options.get( - "language", 'None') or decode_options.get("language", None) is None: + "language") == 'None' or decode_options.get("language", None) is None: if not model.is_multilingual: decode_options["language"] = "en" else: @@ -771,8 +770,10 @@ class GreedyDecoder(TokenDecoder): if temperature == 0: next_tokens = paddle.argmax(logits, axis=-1) else: - next_tokens = Categorical(logits=logits / temperature).sample( - shape=logits.shape) + next_tokens = Categorical(logits=logits / temperature).sample([1]) + next_tokens = paddle.reshape(next_tokens, [ + next_tokens.shape[0] * next_tokens.shape[1], + ]) logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) current_logprobs = logprobs[paddle.arange(logprobs.shape[0]), @@ -1205,9 +1206,8 @@ class DecodingTask: DecodingResult( audio_features=features, language=language, - language_probs=probs) - for features, language, probs in zip(audio_features, languages, - language_probs) + language_probs=probs) for features, language, probs in + zip(audio_features, languages, language_probs) ] # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py index 26ac501e2..be6fcf589 100644 --- a/paddlespeech/s2t/training/gradclip.py +++ b/paddlespeech/s2t/training/gradclip.py @@ -43,8 +43,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = layers.square(merge_grad) - sum_square = layers.reduce_sum(square) + square = paddle.square(merge_grad) + sum_square = paddle.sum(square) sum_square_list.append(sum_square) # debug log, not dump all since slow down train process @@ -57,23 +57,24 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): return params_grads global_norm_var = layers.concat(sum_square_list) - global_norm_var = layers.reduce_sum(global_norm_var) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sum(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) + # debug log logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!") max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm)) + y=paddle.maximum(x=global_norm_var, y=max_global_norm)) for i, (p, g) in enumerate(params_grads): if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue - new_grad = layers.elementwise_mul(x=g, y=clip_var) + new_grad = paddle.multiply(x=g, y=clip_var) params_and_grads.append((p, new_grad)) # debug log, not dump all since slow down train process diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 1b1792bd1..299a8c3d4 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -16,14 +16,9 @@ import sys import warnings from typing import List +import numpy import uvicorn from fastapi import FastAPI -from prettytable import PrettyTable -from starlette.middleware.cors import CORSMiddleware - -from ..executor import BaseExecutor -from ..util import cli_server_register -from ..util import stats_wrapper from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource from paddlespeech.server.engine.engine_pool import init_engine_pool @@ -31,6 +26,12 @@ from paddlespeech.server.engine.engine_warmup import warm_up from paddlespeech.server.restful.api import setup_router as setup_http_router from paddlespeech.server.utils.config import get_config from paddlespeech.server.ws.api import setup_router as setup_ws_router +from prettytable import PrettyTable +from starlette.middleware.cors import CORSMiddleware + +from ..executor import BaseExecutor +from ..util import cli_server_register +from ..util import stats_wrapper warnings.filterwarnings("ignore") __all__ = ['ServerExecutor', 'ServerStatsExecutor'] @@ -134,7 +135,7 @@ class ServerStatsExecutor(): required=True) self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector'] self.model_name_format = { - 'asr': 'Model-Language-Sample Rate', + 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate', 'tts': 'Model-Language', 'cls': 'Model-Sample Rate', 'text': 'Model-Task-Language', @@ -145,7 +146,20 @@ class ServerStatsExecutor(): fields = self.model_name_format[self.task].split("-") table = PrettyTable(fields) for key in pretrained_models: - table.add_row(key.split("-")) + line = key.split("-") + if self.task == "asr" and len(line) < len(fields): + for i in range(len(line), len(fields)): + line.append("-") + if "codeswitch" in key: + line[3], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + elif "multilingual" in key: + line[4], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + tmp = numpy.array(line) + idx = [0, 5, 3, 4, 1, 2] + line = tmp[idx] + table.add_row(line) print(table) def execute(self, argv: List[str]) -> bool: diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 514cbef8e..24e15765e 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -187,7 +187,7 @@ def main(): record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) - output_metadata.sort(key=itemgetter('feats_lengths')) + output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" with jsonlines.open(output_metadata_path, 'w') as writer: for item in output_metadata: diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index 2b1a40834..d6b226a20 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -166,7 +166,7 @@ def process_sentences(config, if record: results.append(record) - results.sort(key=itemgetter("feats_lengths")) + results.sort(key=itemgetter("feats_lengths"), reverse=True) with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: for item in results: writer.write(item) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index 07301db56..f6a31ced2 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -110,7 +110,7 @@ def train_sp(args, config): train_sampler = ErnieSATSampler( train_dataset, batch_size=config.batch_size, - shuffle=True, + shuffle=False, drop_last=True) dev_sampler = ErnieSATSampler( dev_dataset, diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py index 47c26a610..3ce3d246d 100644 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -100,7 +100,7 @@ class G2PWOnnxConverter: ] self.non_polyphonic = { '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', - '肖', '瘙', '誒', '泊', '听' + '肖', '瘙', '誒', '泊', '听', '噢' } self.non_monophonic = {'似', '攢'} self.monophonic_chars = [ diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 19c98d53f..c13a5ab62 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from typing import Dict from typing import List @@ -18,6 +19,7 @@ import paddle from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor class MixFrontend(): @@ -107,7 +109,40 @@ class MixFrontend(): add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - segments = self.get_segment(sentence) + ''' 1. 添加SSML支持,先列出 文字 和 标签内容, + 然后添加到tmpSegments数组里 + ''' + d_inputs = MixTextProcessor.get_dom_split(sentence) + tmpSegments = [] + for instr in d_inputs: + ''' 暂时只支持 say-as ''' + if instr.lower().startswith("" + segments.append(tuple(currentSeg)) + segments.append(seg) + currentSeg = ["", ""] + else: + if currentSeg[0] == '': + currentSeg[0] = seg[0] + currentSeg[1] = seg[1] + else: + currentSeg[0] = currentSeg[0] + seg[0] + if currentSeg[0] != '': + currentSeg[0] = "" + currentSeg[0] + "" + segments.append(tuple(currentSeg)) phones_list = [] result = {} @@ -120,11 +155,21 @@ class MixFrontend(): input_ids = self.en_frontend.get_input_ids( content, merge_sentences=False, to_tensor=to_tensor) else: - input_ids = self.zh_frontend.get_input_ids( - content, - merge_sentences=False, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) + ''' 3. 把带speak tag的中文和普通文字分开处理 + ''' + if content.strip() != "" and \ + re.match(r".*?.*?.*", content, re.DOTALL): + input_ids = self.zh_frontend.get_input_ids_ssml( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + else: + input_ids = self.zh_frontend.get_input_ids( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) if add_sp: input_ids["phone_ids"][-1] = paddle.concat( [input_ids["phone_ids"][-1], self.sp_id_tensor]) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index ddd8cf5c7..efb673e36 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -138,7 +138,7 @@ class Frontend(): "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", - "狗儿" + "狗儿", "少儿" } self.vocab_phones = {} diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py index 799e0c759..015ed76c6 100644 --- a/paddlespeech/t2s/models/vits/text_encoder.py +++ b/paddlespeech/t2s/models/vits/text_encoder.py @@ -24,6 +24,7 @@ from paddle import nn from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder +from paddlespeech.utils.initialize import normal_ class TextEncoder(nn.Layer): @@ -105,10 +106,6 @@ class TextEncoder(nn.Layer): # define modules self.emb = nn.Embedding(vocabs, attention_dim) - dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5) - w = dist.sample(self.emb.weight.shape) - self.emb.weight.set_value(w) - self.encoder = Encoder( idim=-1, input_layer=None, @@ -130,6 +127,8 @@ class TextEncoder(nn.Layer): cnn_module_kernel=conformer_kernel_size, ) self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1) + self.reset_parameters() + def forward( self, x: paddle.Tensor, @@ -166,3 +165,9 @@ class TextEncoder(nn.Layer): m, logs = paddle.split(stats, 2, axis=1) return x, m, logs, x_mask + + def reset_parameters(self): + normal_(self.emb.weight, mean=0.0, std=self.attention_dim**-0.5) + if self.emb._padding_idx is not None: + with paddle.no_grad(): + self.emb.weight[self.emb._padding_idx] = 0 diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index 0ff3a546d..e68ed5643 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -13,6 +13,7 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """VITS module""" +import math from typing import Any from typing import Dict from typing import Optional @@ -27,7 +28,12 @@ from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscrimi from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator from paddlespeech.t2s.models.vits.generator import VITSGenerator -from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out +from paddlespeech.utils.initialize import kaiming_uniform_ +from paddlespeech.utils.initialize import normal_ +from paddlespeech.utils.initialize import ones_ +from paddlespeech.utils.initialize import uniform_ +from paddlespeech.utils.initialize import zeros_ AVAILABLE_GENERATERS = { "vits_generator": VITSGenerator, @@ -152,8 +158,7 @@ class VITS(nn.Layer): "use_spectral_norm": False, }, }, - cache_generator_outputs: bool=True, - init_type: str="xavier_uniform", ): + cache_generator_outputs: bool=True, ): """Initialize VITS module. Args: idim (int): @@ -179,9 +184,6 @@ class VITS(nn.Layer): assert check_argument_types() super().__init__() - # initialize parameters - initialize(self, init_type) - # define modules generator_class = AVAILABLE_GENERATERS[generator_type] if generator_type == "vits_generator": @@ -196,8 +198,6 @@ class VITS(nn.Layer): self.discriminator = discriminator_class( **discriminator_params, ) - nn.initializer.set_global_initializer(None) - # cache self.cache_generator_outputs = cache_generator_outputs self._cache = None @@ -214,6 +214,10 @@ class VITS(nn.Layer): self.reuse_cache_gen = True self.reuse_cache_dis = True + self.reset_parameters() + self.generator.decoder.reset_parameters() + self.generator.text_encoder.reset_parameters() + def forward( self, text: paddle.Tensor, @@ -243,7 +247,7 @@ class VITS(nn.Layer): forward_generator (bool): Whether to forward generator. Returns: - + """ if forward_generator: return self._forward_generator( @@ -290,7 +294,7 @@ class VITS(nn.Layer): lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). Returns: - + """ # setup feats = feats.transpose([0, 2, 1]) @@ -497,3 +501,34 @@ class VITS(nn.Layer): lids, ) return dict(wav=paddle.reshape(wav, [-1])) + + def reset_parameters(self): + def _reset_parameters(module): + if isinstance(module, + (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)): + kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(module.weight) + if fan_in != 0: + bound = 1 / math.sqrt(fan_in) + uniform_(module.bias, -bound, bound) + + if isinstance(module, + (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)): + ones_(module.weight) + zeros_(module.bias) + + if isinstance(module, nn.Linear): + kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + uniform_(module.bias, -bound, bound) + + if isinstance(module, nn.Embedding): + normal_(module.weight) + if module._padding_idx is not None: + with paddle.no_grad(): + module.weight[module._padding_idx] = 0 + + self.apply(_reset_parameters) diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py index b39121347..892ca371e 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -74,6 +74,28 @@ class MixTextProcessor(): ctlist.append([mixstr, []]) return ctlist + @classmethod + def get_dom_split(self, mixstr): + ''' 文本分解,顺序加了列表中,返回文本和say-as标签 + ''' + ctlist = [] + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append(pre_xml) + dom = DomXml(in_xml) + tags = dom.get_text_and_sayas_tags() + ctlist.extend(tags) + + ctlist.append(after_xml) + return ctlist + else: + ctlist.append(mixstr) + return ctlist class DomXml(): def __init__(self, xmlstr): @@ -156,3 +178,15 @@ class DomXml(): if x.hasAttribute('pinyin'): # pinyin print(x.tagName, 'pinyin', x.getAttribute('pinyin'), x.firstChild.data) + + def get_text_and_sayas_tags(self): + '''返回 xml 内容的列表,包括所有文本内容和 tag''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + res.append(x2.toxml()) + return res diff --git a/paddlespeech/utils/initialize.py b/paddlespeech/utils/initialize.py new file mode 100644 index 000000000..8ebe6845e --- /dev/null +++ b/paddlespeech/utils/initialize.py @@ -0,0 +1,321 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py +Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file. +""" +import math + +import numpy as np +import paddle +import paddle.nn as nn + +__all__ = [ + "uniform_", + "normal_", + "constant_", + "ones_", + "zeros_", + "xavier_uniform_", + "xavier_normal_", + "kaiming_uniform_", + "kaiming_normal_", + "linear_init_", + "conv_init_", + "reset_initialized_parameter", + "_calculate_fan_in_and_fan_out", +] + + +def _no_grad_uniform_(tensor, a, b): + with paddle.no_grad(): + tensor.set_value( + paddle.uniform( + shape=tensor.shape, dtype=tensor.dtype, min=a, max=b)) + return tensor + + +def _no_grad_normal_(tensor, mean=0.0, std=1.0): + with paddle.no_grad(): + tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape)) + return tensor + + +def _no_grad_fill_(tensor, value=0.0): + with paddle.no_grad(): + tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype)) + return tensor + + +def uniform_(tensor, a, b): + """ + Modified tensor inspace using uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + a (float|int): min value. + b (float|int): max value. + Return: + tensor + """ + return _no_grad_uniform_(tensor, a, b) + + +def normal_(tensor, mean=0.0, std=1.0): + """ + Modified tensor inspace using normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mean (float|int): mean value. + std (float|int): std value. + Return: + tensor + """ + return _no_grad_normal_(tensor, mean, std) + + +def constant_(tensor, value=0.0): + """ + Modified tensor inspace using constant_ + Args: + tensor (paddle.Tensor): paddle Tensor + value (float|int): value to fill tensor. + Return: + tensor + """ + return _no_grad_fill_(tensor, value) + + +def ones_(tensor): + """ + Modified tensor inspace using ones_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 1) + + +def zeros_(tensor): + """ + Modified tensor inspace using zeros_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 0) + + +def vector_(tensor, vector): + with paddle.no_grad(): + tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype)) + return tensor + + +def _calculate_fan_in_and_fan_out(tensor, reverse=False): + """ + Calculate (fan_in, _fan_out) for tensor + Args: + tensor (Tensor): paddle.Tensor + reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True + Return: + Tuple[fan_in, fan_out] + """ + if tensor.ndim < 2: + raise ValueError( + "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" + ) + + if reverse: + num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] + else: + num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0] + + receptive_field_size = 1 + if tensor.ndim > 2: + receptive_field_size = np.prod(tensor.shape[2:]) + + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def xavier_uniform_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def xavier_normal_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + return _no_grad_normal_(tensor, 0, std) + + +# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html +def _calculate_correct_fan(tensor, mode, reverse=False): + mode = mode.lower() + valid_modes = ["fan_in", "fan_out"] + if mode not in valid_modes: + raise ValueError("Mode {} not supported, please use one of {}".format( + mode, valid_modes)) + + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) + + return fan_in if mode == "fan_in" else fan_out + + +def _calculate_gain(nonlinearity, param=None): + linear_fns = [ + "linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", + "conv_transpose2d", "conv_transpose3d" + ] + if nonlinearity in linear_fns or nonlinearity == "sigmoid": + return 1 + elif nonlinearity == "tanh": + return 5.0 / 3 + elif nonlinearity == "relu": + return math.sqrt(2.0) + elif nonlinearity == "leaky_relu": + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance( + param, int) or isinstance(param, float): + # True/False are instances of int, hence check above + negative_slope = param + else: + raise ValueError( + "negative_slope {} not a valid number".format(param)) + return math.sqrt(2.0 / (1 + negative_slope**2)) + elif nonlinearity == "selu": + return 3.0 / 4 + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + + +def kaiming_uniform_(tensor, + a=0, + mode="fan_in", + nonlinearity="leaky_relu", + reverse=False): + """ + Modified tensor inspace using kaiming_uniform method + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def kaiming_normal_(tensor, + a=0, + mode="fan_in", + nonlinearity="leaky_relu", + reverse=False): + """ + Modified tensor inspace using kaiming_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + return _no_grad_normal_(tensor, 0, std) + + +def linear_init_(module): + bound = 1 / math.sqrt(module.weight.shape[0]) + uniform_(module.weight, -bound, bound) + uniform_(module.bias, -bound, bound) + + +def conv_init_(module): + bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) + uniform_(module.weight, -bound, bound) + if module.bias is not None: + uniform_(module.bias, -bound, bound) + + +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +@paddle.no_grad() +def reset_initialized_parameter(model, include_self=True): + """ + Reset initialized parameter using following method for [conv, linear, embedding, bn] + Args: + model (paddle.Layer): paddle Layer + include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself + Return: + None + """ + for _, m in model.named_sublayers(include_self=include_self): + if isinstance(m, nn.Conv2D): + k = float(m._groups) / (m._in_channels * m._kernel_size[0] * + m._kernel_size[1]) + k = math.sqrt(k) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Linear): + k = math.sqrt(1.0 / m.weight.shape[0]) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Embedding): + _no_grad_normal_(m.weight, mean=0.0, std=1.0) + + elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)): + _no_grad_fill_(m.weight, 1.0) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_fill_(m.bias, 0) diff --git a/setup.py b/setup.py index 212d3b109..be6cf63a9 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,6 @@ base = [ "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", - "protobuf>=3.1.0, <=3.20.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", @@ -72,12 +71,9 @@ base = [ "yacs~=0.1.8", "prettytable", "zhon", - "colorlog", - "pathos==0.2.8", "braceexpand", "pyyaml", - "pybind11", - "paddleslim==2.3.4", + "paddleslim>=2.3.4", "paddleaudio>=1.0.2", ] diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 3a58626d2..5d3b76f6c 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -14,7 +14,7 @@ paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav # Speech_recognition -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav paddlespeech asr --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav @@ -26,6 +26,7 @@ paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav +paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav # Support editing num_decoding_left_chunks paddlespeech asr --model conformer_online_wenetspeech --num_decoding_left_chunks 3 --input ./zh.wav