diff --git a/.gitignore b/.gitignore
index 75f56b604..4a0c43312 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
*.egg-info
build
*output/
+.history
audio/dist/
audio/fc_patch/
diff --git a/README.md b/README.md
index dbdf6a4f8..afc4e4d09 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
### Recent Update
+- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
- 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid).
- 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
@@ -189,7 +191,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
-

+
## Installation
diff --git a/README_cn.md b/README_cn.md
index 5cc156c9f..ecc4644aa 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -164,6 +164,8 @@
### 近期更新
+- 🔥 2023.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2023.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
- 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
- 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。
- 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验!
@@ -200,7 +202,7 @@
微信扫描二维码关注公众号,点击“马上报名”填写问卷加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
-

+
diff --git a/audio/paddleaudio/_internal/module_utils.py b/audio/paddleaudio/_internal/module_utils.py
index 7b3230de9..becd23cd8 100644
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@@ -67,8 +67,11 @@ def deprecated(direction: str, version: Optional[str]=None):
def is_kaldi_available():
- return is_module_available("paddleaudio._paddleaudio")
-
+ try:
+ from paddleaudio import _paddleaudio
+ return True
+ except Exception:
+ return False
def requires_kaldi():
if is_kaldi_available():
@@ -128,9 +131,11 @@ def requires_soundfile():
def is_sox_available():
- if platform.system() == "Windows": # not support sox in windows
+ try:
+ from paddleaudio import _paddleaudio
+ return True
+ except Exception:
return False
- return is_module_available("paddleaudio._paddleaudio")
def requires_sox():
diff --git a/audio/setup.py b/audio/setup.py
index 82e9a55a5..823e5dfad 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -40,14 +40,9 @@ COMMITID = 'none'
base = [
"kaldiio",
"librosa==0.8.1",
- "scipy>=1.0.0",
- "soundfile~=0.10",
- "colorlog",
- "pathos == 0.2.8",
+ "pathos",
"pybind11",
"parameterized",
- "tqdm",
- "scikit-learn"
]
requirements = {
@@ -273,7 +268,7 @@ def main():
},
# Package info
- packages=find_packages(include=('paddleaudio*')),
+ packages=find_packages(include=['paddleaudio*']),
package_data=lib_package_data,
ext_modules=setup_helpers.get_ext_modules(),
zip_safe=True,
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index c815a88af..ee2acd6fd 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be
Here are sample files for this demo that can be downloaded:
```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
```
### 3. Usage
@@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
paddlespeech asr --input ./zh.wav -v
# English
paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+ # Code-Switch
+ paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v
# Chinese ASR + Punctuation Restoration
paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
```
@@ -40,6 +42,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- `input`(required): Audio file to recognize.
- `model`: Model type of asr task. Default: `conformer_wenetspeech`.
- `lang`: Model language. Default: `zh`.
+ - `codeswitch`: Code Swith Model. Default: `False`
- `sample_rate`: Sample rate of the model. Default: `16000`.
- `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
@@ -83,14 +86,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API:
-| Model | Language | Sample Rate
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| Model | Code Switch | Language | Sample Rate
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 13aa9f277..62dce3bc9 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -1,4 +1,5 @@
(简体中文|[English](./README.md))
+ (简体中文|[English](./README.md))
# 语音识别
## 介绍
@@ -16,7 +17,7 @@
可以下载此 demo 的示例音频:
```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
```
### 3. 使用方法
- 命令行 (推荐使用)
@@ -25,6 +26,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
paddlespeech asr --input ./zh.wav -v
# 英文
paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+ #中英混合
+ paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v
# 中文 + 标点恢复
paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
```
@@ -38,6 +41,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- `input`(必须输入):用于识别的音频文件。
- `model`:ASR 任务的模型,默认值:`conformer_wenetspeech`。
- `lang`:模型语言,默认值:`zh`。
+ - `codeswitch`: 是否使用语言转换,默认值:`False`。
- `sample_rate`:音频采样率,默认值:`16000`。
- `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。
- `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。
@@ -80,14 +84,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
### 4.预训练模型
以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表:
-| 模型 | 语言 | 采样率
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| 模型 | 语言转换 | 语言 | 采样率
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh
index e48ff3e96..8ba6e4c3e 100755
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@@ -2,6 +2,7 @@
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
# asr
paddlespeech asr --input ./zh.wav
@@ -18,6 +19,11 @@ paddlespeech asr --help
# english asr
paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav
+
+# code-switch asr
+paddlespeech asr --lang zh_en --codeswitch True --model conformer_talcs --input ./ch_zh_mix.wav
+
+
# model stats
paddlespeech stats --task asr
diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt
index cdc654656..8425a1fee 100644
--- a/demos/speech_web/speech_server/requirements.txt
+++ b/demos/speech_web/speech_server/requirements.txt
@@ -1,8 +1,6 @@
aiofiles
faiss-cpu
-praatio==5.0.0
+praatio>=5.0.0
pydantic
python-multipart
-scikit_learn
starlette
-uvicorn
diff --git a/docs/requirements.txt b/docs/requirements.txt
index bd7f40ec3..609f27925 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,12 +1,9 @@
braceexpand
-colorlog
editdistance
-fastapi
g2p_en
g2pM
h5py
inflect
-jieba
jsonlines
kaldiio
keyboard
@@ -16,7 +13,7 @@ matplotlib
myst-parser
nara_wpe
numpydoc
-onnxruntime==1.10.0
+onnxruntime>=1.11.0
opencc
paddlenlp
# use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243
@@ -24,31 +21,24 @@ paddlepaddle>=2.2.2,<2.4.0
paddlespeech_ctcdecoders
paddlespeech_feat
pandas
-pathos==0.2.8
pattern_singleton
-Pillow>=9.0.0
-praatio==5.0.0
+ppdiffusers>=0.9.0
+praatio>=5.0.0
prettytable
pypinyin-dict
pypinyin<=0.44.0
python-dateutil
-pyworld==0.2.12
+pyworld>=0.2.12
recommonmark>=0.5.0
-resampy==0.2.2
+resampy
sacrebleu
-scipy
-sentencepiece~=0.1.96
-soundfile~=0.10
sphinx
sphinx-autobuild
sphinx-markdown-tables
sphinx_rtd_theme
textgrid
timer
-tqdm
typeguard
-uvicorn
-visualdl
webrtcvad
websockets
yacs~=0.1.8
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/aishell3/vc0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/aishell3/vc0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=2 \
- --phones-dict=dump/phone_id_map.txt \
- --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc1/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
deleted file mode 100755
index 8fd8977d3..000000000
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-python3 ${BIN_DIR}/../synthesize.py \
- --am=fastspeech2_aishell3 \
- --am_config=${config_path} \
- --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
- --am_stat=dump/train/speech_stats.npy \
- --voc=pwgan_aishell3 \
- --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
- --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
- --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
- --test_metadata=dump/test/norm/metadata.jsonl \
- --output_dir=${train_output_path}/test \
- --phones_dict=dump/phone_id_map.txt \
- --speaker_dict=dump/speaker_id_map.txt \
- --voice-cloning=True
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
new file mode 120000
index 000000000..ca8df6b04
--- /dev/null
+++ b/examples/aishell3/vc2/local/synthesize.sh
@@ -0,0 +1 @@
+../../vc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc2/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=2 \
- --phones-dict=dump/phone_id_map.txt \
- --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc2/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc2/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc2/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh
index 44cc3dbe4..71eab68ad 100755
--- a/examples/aishell3/voc1/local/preprocess.sh
+++ b/examples/aishell3/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
+
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
fi
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
- --config=${config_path} \
- --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
- --test-metadata=dump/test/norm/metadata.jsonl \
- --output-dir=${train_output_path}/test \
- --generator-type=pwgan
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/aishell3/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/aishell3/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/aishell3/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
deleted file mode 100755
index 44cc3dbe4..000000000
--- a/examples/aishell3/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # get durations from MFA's result
- echo "Generate durations.txt from MFA results ..."
- python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
- --inputdir=./aishell3_alignment_tone \
- --output=durations.txt \
- --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # extract features
- echo "Extract features ..."
- python3 ${BIN_DIR}/../preprocess.py \
- --rootdir=~/datasets/data_aishell3/ \
- --dataset=aishell3 \
- --dumpdir=dump \
- --dur-file=durations.txt \
- --config=${config_path} \
- --cut-sil=True \
- --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # get features' stats(mean and std)
- echo "Get features' stats ..."
- python3 ${MAIN_ROOT}/utils/compute_statistics.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # normalize, dev and test should use train's stats
- echo "Normalize ..."
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/dev/raw/metadata.jsonl \
- --dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/test/raw/metadata.jsonl \
- --dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/aishell3/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/aishell3/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
- --config=${config_path} \
- --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
- --test-metadata=dump/test/norm/metadata.jsonl \
- --output-dir=${train_output_path}/test \
- --generator-type=hifigan
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/aishell3/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/aishell3/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/aishell3/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
deleted file mode 100755
index 8b4178f13..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-stage=0
-stop_stage=0
-
-# hifigan
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- FLAGS_allocator_strategy=naive_best_fit \
- FLAGS_fraction_of_gpu_memory_to_use=0.01 \
- python3 ${BIN_DIR}/synthesize.py \
- --erniesat_config=${config_path} \
- --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
- --erniesat_stat=dump/train/speech_stats.npy \
- --voc=hifigan_aishell3 \
- --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
- --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
- --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
- --test_metadata=dump/test/norm/metadata.jsonl \
- --output_dir=${train_output_path}/test \
- --phones_dict=dump/phone_id_map.txt
-fi
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
new file mode 120000
index 000000000..5703dcb2c
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=8 \
- --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh
index a70a77b58..c6dce53cb 100755
--- a/examples/csmsc/tts3/local/PTQ_static.sh
+++ b/examples/csmsc/tts3/local/PTQ_static.sh
@@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
- --onnx_forma=True
\ No newline at end of file
+ --onnx_format=True
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh
index 2e5166141..c85ebd109 100755
--- a/examples/csmsc/voc1/local/PTQ_static.sh
+++ b/examples/csmsc/voc1/local/PTQ_static.sh
@@ -2,7 +2,7 @@ train_output_path=$1
model_name=$2
python3 ${BIN_DIR}/../../PTQ_static.py \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
+ --dev-metadata=dump/dev/raw/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_format=True
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh
index 61d6d62be..62d0717b9 100755
--- a/examples/csmsc/voc1/local/preprocess.sh
+++ b/examples/csmsc/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
+
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
fi
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
deleted file mode 100755
index 6719bd0be..000000000
--- a/examples/csmsc/voc3/finetune.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-source path.sh
-
-gpus=0
-stage=0
-stop_stage=100
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
- --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
- --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
- --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
- --dur-file=durations.txt \
- --output-dir=dump_finetune \
- --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
- --dataset=baker \
- --rootdir=~/datasets/BZNSYP/
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- python3 ${MAIN_ROOT}/utils/link_wav.py \
- --old-dump-dir=dump \
- --dump-dir=dump_finetune
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # get features' stats(mean and std)
- echo "Get features' stats ..."
- cp dump/train/feats_stats.npy dump_finetune/train/
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # normalize, dev and test should use train's stats
- echo "Normalize ..."
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump_finetune/train/raw/metadata.jsonl \
- --dumpdir=dump_finetune/train/norm \
- --stats=dump_finetune/train/feats_stats.npy
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump_finetune/dev/raw/metadata.jsonl \
- --dumpdir=dump_finetune/dev/norm \
- --stats=dump_finetune/train/feats_stats.npy
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump_finetune/test/raw/metadata.jsonl \
- --dumpdir=dump_finetune/test/norm \
- --stats=dump_finetune/train/feats_stats.npy
-fi
-
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
- CUDA_VISIBLE_DEVICES=${gpus} \
- FLAGS_cudnn_exhaustive_search=true \
- FLAGS_conv_workspace_size_limit=4000 \
- python ${BIN_DIR}/train.py \
- --train-metadata=dump_finetune/train/norm/metadata.jsonl \
- --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
- --config=conf/finetune.yaml \
- --output-dir=exp/finetune \
- --ngpu=1
-fi
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 120000
index 000000000..b6fa868e2
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1 @@
+../voc5/finetune.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc3/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # get durations from MFA's result
- echo "Generate durations.txt from MFA results ..."
- python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
- --inputdir=./baker_alignment_tone \
- --output=durations.txt \
- --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # extract features
- echo "Extract features ..."
- python3 ${BIN_DIR}/../preprocess.py \
- --rootdir=~/datasets/BZNSYP/ \
- --dataset=baker \
- --dumpdir=dump \
- --dur-file=durations.txt \
- --config=${config_path} \
- --cut-sil=True \
- --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # get features' stats(mean and std)
- echo "Get features' stats ..."
- python3 ${MAIN_ROOT}/utils/compute_statistics.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # normalize, dev and test should use train's stats
- echo "Normalize ..."
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/dev/raw/metadata.jsonl \
- --dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/test/raw/metadata.jsonl \
- --dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc3/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc3/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # get durations from MFA's result
- echo "Generate durations.txt from MFA results ..."
- python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
- --inputdir=./baker_alignment_tone \
- --output=durations.txt \
- --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # extract features
- echo "Extract features ..."
- python3 ${BIN_DIR}/../preprocess.py \
- --rootdir=~/datasets/BZNSYP/ \
- --dataset=baker \
- --dumpdir=dump \
- --dur-file=durations.txt \
- --config=${config_path} \
- --cut-sil=True \
- --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # get features' stats(mean and std)
- echo "Get features' stats ..."
- python3 ${MAIN_ROOT}/utils/compute_statistics.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # normalize, dev and test should use train's stats
- echo "Normalize ..."
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/dev/raw/metadata.jsonl \
- --dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/test/raw/metadata.jsonl \
- --dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc4/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc4/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc4/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh
index 6719bd0be..eb8325aeb 100755
--- a/examples/csmsc/voc5/finetune.sh
+++ b/examples/csmsc/voc5/finetune.sh
@@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/train/raw/metadata.jsonl \
--dumpdir=dump_finetune/train/norm \
- --stats=dump_finetune/train/feats_stats.npy
+ --stats=dump_finetune/train/feats_stats.npy \
+ --skip-wav-copy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/dev/raw/metadata.jsonl \
--dumpdir=dump_finetune/dev/norm \
- --stats=dump_finetune/train/feats_stats.npy
+ --stats=dump_finetune/train/feats_stats.npy \
+ --skip-wav-copy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/test/raw/metadata.jsonl \
--dumpdir=dump_finetune/test/norm \
- --stats=dump_finetune/train/feats_stats.npy
+ --stats=dump_finetune/train/feats_stats.npy \
+ --skip-wav-copy
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # get durations from MFA's result
- echo "Generate durations.txt from MFA results ..."
- python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
- --inputdir=./baker_alignment_tone \
- --output=durations.txt \
- --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # extract features
- echo "Extract features ..."
- python3 ${BIN_DIR}/../preprocess.py \
- --rootdir=~/datasets/BZNSYP/ \
- --dataset=baker \
- --dumpdir=dump \
- --dur-file=durations.txt \
- --config=${config_path} \
- --cut-sil=True \
- --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # get features' stats(mean and std)
- echo "Get features' stats ..."
- python3 ${MAIN_ROOT}/utils/compute_statistics.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # normalize, dev and test should use train's stats
- echo "Normalize ..."
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/dev/raw/metadata.jsonl \
- --dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/test/raw/metadata.jsonl \
- --dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc5/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh
index 2dcc39ac7..509824b8e 100755
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
+
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
fi
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc6/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc6/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
deleted file mode 100755
index f90db9150..000000000
--- a/examples/ljspeech/tts0/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1 \
- --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
new file mode 120000
index 000000000..7f54e9239
--- /dev/null
+++ b/examples/ljspeech/tts0/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts0/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/ljspeech/tts0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/ljspeech/tts0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
deleted file mode 100755
index d1302f99f..000000000
--- a/examples/ljspeech/tts3/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1 \
- --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
new file mode 120000
index 000000000..d7b05058e
--- /dev/null
+++ b/examples/ljspeech/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/ljspeech/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/ljspeech/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh
index d1af60dad..bfbf75b7d 100755
--- a/examples/ljspeech/voc1/local/preprocess.sh
+++ b/examples/ljspeech/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
+
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
fi
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
- --config=${config_path} \
- --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
- --test-metadata=dump/test/norm/metadata.jsonl \
- --output-dir=${train_output_path}/test \
- --generator-type=pwgan
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/ljspeech/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/ljspeech/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
deleted file mode 100755
index d1af60dad..000000000
--- a/examples/ljspeech/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # get durations from MFA's result
- echo "Generate durations.txt from MFA results ..."
- python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
- --inputdir=./ljspeech_alignment \
- --output=durations.txt \
- --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # extract features
- echo "Extract features ..."
- python3 ${BIN_DIR}/../preprocess.py \
- --rootdir=~/datasets/LJSpeech-1.1/ \
- --dataset=ljspeech \
- --dumpdir=dump \
- --dur-file=durations.txt \
- --config=${config_path} \
- --cut-sil=True \
- --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # get features' stats(mean and std)
- echo "Get features' stats ..."
- python3 ${MAIN_ROOT}/utils/compute_statistics.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # normalize, dev and test should use train's stats
- echo "Normalize ..."
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/dev/raw/metadata.jsonl \
- --dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/test/raw/metadata.jsonl \
- --dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/ljspeech/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
- --config=${config_path} \
- --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
- --test-metadata=dump/test/norm/metadata.jsonl \
- --output-dir=${train_output_path}/test \
- --generator-type=hifigan
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/ljspeech/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/ljspeech/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py
index 25382d8c3..f023a37b7 100644
--- a/examples/tess/cls0/local/train.py
+++ b/examples/tess/cls0/local/train.py
@@ -121,7 +121,7 @@ if __name__ == "__main__":
optimizer.clear_grad()
# Calculate loss
- avg_loss += loss.numpy()[0]
+ avg_loss += float(loss)
# Calculate metrics
preds = paddle.argmax(logits, axis=1)
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=8 \
- --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
deleted file mode 100755
index 3a5076505..000000000
--- a/examples/vctk/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1 \
- --phones-dict=dump/phone_id_map.txt \
- --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/vctk/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/vctk/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/vctk/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh
index 88a478cd5..6b7e5288a 100755
--- a/examples/vctk/voc1/local/preprocess.sh
+++ b/examples/vctk/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
+
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
+ --stats=dump/train/feats_stats.npy \
+ --skip-wav-copy
fi
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/vctk/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
- --config=${config_path} \
- --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
- --test-metadata=dump/test/norm/metadata.jsonl \
- --output-dir=${train_output_path}/test \
- --generator-type=pwgan
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/vctk/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/vctk/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/vctk/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
deleted file mode 100755
index 88a478cd5..000000000
--- a/examples/vctk/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # get durations from MFA's result
- echo "Generate durations.txt from MFA results ..."
- python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
- --inputdir=./vctk_alignment \
- --output=durations.txt \
- --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # extract features
- echo "Extract features ..."
- python3 ${BIN_DIR}/../preprocess.py \
- --rootdir=~/datasets/VCTK-Corpus-0.92/ \
- --dataset=vctk \
- --dumpdir=dump \
- --dur-file=durations.txt \
- --config=${config_path} \
- --cut-sil=True \
- --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # get features' stats(mean and std)
- echo "Get features' stats ..."
- python3 ${MAIN_ROOT}/utils/compute_statistics.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # normalize, dev and test should use train's stats
- echo "Normalize ..."
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/train/raw/metadata.jsonl \
- --dumpdir=dump/train/norm \
- --stats=dump/train/feats_stats.npy
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/dev/raw/metadata.jsonl \
- --dumpdir=dump/dev/norm \
- --stats=dump/train/feats_stats.npy
-
- python3 ${BIN_DIR}/../normalize.py \
- --metadata=dump/test/raw/metadata.jsonl \
- --dumpdir=dump/test/norm \
- --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/vctk/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/vctk/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
- --config=${config_path} \
- --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
- --test-metadata=dump/test/norm/metadata.jsonl \
- --output-dir=${train_output_path}/test \
- --generator-type=hifigan
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/vctk/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=1
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/vctk/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/vctk/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
deleted file mode 100755
index 1da72f117..000000000
--- a/examples/zh_en_tts/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
- --train-metadata=dump/train/norm/metadata.jsonl \
- --dev-metadata=dump/dev/norm/metadata.jsonl \
- --config=${config_path} \
- --output-dir=${train_output_path} \
- --ngpu=2 \
- --phones-dict=dump/phone_id_map.txt \
- --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/zh_en_tts/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/zh_en_tts/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 004143361..7a7aef8b0 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -25,6 +25,9 @@ import librosa
import numpy as np
import paddle
import soundfile
+from paddlespeech.audio.transform.transformation import Transformation
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.utils.utility import UpdateConfig
from yacs.config import CfgNode
from ...utils.env import MODEL_HOME
@@ -34,9 +37,6 @@ from ..log import logger
from ..utils import CLI_TIMER
from ..utils import stats_wrapper
from ..utils import timer_register
-from paddlespeech.audio.transform.transformation import Transformation
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.utils.utility import UpdateConfig
__all__ = ['ASRExecutor']
@@ -62,8 +62,13 @@ class ASRExecutor(BaseExecutor):
'--lang',
type=str,
default='zh',
- help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]'
+ help='Choose model language. [zh, en, zh_en], zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k], zh_en:[conformer_talcs-codeswitch_zh_en-16k]'
)
+ self.parser.add_argument(
+ '--codeswitch',
+ type=bool,
+ default=False,
+ help='Choose whether use code-switch. True or False.')
self.parser.add_argument(
"--sample_rate",
type=int,
@@ -127,6 +132,7 @@ class ASRExecutor(BaseExecutor):
def _init_from_path(self,
model_type: str='wenetspeech',
lang: str='zh',
+ codeswitch: bool=False,
sample_rate: int=16000,
cfg_path: Optional[os.PathLike]=None,
decode_method: str='attention_rescoring',
@@ -144,7 +150,12 @@ class ASRExecutor(BaseExecutor):
if cfg_path is None or ckpt_path is None:
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
- tag = model_type + '-' + lang + '-' + sample_rate_str
+ if lang == "zh_en" and codeswitch is True:
+ tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str
+ elif lang == "zh_en" or codeswitch is True:
+ raise Exception("codeswitch is true only in zh_en model")
+ else:
+ tag = model_type + '-' + lang + '-' + sample_rate_str
self.task_resource.set_task_model(tag, version=None)
self.res_path = self.task_resource.res_dir
@@ -423,6 +434,7 @@ class ASRExecutor(BaseExecutor):
model = parser_args.model
lang = parser_args.lang
+ codeswitch = parser_args.codeswitch
sample_rate = parser_args.sample_rate
config = parser_args.config
ckpt_path = parser_args.ckpt_path
@@ -444,6 +456,7 @@ class ASRExecutor(BaseExecutor):
audio_file=input_,
model=model,
lang=lang,
+ codeswitch=codeswitch,
sample_rate=sample_rate,
config=config,
ckpt_path=ckpt_path,
@@ -472,6 +485,7 @@ class ASRExecutor(BaseExecutor):
audio_file: os.PathLike,
model: str='conformer_u2pp_online_wenetspeech',
lang: str='zh',
+ codeswitch: bool=False,
sample_rate: int=16000,
config: os.PathLike=None,
ckpt_path: os.PathLike=None,
@@ -485,8 +499,8 @@ class ASRExecutor(BaseExecutor):
"""
audio_file = os.path.abspath(audio_file)
paddle.set_device(device)
- self._init_from_path(model, lang, sample_rate, config, decode_method,
- num_decoding_left_chunks, ckpt_path)
+ self._init_from_path(model, lang, codeswitch, sample_rate, config,
+ decode_method, num_decoding_left_chunks, ckpt_path)
if not self._check(audio_file, sample_rate, force_yes):
sys.exit(-1)
if rtf:
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 767d0df78..dfeb5cae5 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -14,6 +14,7 @@
import argparse
from typing import List
+import numpy
from prettytable import PrettyTable
from ..resource import CommonTaskResource
@@ -78,7 +79,7 @@ class VersionCommand:
model_name_format = {
- 'asr': 'Model-Language-Sample Rate',
+ 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
'cls': 'Model-Sample Rate',
'st': 'Model-Source language-Target language',
'text': 'Model-Task-Language',
@@ -111,7 +112,21 @@ class StatsCommand:
fields = model_name_format[self.task].split("-")
table = PrettyTable(fields)
for key in pretrained_models:
- table.add_row(key.split("-"))
+ line = key.split("-")
+ if self.task == "asr" and len(line) < len(fields):
+ for i in range(len(line), len(fields)):
+ line.append("-")
+ if "codeswitch" in key:
+ line[3], line[1] = line[1].split("_")[0], line[1].split(
+ "_")[1:]
+ elif "multilingual" in key:
+ line[4], line[1] = line[1].split("_")[0], line[1].split(
+ "_")[1:]
+ tmp = numpy.array(line)
+ idx = [0, 5, 3, 4, 1, 2]
+ line = tmp[idx]
+ table.add_row(line)
+
print(table)
def execute(self, argv: List[str]) -> bool:
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 707518c05..5515ade26 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor):
with open(self.voc_config) as f:
self.voc_config = CfgNode(yaml.safe_load(f))
- with open(self.phones_dict, "r") as f:
+ with open(self.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
tone_size = None
if self.tones_dict:
- with open(self.tones_dict, "r") as f:
+ with open(self.tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
spk_num = None
if self.speaker_dict:
- with open(self.speaker_dict, 'rt') as f:
+ with open(self.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 3c5aa1f90..ff0b30f6d 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -30,6 +30,7 @@ __all__ = [
]
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+# Add code-switch and multilingual tag, "{model_name}[_{dataset}]-[codeswitch/multilingual][_{lang}][-...]".
# e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
@@ -322,6 +323,18 @@ asr_dynamic_pretrained_models = {
'099a601759d467cd0a8523ff939819c5'
},
},
+ "conformer_talcs-codeswitch_zh_en-16k": {
+ '1.4': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz',
+ 'md5':
+ '01962c5d0a70878fe41cacd4f61e14d1',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/conformer/checkpoints/avg_10'
+ },
+ },
}
asr_static_pretrained_models = {
diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py
index 8bd85c914..1e1aea044 100644
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@@ -155,6 +155,10 @@ class Tokenizer:
if ids < len(self.tokenizer):
ids_list.append(ids)
token_ids = ids_list
+ elif len(token_ids) == 1:
+ token_ids = token_ids[0]
+ else:
+ raise ValueError(f"token_ids {token_ids} load error.")
return self.tokenizer.decode(token_ids, **kwargs)
diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py
index 63cafbdb7..9cf9a9eca 100644
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@@ -17,12 +17,11 @@ from typing import Union
import numpy as np
import paddle
import paddle.nn.functional as F
+import paddlespeech.s2t.modules.align as paddlespeech_nn
import soundfile
import tqdm
from paddle import nn
from paddle.distribution import Categorical
-
-import paddlespeech.s2t.modules.align as paddlespeech_nn
from paddlespeech.s2t.models.whisper import utils
from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer
from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES
@@ -477,7 +476,7 @@ def transcribe(
decode_options["fp16"] = False
if decode_options.get(
- "language", 'None') or decode_options.get("language", None) is None:
+ "language") == 'None' or decode_options.get("language", None) is None:
if not model.is_multilingual:
decode_options["language"] = "en"
else:
@@ -771,8 +770,10 @@ class GreedyDecoder(TokenDecoder):
if temperature == 0:
next_tokens = paddle.argmax(logits, axis=-1)
else:
- next_tokens = Categorical(logits=logits / temperature).sample(
- shape=logits.shape)
+ next_tokens = Categorical(logits=logits / temperature).sample([1])
+ next_tokens = paddle.reshape(next_tokens, [
+ next_tokens.shape[0] * next_tokens.shape[1],
+ ])
logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
current_logprobs = logprobs[paddle.arange(logprobs.shape[0]),
@@ -1205,9 +1206,8 @@ class DecodingTask:
DecodingResult(
audio_features=features,
language=language,
- language_probs=probs)
- for features, language, probs in zip(audio_features, languages,
- language_probs)
+ language_probs=probs) for features, language, probs in
+ zip(audio_features, languages, language_probs)
]
# repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
index 26ac501e2..be6fcf589 100644
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
@@ -43,8 +43,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
- square = layers.square(merge_grad)
- sum_square = layers.reduce_sum(square)
+ square = paddle.square(merge_grad)
+ sum_square = paddle.sum(square)
sum_square_list.append(sum_square)
# debug log, not dump all since slow down train process
@@ -57,23 +57,24 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
return params_grads
global_norm_var = layers.concat(sum_square_list)
- global_norm_var = layers.reduce_sum(global_norm_var)
- global_norm_var = layers.sqrt(global_norm_var)
+ global_norm_var = paddle.sum(global_norm_var)
+ global_norm_var = paddle.sqrt(global_norm_var)
+
# debug log
logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
- clip_var = layers.elementwise_div(
+ clip_var = paddle.divide(
x=max_global_norm,
- y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
+ y=paddle.maximum(x=global_norm_var, y=max_global_norm))
for i, (p, g) in enumerate(params_grads):
if g is None:
continue
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
- new_grad = layers.elementwise_mul(x=g, y=clip_var)
+ new_grad = paddle.multiply(x=g, y=clip_var)
params_and_grads.append((p, new_grad))
# debug log, not dump all since slow down train process
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 1b1792bd1..299a8c3d4 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -16,14 +16,9 @@ import sys
import warnings
from typing import List
+import numpy
import uvicorn
from fastapi import FastAPI
-from prettytable import PrettyTable
-from starlette.middleware.cors import CORSMiddleware
-
-from ..executor import BaseExecutor
-from ..util import cli_server_register
-from ..util import stats_wrapper
from paddlespeech.cli.log import logger
from paddlespeech.resource import CommonTaskResource
from paddlespeech.server.engine.engine_pool import init_engine_pool
@@ -31,6 +26,12 @@ from paddlespeech.server.engine.engine_warmup import warm_up
from paddlespeech.server.restful.api import setup_router as setup_http_router
from paddlespeech.server.utils.config import get_config
from paddlespeech.server.ws.api import setup_router as setup_ws_router
+from prettytable import PrettyTable
+from starlette.middleware.cors import CORSMiddleware
+
+from ..executor import BaseExecutor
+from ..util import cli_server_register
+from ..util import stats_wrapper
warnings.filterwarnings("ignore")
__all__ = ['ServerExecutor', 'ServerStatsExecutor']
@@ -134,7 +135,7 @@ class ServerStatsExecutor():
required=True)
self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector']
self.model_name_format = {
- 'asr': 'Model-Language-Sample Rate',
+ 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
'tts': 'Model-Language',
'cls': 'Model-Sample Rate',
'text': 'Model-Task-Language',
@@ -145,7 +146,20 @@ class ServerStatsExecutor():
fields = self.model_name_format[self.task].split("-")
table = PrettyTable(fields)
for key in pretrained_models:
- table.add_row(key.split("-"))
+ line = key.split("-")
+ if self.task == "asr" and len(line) < len(fields):
+ for i in range(len(line), len(fields)):
+ line.append("-")
+ if "codeswitch" in key:
+ line[3], line[1] = line[1].split("_")[0], line[1].split(
+ "_")[1:]
+ elif "multilingual" in key:
+ line[4], line[1] = line[1].split("_")[0], line[1].split(
+ "_")[1:]
+ tmp = numpy.array(line)
+ idx = [0, 5, 3, 4, 1, 2]
+ line = tmp[idx]
+ table.add_row(line)
print(table)
def execute(self, argv: List[str]) -> bool:
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index e450aa1a0..c43dafb3c 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -437,7 +437,7 @@ if __name__ == '__main__':
vocab_phones = {}
- with open(args.phones_dict, 'rt') as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
for phn, id in phn_id:
vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py
index 75a666bb1..c98d691be 100644
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@@ -109,7 +109,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
- with open(args.phones_dict, "r") as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index d31e62a82..97626db0b 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -67,7 +67,7 @@ def train_sp(args, config):
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
- with open(args.speaker_dict, 'rt') as f:
+ with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
@@ -123,7 +123,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
- with open(args.phones_dict, "r") as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 644ec250d..d05dfafcf 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config):
# construct dataset for evaluation
sentences = []
- with open(args.text, 'rt') as f:
+ with open(args.text, 'rt', encoding='utf-8') as f:
for line in f:
items = line.strip().split()
utt_id = items[0]
sentence = "".join(items[1:])
sentences.append((utt_id, sentence))
- with open(args.phones_dict, "r") as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
- with open(args.tones_dict, "r") as f:
+ with open(args.tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 7b422e64f..c90090daa 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -70,7 +70,7 @@ def train_sp(args, config):
if args.speaker_dict is not None:
print("multiple speaker speedyspeech!")
collate_fn = speedyspeech_multi_spk_batch_fn
- with open(args.speaker_dict, 'rt') as f:
+ with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
@@ -133,11 +133,11 @@ def train_sp(args, config):
collate_fn=collate_fn,
num_workers=config.num_workers)
print("dataloaders done!")
- with open(args.phones_dict, "r") as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
- with open(args.tones_dict, "r") as f:
+ with open(args.tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 6b693440c..491edda30 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int):
def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
# construct dataset for evaluation
sentences = []
- with open(text_file, 'rt') as f:
+ with open(text_file, 'rt', encoding='utf-8') as f:
for line in f:
if line.strip() != "":
items = re.split(r"\s+", line.strip(), 1)
@@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc',
tones_dict: Optional[os.PathLike]=None,
speaker_dict: Optional[os.PathLike]=None,
return_am: bool=False):
- with open(phones_dict, "r") as f:
+ with open(phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
tone_size = None
if tones_dict is not None:
- with open(tones_dict, "r") as f:
+ with open(tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
spk_num = None
if speaker_dict is not None:
- with open(speaker_dict, 'rt') as f:
+ with open(speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
odim = am_config.n_mels
diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
index 69ff80e46..db88009a8 100644
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -119,7 +119,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
- with open(args.phones_dict, "r") as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index da48b6b99..d49baad99 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -114,7 +114,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
- with open(args.phones_dict, "r") as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py
index 514cbef8e..24e15765e 100644
--- a/paddlespeech/t2s/exps/vits/normalize.py
+++ b/paddlespeech/t2s/exps/vits/normalize.py
@@ -187,7 +187,7 @@ def main():
record["spk_emb"] = str(item["spk_emb"])
output_metadata.append(record)
- output_metadata.sort(key=itemgetter('feats_lengths'))
+ output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True)
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
with jsonlines.open(output_metadata_path, 'w') as writer:
for item in output_metadata:
diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py
index 2b1a40834..d6b226a20 100644
--- a/paddlespeech/t2s/exps/vits/preprocess.py
+++ b/paddlespeech/t2s/exps/vits/preprocess.py
@@ -166,7 +166,7 @@ def process_sentences(config,
if record:
results.append(record)
- results.sort(key=itemgetter("feats_lengths"))
+ results.sort(key=itemgetter("feats_lengths"), reverse=True)
with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
for item in results:
writer.write(item)
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index 07301db56..0e74bf631 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -78,7 +78,7 @@ def train_sp(args, config):
if args.speaker_dict is not None:
print("multiple speaker vits!")
collate_fn = vits_multi_spk_batch_fn
- with open(args.speaker_dict, 'rt') as f:
+ with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
@@ -110,7 +110,7 @@ def train_sp(args, config):
train_sampler = ErnieSATSampler(
train_dataset,
batch_size=config.batch_size,
- shuffle=True,
+ shuffle=False,
drop_last=True)
dev_sampler = ErnieSATSampler(
dev_dataset,
@@ -132,7 +132,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
- with open(args.phones_dict, "r") as f:
+ with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
index 47c26a610..3ce3d246d 100644
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -100,7 +100,7 @@ class G2PWOnnxConverter:
]
self.non_polyphonic = {
'一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗',
- '肖', '瘙', '誒', '泊', '听'
+ '肖', '瘙', '誒', '泊', '听', '噢'
}
self.non_monophonic = {'似', '攢'}
self.monophonic_chars = [
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 19c98d53f..c13a5ab62 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import re
from typing import Dict
from typing import List
@@ -18,6 +19,7 @@ import paddle
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
class MixFrontend():
@@ -107,7 +109,40 @@ class MixFrontend():
add_sp: bool=True,
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
- segments = self.get_segment(sentence)
+ ''' 1. 添加SSML支持,先列出 文字 和 标签内容,
+ 然后添加到tmpSegments数组里
+ '''
+ d_inputs = MixTextProcessor.get_dom_split(sentence)
+ tmpSegments = []
+ for instr in d_inputs:
+ ''' 暂时只支持 say-as '''
+ if instr.lower().startswith("" + currentSeg[0] + ""
+ segments.append(tuple(currentSeg))
+ segments.append(seg)
+ currentSeg = ["", ""]
+ else:
+ if currentSeg[0] == '':
+ currentSeg[0] = seg[0]
+ currentSeg[1] = seg[1]
+ else:
+ currentSeg[0] = currentSeg[0] + seg[0]
+ if currentSeg[0] != '':
+ currentSeg[0] = "" + currentSeg[0] + ""
+ segments.append(tuple(currentSeg))
phones_list = []
result = {}
@@ -120,11 +155,21 @@ class MixFrontend():
input_ids = self.en_frontend.get_input_ids(
content, merge_sentences=False, to_tensor=to_tensor)
else:
- input_ids = self.zh_frontend.get_input_ids(
- content,
- merge_sentences=False,
- get_tone_ids=get_tone_ids,
- to_tensor=to_tensor)
+ ''' 3. 把带speak tag的中文和普通文字分开处理
+ '''
+ if content.strip() != "" and \
+ re.match(r".*?.*?.*", content, re.DOTALL):
+ input_ids = self.zh_frontend.get_input_ids_ssml(
+ content,
+ merge_sentences=False,
+ get_tone_ids=get_tone_ids,
+ to_tensor=to_tensor)
+ else:
+ input_ids = self.zh_frontend.get_input_ids(
+ content,
+ merge_sentences=False,
+ get_tone_ids=get_tone_ids,
+ to_tensor=to_tensor)
if add_sp:
input_ids["phone_ids"][-1] = paddle.concat(
[input_ids["phone_ids"][-1], self.sp_id_tensor])
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 261db80a8..af86d9b80 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -58,7 +58,7 @@ class English(Phonetics):
self.punc = ":,;。?!“”‘’':,;.?!"
self.text_normalizer = TextNormalizer()
if phone_vocab_path:
- with open(phone_vocab_path, 'rt') as f:
+ with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
for phn, id in phn_id:
self.vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index ddd8cf5c7..35b97a93a 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -138,18 +138,18 @@ class Frontend():
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
- "狗儿"
+ "狗儿", "少儿"
}
self.vocab_phones = {}
self.vocab_tones = {}
if phone_vocab_path:
- with open(phone_vocab_path, 'rt') as f:
+ with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
for phn, id in phn_id:
self.vocab_phones[phn] = int(id)
if tone_vocab_path:
- with open(tone_vocab_path, 'rt') as f:
+ with open(tone_vocab_path, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
for tone, id in tone_id:
self.vocab_tones[tone] = int(id)
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
index 09e6827d0..1db9248ae 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater):
loss.backward()
optimizer.step()
+ if self.use_guided_attn_loss:
+ report("train/attn_loss", float(attn_loss))
+ losses_dict["attn_loss"] = float(attn_loss)
+
report("train/l1_loss", float(l1_loss))
report("train/mse_loss", float(mse_loss))
report("train/bce_loss", float(bce_loss))
- report("train/attn_loss", float(attn_loss))
report("train/loss", float(loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["mse_loss"] = float(mse_loss)
losses_dict["bce_loss"] = float(bce_loss)
- losses_dict["attn_loss"] = float(attn_loss)
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
@@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator):
attn_loss = self.attn_loss(
att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
loss = loss + attn_loss
+
+ if self.use_guided_attn_loss:
+ report("eval/attn_loss", float(attn_loss))
+ losses_dict["attn_loss"] = float(attn_loss)
report("eval/l1_loss", float(l1_loss))
report("eval/mse_loss", float(mse_loss))
report("eval/bce_loss", float(bce_loss))
- report("eval/attn_loss", float(attn_loss))
report("eval/loss", float(loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["mse_loss"] = float(mse_loss)
losses_dict["bce_loss"] = float(bce_loss)
- losses_dict["attn_loss"] = float(attn_loss)
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py
index 799e0c759..015ed76c6 100644
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
@@ -24,6 +24,7 @@ from paddle import nn
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder
+from paddlespeech.utils.initialize import normal_
class TextEncoder(nn.Layer):
@@ -105,10 +106,6 @@ class TextEncoder(nn.Layer):
# define modules
self.emb = nn.Embedding(vocabs, attention_dim)
- dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5)
- w = dist.sample(self.emb.weight.shape)
- self.emb.weight.set_value(w)
-
self.encoder = Encoder(
idim=-1,
input_layer=None,
@@ -130,6 +127,8 @@ class TextEncoder(nn.Layer):
cnn_module_kernel=conformer_kernel_size, )
self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1)
+ self.reset_parameters()
+
def forward(
self,
x: paddle.Tensor,
@@ -166,3 +165,9 @@ class TextEncoder(nn.Layer):
m, logs = paddle.split(stats, 2, axis=1)
return x, m, logs, x_mask
+
+ def reset_parameters(self):
+ normal_(self.emb.weight, mean=0.0, std=self.attention_dim**-0.5)
+ if self.emb._padding_idx is not None:
+ with paddle.no_grad():
+ self.emb.weight[self.emb._padding_idx] = 0
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 0ff3a546d..e68ed5643 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -13,6 +13,7 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""VITS module"""
+import math
from typing import Any
from typing import Dict
from typing import Optional
@@ -27,7 +28,12 @@ from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscrimi
from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator
from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator
from paddlespeech.t2s.models.vits.generator import VITSGenerator
-from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
AVAILABLE_GENERATERS = {
"vits_generator": VITSGenerator,
@@ -152,8 +158,7 @@ class VITS(nn.Layer):
"use_spectral_norm": False,
},
},
- cache_generator_outputs: bool=True,
- init_type: str="xavier_uniform", ):
+ cache_generator_outputs: bool=True, ):
"""Initialize VITS module.
Args:
idim (int):
@@ -179,9 +184,6 @@ class VITS(nn.Layer):
assert check_argument_types()
super().__init__()
- # initialize parameters
- initialize(self, init_type)
-
# define modules
generator_class = AVAILABLE_GENERATERS[generator_type]
if generator_type == "vits_generator":
@@ -196,8 +198,6 @@ class VITS(nn.Layer):
self.discriminator = discriminator_class(
**discriminator_params, )
- nn.initializer.set_global_initializer(None)
-
# cache
self.cache_generator_outputs = cache_generator_outputs
self._cache = None
@@ -214,6 +214,10 @@ class VITS(nn.Layer):
self.reuse_cache_gen = True
self.reuse_cache_dis = True
+ self.reset_parameters()
+ self.generator.decoder.reset_parameters()
+ self.generator.text_encoder.reset_parameters()
+
def forward(
self,
text: paddle.Tensor,
@@ -243,7 +247,7 @@ class VITS(nn.Layer):
forward_generator (bool):
Whether to forward generator.
Returns:
-
+
"""
if forward_generator:
return self._forward_generator(
@@ -290,7 +294,7 @@ class VITS(nn.Layer):
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
-
+
"""
# setup
feats = feats.transpose([0, 2, 1])
@@ -497,3 +501,34 @@ class VITS(nn.Layer):
lids, )
return dict(wav=paddle.reshape(wav, [-1]))
+
+ def reset_parameters(self):
+ def _reset_parameters(module):
+ if isinstance(module,
+ (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
+ kaiming_uniform_(module.weight, a=math.sqrt(5))
+ if module.bias is not None:
+ fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+ if fan_in != 0:
+ bound = 1 / math.sqrt(fan_in)
+ uniform_(module.bias, -bound, bound)
+
+ if isinstance(module,
+ (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+ ones_(module.weight)
+ zeros_(module.bias)
+
+ if isinstance(module, nn.Linear):
+ kaiming_uniform_(module.weight, a=math.sqrt(5))
+ if module.bias is not None:
+ fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+ bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+ uniform_(module.bias, -bound, bound)
+
+ if isinstance(module, nn.Embedding):
+ normal_(module.weight)
+ if module._padding_idx is not None:
+ with paddle.no_grad():
+ module.weight[module._padding_idx] = 0
+
+ self.apply(_reset_parameters)
diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py
new file mode 100644
index 000000000..eb67ffb0d
--- /dev/null
+++ b/paddlespeech/t2s/modules/diffusion.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Diffusion denoising related modules for paddle"""
+import math
+from typing import Callable
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import ppdiffusers
+from paddle import nn
+from ppdiffusers.models.embeddings import Timesteps
+from ppdiffusers.schedulers import DDPMScheduler
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock
+
+
+class WaveNetDenoiser(nn.Layer):
+ """A Mel-Spectrogram Denoiser modified from WaveNet
+
+ Args:
+ in_channels (int, optional):
+ Number of channels of the input mel-spectrogram, by default 80
+ out_channels (int, optional):
+ Number of channels of the output mel-spectrogram, by default 80
+ kernel_size (int, optional):
+ Kernel size of the residual blocks inside, by default 3
+ layers (int, optional):
+ Number of residual blocks inside, by default 20
+ stacks (int, optional):
+ The number of groups to split the residual blocks into, by default 5
+ Within each group, the dilation of the residual block grows exponentially.
+ residual_channels (int, optional):
+ Residual channel of the residual blocks, by default 256
+ gate_channels (int, optional):
+ Gate channel of the residual blocks, by default 512
+ skip_channels (int, optional):
+ Skip channel of the residual blocks, by default 256
+ aux_channels (int, optional):
+ Auxiliary channel of the residual blocks, by default 256
+ dropout (float, optional):
+ Dropout of the residual blocks, by default 0.
+ bias (bool, optional):
+ Whether to use bias in residual blocks, by default True
+ use_weight_norm (bool, optional):
+ Whether to use weight norm in all convolutions, by default False
+ """
+
+ def __init__(
+ self,
+ in_channels: int=80,
+ out_channels: int=80,
+ kernel_size: int=3,
+ layers: int=20,
+ stacks: int=5,
+ residual_channels: int=256,
+ gate_channels: int=512,
+ skip_channels: int=256,
+ aux_channels: int=256,
+ dropout: float=0.,
+ bias: bool=True,
+ use_weight_norm: bool=False,
+ init_type: str="kaiming_normal", ):
+ super().__init__()
+
+ # initialize parameters
+ initialize(self, init_type)
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.aux_channels = aux_channels
+ self.layers = layers
+ self.stacks = stacks
+ self.kernel_size = kernel_size
+
+ assert layers % stacks == 0
+ layers_per_stack = layers // stacks
+
+ self.first_t_emb = nn.Sequential(
+ Timesteps(
+ residual_channels,
+ flip_sin_to_cos=False,
+ downscale_freq_shift=1),
+ nn.Linear(residual_channels, residual_channels * 4),
+ nn.Mish(), nn.Linear(residual_channels * 4, residual_channels))
+ self.t_emb_layers = nn.LayerList([
+ nn.Linear(residual_channels, residual_channels)
+ for _ in range(layers)
+ ])
+
+ self.first_conv = nn.Conv1D(
+ in_channels, residual_channels, 1, bias_attr=True)
+ self.first_act = nn.ReLU()
+
+ self.conv_layers = nn.LayerList()
+ for layer in range(layers):
+ dilation = 2**(layer % layers_per_stack)
+ conv = WaveNetResidualBlock(
+ kernel_size=kernel_size,
+ residual_channels=residual_channels,
+ gate_channels=gate_channels,
+ skip_channels=skip_channels,
+ aux_channels=aux_channels,
+ dilation=dilation,
+ dropout=dropout,
+ bias=bias)
+ self.conv_layers.append(conv)
+
+ final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
+ nn.initializer.Constant(0.0)(final_conv.weight)
+ self.last_conv_layers = nn.Sequential(nn.ReLU(),
+ nn.Conv1D(
+ skip_channels,
+ skip_channels,
+ 1,
+ bias_attr=True),
+ nn.ReLU(), final_conv)
+
+ if use_weight_norm:
+ self.apply_weight_norm()
+
+ def forward(self, x, t, c):
+ """Denoise mel-spectrogram.
+
+ Args:
+ x(Tensor):
+ Shape (N, C_in, T), The input mel-spectrogram.
+ t(Tensor):
+ Shape (N), The timestep input.
+ c(Tensor):
+ Shape (N, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output).
+
+ Returns:
+ Tensor: Shape (N, C_out, T), the denoised mel-spectrogram.
+ """
+ assert c.shape[-1] == x.shape[-1]
+
+ if t.shape[0] != x.shape[0]:
+ t = t.tile([x.shape[0]])
+ t_emb = self.first_t_emb(t)
+ t_embs = [
+ t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers
+ ]
+
+ x = self.first_conv(x)
+ x = self.first_act(x)
+ skips = 0
+ for f, t in zip(self.conv_layers, t_embs):
+ x = x + t
+ x, s = f(x, c)
+ skips += s
+ skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+ x = self.last_conv_layers(skips)
+ return x
+
+ def apply_weight_norm(self):
+ """Recursively apply weight normalization to all the Convolution layers
+ in the sublayers.
+ """
+
+ def _apply_weight_norm(layer):
+ if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
+ nn.utils.weight_norm(layer)
+
+ self.apply(_apply_weight_norm)
+
+ def remove_weight_norm(self):
+ """Recursively remove weight normalization from all the Convolution
+ layers in the sublayers.
+ """
+
+ def _remove_weight_norm(layer):
+ try:
+ nn.utils.remove_weight_norm(layer)
+ except ValueError:
+ pass
+
+ self.apply(_remove_weight_norm)
+
+
+class GaussianDiffusion(nn.Layer):
+ """Common Gaussian Diffusion Denoising Model Module
+
+ Args:
+ denoiser (Layer, optional):
+ The model used for denoising noises.
+ num_train_timesteps (int, optional):
+ The number of timesteps between the noise and the real during training, by default 1000.
+ beta_start (float, optional):
+ beta start parameter for the scheduler, by default 0.0001.
+ beta_end (float, optional):
+ beta end parameter for the scheduler, by default 0.0001.
+ beta_schedule (str, optional):
+ beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule).
+ num_max_timesteps (int, optional):
+ The max timestep transition from real to noise, by default None.
+
+ Examples:
+ >>> import paddle
+ >>> import paddle.nn.functional as F
+ >>> from tqdm import tqdm
+ >>>
+ >>> denoiser = WaveNetDenoiser()
+ >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=1000, num_max_timesteps=100)
+ >>> x = paddle.ones([4, 80, 192]) # [B, mel_ch, T] # real mel input
+ >>> c = paddle.randn([4, 256, 192]) # [B, fs2_encoder_out_ch, T] # fastspeech2 encoder output
+ >>> loss = F.mse_loss(*diffusion(x, c))
+ >>> loss.backward()
+ >>> print('MSE Loss:', loss.item())
+ MSE Loss: 1.6669728755950928
+ >>> def create_progress_callback():
+ >>> pbar = None
+ >>> def callback(index, timestep, num_timesteps, sample):
+ >>> nonlocal pbar
+ >>> if pbar is None:
+ >>> pbar = tqdm(total=num_timesteps)
+ >>> pbar.update(index)
+ >>> pbar.update()
+ >>>
+ >>> return callback
+ >>>
+ >>> # ds=1000, K_step=60, scheduler=ddpm, from aux fs2 mel output
+ >>> ds = 1000
+ >>> infer_steps = 1000
+ >>> K_step = 60
+ >>> scheduler_type = 'ddpm'
+ >>> x_in = x
+ >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+ >>> with paddle.no_grad():
+ >>> sample = diffusion.inference(
+ >>> paddle.randn(x.shape), c, ref_x=x_in,
+ >>> num_inference_steps=infer_steps,
+ >>> scheduler_type=scheduler_type,
+ >>> callback=create_progress_callback())
+ 100%|█████| 60/60 [00:03<00:00, 18.36it/s]
+ >>>
+ >>> # ds=100, K_step=100, scheduler=ddpm, from gaussian noise
+ >>> ds = 100
+ >>> infer_steps = 100
+ >>> K_step = 100
+ >>> scheduler_type = 'ddpm'
+ >>> x_in = None
+ >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+ >>> with paddle.no_grad():
+ >>> sample = diffusion.inference(
+ >>> paddle.randn(x.shape), c, ref_x=x_in,
+ >>> num_inference_steps=infer_steps,
+ >>> scheduler_type=scheduler_type,
+ >>> callback=create_progress_callback())
+ 100%|█████| 100/100 [00:05<00:00, 18.29it/s]
+ >>>
+ >>> # ds=1000, K_step=1000, scheduler=pndm, infer_step=25, from gaussian noise
+ >>> ds = 1000
+ >>> infer_steps = 25
+ >>> K_step = 1000
+ >>> scheduler_type = 'pndm'
+ >>> x_in = None
+ >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+ >>> with paddle.no_grad():
+ >>> sample = diffusion.inference(
+ >>> paddle.randn(x.shape), c, ref_x=x_in,
+ >>> num_inference_steps=infer_steps,
+ >>> scheduler_type=scheduler_type,
+ >>> callback=create_progress_callback())
+ 100%|█████| 34/34 [00:01<00:00, 19.75it/s]
+ >>>
+ >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output
+ >>> ds = 1000
+ >>> infer_steps = 50
+ >>> K_step = 100
+ >>> scheduler_type = 'pndm'
+ >>> x_in = x
+ >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+ >>> with paddle.no_grad():
+ >>> sample = diffusion.inference(
+ >>> paddle.randn(x.shape), c, ref_x=x_in,
+ >>> num_inference_steps=infer_steps,
+ >>> scheduler_type=scheduler_type,
+ >>> callback=create_progress_callback())
+ 100%|█████| 14/14 [00:00<00:00, 23.80it/s]
+
+ """
+
+ def __init__(self,
+ denoiser: nn.Layer,
+ num_train_timesteps: Optional[int]=1000,
+ beta_start: Optional[float]=0.0001,
+ beta_end: Optional[float]=0.02,
+ beta_schedule: Optional[str]="squaredcos_cap_v2",
+ num_max_timesteps: Optional[int]=None):
+ super().__init__()
+
+ self.num_train_timesteps = num_train_timesteps
+ self.beta_start = beta_start
+ self.beta_end = beta_end
+ self.beta_schedule = beta_schedule
+
+ self.denoiser = denoiser
+ self.noise_scheduler = DDPMScheduler(
+ num_train_timesteps=num_train_timesteps,
+ beta_start=beta_start,
+ beta_end=beta_end,
+ beta_schedule=beta_schedule)
+ self.num_max_timesteps = num_max_timesteps
+
+ def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ """Generate random timesteps noised x.
+
+ Args:
+ x (Tensor):
+ The input for adding noises.
+ cond (Tensor, optional):
+ Conditional input for compute noises.
+
+ Returns:
+ y (Tensor):
+ The output with noises added in.
+ target (Tensor):
+ The noises which is added to the input.
+
+ """
+ noise_scheduler = self.noise_scheduler
+
+ # Sample noise that we'll add to the mel-spectrograms
+ target = noise = paddle.randn(x.shape)
+
+ # Sample a random timestep for each mel-spectrogram
+ num_timesteps = self.num_train_timesteps
+ if self.num_max_timesteps is not None:
+ num_timesteps = self.num_max_timesteps
+ timesteps = paddle.randint(0, num_timesteps, (x.shape[0], ))
+
+ # Add noise to the clean mel-spectrograms according to the noise magnitude at each timestep
+ # (this is the forward diffusion process)
+ noisy_images = noise_scheduler.add_noise(x, noise, timesteps)
+
+ y = self.denoiser(noisy_images, timesteps, cond)
+
+ # then compute loss use output y and noisy target for prediction_type == "epsilon"
+ return y, target
+
+ def inference(self,
+ noise: paddle.Tensor,
+ cond: Optional[paddle.Tensor]=None,
+ ref_x: Optional[paddle.Tensor]=None,
+ num_inference_steps: Optional[int]=1000,
+ strength: Optional[float]=None,
+ scheduler_type: Optional[str]="ddpm",
+ callback: Optional[Callable[[int, int, int, paddle.Tensor],
+ None]]=None,
+ callback_steps: Optional[int]=1):
+ """Denoising input from noises. Refer to ppdiffusers img2img pipeline.
+
+ Args:
+ noise (Tensor):
+ The input tensor as a starting point for denoising.
+ cond (Tensor, optional):
+ Conditional input for compute noises.
+ ref_x (Tensor, optional):
+ The real output for the denoising process to refer.
+ num_inference_steps (int, optional):
+ The number of timesteps between the noise and the real during inference, by default 1000.
+ strength (float, optional):
+ Mixing strength of ref_x with noise. The larger the value, the stronger the noise.
+ Range [0,1], by default None.
+ scheduler_type (str, optional):
+ Noise scheduler for generate noises.
+ Choose a great scheduler can skip many denoising step, by default 'ddpm'.
+ callback (Callable[[int,int,int,Tensor], None], optional):
+ Callback function during denoising steps.
+
+ Args:
+ index (int):
+ Current denoising index.
+ timestep (int):
+ Current denoising timestep.
+ num_timesteps (int):
+ Number of the denoising timesteps.
+ denoised_output (Tensor):
+ Current intermediate result produced during denoising.
+
+ callback_steps (int, optional):
+ The step to call the callback function.
+
+ Returns:
+ denoised_output (Tensor):
+ The denoised output tensor.
+
+ """
+ scheduler_cls = None
+ for clsname in dir(ppdiffusers.schedulers):
+ if clsname.lower() == scheduler_type + "scheduler":
+ scheduler_cls = getattr(ppdiffusers.schedulers, clsname)
+ break
+
+ if scheduler_cls is None:
+ raise ValueError(f"No such scheduler type named {scheduler_type}")
+
+ scheduler = scheduler_cls(
+ num_train_timesteps=self.num_train_timesteps,
+ beta_start=self.beta_start,
+ beta_end=self.beta_end,
+ beta_schedule=self.beta_schedule)
+
+ # set timesteps
+ scheduler.set_timesteps(num_inference_steps)
+
+ # prepare first noise variables
+ noisy_input = noise
+ timesteps = scheduler.timesteps
+ if ref_x is not None:
+ init_timestep = None
+ if strength is None or strength < 0. or strength > 1.:
+ strength = None
+ if self.num_max_timesteps is not None:
+ strength = self.num_max_timesteps / self.num_train_timesteps
+ if strength is not None:
+ # get the original timestep using init_timestep
+ init_timestep = min(
+ int(num_inference_steps * strength), num_inference_steps)
+ t_start = max(num_inference_steps - init_timestep, 0)
+ timesteps = scheduler.timesteps[t_start:]
+ num_inference_steps = num_inference_steps - t_start
+ noisy_input = scheduler.add_noise(
+ ref_x, noise, timesteps[:1].tile([noise.shape[0]]))
+
+ # denoising loop
+ denoised_output = noisy_input
+ num_warmup_steps = len(
+ timesteps) - num_inference_steps * scheduler.order
+ for i, t in enumerate(timesteps):
+ denoised_output = scheduler.scale_model_input(denoised_output, t)
+
+ # predict the noise residual
+ noise_pred = self.denoiser(denoised_output, t, cond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ denoised_output = scheduler.step(noise_pred, t,
+ denoised_output).prev_sample
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+ (i + 1) % scheduler.order == 0):
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, len(timesteps), denoised_output)
+
+ return denoised_output
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py
index b39121347..892ca371e 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -74,6 +74,28 @@ class MixTextProcessor():
ctlist.append([mixstr, []])
return ctlist
+ @classmethod
+ def get_dom_split(self, mixstr):
+ ''' 文本分解,顺序加了列表中,返回文本和say-as标签
+ '''
+ ctlist = []
+ patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S)
+ mat = re.match(patn, mixstr)
+ if mat:
+ pre_xml = mat.group(1)
+ in_xml = mat.group(2)
+ after_xml = mat.group(3)
+
+ ctlist.append(pre_xml)
+ dom = DomXml(in_xml)
+ tags = dom.get_text_and_sayas_tags()
+ ctlist.extend(tags)
+
+ ctlist.append(after_xml)
+ return ctlist
+ else:
+ ctlist.append(mixstr)
+ return ctlist
class DomXml():
def __init__(self, xmlstr):
@@ -156,3 +178,15 @@ class DomXml():
if x.hasAttribute('pinyin'): # pinyin
print(x.tagName, 'pinyin',
x.getAttribute('pinyin'), x.firstChild.data)
+
+ def get_text_and_sayas_tags(self):
+ '''返回 xml 内容的列表,包括所有文本内容和 tag'''
+ res = []
+
+ for x1 in self.rnode:
+ if x1.nodeType == Node.TEXT_NODE:
+ res.append(x1.value)
+ else:
+ for x2 in x1.childNodes:
+ res.append(x2.toxml())
+ return res
diff --git a/paddlespeech/utils/initialize.py b/paddlespeech/utils/initialize.py
new file mode 100644
index 000000000..8ebe6845e
--- /dev/null
+++ b/paddlespeech/utils/initialize.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+ "uniform_",
+ "normal_",
+ "constant_",
+ "ones_",
+ "zeros_",
+ "xavier_uniform_",
+ "xavier_normal_",
+ "kaiming_uniform_",
+ "kaiming_normal_",
+ "linear_init_",
+ "conv_init_",
+ "reset_initialized_parameter",
+ "_calculate_fan_in_and_fan_out",
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+ with paddle.no_grad():
+ tensor.set_value(
+ paddle.uniform(
+ shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+ return tensor
+
+
+def _no_grad_normal_(tensor, mean=0.0, std=1.0):
+ with paddle.no_grad():
+ tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+ return tensor
+
+
+def _no_grad_fill_(tensor, value=0.0):
+ with paddle.no_grad():
+ tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+ return tensor
+
+
+def uniform_(tensor, a, b):
+ """
+ Modified tensor inspace using uniform_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ a (float|int): min value.
+ b (float|int): max value.
+ Return:
+ tensor
+ """
+ return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+ """
+ Modified tensor inspace using normal_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ mean (float|int): mean value.
+ std (float|int): std value.
+ Return:
+ tensor
+ """
+ return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.0):
+ """
+ Modified tensor inspace using constant_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ value (float|int): value to fill tensor.
+ Return:
+ tensor
+ """
+ return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+ """
+ Modified tensor inspace using ones_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ Return:
+ tensor
+ """
+ return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+ """
+ Modified tensor inspace using zeros_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ Return:
+ tensor
+ """
+ return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+ with paddle.no_grad():
+ tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+ return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+ """
+ Calculate (fan_in, _fan_out) for tensor
+ Args:
+ tensor (Tensor): paddle.Tensor
+ reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+ Return:
+ Tuple[fan_in, fan_out]
+ """
+ if tensor.ndim < 2:
+ raise ValueError(
+ "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+ )
+
+ if reverse:
+ num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+ else:
+ num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+ receptive_field_size = 1
+ if tensor.ndim > 2:
+ receptive_field_size = np.prod(tensor.shape[2:])
+
+ fan_in = num_input_fmaps * receptive_field_size
+ fan_out = num_output_fmaps * receptive_field_size
+
+ return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1.0, reverse=False):
+ """
+ Modified tensor inspace using xavier_uniform_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ gain (float): super parameter, 1. default.
+ reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+ Return:
+ tensor
+ """
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+ std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+ k = math.sqrt(3.0) * std
+ return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1.0, reverse=False):
+ """
+ Modified tensor inspace using xavier_normal_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ gain (float): super parameter, 1. default.
+ reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+ Return:
+ tensor
+ """
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+ std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+ return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+ mode = mode.lower()
+ valid_modes = ["fan_in", "fan_out"]
+ if mode not in valid_modes:
+ raise ValueError("Mode {} not supported, please use one of {}".format(
+ mode, valid_modes))
+
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+ return fan_in if mode == "fan_in" else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+ linear_fns = [
+ "linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d",
+ "conv_transpose2d", "conv_transpose3d"
+ ]
+ if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+ return 1
+ elif nonlinearity == "tanh":
+ return 5.0 / 3
+ elif nonlinearity == "relu":
+ return math.sqrt(2.0)
+ elif nonlinearity == "leaky_relu":
+ if param is None:
+ negative_slope = 0.01
+ elif not isinstance(param, bool) and isinstance(
+ param, int) or isinstance(param, float):
+ # True/False are instances of int, hence check above
+ negative_slope = param
+ else:
+ raise ValueError(
+ "negative_slope {} not a valid number".format(param))
+ return math.sqrt(2.0 / (1 + negative_slope**2))
+ elif nonlinearity == "selu":
+ return 3.0 / 4
+ else:
+ raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+ a=0,
+ mode="fan_in",
+ nonlinearity="leaky_relu",
+ reverse=False):
+ """
+ Modified tensor inspace using kaiming_uniform method
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+ nonlinearity (str): nonlinearity method name
+ reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+ Return:
+ tensor
+ """
+ fan = _calculate_correct_fan(tensor, mode, reverse)
+ gain = _calculate_gain(nonlinearity, a)
+ std = gain / math.sqrt(fan)
+ k = math.sqrt(3.0) * std
+ return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+ a=0,
+ mode="fan_in",
+ nonlinearity="leaky_relu",
+ reverse=False):
+ """
+ Modified tensor inspace using kaiming_normal_
+ Args:
+ tensor (paddle.Tensor): paddle Tensor
+ mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+ nonlinearity (str): nonlinearity method name
+ reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+ Return:
+ tensor
+ """
+ fan = _calculate_correct_fan(tensor, mode, reverse)
+ gain = _calculate_gain(nonlinearity, a)
+ std = gain / math.sqrt(fan)
+ return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+ bound = 1 / math.sqrt(module.weight.shape[0])
+ uniform_(module.weight, -bound, bound)
+ uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+ bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+ uniform_(module.weight, -bound, bound)
+ if module.bias is not None:
+ uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+ """initialize conv/fc bias value according to a given probability value."""
+ bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+ return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+ """
+ Reset initialized parameter using following method for [conv, linear, embedding, bn]
+ Args:
+ model (paddle.Layer): paddle Layer
+ include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+ Return:
+ None
+ """
+ for _, m in model.named_sublayers(include_self=include_self):
+ if isinstance(m, nn.Conv2D):
+ k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+ m._kernel_size[1])
+ k = math.sqrt(k)
+ _no_grad_uniform_(m.weight, -k, k)
+ if hasattr(m, "bias") and getattr(m, "bias") is not None:
+ _no_grad_uniform_(m.bias, -k, k)
+
+ elif isinstance(m, nn.Linear):
+ k = math.sqrt(1.0 / m.weight.shape[0])
+ _no_grad_uniform_(m.weight, -k, k)
+ if hasattr(m, "bias") and getattr(m, "bias") is not None:
+ _no_grad_uniform_(m.bias, -k, k)
+
+ elif isinstance(m, nn.Embedding):
+ _no_grad_normal_(m.weight, mean=0.0, std=1.0)
+
+ elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+ _no_grad_fill_(m.weight, 1.0)
+ if hasattr(m, "bias") and getattr(m, "bias") is not None:
+ _no_grad_fill_(m.bias, 0)
diff --git a/setup.py b/setup.py
index 3bde2b205..76bc5be8d 100644
--- a/setup.py
+++ b/setup.py
@@ -37,9 +37,7 @@ base = [
"g2pM",
"h5py",
"inflect",
- "jieba",
"jsonlines",
- "kaldiio",
"librosa==0.8.1",
"loguru",
"matplotlib",
@@ -49,38 +47,29 @@ base = [
"opencc-python-reimplemented",
"pandas",
"paddlenlp>=2.4.8",
+ "ppdiffusers>=0.9.0",
"paddlespeech_feat",
- "Pillow>=9.0.0",
- "praatio==5.0.0",
- "protobuf>=3.1.0, <=3.20.0",
+ "praatio>=5.0.0",
"pypinyin<=0.44.0",
"pypinyin-dict",
"python-dateutil",
- "pyworld==0.2.12",
- "resampy==0.2.2",
+ "pyworld>=0.2.12",
+ "resampy",
"sacrebleu",
- "scipy",
- "sentencepiece~=0.1.96",
- "soundfile~=0.10",
"textgrid",
"timer",
- "tqdm",
"typeguard",
- "visualdl",
"webrtcvad",
"yacs~=0.1.8",
"prettytable",
"zhon",
- "colorlog",
- "pathos==0.2.8",
"braceexpand",
"pyyaml",
- "pybind11",
- "paddleslim==2.3.4",
- "paddleaudio>=1.0.2",
+ "paddleslim>=2.3.4",
+ "paddleaudio>=1.1.0",
]
-server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
+server = ["pattern_singleton", "websockets"]
requirements = {
"install":
@@ -303,7 +292,7 @@ setup_info = dict(
},
# Package info
- packages=find_packages(include=('paddlespeech*')),
+ packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']),
zip_safe=True,
classifiers=[
'Development Status :: 5 - Production/Stable',
diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index cb05a1d0f..9ff81bd8b 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then
mkdir -p BZNSYP
unrar x BZNSYP.rar BZNSYP
wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
+ # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住
+ wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz
+ tar -xzf nltk_data.tar.gz -C ${HOME}
# 数据预处理
python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml
python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 3a58626d2..5d3b76f6c 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -14,7 +14,7 @@ paddlespeech ssl --task asr --lang en --input ./en.wav
paddlespeech ssl --task vector --lang en --input ./en.wav
# Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
paddlespeech asr --input ./zh.wav
paddlespeech asr --model conformer_aishell --input ./zh.wav
paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@@ -26,6 +26,7 @@ paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav
paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav
paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav
paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav
+paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav
# Support editing num_decoding_left_chunks
paddlespeech asr --model conformer_online_wenetspeech --num_decoding_left_chunks 3 --input ./zh.wav