diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f72b44ac6..44bbd5cad 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,12 +26,12 @@ repos:
- --no-sort-keys
- --autofix
- id: check-merge-conflict
- - id: flake8
- aergs:
- - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
- - --builtins=G,request
- - --jobs=1
- exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+ # - id: flake8
+ # aergs:
+ # - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
+ # - --builtins=G,request
+ # - --jobs=1
+ # exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
rev: v1.0.1
diff --git a/README.md b/README.md
index c6e9fc209..19ec61cb0 100644
--- a/README.md
+++ b/README.md
@@ -227,13 +227,13 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
## Installation
-We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.4.1*.
+We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.8* and *paddlepaddle<=2.5.1*. Some new versions of Paddle do not have support for adaptation in PaddleSpeech, so currently only versions 2.5.1 and earlier can be supported.
### **Dependency Introduction**
+ gcc >= 4.8.5
-+ paddlepaddle >= 2.4.1
-+ python >= 3.7
++ paddlepaddle <= 2.5.1
++ python >= 3.8
+ OS support: Linux(recommend), Windows, Mac OSX
PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version.
@@ -893,10 +893,6 @@ The Text-to-Speech module is originally called [Parakeet](https://github.com/Pad
- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.**
-
-

-
-
## Citation
diff --git a/README_cn.md b/README_cn.md
index eabb2ead4..7aef30871 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -8,7 +8,7 @@
-
+
@@ -237,12 +237,12 @@
## 安装
-我们强烈建议用户在 **Linux** 环境下,*3.7* 以上版本的 *python* 上安装 PaddleSpeech。
+我们强烈建议用户在 **Linux** 环境下,*3.8* 以上版本的 *python* 上安装 PaddleSpeech。同时,有一些Paddle新版本的内容没有在做适配的支持,因此目前只能使用2.5.1及之前的版本。
### 相关依赖
+ gcc >= 4.8.5
-+ paddlepaddle >= 2.4.1
-+ python >= 3.7
++ paddlepaddle <= 2.5.1
++ python >= 3.8
+ linux(推荐), mac, windows
PaddleSpeech 依赖于 paddlepaddle,安装可以参考[ paddlepaddle 官网](https://www.paddlepaddle.org.cn/),根据自己机器的情况进行选择。这里给出 cpu 版本示例,其它版本大家可以根据自己机器的情况进行安装。
diff --git a/audio/setup.py b/audio/setup.py
index 0fe6e5995..f7d459446 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -38,8 +38,10 @@ VERSION = '1.2.0'
COMMITID = 'none'
base = [
+ # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
+ "librosa==0.8.1",
+ "numpy==1.23.5",
"kaldiio",
- "librosa>=0.10.0",
"pathos",
"pybind11",
"parameterized",
diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py
index 83c2b3f35..0711a40af 100644
--- a/demos/speech_web/speech_server/src/ge2e_clone.py
+++ b/demos/speech_web/speech_server/src/ge2e_clone.py
@@ -38,23 +38,9 @@ class VoiceCloneGE2E():
output_dir = os.path.dirname(out_wav)
ngpu = get_ngpu()
- cmd = f"""
- python3 {self.BIN_DIR}/voice_cloning.py \
- --am={self.am} \
- --am_config={self.am_config} \
- --am_ckpt={self.am_ckpt} \
- --am_stat={self.am_stat} \
- --voc={self.voc} \
- --voc_config={self.voc_config} \
- --voc_ckpt={self.voc_ckpt} \
- --voc_stat={self.voc_stat} \
- --ge2e_params_path={self.ge2e_params_path} \
- --text="{text}" \
- --input-dir={ref_audio_dir} \
- --output-dir={output_dir} \
- --phones-dict={self.phones_dict} \
- --ngpu={ngpu}
- """
+ cmd = f"""python {self.BIN_DIR}/voice_cloning.py --am={self.am} --am_config={self.am_config} --am_ckpt={self.am_ckpt} --am_stat={self.am_stat} --voc={self.voc} --voc_config={self.voc_config} --voc_ckpt={self.voc_ckpt} --voc_stat={self.voc_stat} --ge2e_params_path={self.ge2e_params_path} --text="{text}" --input-dir={ref_audio_dir} --output-dir={output_dir} --phones-dict={self.phones_dict} --ngpu={ngpu}"""
+
+ print(cmd)
output_name = os.path.join(output_dir, full_file_name)
return run_cmd(cmd, output_name=output_name)
diff --git a/docs/source/install.md b/docs/source/install.md
index a4dae3640..3607d7185 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -95,7 +95,7 @@ bash
```
Then you can create a conda virtual environment using the following command:
```bash
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
```
Activate the conda virtual environment:
```bash
@@ -181,7 +181,7 @@ $HOME/miniconda3/bin/conda init
# use the "bash" command to make the conda environment works
bash
# create a conda virtual environment
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
# Activate the conda virtual environment:
conda activate tools/venv
# Install the conda packages
diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md
index 7f05cdfe4..01ae21fe7 100644
--- a/docs/source/install_cn.md
+++ b/docs/source/install_cn.md
@@ -91,7 +91,7 @@ bash
```
然后你可以创建一个 conda 的虚拟环境:
```bash
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
```
激活 conda 虚拟环境:
```bash
@@ -173,7 +173,7 @@ $HOME/miniconda3/bin/conda init
# 激活 conda
bash
# 创建 Conda 虚拟环境
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
# 激活 Conda 虚拟环境:
conda activate tools/venv
# 安装 Conda 包
diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index cb1029e7b..c735e0bd8 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -165,8 +165,7 @@ docker run -it xxxxxx
设置python:
```bash
-export PATH="/opt/python/cp37-cp37m/bin/:$PATH"
-#export PATH="/opt/python/cp38-cp38/bin/:$PATH"
+export PATH="/opt/python/cp38-cp38/bin/:$PATH"
#export PATH="/opt/python/cp39-cp39/bin/:$PATH"
```
diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md
index 643d0e224..be771ba59 100644
--- a/examples/aishell/asr1/RESULTS.md
+++ b/examples/aishell/asr1/RESULTS.md
@@ -1,14 +1,31 @@
# Aishell
-## Conformer
-paddle version: 2.2.2
-paddlespeech version: 1.0.1
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0480 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 |
+## RoFormer Streaming
+paddle version: 2.5.0
+paddlespeech version: 1.5.0
+
+Tesla V100-SXM2-32GB: 1 node, 4 card
+Global BachSize: 32 * 4
+Training Done: 1 day, 12:56:39.639646
+### `decoding.decoding_chunk_size=16`
+
+> chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | 16, -1 | - | 5.63 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 6.13 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 6.13 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 5.44 |
+
+### `decoding.decoding_chunk_size=-1`
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - | 5.51 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 | - | 4.99 |
## Conformer Streaming
@@ -24,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.051968 |
+## Conformer
+paddle version: 2.2.2
+paddlespeech version: 1.0.1
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 |
+
+
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
diff --git a/examples/aishell/asr1/conf/chunk_roformer.yaml b/examples/aishell/asr1/conf/chunk_roformer.yaml
new file mode 100644
index 000000000..a4051a021
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_roformer.yaml
@@ -0,0 +1,98 @@
+############################################
+# Network Architecture #
+############################################
+cmvn_file:
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+ output_size: 256 # dimension of attention
+ attention_heads: 4
+ linear_units: 2048 # the number of units of position-wise feed forward
+ num_blocks: 12 # the number of encoder blocks
+ dropout_rate: 0.1 # sublayer output dropout
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.0
+ input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+ normalize_before: True
+ cnn_module_kernel: 15
+ use_cnn_module: True
+ activation_type: 'swish'
+ pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
+ selfattention_layer_type: 'rel_selfattn' # unused
+ causal: true
+ use_dynamic_chunk: true
+ cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+ use_dynamic_left_chunk: false
+# decoder related
+decoder: transformer # transformer, bitransformer
+decoder_conf:
+ attention_heads: 4
+ linear_units: 2048
+ num_blocks: 6
+ r_num_blocks: 0 # only for bitransformer
+ dropout_rate: 0.1 # sublayer output dropout
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.0
+ src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+ ctc_weight: 0.3
+ lsm_weight: 0.1 # label smoothing option
+ reverse_weight: 0.0 # only for bitransformer
+ length_normalized_loss: false
+ init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+# Data #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+# Dataloader #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batch_size: 32
+maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+# Training #
+###########################################
+n_epoch: 240
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+ lr: 0.001
+ weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 25000
+ lr_decay: 1.0
+log_interval: 100
+checkpoint:
+ kbest_n: 50
+ latest_n: 5
diff --git a/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml
new file mode 100644
index 000000000..aa3a0aca7
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml
@@ -0,0 +1,98 @@
+############################################
+# Network Architecture #
+############################################
+cmvn_file:
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+ output_size: 256 # dimension of attention
+ attention_heads: 4
+ linear_units: 2048 # the number of units of position-wise feed forward
+ num_blocks: 12 # the number of encoder blocks
+ dropout_rate: 0.1 # sublayer output dropout
+ positional_dropout_rate: 0.1
+ attention_dropout_rate: 0.0
+ input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+ normalize_before: True
+ cnn_module_kernel: 15
+ use_cnn_module: True
+ activation_type: 'swish'
+ pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
+ selfattention_layer_type: 'rel_selfattn' # unused
+ causal: true
+ use_dynamic_chunk: true
+ cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+ use_dynamic_left_chunk: false
+# decoder related
+decoder: bitransformer # transformer, bitransformer
+decoder_conf:
+ attention_heads: 4
+ linear_units: 2048
+ num_blocks: 3
+ r_num_blocks: 3 # only for bitransformer
+ dropout_rate: 0.1 # sublayer output dropout
+ positional_dropout_rate: 0.1
+ self_attention_dropout_rate: 0.0
+ src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+ ctc_weight: 0.3
+ lsm_weight: 0.1 # label smoothing option
+ reverse_weight: 0.3 # only for bitransformer
+ length_normalized_loss: false
+ init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+# Data #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+# Dataloader #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batch_size: 32
+maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+# Training #
+###########################################
+n_epoch: 240
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+ lr: 0.001
+ weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 25000
+ lr_decay: 1.0
+log_interval: 100
+checkpoint:
+ kbest_n: 50
+ latest_n: 5
diff --git a/examples/csmsc/tts2/local/inference_xpu.sh b/examples/csmsc/tts2/local/inference_xpu.sh
new file mode 100644
index 000000000..5d8d92054
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_xpu.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=speedyspeech_csmsc \
+ --voc=pwgan_csmsc \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --device xpu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=speedyspeech_csmsc \
+ --voc=mb_melgan_csmsc \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --device xpu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=speedyspeech_csmsc \
+ --voc=hifigan_csmsc \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --device xpu
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh
new file mode 100644
index 000000000..0285f42cd
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=mb_melgan_csmsc \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=style_melgan_csmsc \
+ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+ # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=hifigan_csmsc \
+ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_xpu.sh b/examples/csmsc/tts2/local/synthesize_xpu.sh
new file mode 100644
index 000000000..801789c26
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_xpu.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=mb_melgan_csmsc \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=style_melgan_csmsc \
+ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "in hifigan syn"
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=hifigan_csmsc \
+ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn"
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --tones_dict=dump/tone_id_map.txt \
+ --phones_dict=dump/phone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_xpu.sh b/examples/csmsc/tts2/local/train_xpu.sh
new file mode 100644
index 000000000..0c07c27fc
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_xpu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
+ --train-metadata=dump/train/norm/metadata.jsonl \
+ --dev-metadata=dump/dev/norm/metadata.jsonl \
+ --config=${config_path} \
+ --output-dir=${train_output_path} \
+ --ngpu=0 \
+ --nxpu=1 \
+ --phones-dict=dump/phone_id_map.txt \
+ --tones-dict=dump/tone_id_map.txt \
+ --use-relative-path=True
diff --git a/examples/csmsc/tts2/run_xpu.sh b/examples/csmsc/tts2/run_xpu.sh
new file mode 100644
index 000000000..4b867961f
--- /dev/null
+++ b/examples/csmsc/tts2/run_xpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+xpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run_xpu.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+ FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # synthesize, vocoder is pwgan by default
+ FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # synthesize_e2e, vocoder is pwgan by default
+ FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # inference with static model
+ FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
+fi
diff --git a/examples/csmsc/tts3/local/inference_xpu.sh b/examples/csmsc/tts3/local/inference_xpu.sh
new file mode 100644
index 000000000..541dc6262
--- /dev/null
+++ b/examples/csmsc/tts3/local/inference_xpu.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=fastspeech2_csmsc \
+ --voc=pwgan_csmsc \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --device xpu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=fastspeech2_csmsc \
+ --voc=mb_melgan_csmsc \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --device xpu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=fastspeech2_csmsc \
+ --voc=hifigan_csmsc \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --device xpu
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=fastspeech2_csmsc \
+ --voc=wavernn_csmsc \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --device xpu
+fi
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh
new file mode 100644
index 000000000..bb58a37c8
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=mb_melgan_csmsc \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=style_melgan_csmsc \
+ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+ # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "in hifigan syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=hifigan_csmsc \
+ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../../assets/sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --ngpu=0 \
+ --nxpu=1
+fi
diff --git a/examples/csmsc/tts3/local/synthesize_xpu.sh b/examples/csmsc/tts3/local/synthesize_xpu.sh
new file mode 100644
index 000000000..fac8677a7
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_xpu.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=mb_melgan_csmsc \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=style_melgan_csmsc \
+ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "in hifigan syn"
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=hifigan_csmsc \
+ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn"
+ FLAGS_allocator_strategy=naive_best_fit \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --ngpu=0 \
+ --nxpu=1
+fi
diff --git a/examples/csmsc/tts3/local/train_xpu.sh b/examples/csmsc/tts3/local/train_xpu.sh
new file mode 100644
index 000000000..a7d889888
--- /dev/null
+++ b/examples/csmsc/tts3/local/train_xpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+ --train-metadata=dump/train/norm/metadata.jsonl \
+ --dev-metadata=dump/dev/norm/metadata.jsonl \
+ --config=${config_path} \
+ --output-dir=${train_output_path} \
+ --ngpu=0 \
+ --nxpu=1 \
+ --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/csmsc/tts3/run_xpu.sh b/examples/csmsc/tts3/run_xpu.sh
new file mode 100644
index 000000000..4922d6b4b
--- /dev/null
+++ b/examples/csmsc/tts3/run_xpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+xpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+ FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # synthesize, vocoder is pwgan by default
+ FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # synthesize_e2e, vocoder is pwgan by default
+ FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # inference with static model, vocoder is pwgan by default
+ FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
+fi
diff --git a/paddlespeech/dataset/s2t/avg_model.py b/paddlespeech/dataset/s2t/avg_model.py
index c5753b726..5bd5cb1f0 100755
--- a/paddlespeech/dataset/s2t/avg_model.py
+++ b/paddlespeech/dataset/s2t/avg_model.py
@@ -20,30 +20,6 @@ import numpy as np
import paddle
-def define_argparse():
- parser = argparse.ArgumentParser(description='average model')
- parser.add_argument('--dst_model', required=True, help='averaged model')
- parser.add_argument(
- '--ckpt_dir', required=True, help='ckpt model dir for average')
- parser.add_argument(
- '--val_best', action="store_true", help='averaged model')
- parser.add_argument(
- '--num', default=5, type=int, help='nums for averaged model')
- parser.add_argument(
- '--min_epoch',
- default=0,
- type=int,
- help='min epoch used for averaging model')
- parser.add_argument(
- '--max_epoch',
- default=65536, # Big enough
- type=int,
- help='max epoch used for averaging model')
-
- args = parser.parse_args()
- return args
-
-
def average_checkpoints(dst_model="",
ckpt_dir="",
val_best=True,
@@ -85,7 +61,7 @@ def average_checkpoints(dst_model="",
print(path_list)
avg = None
- num = args.num
+ num = num
assert num == len(path_list)
for path in path_list:
print(f'Processing {path}')
@@ -100,14 +76,14 @@ def average_checkpoints(dst_model="",
if avg[k] is not None:
avg[k] /= num
- paddle.save(avg, args.dst_model)
- print(f'Saving to {args.dst_model}')
+ paddle.save(avg, dst_model)
+ print(f'Saving to {dst_model}')
- meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
+ meta_path = os.path.splitext(dst_model)[0] + '.avg.json'
with open(meta_path, 'w') as f:
data = json.dumps({
- "mode": 'val_best' if args.val_best else 'latest',
- "avg_ckpt": args.dst_model,
+ "mode": 'val_best' if val_best else 'latest',
+ "avg_ckpt": dst_model,
"val_loss_mean": avg_val_score,
"ckpts": path_list,
"epochs": selected_epochs.tolist(),
@@ -116,9 +92,40 @@ def average_checkpoints(dst_model="",
f.write(data + "\n")
+def define_argparse():
+ parser = argparse.ArgumentParser(description='average model')
+ parser.add_argument('--dst_model', required=True, help='averaged model')
+ parser.add_argument(
+ '--ckpt_dir', required=True, help='ckpt model dir for average')
+ parser.add_argument(
+ '--val_best', action="store_true", help='averaged model')
+ parser.add_argument(
+ '--num', default=5, type=int, help='nums for averaged model')
+ parser.add_argument(
+ '--min_epoch',
+ default=0,
+ type=int,
+ help='min epoch used for averaging model')
+ parser.add_argument(
+ '--max_epoch',
+ default=65536, # Big enough
+ type=int,
+ help='max epoch used for averaging model')
+
+ args = parser.parse_args()
+ print(args)
+ return args
+
+
def main():
args = define_argparse()
- average_checkpoints(args)
+ average_checkpoints(
+ dst_model=args.dst_model,
+ ckpt_dir=args.ckpt_dir,
+ val_best=args.val_best,
+ num=args.num,
+ min_epoch=args.min_epoch,
+ max_epoch=args.max_epoch)
if __name__ == '__main__':
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index 7ab8cf853..d007a9e39 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -27,7 +27,6 @@ from paddlespeech.audio.text.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
from paddlespeech.s2t.training.reporter import report
from paddlespeech.s2t.training.timer import Timer
from paddlespeech.s2t.training.trainer import Trainer
@@ -148,7 +147,7 @@ class DeepSpeech2Trainer(Trainer):
if not self.train:
return
- grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
+ grad_clip = paddle.nn.ClipGradByGlobalNorm(config.global_grad_clip)
lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
optimizer = paddle.optimizer.Adam(
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index f716fa3b5..2e1c14ac1 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -145,7 +145,6 @@ class U2BaseModel(ASRInterface, nn.Layer):
text_lengths)
ctc_time = time.time() - start
#logger.debug(f"ctc time: {ctc_time}")
-
if loss_ctc is None:
loss = loss_att
elif loss_att is None:
@@ -916,6 +915,8 @@ class U2Model(U2DecodeModel):
decoder_type = configs.get('decoder', 'transformer')
logger.debug(f"U2 Decoder type: {decoder_type}")
if decoder_type == 'transformer':
+ configs['model_conf'].pop('reverse_weight', None)
+ configs['decoder_conf'].pop('r_num_blocks', None)
decoder = TransformerDecoder(vocab_size,
encoder.output_size(),
**configs['decoder_conf'])
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
index 59a67a1e5..a3744d340 100755
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
x_lens = x.shape[1]
ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
- topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen)
+ topk_index = topk_index.view([batch_size, x_lens]) # (B, maxlen)
hyps = [hyp.tolist() for hyp in topk_index]
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 14336c03d..10ab3eaea 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -15,6 +15,7 @@
# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""Multi-Head Attention layer definition."""
import math
+from typing import List
from typing import Tuple
import paddle
@@ -26,7 +27,10 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
-__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"]
+__all__ = [
+ "MultiHeadedAttention", "RelPositionMultiHeadedAttention",
+ "RoPERelPositionMultiHeadedAttention"
+]
# Relative Positional Encodings
# https://www.jianshu.com/p/c0608efcc26f
@@ -165,6 +169,7 @@ class MultiHeadedAttention(nn.Layer):
and `head * d_k == size`
"""
+ # (B,T,D) -> (B,T,H,D/H)
q, k, v = self.forward_qkv(query, key, value)
# when export onnx model, for 1st chunk, we feed
@@ -373,3 +378,139 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
self.d_k) # (batch, head, time1, time2)
return self.forward_attention(v, scores, mask), new_cache
+
+
+class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
+ """Multi-Head Attention layer with RoPE relative position encoding."""
+
+ def __init__(self,
+ n_head,
+ n_feat,
+ dropout_rate,
+ adaptive_scale=False,
+ init_weights=False):
+ """Construct an RelPositionMultiHeadedAttention object.
+ Paper: https://arxiv.org/abs/1901.02860
+ Args:
+ n_head (int): The number of heads.
+ n_feat (int): The number of features.
+ dropout_rate (float): Dropout rate.
+ """
+ super().__init__(n_head, n_feat, dropout_rate)
+
+ def align(self, tensor: paddle.Tensor, axes: List[int], ndim=None):
+ """重新对齐tensor(批量版expand_dims)
+ axes:原来的第i维对齐新tensor的第axes[i]维;
+ ndim:新tensor的维度。
+ """
+ assert len(axes) == tensor.dim()
+ assert ndim or min(axes) >= 0
+
+ ndim = ndim or max(axes) + 1
+
+ # a[0, None, 1] = a[0, np.newaxis, 1]
+ indices = [None] * ndim
+ for i in axes:
+ # slice nothing, a[0, slice(None), 1] = a[0, :, 1]
+ indices[i] = slice(None)
+
+ return tensor[indices]
+
+ def apply_rotary_position_embeddings(self, sinusoidal, *tensors):
+ """应用RoPE到tensors中
+ 其中,sinusoidal.shape=[B, T, D],tensors为tensor的列表,而
+ tensor.shape=[B, T, ..., D], or (B,H,T,D/H)
+ """
+ assert len(tensors) > 0, 'at least one input tensor'
+ assert all(
+ [tensor.shape == tensors[0].shape
+ for tensor in tensors[1:]]), 'all tensors must have the same shape'
+
+ # (B,H,T,D)
+ ndim = tensors[0].dim()
+ _, H, T, D = tensors[0].shape
+
+ # sinusoidal shape same with tensors[0]
+ # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H)
+ # sinusoidal = self.align(sinusoidal, [0, 1, -1], ndim)
+ sinusoidal = sinusoidal.reshape((1, T, H, D)).transpose([0, 2, 1, 3])
+
+ # http://man.hubwiz.com/docset/TensorFlow.docset/Contents/Resources/Documents/api_docs/python/tf/keras/backend/repeat_elements.html
+ # like np.repeat, x (s1, s2, s3), axis 1, (s1, s2*rep, s3)
+ # [b,T, ..., d/2] -> [b,T, ..., d]
+ cos_pos = paddle.repeat_interleave(sinusoidal[..., 1::2], 2, axis=-1)
+ sin_pos = paddle.repeat_interleave(sinusoidal[..., 0::2], 2, axis=-1)
+ outputs = []
+ for tensor in tensors:
+ # x2 = [-x2, x1, -x4, x3, ..., -x_d, x_{d-1}]
+ tensor2 = paddle.stack([-tensor[..., 1::2], tensor[..., ::2]], ndim)
+ tensor2 = paddle.reshape(tensor2, paddle.shape(tensor))
+
+ # 公式 34, out = x * cos_pos + x2 * sin_pos
+ outputs.append(tensor * cos_pos + tensor2 * sin_pos)
+ return outputs[0] if len(outputs) == 1 else outputs
+
+ def forward(self,
+ query: paddle.Tensor,
+ key: paddle.Tensor,
+ value: paddle.Tensor,
+ mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ pos_emb: paddle.Tensor=paddle.empty([0]),
+ cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+ Ref: https://github.com/facebookresearch/llama/blob/main/llama/model.py
+ Args:
+ query (paddle.Tensor): Query tensor (#batch, time1, size).
+ key (paddle.Tensor): Key tensor (#batch, time2, size).
+ value (paddle.Tensor): Value tensor (#batch, time2, size).
+ mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
+ (#batch, time1, time2), (0, 0, 0) means fake mask.
+ pos_emb (paddle.Tensor): Positional embedding tensor
+ (#batch, time2, size).
+ cache (paddle.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+ where `cache_t == chunk_size * num_decoding_left_chunks`
+ and `head * d_k == size`
+ Returns:
+ paddle.Tensor: Output tensor (#batch, time1, d_model).
+ paddle.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+ where `cache_t == chunk_size * num_decoding_left_chunks`
+ and `head * d_k == size`
+ """
+ q, k, v = self.forward_qkv(query, key, value)
+ # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k)
+
+ # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index
+ # q_t always is chunk_size
+ q_t = q.shape[2]
+ q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q)
+ # k will increase when in streaming decoding.
+ k = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], k)
+
+ # when export onnx model, for 1st chunk, we feed
+ # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+ # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+ # In all modes, `if cache.size(0) > 0` will alwayse be `True`
+ # and we will always do splitting and
+ # concatnation(this will simplify onnx export). Note that
+ # it's OK to concat & split zero-shaped tensors(see code below).
+ # when export jit model, for 1st chunk, we always feed
+ # cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+ # >>> a = torch.ones((1, 2, 0, 4))
+ # >>> b = torch.ones((1, 2, 3, 4))
+ # >>> c = torch.cat((a, b), dim=2)
+ # >>> torch.equal(b, c) # True
+ # >>> d = torch.split(a, 2, dim=-1)
+ # >>> torch.equal(d[0], d[1]) # True
+ if cache.shape[0] > 0:
+ # last dim `d_k * 2` for (key, val)
+ key_cache, value_cache = paddle.split(cache, 2, axis=-1)
+ k = paddle.concat([key_cache, k], axis=2)
+ v = paddle.concat([value_cache, v], axis=2)
+ # We do cache slicing in encoder.forward_chunk, since it's
+ # non-trivial to calculate `next_cache_start` here.
+ new_cache = paddle.concat((k, v), axis=-1)
+
+ # dot(q, k)
+ scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k)
+ return self.forward_attention(v, scores, mask), new_cache
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index f41a7b5d4..1e9f01018 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -85,18 +85,21 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
reverse (bool, optional): Not used. Defaults to False.
"""
nn.Layer.__init__(self)
- self.d_model = d_model
+ self.d_model = paddle.to_tensor(d_model)
self.max_len = max_len
self.xscale = paddle.to_tensor(math.sqrt(self.d_model))
self.dropout = nn.Dropout(p=dropout_rate)
+ self.base = paddle.to_tensor(10000.0)
self.pe = paddle.zeros([1, self.max_len, self.d_model]) #[B=1,T,D]
position = paddle.arange(
0, self.max_len, dtype=paddle.float32).unsqueeze(1) #[T, 1]
+ # base^{-2(i-1)/d)}, i \in (1,2...,d/2)
div_term = paddle.exp(
- paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
- -(math.log(10000.0) / self.d_model))
+ -paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
+ (paddle.log(self.base) / self.d_model))
+ # [B,T,D]
self.pe[:, :, 0::2] = paddle.sin(position * div_term)
self.pe[:, :, 1::2] = paddle.cos(position * div_term)
@@ -161,6 +164,98 @@ class RelPositionalEncoding(PositionalEncoding):
assert offset + x.shape[
1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
offset, x.shape[1], self.max_len)
+
x = x * self.xscale
pos_emb = self.pe[:, offset:offset + x.shape[1]]
return self.dropout(x), self.dropout(pos_emb)
+
+
+# RotaryRelPositionalEncoding is same to RelPositionalEncoding
+class ScaledRotaryRelPositionalEncoding(RelPositionalEncoding):
+ """Scaled Rotary Relative positional encoding module.
+ POSITION INTERPOLATION: : https://arxiv.org/pdf/2306.15595v2.pdf
+ """
+
+ def __init__(self,
+ d_model: int,
+ dropout_rate: float,
+ max_len: int=5000,
+ scale=1):
+ """
+ Args:
+ d_model (int): Embedding dimension.
+ dropout_rate (float): Dropout rate.
+ max_len (int, optional): [Maximum input length.]. Defaults to 5000.
+ scale (int): Interpolation max input length to `scale * max_len` positions.
+ """
+ super().__init__(d_model, dropout_rate, max_len, reverse=True)
+ self.pscale = paddle.to_tensor(scale)
+ self.max_len = max_len * scale
+
+ def sinusoidal_embeddings(self,
+ pos: paddle.Tensor,
+ dim: paddle.Tensor,
+ base=10000) -> paddle.Tensor:
+ """计算pos位置的dim维sinusoidal编码"""
+ assert dim % 2 == 0
+ # (d/2,)
+ indices = paddle.arange(0, dim // 2, dtype=pos.dtype)
+ indices = paddle.pow(paddle.cast(base, pos.dtype), -2 * indices / dim)
+ # pos (1, T), indices (d/2,) -> (1, T, d/2)
+ embeddings = paddle.einsum('...,d->...d', pos, indices)
+ # (1, T, d/2, 2)
+ embeddings = paddle.stack(
+ [paddle.sin(embeddings), paddle.cos(embeddings)], axis=-1)
+ # (1, T, d)
+ embeddings = paddle.flatten(embeddings, start_axis=-2, stop_axis=-1)
+ return embeddings
+
+ def forward(self, x: paddle.Tensor,
+ offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ """Compute positional encoding.
+ Args:
+ x (paddle.Tensor): Input tensor (batch, time, `*`).
+ Returns:
+ paddle.Tensor: Encoded tensor (batch, time, `*`).
+ paddle.Tensor: Positional embedding tensor (1, time, `*`).
+ """
+ x = x * self.xscale
+
+ B, T, D = x.shape
+ assert D == self.d_model
+
+ # postion interploation
+ start = 0
+ end = T * self.pscale
+ assert end <= self.max_len
+ position = paddle.arange(start, end, dtype=x.dtype).unsqueeze(0)
+ position *= 1.0 / self.pscale
+ pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base)
+
+ pos_emb = pe[:, offset:offset + x.shape[1]]
+ return self.dropout(x), self.dropout(pos_emb)
+
+ def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
+ """ For getting encoding in a streaming fashion
+ Attention!!!!!
+ we apply dropout only once at the whole utterance level in a none
+ streaming way, but will call this function several times with
+ increasing input size in a streaming scenario, so the dropout will
+ be applied several times.
+ Args:
+ offset (int): start offset
+ size (int): requried size of position encoding
+ Returns:
+ paddle.Tensor: Corresponding position encoding, #[1, T, D].
+ """
+ # postion interploation
+ start = offset
+ end = (offset + size) * self.pscale
+ assert end <= self.max_len
+ position = paddle.arange(
+ start, end, dtype=paddle.get_default_dtype()).unsqueeze(0)
+ position *= 1.0 / self.pscale
+
+ pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base)
+
+ return self.dropout(pe)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index d90d69d77..27d7ffbd7 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -28,6 +28,7 @@ from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.attention import MultiHeadedAttention
from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
+from paddlespeech.s2t.modules.attention import RoPERelPositionMultiHeadedAttention
from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
from paddlespeech.s2t.modules.embedding import NoPositionalEncoding
from paddlespeech.s2t.modules.embedding import PositionalEncoding
@@ -115,6 +116,8 @@ class BaseEncoder(nn.Layer):
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "rel_pos":
pos_enc_class = RelPositionalEncoding
+ elif pos_enc_layer_type == "rope_pos":
+ pos_enc_class = RelPositionalEncoding
elif pos_enc_layer_type == "no_pos":
pos_enc_class = NoPositionalEncoding
else:
@@ -230,14 +233,14 @@ class BaseEncoder(nn.Layer):
xs = self.global_cmvn(xs)
# before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
- xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+ xs, _, _ = self.embed(xs, tmp_masks, offset=offset)
# after embed, xs=(B=1, chunk_size, hidden-dim)
elayers, _, cache_t1, _ = att_cache.shape
chunk_size = xs.shape[1]
attention_key_size = cache_t1 + chunk_size
- # only used when using `RelPositionMultiHeadedAttention`
+ # only used when using `RelPositionMultiHeadedAttention` and `RoPERelPositionMultiHeadedAttention`
pos_emb = self.embed.position_encoding(
offset=offset - cache_t1, size=attention_key_size)
@@ -474,21 +477,35 @@ class ConformerEncoder(BaseEncoder):
activation = get_activation(activation_type)
# self-attention module definition
- encoder_selfattn_layer = RelPositionMultiHeadedAttention
- encoder_selfattn_layer_args = (attention_heads, output_size,
- attention_dropout_rate)
+ encoder_dim = output_size
+ if pos_enc_layer_type == "abs_pos":
+ encoder_selfattn_layer = MultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+ attention_dropout_rate)
+ elif pos_enc_layer_type == "rel_pos":
+ encoder_selfattn_layer = RelPositionMultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+ attention_dropout_rate)
+ elif pos_enc_layer_type == "rope_pos":
+ encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+ attention_dropout_rate)
+ else:
+ raise ValueError(
+ f"pos_enc_layer_type {pos_enc_layer_type} not supported.")
+
# feed-forward module definition
positionwise_layer = PositionwiseFeedForward
- positionwise_layer_args = (output_size, linear_units, dropout_rate,
+ positionwise_layer_args = (encoder_dim, linear_units, dropout_rate,
activation)
# convolution module definition
convolution_layer = ConvolutionModule
- convolution_layer_args = (output_size, cnn_module_kernel, activation,
+ convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
cnn_module_norm, causal)
self.encoders = nn.LayerList([
ConformerEncoderLayer(
- size=output_size,
+ size=encoder_dim,
self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
feed_forward=positionwise_layer(*positionwise_layer_args),
feed_forward_macaron=positionwise_layer(
@@ -580,15 +597,23 @@ class SqueezeformerEncoder(nn.Layer):
activation = get_activation(activation_type)
# self-attention module definition
- if pos_enc_layer_type != "rel_pos":
+ if pos_enc_layer_type == "abs_pos":
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, output_size,
attention_dropout_rate)
- else:
+ elif pos_enc_layer_type == "rel_pos":
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, encoder_dim,
attention_dropout_rate,
adaptive_scale, init_weights)
+ elif pos_enc_layer_type == "rope_pos":
+ encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+ attention_dropout_rate,
+ adaptive_scale, init_weights)
+ else:
+ raise ValueError(
+ f"pos_enc_layer_type {pos_enc_layer_type} not supported.")
# feed-forward module definition
positionwise_layer = PositionwiseFeedForward
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index ecba95e85..0499e742b 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -48,7 +48,7 @@ class TransformerEncoderLayer(nn.Layer):
Args:
size (int): Input dimension.
self_attn (nn.Layer): Self-attention module instance.
- `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+ `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
instance can be used as the argument.
feed_forward (nn.Layer): Feed-forward module instance.
`PositionwiseFeedForward`, instance can be used as the argument.
@@ -147,7 +147,7 @@ class ConformerEncoderLayer(nn.Layer):
Args:
size (int): Input dimension.
self_attn (nn.Layer): Self-attention module instance.
- `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+ `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
instance can be used as the argument.
feed_forward (nn.Layer): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
@@ -298,7 +298,7 @@ class SqueezeformerEncoderLayer(nn.Layer):
Args:
size (int): Input dimension.
self_attn (paddle.nn.Layer): Self-attention module instance.
- `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+ `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
instance can be used as the argument.
feed_forward1 (paddle.nn.Layer): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
deleted file mode 100644
index 06587c749..000000000
--- a/paddlespeech/s2t/training/gradclip.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-from paddle.fluid import core
-from paddle.fluid import layers
-from paddle.fluid.dygraph import base as imperative_base
-
-from paddlespeech.s2t.utils.log import Log
-
-__all__ = ["ClipGradByGlobalNormWithLog"]
-
-logger = Log(__name__).getlog()
-
-
-class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
- def __init__(self, clip_norm):
- super().__init__(clip_norm)
-
- def __repr__(self):
- return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
-
- @imperative_base.no_grad
- def _dygraph_clip(self, params_grads):
- params_and_grads = []
- sum_square_list = []
- for i, (p, g) in enumerate(params_grads):
- if g is None:
- continue
- if getattr(p, 'need_clip', True) is False:
- continue
- merge_grad = g
- if g.type == core.VarDesc.VarType.SELECTED_ROWS:
- merge_grad = layers.merge_selected_rows(g)
- merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
- square = paddle.square(merge_grad)
- sum_square = paddle.sum(square)
- sum_square_list.append(sum_square)
-
- # debug log, not dump all since slow down train process
- if i < 10:
- logger.debug(
- f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
-
- # all parameters have been filterd out
- if len(sum_square_list) == 0:
- return params_grads
-
- global_norm_var = paddle.concat(sum_square_list)
- global_norm_var = paddle.sum(global_norm_var)
- global_norm_var = paddle.sqrt(global_norm_var)
-
- # debug log
- logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
-
- max_global_norm = paddle.full(
- shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
- clip_var = paddle.divide(
- x=max_global_norm,
- y=paddle.maximum(x=global_norm_var, y=max_global_norm))
- for i, (p, g) in enumerate(params_grads):
- if g is None:
- continue
- if getattr(p, 'need_clip', True) is False:
- params_and_grads.append((p, g))
- continue
- new_grad = paddle.multiply(x=g, y=clip_var)
- params_and_grads.append((p, new_grad))
-
- # debug log, not dump all since slow down train process
- if i < 10:
- logger.debug(
- f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
- )
-
- return params_and_grads
diff --git a/paddlespeech/s2t/training/optimizer/__init__.py b/paddlespeech/s2t/training/optimizer/__init__.py
index aafdc5b6a..90281e1ed 100644
--- a/paddlespeech/s2t/training/optimizer/__init__.py
+++ b/paddlespeech/s2t/training/optimizer/__init__.py
@@ -19,7 +19,7 @@ from typing import Text
import paddle
from paddle.optimizer import Optimizer
from paddle.regularizer import L2Decay
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
+
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.dynamic_import import instance_class
from paddlespeech.s2t.utils.log import Log
@@ -100,10 +100,9 @@ class OptimizerFactory():
assert "parameters" in args, "parameters not in args."
assert "learning_rate" in args, "learning_rate not in args."
- grad_clip = ClipGradByGlobalNormWithLog(
+ grad_clip = paddle.nn.ClipGradByGlobalNorm(
args['grad_clip']) if "grad_clip" in args else None
- weight_decay = L2Decay(
- args['weight_decay']) if "weight_decay" in args else None
+ weight_decay = args.get("weight_decay", None)
if weight_decay:
logger.info(f'')
if grad_clip:
diff --git a/paddlespeech/s2t/training/optimizer/adadelta.py b/paddlespeech/s2t/training/optimizer/adadelta.py
index 900b697c5..7c3950a90 100644
--- a/paddlespeech/s2t/training/optimizer/adadelta.py
+++ b/paddlespeech/s2t/training/optimizer/adadelta.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
-from paddle.fluid import framework
+from paddle import framework
from paddle.optimizer import Optimizer
__all__ = []
diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index 0995a55da..9dd31a08b 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -28,7 +28,7 @@ from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.onnx_infer import get_sess
from paddlespeech.server.utils.util import denorm
from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
__all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py
index a46b84bd9..0cfb20354 100644
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -29,7 +29,7 @@ from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.util import denorm
from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 20b98fae6..3a6461f8c 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -32,7 +32,7 @@ from paddlespeech.server.utils.errors import ErrorCode
from paddlespeech.server.utils.exception import ServerBaseException
from paddlespeech.server.utils.paddle_predictor import init_predictor
from paddlespeech.server.utils.paddle_predictor import run_model
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
__all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py
index 57fe82a9c..7d93c026e 100644
--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -18,6 +18,5 @@ from . import exps
from . import frontend
from . import models
from . import modules
-from . import ssml
from . import training
from . import utils
diff --git a/paddlespeech/t2s/assets/__init__.py b/paddlespeech/t2s/assets/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/paddlespeech/t2s/assets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/assets/sentences_mix.txt b/paddlespeech/t2s/assets/sentences_mix.txt
index 06e97d14a..bfa0db636 100644
--- a/paddlespeech/t2s/assets/sentences_mix.txt
+++ b/paddlespeech/t2s/assets/sentences_mix.txt
@@ -5,4 +5,5 @@
005 Paddle Bo Bo: 使用 Paddle Speech 的语音合成模块生成虚拟人的声音。
006 热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!
007 我喜欢 eat apple, 你喜欢 drink milk。
-008 我们要去云南 team building, 非常非常 happy.
\ No newline at end of file
+008 我们要去云南 team building, 非常非常 happy.
+009 AI for Sceience 平台。
\ No newline at end of file
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index 97626db0b..24f2be7d5 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -44,10 +44,17 @@ from paddlespeech.t2s.utils import str2bool
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
- if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+ if args.ngpu > 0 and paddle.is_compiled_with_cuda():
+ paddle.set_device("gpu")
+ elif args.nxpu > 0 and paddle.is_compiled_with_xpu():
+ paddle.set_device("xpu")
+ elif args.ngpu == 0 and args.nxpu == 0:
paddle.set_device("cpu")
else:
- paddle.set_device("gpu")
+ raise ValueError(
+ "Please make sure that the paddle you installed matches the device type you set, "
+ "and that ngpu and nxpu cannot be negative at the same time.")
+
world_size = paddle.distributed.get_world_size()
if world_size > 1:
paddle.distributed.init_parallel_env()
@@ -183,7 +190,12 @@ def main():
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
- "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+ "--ngpu", type=int, default=1, help="if ngpu=0, use cpu or xpu.")
+ parser.add_argument(
+ "--nxpu",
+ type=int,
+ default=0,
+ help="if ngpu=0 and nxpu > 0, use xpu. if ngpu=0 and nxpu=0, use cpu.")
parser.add_argument(
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 31fe14490..8a5269825 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
parser.add_argument(
"--device",
default="gpu",
- choices=["gpu", "cpu"],
+ choices=["gpu", "cpu", "xpu"],
help="Device selected for inference.", )
parser.add_argument('--cpu_threads', type=int, default=1)
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 57c79dee1..9a07df64d 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -33,8 +33,8 @@ from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import *
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
-from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
+from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.frontend.sing_frontend import SingFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend
@@ -99,14 +99,23 @@ def norm(data, mean, std):
return (data - mean) / std
-def get_chunks(data, block_size: int, pad_size: int):
- data_len = data.shape[1]
+def get_chunks(mel, chunk_size: int, pad_size: int):
+ """
+ Split mel by chunk size with left and right context.
+
+ Args:
+ mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
+ chunk_size (int): chunk size
+ pad_size (int): size for left and right context.
+ """
+ T = mel.shape[1]
+ n = math.ceil(T / chunk_size)
+
chunks = []
- n = math.ceil(data_len / block_size)
for i in range(n):
- start = max(0, i * block_size - pad_size)
- end = min((i + 1) * block_size + pad_size, data_len)
- chunks.append(data[:, start:end, :])
+ start = max(0, i * chunk_size - pad_size)
+ end = min((i + 1) * chunk_size + pad_size, T)
+ chunks.append(mel[:, start:end, :])
return chunks
@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
with open(text_file, 'rt', encoding='utf-8') as f:
for line in f:
if line.strip() != "":
- items = re.split(r"\s+", line.strip(), 1)
+ items = re.split(r"\s+", line.strip(), maxsplit=1)
+ assert len(items) == 2
utt_id = items[0]
- if lang in {'zh', 'canton'}:
- sentence = "".join(items[1:])
- elif lang == 'en':
- sentence = " ".join(items[1:])
- elif lang == 'mix':
- sentence = " ".join(items[1:])
+ sentence = items[1]
sentences.append((utt_id, sentence))
return sentences
@@ -319,6 +324,7 @@ def run_frontend(
input_ids = {}
if text.strip() != "" and re.match(r".*?.*?.*", text,
re.DOTALL):
+ # using ssml
input_ids = frontend.get_input_ids_ssml(
text,
merge_sentences=merge_sentences,
@@ -359,6 +365,7 @@ def run_frontend(
outs.update({'is_slurs': is_slurs})
else:
print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
+
outs.update({'phone_ids': phone_ids})
return outs
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 6189522db..e7cf7850e 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -219,7 +219,13 @@ def parse_args():
)
# other
parser.add_argument(
- "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+ "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+ parser.add_argument(
+ "--nxpu",
+ type=int,
+ default=0,
+ help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+ )
parser.add_argument("--test_metadata", type=str, help="test metadata.")
parser.add_argument("--output_dir", type=str, help="output dir.")
parser.add_argument(
@@ -235,12 +241,14 @@ def parse_args():
def main():
args = parse_args()
- if args.ngpu == 0:
- paddle.set_device("cpu")
- elif args.ngpu > 0:
+ if args.ngpu > 0:
paddle.set_device("gpu")
+ elif args.nxpu > 0:
+ paddle.set_device("xpu")
+ elif args.ngpu == 0 and args.nxpu == 0:
+ paddle.set_device("cpu")
else:
- print("ngpu should >= 0 !")
+ print("ngpu or nxpu should >= 0 !")
evaluate(args)
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 0c7b34b09..c63a5fbe9 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -13,6 +13,7 @@
# limitations under the License.
import argparse
from pathlib import Path
+from pprint import pprint
import paddle
import soundfile as sf
@@ -78,6 +79,7 @@ def evaluate(args):
# whether dygraph to static
if args.inference_dir:
+ print("convert am and voc to static model.")
# acoustic model
am_inference = am_to_static(
am_inference=am_inference,
@@ -92,6 +94,7 @@ def evaluate(args):
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
+
merge_sentences = False
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
@@ -102,13 +105,19 @@ def evaluate(args):
if am_name == 'speedyspeech':
get_tone_ids = True
+ # wav samples
N = 0
+ # inference time cost
T = 0
+
+ # [(uid, text), ]
if am_name == 'diffsinger':
sentences = get_sentences_svs(text_file=args.text)
else:
sentences = get_sentences(text_file=args.text, lang=args.lang)
+
for utt_id, sentence in sentences:
+ print(f"{utt_id} {sentence}")
with timer() as t:
if am_name == "diffsinger":
text = ""
@@ -116,6 +125,8 @@ def evaluate(args):
else:
text = sentence
svs_input = None
+
+ # frontend
frontend_dict = run_frontend(
frontend=frontend,
text=text,
@@ -124,25 +135,33 @@ def evaluate(args):
lang=args.lang,
svs_input=svs_input)
phone_ids = frontend_dict['phone_ids']
+ # pprint(f"{utt_id} {phone_ids}")
+
with paddle.no_grad():
flags = 0
for i in range(len(phone_ids)):
+ # sub phone, split by `sp` or punctuation.
part_phone_ids = phone_ids[i]
+
# acoustic model
if am_name == 'fastspeech2':
# multi speaker
if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
- spk_id = paddle.to_tensor(args.spk_id)
+ # multi-speaker
+ spk_id = paddle.to_tensor([args.spk_id])
mel = am_inference(part_phone_ids, spk_id)
else:
+ # single-speaker
mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech':
part_tone_ids = frontend_dict['tone_ids'][i]
if am_dataset in {"aishell3", "vctk", "mix"}:
- spk_id = paddle.to_tensor(args.spk_id)
+ # multi-speaker
+ spk_id = paddle.to_tensor([args.spk_id])
mel = am_inference(part_phone_ids, part_tone_ids,
spk_id)
else:
+ # single-speaker
mel = am_inference(part_phone_ids, part_tone_ids)
elif am_name == 'tacotron2':
mel = am_inference(part_phone_ids)
@@ -155,6 +174,7 @@ def evaluate(args):
note=part_note_ids,
note_dur=part_note_durs,
is_slur=part_is_slurs, )
+
# vocoder
wav = voc_inference(mel)
if flags == 0:
@@ -162,17 +182,23 @@ def evaluate(args):
flags = 1
else:
wav_all = paddle.concat([wav_all, wav])
+
wav = wav_all.numpy()
N += wav.size
T += t.elapse
+
+ # samples per second
speed = wav.size / t.elapse
+ # generate one second wav need `RTF` seconds
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
+
sf.write(
str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
print(f"{utt_id} done!")
+
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
@@ -273,7 +299,13 @@ def parse_args():
default=None,
help="dir to save inference models")
parser.add_argument(
- "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+ "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+ parser.add_argument(
+ "--nxpu",
+ type=int,
+ default=0,
+ help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+ )
parser.add_argument(
"--text",
type=str,
@@ -303,12 +335,14 @@ def parse_args():
def main():
args = parse_args()
- if args.ngpu == 0:
- paddle.set_device("cpu")
- elif args.ngpu > 0:
+ if args.ngpu > 0:
paddle.set_device("gpu")
+ elif args.nxpu > 0:
+ paddle.set_device("xpu")
+ elif args.ngpu == 0 and args.nxpu == 0:
+ paddle.set_device("cpu")
else:
- print("ngpu should >= 0 !")
+ print("ngpu or nxpu should >= 0 !")
evaluate(args)
diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
index 2ebd5ecc2..4e82e53ff 100644
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -27,7 +27,7 @@ import yaml
from yacs.config import CfgNode as Configuration
from paddlespeech.t2s.datasets.get_feats import LogMelFBank
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
def get_lj_sentences(file_name, frontend):
diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
index 0cd7d224e..279407b38 100644
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
@@ -21,7 +21,7 @@ import soundfile as sf
import yaml
from yacs.config import CfgNode
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.models.transformer_tts import TransformerTTS
from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
diff --git a/paddlespeech/t2s/frontend/__init__.py b/paddlespeech/t2s/frontend/__init__.py
index 64015435e..a8f77d552 100644
--- a/paddlespeech/t2s/frontend/__init__.py
+++ b/paddlespeech/t2s/frontend/__init__.py
@@ -13,8 +13,8 @@
# limitations under the License.
from .generate_lexicon import *
from .normalizer import *
-from .phonectic import *
from .punctuation import *
+from .ssml import *
from .tone_sandhi import *
from .vocab import *
from .zh_normalization import *
diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py
index 7a81b645d..9b2b11b3d 100644
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from paddlespeech.t2s.frontend.phonectic import Phonetics
"""
A phonology system with ARPABET symbols and limited punctuations. The G2P
conversion is done by g2p_en.
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
Note that g2p_en does not handle words with hypen well. So make sure the input
sentence is first normalized.
"""
-from paddlespeech.t2s.frontend.vocab import Vocab
from g2p_en import G2p
+from paddlespeech.t2s.frontend.phonectic import Phonetics
+from paddlespeech.t2s.frontend.vocab import Vocab
+
class ARPABET(Phonetics):
- """A phonology for English that uses ARPABET as the phoneme vocabulary.
+ """A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
+
+ 47 symbols = 39 phones + 4 punctuations + 4 special tokens( )
+
+ The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
+ 0 — No stress
+ 1 — Primary stress
+ 2 — Secondary stress
+
+ Phoneme Set:
+ Phoneme Example Translation
+ ------- ------- -----------
+ AA odd AA D
+ AE at AE T
+ AH hut HH AH T
+ AO ought AO T
+ AW cow K AW
+ AY hide HH AY D
+ B be B IY
+ CH cheese CH IY Z
+ D dee D IY
+ DH thee DH IY
+ EH Ed EH D
+ ER hurt HH ER T
+ EY ate EY T
+ F fee F IY
+ G green G R IY N
+ HH he HH IY
+ IH it IH T
+ IY eat IY T
+ JH gee JH IY
+ K key K IY
+ L lee L IY
+ M me M IY
+ N knee N IY
+ NG ping P IH NG
+ OW oat OW T
+ OY toy T OY
+ P pee P IY
+ R read R IY D
+ S sea S IY
+ SH she SH IY
+ T tea T IY
+ TH theta TH EY T AH
+ UH hood HH UH D
+ UW two T UW
+ V vee V IY
+ W we W IY
+ Y yield Y IY L D
+ Z zee Z IY
+ ZH seizure S IY ZH ER
+
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
- Phoneme Example Translation
- ------- ------- -----------
- AA odd AA D
- AE at AE T
- AH hut HH AH T
- AO ought AO T
- AW cow K AW
- AY hide HH AY D
- B be B IY
- CH cheese CH IY Z
- D dee D IY
- DH thee DH IY
- EH Ed EH D
- ER hurt HH ER T
- EY ate EY T
- F fee F IY
- G green G R IY N
- HH he HH IY
- IH it IH T
- IY eat IY T
- JH gee JH IY
- K key K IY
- L lee L IY
- M me M IY
- N knee N IY
- NG ping P IH NG
- OW oat OW T
- OY toy T OY
- P pee P IY
- R read R IY D
- S sea S IY
- SH she SH IY
- T tea T IY
- TH theta TH EY T AH
- UH hood HH UH D
- UW two T UW
- V vee V IY
- W we W IY
- Y yield Y IY L D
- Z zee Z IY
- ZH seizure S IY ZH ER
"""
+ # 39 phonemes
phonemes = [
'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
]
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations
+ # vowels carry a lexical stress marker:
+ # 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音)
_stress_to_no_stress_ = {
'AA0': 'AA',
'AA1': 'AA',
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
'UW2': 'UW'
}
+ def __repr__(self):
+ fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
+ return fmt.format(len(phonemes), punctuations)
+
def __init__(self):
+ # https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations)
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
Returns:
List[str]: The list of pronunciation sequence.
"""
+ # g2p and remove vowel stress
phonemes = [
self._remove_vowels(item) for item in self.backend(sentence)
]
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
Returns:
List[int]: The list of pronunciation id sequence.
"""
+ # phonemes to ids
ids = [self.vocab.lookup(item) for item in phonemes]
return ids
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
def vocab_size(self):
""" Vocab size.
"""
- # 47 = 39 phones + 4 punctuations + 4 special tokens
+ # 47 = 39 phones + 4 punctuations + 4 special tokens( )
return len(self.vocab)
class ARPABETWithStress(Phonetics):
+ """
+ A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
+
+ 77 symbols = 69 phones + 4 punctuations + 4 special tokens
+ """
phonemes = [
'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations
+ def __repr__(self):
+ fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
+ return fmt.format(len(phonemes), punctuations)
+
def __init__(self):
self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations)
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
index f2c7175fe..bbb7bcf00 100644
--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -29,7 +29,8 @@ INITIALS = [
INITIALS += ['sp', 'spl', 'spn', 'sil']
-def get_lines(cantons: List[str]):
+def jyuping_to_phonemes(cantons: List[str]):
+ # jyuping to inital and final
phones = []
for canton in cantons:
for consonant in INITIALS:
@@ -47,7 +48,7 @@ def get_lines(cantons: List[str]):
class CantonFrontend():
def __init__(self, phone_vocab_path: str):
self.text_normalizer = TextNormalizer()
- self.punc = ":,;。?!“”‘’':,;.?!"
+ self.punc = "、:,;。?!“”‘’':,;.?!"
self.vocab_phones = {}
if phone_vocab_path:
@@ -61,8 +62,11 @@ class CantonFrontend():
merge_sentences: bool=True) -> List[List[str]]:
phones_list = []
for sentence in sentences:
+ # jyuping
+ # 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
phones_str = ToJyutping.get_jyutping_text(sentence)
- phones_split = get_lines(phones_str.split(' '))
+ # phonemes
+ phones_split = jyuping_to_phonemes(phones_str.split(' '))
phones_list.append(phones_split)
return phones_list
@@ -78,8 +82,11 @@ class CantonFrontend():
sentence: str,
merge_sentences: bool=True,
print_info: bool=False) -> List[List[str]]:
+ # TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence)
+ # G2P
phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
+
if print_info:
print("----------------------------")
print("text norm results:")
@@ -88,6 +95,7 @@ class CantonFrontend():
print("g2p results:")
print(phonemes)
print("----------------------------")
+
return phonemes
def get_input_ids(self,
@@ -98,9 +106,9 @@ class CantonFrontend():
phonemes = self.get_phonemes(
sentence, merge_sentences=merge_sentences, print_info=print_info)
+
result = {}
temp_phone_ids = []
-
for phones in phonemes:
if phones:
phone_ids = self._p2id(phones)
@@ -108,6 +116,8 @@ class CantonFrontend():
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
+
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
+
return result
diff --git a/paddlespeech/t2s/frontend/en_frontend.py b/paddlespeech/t2s/frontend/en_frontend.py
new file mode 100644
index 000000000..c58bed7d3
--- /dev/null
+++ b/paddlespeech/t2s/frontend/en_frontend.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .phonectic import English
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index b8c16097c..2ebfe135e 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -18,9 +18,9 @@ from typing import List
import numpy as np
import paddle
-from paddlespeech.t2s.frontend import English
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+from paddlespeech.t2s.frontend.zh_frontend import Frontend as ZhFrontend
class MixFrontend():
@@ -28,10 +28,9 @@ class MixFrontend():
g2p_model="pypinyin",
phone_vocab_path=None,
tone_vocab_path=None):
-
- self.zh_frontend = Frontend(
+ self.zh_frontend = ZhFrontend(
phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
- self.en_frontend = English(phone_vocab_path=phone_vocab_path)
+ self.en_frontend = EnFrontend(phone_vocab_path=phone_vocab_path)
self.sp_id = self.zh_frontend.vocab_phones["sp"]
self.sp_id_numpy = np.array([self.sp_id])
self.sp_id_tensor = paddle.to_tensor([self.sp_id])
@@ -55,15 +54,12 @@ class MixFrontend():
else:
return False
- def get_segment(self, text: str) -> List[str]:
+ def split_by_lang(self, text: str) -> List[str]:
# sentence --> [ch_part, en_part, ch_part, ...]
segments = []
types = []
- flag = 0
- temp_seg = ""
- temp_lang = ""
- # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
+ # Determine the type of each character. type: chinese, alphabet, other.
for ch in text:
if self.is_chinese(ch):
types.append("zh")
@@ -74,31 +70,31 @@ class MixFrontend():
assert len(types) == len(text)
- for i in range(len(types)):
+ flag = 0
+ temp_seg = ""
+ temp_lang = ""
+
+ for i in range(len(text)):
# find the first char of the seg
if flag == 0:
temp_seg += text[i]
temp_lang = types[i]
flag = 1
-
else:
if temp_lang == "other":
- if types[i] == temp_lang:
- temp_seg += text[i]
- else:
- temp_seg += text[i]
+ # text start is not lang.
+ temp_seg += text[i]
+ if types[i] != temp_lang:
temp_lang = types[i]
-
else:
- if types[i] == temp_lang:
- temp_seg += text[i]
- elif types[i] == "other":
+ if types[i] == temp_lang or types[i] == "other":
+ # merge same lang or other
temp_seg += text[i]
else:
+ # change lang
segments.append((temp_seg, temp_lang))
temp_seg = text[i]
- temp_lang = types[i]
- flag = 1
+ temp_lang = types[i] # new lang
segments.append((temp_seg, temp_lang))
@@ -110,76 +106,95 @@ class MixFrontend():
get_tone_ids: bool=False,
add_sp: bool=True,
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
- ''' 1. 添加SSML支持,先列出 文字 和 标签内容,
- 然后添加到tmpSegments数组里
- '''
- d_inputs = MixTextProcessor.get_dom_split(sentence)
- tmpSegments = []
- for instr in d_inputs:
- ''' 暂时只支持 say-as '''
- if instr.lower().startswith("" + currentSeg[0] + ""
segments.append(tuple(currentSeg))
+ # en
segments.append(seg)
+ # reset
currentSeg = ["", ""]
else:
+ # zh
if currentSeg[0] == '':
+ # first see
currentSeg[0] = seg[0]
currentSeg[1] = seg[1]
else:
+ # merge zh
currentSeg[0] = currentSeg[0] + seg[0]
+
if currentSeg[0] != '':
+ # last zh
currentSeg[0] = "" + currentSeg[0] + ""
segments.append(tuple(currentSeg))
phones_list = []
result = {}
+ # 008 我们要去云南 team building, 非常非常 happy.
+ # seg ('我们要去云南 ', 'zh')
+ # seg ('team building, ', 'en')
+ # seg ('非常非常 ', 'zh')
+ # seg ('happy.', 'en')
+ # [('我们要去云南 ', 'zh'), ('team building, ', 'en'), ('非常非常 ', 'zh'), ('happy.', 'en')]
for seg in segments:
content = seg[0]
lang = seg[1]
- if content != '':
- if lang == "en":
- input_ids = self.en_frontend.get_input_ids(
- content, merge_sentences=False, to_tensor=to_tensor)
+
+ if not content:
+ continue
+
+ if lang == "en":
+ input_ids = self.en_frontend.get_input_ids(
+ content, merge_sentences=False, to_tensor=to_tensor)
+ else:
+ if content.strip() != "" and \
+ re.match(r".*?.*?.*", content, re.DOTALL):
+ # process ssml
+ input_ids = self.zh_frontend.get_input_ids_ssml(
+ content,
+ merge_sentences=False,
+ get_tone_ids=get_tone_ids,
+ to_tensor=to_tensor)
else:
- ''' 3. 把带speak tag的中文和普通文字分开处理
- '''
- if content.strip() != "" and \
- re.match(r".*?.*?.*", content, re.DOTALL):
- input_ids = self.zh_frontend.get_input_ids_ssml(
- content,
- merge_sentences=False,
- get_tone_ids=get_tone_ids,
- to_tensor=to_tensor)
- else:
- input_ids = self.zh_frontend.get_input_ids(
- content,
- merge_sentences=False,
- get_tone_ids=get_tone_ids,
- to_tensor=to_tensor)
- if add_sp:
- if to_tensor:
- input_ids["phone_ids"][-1] = paddle.concat(
- [input_ids["phone_ids"][-1], self.sp_id_tensor])
- else:
- input_ids["phone_ids"][-1] = np.concatenate(
- (input_ids["phone_ids"][-1], self.sp_id_numpy))
+ # process plain text
+ input_ids = self.zh_frontend.get_input_ids(
+ content,
+ merge_sentences=False,
+ get_tone_ids=get_tone_ids,
+ to_tensor=to_tensor)
+
+ if add_sp:
+ # add sp between zh and en
+ if to_tensor:
+ input_ids["phone_ids"][-1] = paddle.concat(
+ [input_ids["phone_ids"][-1], self.sp_id_tensor])
+ else:
+ input_ids["phone_ids"][-1] = np.concatenate(
+ (input_ids["phone_ids"][-1], self.sp_id_numpy))
- for phones in input_ids["phone_ids"]:
- phones_list.append(phones)
+ phones_list.extend(input_ids["phone_ids"])
if merge_sentences:
merge_list = paddle.concat(phones_list)
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index af86d9b80..d6c66f1e0 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -47,15 +47,34 @@ class Phonetics(ABC):
class English(Phonetics):
""" Normalize the input text sequence and convert into pronunciation id sequence.
+
+ https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
+
+ phonemes = ["", "", "", ""] + [
+ 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
+ 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
+ 'EY2', 'F', 'G', 'HH',
+ 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
+ 'M', 'N', 'NG', 'OW0', 'OW1',
+ 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
+ 'UH0', 'UH1', 'UH2', 'UW',
+ 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
"""
+ LEXICON = {
+ # key using lowercase
+ "AI".lower(): [["EY0", "AY1"]],
+ }
+
def __init__(self, phone_vocab_path=None):
self.backend = G2p()
+ self.backend.cmu.update(English.LEXICON)
self.phonemes = list(self.backend.phonemes)
self.punctuations = get_punctuations("en")
self.vocab = Vocab(self.phonemes + self.punctuations)
self.vocab_phones = {}
- self.punc = ":,;。?!“”‘’':,;.?!"
+ self.punc = "、:,;。?!“”‘’':,;.?!"
self.text_normalizer = TextNormalizer()
if phone_vocab_path:
with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
@@ -86,8 +105,8 @@ class English(Phonetics):
sentence: str,
merge_sentences: bool=False,
to_tensor: bool=True) -> paddle.Tensor:
- result = {}
sentences = self.text_normalizer._split(sentence, lang="en")
+
phones_list = []
temp_phone_ids = []
for sentence in sentences:
@@ -118,7 +137,10 @@ class English(Phonetics):
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
+
+ result = {}
result["phone_ids"] = temp_phone_ids
+
return result
def numericalize(self, phonemes):
diff --git a/paddlespeech/t2s/frontend/polyphonic.py b/paddlespeech/t2s/frontend/polyphonic.py
new file mode 100644
index 000000000..9a757e204
--- /dev/null
+++ b/paddlespeech/t2s/frontend/polyphonic.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import yaml
+
+
+class Polyphonic():
+ def __init__(self):
+ with open(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ 'polyphonic.yaml'),
+ 'r',
+ encoding='utf-8') as polyphonic_file:
+ # 解析yaml
+ polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
+ self.polyphonic_words = polyphonic_dict["polyphonic"]
+
+ def correct_pronunciation(self, word, pinyin):
+ # 词汇被词典收录则返回纠正后的读音
+ if word in self.polyphonic_words.keys():
+ pinyin = self.polyphonic_words[word]
+ # 否则返回原读音
+ return pinyin
diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml
index 6885035e7..f52b1cf58 100644
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -47,4 +47,8 @@ polyphonic:
恶行: ['e4','xing2']
唉: ['ai4']
扎实: ['zha1','shi2']
- 干将: ['gan4','jiang4']
\ No newline at end of file
+ 干将: ['gan4','jiang4']
+ 陈威行: ['chen2', 'wei1', 'hang2']
+ 郭晟: ['guo1', 'sheng4']
+ 中标: ['zhong4', 'biao1']
+ 抗住: ['kang2', 'zhu4']
\ No newline at end of file
diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py
index c2aecf273..fff72a10c 100644
--- a/paddlespeech/t2s/frontend/sing_frontend.py
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
@@ -29,7 +29,7 @@ class SingFrontend():
pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
"""
- self.punc = '[:,;。?!“”‘’\':,;.?!]'
+ self.punc = '[、:,;。?!“”‘’\':,;.?!]'
self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
if pinyin_phone_path:
diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/frontend/ssml/__init__.py
similarity index 89%
rename from paddlespeech/t2s/ssml/__init__.py
rename to paddlespeech/t2s/frontend/ssml/__init__.py
index 9b4db053b..b1b9d726f 100644
--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/frontend/ssml/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py
similarity index 84%
rename from paddlespeech/t2s/ssml/xml_processor.py
rename to paddlespeech/t2s/frontend/ssml/xml_processor.py
index 892ca371e..1d216c31b 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py
@@ -1,4 +1,17 @@
# -*- coding: utf-8 -*-
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import re
import xml.dom.minidom
import xml.parsers.expat
@@ -17,7 +30,6 @@ Note: xml 有5种特殊字符, &<>"'
' '
例如:
"姓名"
-
'''
@@ -61,17 +73,29 @@ class MixTextProcessor():
patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr)
if mat:
+ # pre
pre_xml = mat.group(1)
+ # between ...
in_xml = mat.group(2)
+ # post
after_xml = mat.group(3)
- ctlist.append([pre_xml, []])
+ # pre with none syllable
+ if pre_xml:
+ ctlist.append([pre_xml, []])
+
+ # between with syllable
+ # [(sub sentence, [syllables]), ...]
dom = DomXml(in_xml)
pinyinlist = dom.get_pinyins_for_xml()
ctlist = ctlist + pinyinlist
- ctlist.append([after_xml, []])
+
+ # post with none syllable
+ if after_xml:
+ ctlist.append([after_xml, []])
else:
ctlist.append([mixstr, []])
+
return ctlist
@classmethod
@@ -86,17 +110,21 @@ class MixTextProcessor():
in_xml = mat.group(2)
after_xml = mat.group(3)
- ctlist.append(pre_xml)
+ if pre_xml:
+ ctlist.append(pre_xml)
+
dom = DomXml(in_xml)
tags = dom.get_text_and_sayas_tags()
ctlist.extend(tags)
-
- ctlist.append(after_xml)
- return ctlist
+
+ if after_xml:
+ ctlist.append(after_xml)
else:
ctlist.append(mixstr)
+
return ctlist
+
class DomXml():
def __init__(self, xmlstr):
self.tdom = parseString(xmlstr) #Document
diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 42f7b8b2f..690f69aa2 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -20,6 +20,9 @@ from pypinyin import Style
class ToneSandhi():
+ def __repr__(self):
+ return "MandarinToneSandhi"
+
def __init__(self):
self.must_neural_tone_words = {
'麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
@@ -65,9 +68,22 @@ class ToneSandhi():
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
'幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
'耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
- '考考', '整整', '莘莘', '落地', '算子', '家家户户'
+ '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
}
- self.punc = ":,;。?!“”‘’':,;.?!"
+ self.punc = "、:,;。?!“”‘’':,;.?!"
+
+ def _split_word(self, word: str) -> List[str]:
+ word_list = jieba.cut_for_search(word)
+ word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+ first_subword = word_list[0]
+ first_begin_idx = word.find(first_subword)
+ if first_begin_idx == 0:
+ second_subword = word[len(first_subword):]
+ new_word_list = [first_subword, second_subword]
+ else:
+ second_subword = word[:-len(first_subword)]
+ new_word_list = [second_subword, first_subword]
+ return new_word_list
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g.
@@ -154,18 +170,8 @@ class ToneSandhi():
finals[i] = finals[i][:-1] + "4"
return finals
- def _split_word(self, word: str) -> List[str]:
- word_list = jieba.cut_for_search(word)
- word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
- first_subword = word_list[0]
- first_begin_idx = word.find(first_subword)
- if first_begin_idx == 0:
- second_subword = word[len(first_subword):]
- new_word_list = [first_subword, second_subword]
- else:
- second_subword = word[:-len(first_subword)]
- new_word_list = [second_subword, first_subword]
- return new_word_list
+ def _all_tone_three(self, finals: List[str]) -> bool:
+ return all(x[-1] == "3" for x in finals)
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
@@ -207,9 +213,6 @@ class ToneSandhi():
return finals
- def _all_tone_three(self, finals: List[str]) -> bool:
- return all(x[-1] == "3" for x in finals)
-
# merge "不" and the word behind it
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
@@ -336,6 +339,9 @@ class ToneSandhi():
def pre_merge_for_modify(
self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+ """
+ seg: [(word, pos), ...]
+ """
seg = self._merge_bu(seg)
seg = self._merge_yi(seg)
seg = self._merge_reduplication(seg)
@@ -346,7 +352,11 @@ class ToneSandhi():
def modified_tone(self, word: str, pos: str,
finals: List[str]) -> List[str]:
-
+ """
+ word: 分词
+ pos: 词性
+ finals: 带调韵母, [final1, ..., finaln]
+ """
finals = self._bu_sandhi(word, finals)
finals = self._yi_sandhi(word, finals)
finals = self._neural_sandhi(word, pos, finals)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 35b97a93a..1431bc6d8 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -14,6 +14,7 @@
import os
import re
from operator import itemgetter
+from pprint import pprint
from typing import Dict
from typing import List
@@ -30,10 +31,11 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
+from paddlespeech.t2s.frontend.polyphonic import Polyphonic
from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
-from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
INITIALS = [
'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@@ -41,6 +43,9 @@ INITIALS = [
]
INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
+# 0 for None, 5 for neutral
+TONES = ["0", "1", "2", "3", "4", "5"]
+
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
@@ -49,34 +54,19 @@ def intersperse(lst, item):
def insert_after_character(lst, item):
+ """
+ inset `item` after finals.
+ """
result = [item]
+
for phone in lst:
result.append(phone)
if phone not in INITIALS:
# finals has tones
# assert phone[-1] in "12345"
result.append(item)
- return result
-
-
-class Polyphonic():
- def __init__(self):
- with open(
- os.path.join(
- os.path.dirname(os.path.abspath(__file__)),
- 'polyphonic.yaml'),
- 'r',
- encoding='utf-8') as polyphonic_file:
- # 解析yaml
- polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
- self.polyphonic_words = polyphonic_dict["polyphonic"]
- def correct_pronunciation(self, word, pinyin):
- # 词汇被词典收录则返回纠正后的读音
- if word in self.polyphonic_words.keys():
- pinyin = self.polyphonic_words[word]
- # 否则返回原读音
- return pinyin
+ return result
class Frontend():
@@ -85,10 +75,8 @@ class Frontend():
phone_vocab_path=None,
tone_vocab_path=None,
use_rhy=False):
- self.mix_ssml_processor = MixTextProcessor()
- self.tone_modifier = ToneSandhi()
- self.text_normalizer = TextNormalizer()
- self.punc = ":,;。?!“”‘’':,;.?!"
+
+ self.punc = "、:,;。?!“”‘’':,;.?!"
self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
self.phrases_dict = {
'开户行': [['ka1i'], ['hu4'], ['hang2']],
@@ -108,28 +96,7 @@ class Frontend():
'嘞': [['lei5']],
'掺和': [['chan1'], ['huo5']]
}
- self.use_rhy = use_rhy
- if use_rhy:
- self.rhy_predictor = RhyPredictor()
- print("Rhythm predictor loaded.")
- # g2p_model can be pypinyin and g2pM and g2pW
- self.g2p_model = g2p_model
- if self.g2p_model == "g2pM":
- self.g2pM_model = G2pM()
- self.pinyin2phone = generate_lexicon(
- with_tone=True, with_erhua=False)
- elif self.g2p_model == "g2pW":
- # use pypinyin as backup for non polyphonic characters in g2pW
- self._init_pypinyin()
- self.corrector = Polyphonic()
- self.g2pM_model = G2pM()
- self.g2pW_model = G2PWOnnxConverter(
- style='pinyin', enable_non_tradional_chinese=True)
- self.pinyin2phone = generate_lexicon(
- with_tone=True, with_erhua=False)
- else:
- self._init_pypinyin()
self.must_erhua = {
"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
}
@@ -154,13 +121,51 @@ class Frontend():
for tone, id in tone_id:
self.vocab_tones[tone] = int(id)
+ # SSML
+ self.mix_ssml_processor = MixTextProcessor()
+ # tone sandhi
+ self.tone_modifier = ToneSandhi()
+ # TN
+ self.text_normalizer = TextNormalizer()
+
+ # prosody
+ self.use_rhy = use_rhy
+ if use_rhy:
+ self.rhy_predictor = RhyPredictor()
+ print("Rhythm predictor loaded.")
+
+ # g2p
+ assert g2p_model in ('pypinyin', 'g2pM', 'g2pW')
+ self.g2p_model = g2p_model
+ if self.g2p_model == "g2pM":
+ self.g2pM_model = G2pM()
+ self.pinyin2phone = generate_lexicon(
+ with_tone=True, with_erhua=False)
+ elif self.g2p_model == "g2pW":
+ # use pypinyin as backup for non polyphonic characters in g2pW
+ self._init_pypinyin()
+ self.corrector = Polyphonic()
+ self.g2pM_model = G2pM()
+ self.g2pW_model = G2PWOnnxConverter(
+ style='pinyin', enable_non_tradional_chinese=True)
+ self.pinyin2phone = generate_lexicon(
+ with_tone=True, with_erhua=False)
+ else:
+ self._init_pypinyin()
+
def _init_pypinyin(self):
+ """
+ Load pypinyin G2P module.
+ """
large_pinyin.load()
load_phrases_dict(self.phrases_dict)
# 调整字的拼音顺序
load_single_dict({ord(u'地'): u'de,di4'})
def _get_initials_finals(self, word: str) -> List[List[str]]:
+ """
+ Get word initial and final by pypinyin or g2pM
+ """
initials = []
finals = []
if self.g2p_model == "pypinyin":
@@ -171,11 +176,14 @@ class Frontend():
for c, v in zip(orig_initials, orig_finals):
if re.match(r'i\d', v):
if c in ['z', 'c', 's']:
+ # zi, ci, si
v = re.sub('i', 'ii', v)
elif c in ['zh', 'ch', 'sh', 'r']:
+ # zhi, chi, shi
v = re.sub('i', 'iii', v)
initials.append(c)
finals.append(v)
+
elif self.g2p_model == "g2pM":
pinyins = self.g2pM_model(word, tone=True, char_split=False)
for pinyin in pinyins:
@@ -192,58 +200,123 @@ class Frontend():
# If it's not pinyin (possibly punctuation) or no conversion is required
initials.append(pinyin)
finals.append(pinyin)
+
return initials, finals
+ def _merge_erhua(self,
+ initials: List[str],
+ finals: List[str],
+ word: str,
+ pos: str) -> List[List[str]]:
+ """
+ Do erhub.
+ """
+ # fix er1
+ for i, phn in enumerate(finals):
+ if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
+ finals[i] = 'er2'
+
+ # 发音
+ if word not in self.must_erhua and (word in self.not_erhua or
+ pos in {"a", "j", "nr"}):
+ return initials, finals
+
+ # "……" 等情况直接返回
+ if len(finals) != len(word):
+ return initials, finals
+
+ assert len(finals) == len(word)
+
+ # 不发音
+ new_initials = []
+ new_finals = []
+ for i, phn in enumerate(finals):
+ if i == len(finals) - 1 and word[i] == "儿" and phn in {
+ "er2", "er5"
+ } and word[-2:] not in self.not_erhua and new_finals:
+ new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1]
+ else:
+ new_initials.append(initials[i])
+ new_finals.append(phn)
+
+ return new_initials, new_finals
+
# if merge_sentences, merge all sentences into one phone sequence
def _g2p(self,
sentences: List[str],
merge_sentences: bool=True,
with_erhua: bool=True) -> List[List[str]]:
+ """
+ Return: list of list phonemes.
+ [['w', 'o3', 'm', 'en2', 'sp'], ...]
+ """
segments = sentences
phones_list = []
+
+ # split by punctuation
for seg in segments:
if self.use_rhy:
seg = self.rhy_predictor._clean_text(seg)
- phones = []
- # Replace all English words in the sentence
+
+ # remove all English words in the sentence
seg = re.sub('[a-zA-Z]+', '', seg)
+
+ # add prosody mark
if self.use_rhy:
seg = self.rhy_predictor.get_prediction(seg)
+
+ # [(word, pos), ...]
seg_cut = psg.lcut(seg)
- initials = []
- finals = []
+ # fix wordseg bad case for sandhi
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+
# 为了多音词获得更好的效果,这里采用整句预测
+ phones = []
+ initials = []
+ finals = []
if self.g2p_model == "g2pW":
try:
+ # undo prosody
if self.use_rhy:
seg = self.rhy_predictor._clean_text(seg)
+
+ # g2p
pinyins = self.g2pW_model(seg)[0]
except Exception:
- # g2pW采用模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测
+ # g2pW 模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测
print("[%s] not in g2pW dict,use g2pM" % seg)
pinyins = self.g2pM_model(seg, tone=True, char_split=False)
+
+ # do prosody
if self.use_rhy:
rhy_text = self.rhy_predictor.get_prediction(seg)
final_py = self.rhy_predictor.pinyin_align(pinyins,
rhy_text)
pinyins = final_py
+
pre_word_length = 0
for word, pos in seg_cut:
sub_initials = []
sub_finals = []
now_word_length = pre_word_length + len(word)
+
+ # skip english word
if pos == 'eng':
pre_word_length = now_word_length
continue
+
word_pinyins = pinyins[pre_word_length:now_word_length]
- # 矫正发音
+
+ # 多音字消歧
word_pinyins = self.corrector.correct_pronunciation(
word, word_pinyins)
+
for pinyin, char in zip(word_pinyins, word):
if pinyin is None:
pinyin = char
+
pinyin = pinyin.replace("u:", "v")
+
if pinyin in self.pinyin2phone:
initial_final_list = self.pinyin2phone[
pinyin].split(" ")
@@ -257,28 +330,41 @@ class Frontend():
# If it's not pinyin (possibly punctuation) or no conversion is required
sub_initials.append(pinyin)
sub_finals.append(pinyin)
+
pre_word_length = now_word_length
+ # tone sandhi
sub_finals = self.tone_modifier.modified_tone(word, pos,
sub_finals)
+ # er hua
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos)
+
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
else:
+ # pypinyin, g2pM
for word, pos in seg_cut:
if pos == 'eng':
+ # skip english word
continue
+
+ # g2p
sub_initials, sub_finals = self._get_initials_finals(word)
+ # tone sandhi
sub_finals = self.tone_modifier.modified_tone(word, pos,
sub_finals)
+ # er hua
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos)
+
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
+
+ # sum(iterable[, start])
initials = sum(initials, [])
finals = sum(finals, [])
@@ -287,111 +373,34 @@ class Frontend():
# we discriminate i, ii and iii
if c and c not in self.punc:
phones.append(c)
+ # replace punctuation by `sp`
if c and c in self.punc:
phones.append('sp')
+
if v and v not in self.punc and v not in self.rhy_phns:
phones.append(v)
- phones_list.append(phones)
- if merge_sentences:
- merge_list = sum(phones_list, [])
- # rm the last 'sp' to avoid the noise at the end
- # cause in the training data, no 'sp' in the end
- if merge_list[-1] == 'sp':
- merge_list = merge_list[:-1]
- phones_list = []
- phones_list.append(merge_list)
- return phones_list
- def _split_word_to_char(self, words):
- res = []
- for x in words:
- res.append(x)
- return res
-
- # if using ssml, have pingyin specified, assign pinyin to words
- def _g2p_assign(self,
- words: List[str],
- pinyin_spec: List[str],
- merge_sentences: bool=True) -> List[List[str]]:
- phones_list = []
- initials = []
- finals = []
-
- words = self._split_word_to_char(words[0])
- for pinyin, char in zip(pinyin_spec, words):
- sub_initials = []
- sub_finals = []
- pinyin = pinyin.replace("u:", "v")
- #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
- if pinyin in self.pinyin2phone:
- initial_final_list = self.pinyin2phone[pinyin].split(" ")
- if len(initial_final_list) == 2:
- sub_initials.append(initial_final_list[0])
- sub_finals.append(initial_final_list[1])
- elif len(initial_final_list) == 1:
- sub_initials.append('')
- sub_finals.append(initial_final_list[1])
- else:
- # If it's not pinyin (possibly punctuation) or no conversion is required
- sub_initials.append(pinyin)
- sub_finals.append(pinyin)
- initials.append(sub_initials)
- finals.append(sub_finals)
+ phones_list.append(phones)
- initials = sum(initials, [])
- finals = sum(finals, [])
- phones = []
- for c, v in zip(initials, finals):
- # NOTE: post process for pypinyin outputs
- # we discriminate i, ii and iii
- if c and c not in self.punc:
- phones.append(c)
- if c and c in self.punc:
- phones.append('sp')
- if v and v not in self.punc and v not in self.rhy_phns:
- phones.append(v)
- phones_list.append(phones)
+ # merge split sub sentence into one sentence.
if merge_sentences:
+ # sub sentence phonemes
merge_list = sum(phones_list, [])
# rm the last 'sp' to avoid the noise at the end
# cause in the training data, no 'sp' in the end
if merge_list[-1] == 'sp':
merge_list = merge_list[:-1]
+
+ # sentence phonemes
phones_list = []
phones_list.append(merge_list)
- return phones_list
- def _merge_erhua(self,
- initials: List[str],
- finals: List[str],
- word: str,
- pos: str) -> List[List[str]]:
- # fix er1
- for i, phn in enumerate(finals):
- if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
- finals[i] = 'er2'
- if word not in self.must_erhua and (word in self.not_erhua or
- pos in {"a", "j", "nr"}):
- return initials, finals
- # "……" 等情况直接返回
- if len(finals) != len(word):
- return initials, finals
-
- assert len(finals) == len(word)
-
- new_initials = []
- new_finals = []
- for i, phn in enumerate(finals):
- if i == len(finals) - 1 and word[i] == "儿" and phn in {
- "er2", "er5"
- } and word[-2:] not in self.not_erhua and new_finals:
- new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1]
- else:
- new_finals.append(phn)
- new_initials.append(initials[i])
- return new_initials, new_finals
+ return phones_list
def _p2id(self, phonemes: List[str]) -> np.ndarray:
+ """
+ Phoneme to Index
+ """
# replace unk phone with sp
phonemes = [
phn if phn in self.vocab_phones else "sp" for phn in phonemes
@@ -400,6 +409,9 @@ class Frontend():
return np.array(phone_ids, np.int64)
def _t2id(self, tones: List[str]) -> np.ndarray:
+ """
+ Tone to Index.
+ """
# replace unk phone with sp
tones = [tone if tone in self.vocab_tones else "0" for tone in tones]
tone_ids = [self.vocab_tones[item] for item in tones]
@@ -407,6 +419,9 @@ class Frontend():
def _get_phone_tone(self, phonemes: List[str],
get_tone_ids: bool=False) -> List[List[str]]:
+ """
+ Get tone from phonemes.
+ """
phones = []
tones = []
if get_tone_ids and self.vocab_tones:
@@ -423,13 +438,14 @@ class Frontend():
-1] == 'r' and phone not in self.vocab_phones and phone[:
-1] in self.vocab_phones:
phones.append(phone[:-1])
- phones.append("er")
tones.append(tone)
+ phones.append("er")
tones.append("2")
else:
phones.append(phone)
tones.append(tone)
else:
+ # initals with 0 tone.
phones.append(full_phone)
tones.append('0')
else:
@@ -443,6 +459,7 @@ class Frontend():
phones.append("er2")
else:
phones.append(phone)
+
return phones, tones
def get_phonemes(self,
@@ -451,10 +468,16 @@ class Frontend():
with_erhua: bool=True,
robot: bool=False,
print_info: bool=False) -> List[List[str]]:
+ """
+ Main function to do G2P
+ """
+ # TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence)
+ # Prosody & WS & g2p & tone sandhi
phonemes = self._g2p(
sentences, merge_sentences=merge_sentences, with_erhua=with_erhua)
- # change all tones to `1`
+
+ # simulate robot pronunciation, change all tones to `1`
if robot:
new_phonemes = []
for sentence in phonemes:
@@ -466,6 +489,7 @@ class Frontend():
new_sentence.append(item)
new_phonemes.append(new_sentence)
phonemes = new_phonemes
+
if print_info:
print("----------------------------")
print("text norm results:")
@@ -476,25 +500,104 @@ class Frontend():
print("----------------------------")
return phonemes
- #@an added for ssml pinyin
+ def _split_word_to_char(self, words):
+ res = []
+ for x in words:
+ res.append(x)
+ return res
+
+ # if using ssml, have pingyin specified, assign pinyin to words
+ def _g2p_assign(self,
+ words: List[str],
+ pinyin_spec: List[str],
+ merge_sentences: bool=True) -> List[List[str]]:
+ """
+ Replace phoneme by SSML
+ """
+ phones_list = []
+ initials = []
+ finals = []
+
+ # to charactor list
+ words = self._split_word_to_char(words[0])
+
+ for pinyin, char in zip(pinyin_spec, words):
+ sub_initials = []
+ sub_finals = []
+ pinyin = pinyin.replace("u:", "v")
+
+ #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
+ if pinyin in self.pinyin2phone:
+ initial_final_list = self.pinyin2phone[pinyin].split(" ")
+ if len(initial_final_list) == 2:
+ sub_initials.append(initial_final_list[0])
+ sub_finals.append(initial_final_list[1])
+ elif len(initial_final_list) == 1:
+ sub_initials.append('')
+ sub_finals.append(initial_final_list[1])
+ else:
+ # If it's not pinyin (possibly punctuation) or no conversion is required
+ sub_initials.append(pinyin)
+ sub_finals.append(pinyin)
+
+ initials.append(sub_initials)
+ finals.append(sub_finals)
+
+ initials = sum(initials, [])
+ finals = sum(finals, [])
+
+ phones = []
+ for c, v in zip(initials, finals):
+ # c for consonant, v for vowel
+ # NOTE: post process for pypinyin outputs
+ # we discriminate i, ii and iii
+ if c and c not in self.punc:
+ phones.append(c)
+ # replace punc to `sp`
+ if c and c in self.punc:
+ phones.append('sp')
+ if v and v not in self.punc and v not in self.rhy_phns:
+ phones.append(v)
+ phones_list.append(phones)
+
+ if merge_sentences:
+ merge_list = sum(phones_list, [])
+ # rm the last 'sp' to avoid the noise at the end
+ # cause in the training data, no 'sp' in the end
+ if merge_list[-1] == 'sp':
+ merge_list = merge_list[:-1]
+ phones_list = []
+ phones_list.append(merge_list)
+
+ return phones_list
+
def get_phonemes_ssml(self,
ssml_inputs: list,
merge_sentences: bool=True,
with_erhua: bool=True,
robot: bool=False,
print_info: bool=False) -> List[List[str]]:
+ """
+ Main function to do G2P with SSML support.
+ """
all_phonemes = []
for word_pinyin_item in ssml_inputs:
phonemes = []
+
+ # ['你喜欢', []] -> 你喜欢 []
sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
+
+ # TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence)
+
if len(pinyin_spec) == 0:
+ # g2p word w/o specified
phonemes = self._g2p(
sentences,
merge_sentences=merge_sentences,
with_erhua=with_erhua)
else:
- # phonemes should be pinyin_spec
+ # word phonemes specified by
phonemes = self._g2p_assign(
sentences, pinyin_spec, merge_sentences=merge_sentences)
@@ -512,17 +615,24 @@ class Frontend():
new_phonemes.append(new_sentence)
all_phonemes = new_phonemes
+ if merge_sentences:
+ all_phonemes = [sum(all_phonemes, [])]
+
if print_info:
print("----------------------------")
print("text norm results:")
print(sentences)
print("----------------------------")
print("g2p results:")
- print(all_phonemes[0])
+ print(all_phonemes)
print("----------------------------")
- return [sum(all_phonemes, [])]
+
+ return all_phonemes
def add_sp_if_no(self, phonemes):
+ """
+ Prosody mark #4 added at sentence end.
+ """
if not phonemes[-1][-1].startswith('sp'):
phonemes[-1].append('sp4')
return phonemes
@@ -542,8 +652,11 @@ class Frontend():
merge_sentences=merge_sentences,
print_info=print_info,
robot=robot)
+
+ # add #4 for sentence end.
if self.use_rhy:
phonemes = self.add_sp_if_no(phonemes)
+
result = {}
phones = []
tones = []
@@ -551,28 +664,33 @@ class Frontend():
temp_tone_ids = []
for part_phonemes in phonemes:
+
phones, tones = self._get_phone_tone(
part_phonemes, get_tone_ids=get_tone_ids)
+
if add_blank:
phones = insert_after_character(phones, blank_token)
+
if tones:
tone_ids = self._t2id(tones)
if to_tensor:
tone_ids = paddle.to_tensor(tone_ids)
temp_tone_ids.append(tone_ids)
+
if phones:
phone_ids = self._p2id(phones)
# if use paddle.to_tensor() in onnxruntime, the first time will be too low
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
+
if temp_tone_ids:
result["tone_ids"] = temp_tone_ids
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
+
return result
- # @an added for ssml
def get_input_ids_ssml(
self,
sentence: str,
@@ -584,12 +702,15 @@ class Frontend():
blank_token: str="",
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
- l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+ # split setence by SSML tag.
+ texts = MixTextProcessor.get_pinyin_split(sentence)
+
phonemes = self.get_phonemes_ssml(
- l_inputs,
+ texts,
merge_sentences=merge_sentences,
print_info=print_info,
robot=robot)
+
result = {}
phones = []
tones = []
@@ -599,21 +720,26 @@ class Frontend():
for part_phonemes in phonemes:
phones, tones = self._get_phone_tone(
part_phonemes, get_tone_ids=get_tone_ids)
+
if add_blank:
phones = insert_after_character(phones, blank_token)
+
if tones:
tone_ids = self._t2id(tones)
if to_tensor:
tone_ids = paddle.to_tensor(tone_ids)
temp_tone_ids.append(tone_ids)
+
if phones:
phone_ids = self._p2id(phones)
# if use paddle.to_tensor() in onnxruntime, the first time will be too low
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
+
if temp_tone_ids:
result["tone_ids"] = temp_tone_ids
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
+
return result
diff --git a/runtime/README.md b/runtime/README.md
index 553bb29ad..0e9c243e9 100644
--- a/runtime/README.md
+++ b/runtime/README.md
@@ -2,7 +2,7 @@
## Environment
We develop under:
-* python - 3.7
+* python - >=3.8
* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7`
* os - Ubuntu 16.04.7 LTS
* gcc/g++/gfortran - 8.2.0
@@ -98,7 +98,7 @@ please install paddlepaddle >= 2.4rc
```
-cd $YOUR_ENV_PATH/lib/python3.7/site-packages/paddle/fluid
+cd $YOUR_ENV_PATH/lib/python3.8/site-packages/paddle/fluid
patchelf --set-soname libpaddle.so libpaddle.so
```
diff --git a/runtime/tools/venv.sh b/runtime/tools/venv.sh
index 3952988c6..2aa7e5095 100755
--- a/runtime/tools/venv.sh
+++ b/runtime/tools/venv.sh
@@ -1,5 +1,5 @@
#!/bin/bash
set -ex
-PYTHON=python3.7
+PYTHON=python3.8
test -d venv || virtualenv -p ${PYTHON} venv
diff --git a/setup.py b/setup.py
index 07b411bd0..af7c4dc3d 100644
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,8 @@ base = [
"hyperpyyaml",
"inflect",
"jsonlines",
+ # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
+ "numpy==1.23.5",
"librosa==0.8.1",
"scipy>=1.4.0",
"loguru",
@@ -260,6 +262,7 @@ setup_info = dict(
long_description=read("README.md"),
long_description_content_type="text/markdown",
keywords=[
+ "SSL"
"speech",
"asr",
"tts",
@@ -268,12 +271,19 @@ setup_info = dict(
"text frontend",
"MFA",
"paddlepaddle",
+ "paddleaudio",
+ "streaming asr",
+ "streaming tts",
"beam search",
"ctcdecoder",
"deepspeech2",
+ "wav2vec2",
+ "hubert",
+ "wavlm",
"transformer",
"conformer",
"fastspeech2",
+ "hifigan",
"gan vocoders",
],
python_requires='>=3.7',
diff --git a/tests/unit/tts/test_enfrontend.py b/tests/unit/tts/test_enfrontend.py
new file mode 100644
index 000000000..4f8c49305
--- /dev/null
+++ b/tests/unit/tts/test_enfrontend.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
+
+if __name__ == '__main__':
+
+ fe = EnFrontend()
+
+ text = "AI for Sceience"
+ phonemes = fe.phoneticize(text)
+ print(text)
+ print(phonemes)
+
+ text = "eight"
+ phonemes = fe.phoneticize(text)
+ print(text)
+ print(phonemes)
diff --git a/tests/unit/tts/test_mixfrontend.py b/tests/unit/tts/test_mixfrontend.py
new file mode 100644
index 000000000..5751dd2a7
--- /dev/null
+++ b/tests/unit/tts/test_mixfrontend.py
@@ -0,0 +1,444 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import tempfile
+
+from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
+
+# mix zh & en phonemes
+phone_id_str = """
+ 0
+ 1
+AA0 2
+AA1 3
+AA2 4
+AE0 5
+AE1 6
+AE2 7
+AH0 8
+AH1 9
+AH2 10
+AO0 11
+AO1 12
+AO2 13
+AW0 14
+AW1 15
+AW2 16
+AY0 17
+AY1 18
+AY2 19
+B 20
+CH 21
+D 22
+DH 23
+EH0 24
+EH1 25
+EH2 26
+ER0 27
+ER1 28
+ER2 29
+EY0 30
+EY1 31
+EY2 32
+F 33
+G 34
+HH 35
+IH0 36
+IH1 37
+IH2 38
+IY0 39
+IY1 40
+IY2 41
+JH 42
+K 43
+L 44
+M 45
+N 46
+NG 47
+OW0 48
+OW1 49
+OW2 50
+OY0 51
+OY1 52
+OY2 53
+P 54
+R 55
+S 56
+SH 57
+T 58
+TH 59
+UH0 60
+UH1 61
+UH2 62
+UW0 63
+UW1 64
+UW2 65
+V 66
+W 67
+Y 68
+Z 69
+ZH 70
+a1 71
+a2 72
+a3 73
+a4 74
+a5 75
+ai1 76
+ai2 77
+ai3 78
+ai4 79
+ai5 80
+air2 81
+air3 82
+air4 83
+an1 84
+an2 85
+an3 86
+an4 87
+an5 88
+ang1 89
+ang2 90
+ang3 91
+ang4 92
+ang5 93
+angr2 94
+angr4 95
+anr1 96
+anr3 97
+anr4 98
+ao1 99
+ao2 100
+ao3 101
+ao4 102
+ao5 103
+aor1 104
+aor3 105
+aor4 106
+aor5 107
+ar2 108
+ar3 109
+ar4 110
+ar5 111
+b 112
+c 113
+ch 114
+d 115
+e1 116
+e2 117
+e3 118
+e4 119
+e5 120
+ei1 121
+ei2 122
+ei3 123
+ei4 124
+ei5 125
+eir4 126
+en1 127
+en2 128
+en3 129
+en4 130
+en5 131
+eng1 132
+eng2 133
+eng3 134
+eng4 135
+eng5 136
+engr4 137
+enr1 138
+enr2 139
+enr3 140
+enr4 141
+enr5 142
+er1 143
+er2 144
+er3 145
+er4 146
+er5 147
+f 148
+g 149
+h 150
+i1 151
+i2 152
+i3 153
+i4 154
+i5 155
+ia1 156
+ia2 157
+ia3 158
+ia4 159
+ia5 160
+ian1 161
+ian2 162
+ian3 163
+ian4 164
+ian5 165
+iang1 166
+iang2 167
+iang3 168
+iang4 169
+iang5 170
+iangr4 171
+ianr1 172
+ianr2 173
+ianr3 174
+ianr4 175
+ianr5 176
+iao1 177
+iao2 178
+iao3 179
+iao4 180
+iao5 181
+iaor1 182
+iaor2 183
+iaor3 184
+iaor4 185
+iar1 186
+iar3 187
+iar4 188
+ie1 189
+ie2 190
+ie3 191
+ie4 192
+ie5 193
+ii1 194
+ii2 195
+ii3 196
+ii4 197
+ii5 198
+iii1 199
+iii2 200
+iii3 201
+iii4 202
+iii5 203
+iiir1 204
+iiir4 205
+iir2 206
+in1 207
+in2 208
+in3 209
+in4 210
+in5 211
+ing1 212
+ing2 213
+ing3 214
+ing4 215
+ing5 216
+ingr1 217
+ingr2 218
+ingr3 219
+ingr4 220
+inr1 221
+inr4 222
+io1 223
+io3 224
+io5 225
+iong1 226
+iong2 227
+iong3 228
+iong4 229
+iong5 230
+iou1 231
+iou2 232
+iou3 233
+iou4 234
+iou5 235
+iour1 236
+iour2 237
+iour3 238
+iour4 239
+ir1 240
+ir2 241
+ir3 242
+ir4 243
+ir5 244
+j 245
+k 246
+l 247
+m 248
+n 249
+o1 250
+o2 251
+o3 252
+o4 253
+o5 254
+ong1 255
+ong2 256
+ong3 257
+ong4 258
+ong5 259
+ongr4 260
+or2 261
+ou1 262
+ou2 263
+ou3 264
+ou4 265
+ou5 266
+our2 267
+our3 268
+our4 269
+our5 270
+p 271
+q 272
+r 273
+s 274
+sh 275
+sil 276
+sp 277
+spl 278
+spn 279
+t 280
+u1 281
+u2 282
+u3 283
+u4 284
+u5 285
+ua1 286
+ua2 287
+ua3 288
+ua4 289
+ua5 290
+uai1 291
+uai2 292
+uai3 293
+uai4 294
+uai5 295
+uair4 296
+uan1 297
+uan2 298
+uan3 299
+uan4 300
+uan5 301
+uang1 302
+uang2 303
+uang3 304
+uang4 305
+uang5 306
+uangr4 307
+uanr1 308
+uanr2 309
+uanr3 310
+uanr4 311
+uanr5 312
+uar1 313
+uar2 314
+uar4 315
+uei1 316
+uei2 317
+uei3 318
+uei4 319
+uei5 320
+ueir1 321
+ueir2 322
+ueir3 323
+ueir4 324
+uen1 325
+uen2 326
+uen3 327
+uen4 328
+uen5 329
+ueng1 330
+ueng2 331
+ueng3 332
+ueng4 333
+uenr1 334
+uenr2 335
+uenr3 336
+uenr4 337
+uo1 338
+uo2 339
+uo3 340
+uo4 341
+uo5 342
+uor1 343
+uor2 344
+uor3 345
+uor5 346
+ur1 347
+ur2 348
+ur3 349
+ur4 350
+ur5 351
+v1 352
+v2 353
+v3 354
+v4 355
+v5 356
+van1 357
+van2 358
+van3 359
+van4 360
+van5 361
+vanr1 362
+vanr2 363
+vanr3 364
+vanr4 365
+ve1 366
+ve2 367
+ve3 368
+ve4 369
+ve5 370
+ver3 371
+ver4 372
+vn1 373
+vn2 374
+vn3 375
+vn4 376
+vn5 377
+vnr2 378
+vr3 379
+x 380
+z 381
+zh 382
+, 383
+. 384
+? 385
+! 386
+ 387
+"""
+
+if __name__ == '__main__':
+ with tempfile.NamedTemporaryFile(mode='wt') as f:
+ phone_ids = phone_id_str.split()
+ for phone, id in zip(phone_ids[::2], phone_ids[1::2]):
+ f.write(f"{phone} {id}")
+ f.write('\n')
+ f.flush()
+
+ frontend = MixFrontend(phone_vocab_path=f.name)
+
+ text = "hello, 我爱北京天安们,what about you."
+ print(text)
+ # [('hello, ', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
+ segs = frontend.split_by_lang(text)
+ print(segs)
+
+ text = "hello?!!我爱北京天安们,what about you."
+ print(text)
+ # [('hello?!!', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
+ segs = frontend.split_by_lang(text)
+ print(segs)
+
+ text = " hello,我爱北京天安们,what about you."
+ print(text)
+ # [(' hello,', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
+ segs = frontend.split_by_lang(text)
+ print(segs)
+
+ # 对于SSML的xml标记处理不好。需要先解析SSML,后处理中英的划分。
+ text = "我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干倒了, 里面有个干尸,不知是被谁干死的。"
+ print(text)
+ # [('', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干', 'zh'), ('say-as>', 'en'), ('死的。', 'zh'), ('speak>', 'en')]
+ segs = frontend.split_by_lang(text)
+ print(segs)
diff --git a/tests/unit/tts/test_ssml.py b/tests/unit/tts/test_ssml.py
new file mode 100644
index 000000000..4c3e9d538
--- /dev/null
+++ b/tests/unit/tts/test_ssml.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+
+if __name__ == '__main__':
+ text = "你好吗,我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干倒了, 里面有个干尸,不知是被谁干死的。thank you."
+
+ # SSML: 13
+ # 0 ['你好吗,', []]
+ # 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
+ # 2 ['倒', ['dao3']]
+ # 3 ['在沙滩上,沙滩上倒了一堆', []]
+ # 4 ['土', ['tu3']]
+ # 5 ['。想象', []]
+ # 6 ['干干', ['gan1', 'gan1']]
+ # 7 ['的树干', []]
+ # 8 ['倒', ['dao3']]
+ # 9 ['了,里面有个干尸,不知是被谁', []]
+ # 10 ['干', ['gan4']]
+ # 11 ['死的。', []]
+ # 12 ['thank you.', []]
+ inputs = MixTextProcessor.get_pinyin_split(text)
+ print(f"SSML get_pinyin_split: {len(inputs)}")
+ for i, sub in enumerate(inputs):
+ print(i, sub)
+ print()
+
+ # SSML get_dom_split: 13
+ # 0 你好吗,
+ # 1 我们的声学模型使用了 Fast Speech Two。前浪
+ # 2 倒
+ # 3 在沙滩上,沙滩上倒了一堆
+ # 4 土
+ # 5 。 想象
+ # 6 干干
+ # 7 的树干
+ # 8 倒
+ # 9 了, 里面有个干尸,不知是被谁
+ # 10 干
+ # 11 死的。
+ # 12 thank you.
+ inputs = MixTextProcessor.get_dom_split(text)
+ print(f"SSML get_dom_split: {len(inputs)}")
+ for i, sub in enumerate(inputs):
+ print(i, sub)
+ print()
+
+ # SSML object.get_pinyin_split: 246
+ # 我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干倒了, 里面有个干尸,不知是被谁干死的。
+ outs = MixTextProcessor().get_xml_content(text)
+ print(f"SSML object.get_pinyin_split: {len(outs)}")
+ print(outs)
+ print()
+
+ # SSML object.get_content_split: 30 你好吗,
+ # 1 我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干
+ # 倒了, 里面有个干尸,不知是被谁干死的。
+ # 2 thank you.
+ outs = MixTextProcessor().get_content_split(text)
+ print(f"SSML object.get_content_split: {len(outs)}")
+ for i, sub in enumerate(outs):
+ print(i, sub)
+ print()
+
+ import json
+ import xmltodict
+ text = "我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干倒了, 里面有个干尸,不知是被谁干死的。"
+ ssml = xmltodict.parse(text)
+ print(json.dumps(ssml))
+ print(ssml['speak'].keys())
+ print(ssml['speak']['#text'])
+ print(ssml['speak']['say-as'])
diff --git a/tools/Makefile b/tools/Makefile
index a5a4485da..c6c667cd0 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,5 +1,5 @@
SHELL:= /bin/bash
-PYTHON:= python3.7
+PYTHON:= python3.8
CXX ?= g++
CC ?= gcc # used for sph2pipe