diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f72b44ac6..44bbd5cad 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,12 +26,12 @@ repos: - --no-sort-keys - --autofix - id: check-merge-conflict - - id: flake8 - aergs: - - --ignore=E501,E228,E226,E261,E266,E128,E402,W503 - - --builtins=G,request - - --jobs=1 - exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$ + # - id: flake8 + # aergs: + # - --ignore=E501,E228,E226,E261,E266,E128,E402,W503 + # - --builtins=G,request + # - --jobs=1 + # exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$ - repo : https://github.com/Lucas-C/pre-commit-hooks rev: v1.0.1 diff --git a/README.md b/README.md index c6e9fc209..19ec61cb0 100644 --- a/README.md +++ b/README.md @@ -227,13 +227,13 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision ## Installation -We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.4.1*. +We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.8* and *paddlepaddle<=2.5.1*. Some new versions of Paddle do not have support for adaptation in PaddleSpeech, so currently only versions 2.5.1 and earlier can be supported. ### **Dependency Introduction** + gcc >= 4.8.5 -+ paddlepaddle >= 2.4.1 -+ python >= 3.7 ++ paddlepaddle <= 2.5.1 ++ python >= 3.8 + OS support: Linux(recommend), Windows, Mac OSX PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version. @@ -893,10 +893,6 @@ The Text-to-Speech module is originally called [Parakeet](https://github.com/Pad - **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.** -
- -
- ## Citation diff --git a/README_cn.md b/README_cn.md index eabb2ead4..7aef30871 100644 --- a/README_cn.md +++ b/README_cn.md @@ -8,7 +8,7 @@ - + @@ -237,12 +237,12 @@ ## 安装 -我们强烈建议用户在 **Linux** 环境下,*3.7* 以上版本的 *python* 上安装 PaddleSpeech。 +我们强烈建议用户在 **Linux** 环境下,*3.8* 以上版本的 *python* 上安装 PaddleSpeech。同时,有一些Paddle新版本的内容没有在做适配的支持,因此目前只能使用2.5.1及之前的版本。 ### 相关依赖 + gcc >= 4.8.5 -+ paddlepaddle >= 2.4.1 -+ python >= 3.7 ++ paddlepaddle <= 2.5.1 ++ python >= 3.8 + linux(推荐), mac, windows PaddleSpeech 依赖于 paddlepaddle,安装可以参考[ paddlepaddle 官网](https://www.paddlepaddle.org.cn/),根据自己机器的情况进行选择。这里给出 cpu 版本示例,其它版本大家可以根据自己机器的情况进行安装。 diff --git a/audio/setup.py b/audio/setup.py index 0fe6e5995..f7d459446 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -38,8 +38,10 @@ VERSION = '1.2.0' COMMITID = 'none' base = [ + # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x + "librosa==0.8.1", + "numpy==1.23.5", "kaldiio", - "librosa>=0.10.0", "pathos", "pybind11", "parameterized", diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py index 83c2b3f35..0711a40af 100644 --- a/demos/speech_web/speech_server/src/ge2e_clone.py +++ b/demos/speech_web/speech_server/src/ge2e_clone.py @@ -38,23 +38,9 @@ class VoiceCloneGE2E(): output_dir = os.path.dirname(out_wav) ngpu = get_ngpu() - cmd = f""" - python3 {self.BIN_DIR}/voice_cloning.py \ - --am={self.am} \ - --am_config={self.am_config} \ - --am_ckpt={self.am_ckpt} \ - --am_stat={self.am_stat} \ - --voc={self.voc} \ - --voc_config={self.voc_config} \ - --voc_ckpt={self.voc_ckpt} \ - --voc_stat={self.voc_stat} \ - --ge2e_params_path={self.ge2e_params_path} \ - --text="{text}" \ - --input-dir={ref_audio_dir} \ - --output-dir={output_dir} \ - --phones-dict={self.phones_dict} \ - --ngpu={ngpu} - """ + cmd = f"""python {self.BIN_DIR}/voice_cloning.py --am={self.am} --am_config={self.am_config} --am_ckpt={self.am_ckpt} --am_stat={self.am_stat} --voc={self.voc} --voc_config={self.voc_config} --voc_ckpt={self.voc_ckpt} --voc_stat={self.voc_stat} --ge2e_params_path={self.ge2e_params_path} --text="{text}" --input-dir={ref_audio_dir} --output-dir={output_dir} --phones-dict={self.phones_dict} --ngpu={ngpu}""" + + print(cmd) output_name = os.path.join(output_dir, full_file_name) return run_cmd(cmd, output_name=output_name) diff --git a/docs/source/install.md b/docs/source/install.md index a4dae3640..3607d7185 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -95,7 +95,7 @@ bash ``` Then you can create a conda virtual environment using the following command: ```bash -conda create -y -p tools/venv python=3.7 +conda create -y -p tools/venv python=3.8 ``` Activate the conda virtual environment: ```bash @@ -181,7 +181,7 @@ $HOME/miniconda3/bin/conda init # use the "bash" command to make the conda environment works bash # create a conda virtual environment -conda create -y -p tools/venv python=3.7 +conda create -y -p tools/venv python=3.8 # Activate the conda virtual environment: conda activate tools/venv # Install the conda packages diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md index 7f05cdfe4..01ae21fe7 100644 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -91,7 +91,7 @@ bash ``` 然后你可以创建一个 conda 的虚拟环境: ```bash -conda create -y -p tools/venv python=3.7 +conda create -y -p tools/venv python=3.8 ``` 激活 conda 虚拟环境: ```bash @@ -173,7 +173,7 @@ $HOME/miniconda3/bin/conda init # 激活 conda bash # 创建 Conda 虚拟环境 -conda create -y -p tools/venv python=3.7 +conda create -y -p tools/venv python=3.8 # 激活 Conda 虚拟环境: conda activate tools/venv # 安装 Conda 包 diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md index cb1029e7b..c735e0bd8 100644 --- a/docs/topic/package_release/python_package_release.md +++ b/docs/topic/package_release/python_package_release.md @@ -165,8 +165,7 @@ docker run -it xxxxxx 设置python: ```bash -export PATH="/opt/python/cp37-cp37m/bin/:$PATH" -#export PATH="/opt/python/cp38-cp38/bin/:$PATH" +export PATH="/opt/python/cp38-cp38/bin/:$PATH" #export PATH="/opt/python/cp39-cp39/bin/:$PATH" ``` diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md index 643d0e224..be771ba59 100644 --- a/examples/aishell/asr1/RESULTS.md +++ b/examples/aishell/asr1/RESULTS.md @@ -1,14 +1,31 @@ # Aishell -## Conformer -paddle version: 2.2.2 -paddlespeech version: 1.0.1 -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0480 | -| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | +## RoFormer Streaming +paddle version: 2.5.0 +paddlespeech version: 1.5.0 + +Tesla V100-SXM2-32GB: 1 node, 4 card +Global BachSize: 32 * 4 +Training Done: 1 day, 12:56:39.639646 +### `decoding.decoding_chunk_size=16` + +> chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | 16, -1 | - | 5.63 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 6.13 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 6.13 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 5.44 | + +### `decoding.decoding_chunk_size=-1` + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - | 5.51 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 | +| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 | - | 4.99 | ## Conformer Streaming @@ -24,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.051968 | +## Conformer +paddle version: 2.2.2 +paddlespeech version: 1.0.1 +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | + + ## Transformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | diff --git a/examples/aishell/asr1/conf/chunk_roformer.yaml b/examples/aishell/asr1/conf/chunk_roformer.yaml new file mode 100644 index 000000000..a4051a021 --- /dev/null +++ b/examples/aishell/asr1/conf/chunk_roformer.yaml @@ -0,0 +1,98 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 # sublayer output dropout + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos + selfattention_layer_type: 'rel_selfattn' # unused + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer # transformer, bitransformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + r_num_blocks: 0 # only for bitransformer + dropout_rate: 0.1 # sublayer output dropout + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + reverse_weight: 0.0 # only for bitransformer + length_normalized_loss: false + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### + +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + + +########################################### +# Dataloader # +########################################### + +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 1 +global_grad_clip: 5.0 +dist_sampler: True +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml new file mode 100644 index 000000000..aa3a0aca7 --- /dev/null +++ b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml @@ -0,0 +1,98 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 # sublayer output dropout + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos + selfattention_layer_type: 'rel_selfattn' # unused + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: bitransformer # transformer, bitransformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 3 + r_num_blocks: 3 # only for bitransformer + dropout_rate: 0.1 # sublayer output dropout + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + reverse_weight: 0.3 # only for bitransformer + length_normalized_loss: false + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### + +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + + +########################################### +# Dataloader # +########################################### + +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 1 +global_grad_clip: 5.0 +dist_sampler: True +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/csmsc/tts2/local/inference_xpu.sh b/examples/csmsc/tts2/local/inference_xpu.sh new file mode 100644 index 000000000..5d8d92054 --- /dev/null +++ b/examples/csmsc/tts2/local/inference_xpu.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device xpu +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device xpu +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device xpu +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh new file mode 100644 index 000000000..0285f42cd --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_xpu.sh b/examples/csmsc/tts2/local/synthesize_xpu.sh new file mode 100644 index 000000000..801789c26 --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_xpu.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --tones_dict=dump/tone_id_map.txt \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi diff --git a/examples/csmsc/tts2/local/train_xpu.sh b/examples/csmsc/tts2/local/train_xpu.sh new file mode 100644 index 000000000..0c07c27fc --- /dev/null +++ b/examples/csmsc/tts2/local/train_xpu.sh @@ -0,0 +1,16 @@ + +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=0 \ + --nxpu=1 \ + --phones-dict=dump/phone_id_map.txt \ + --tones-dict=dump/tone_id_map.txt \ + --use-relative-path=True diff --git a/examples/csmsc/tts2/run_xpu.sh b/examples/csmsc/tts2/run_xpu.sh new file mode 100644 index 000000000..4b867961f --- /dev/null +++ b/examples/csmsc/tts2/run_xpu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +xpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_76.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run_xpu.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan by default + FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts3/local/inference_xpu.sh b/examples/csmsc/tts3/local/inference_xpu.sh new file mode 100644 index 000000000..541dc6262 --- /dev/null +++ b/examples/csmsc/tts3/local/inference_xpu.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --device xpu +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --device xpu +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --device xpu +fi + +# wavernn +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_csmsc \ + --voc=wavernn_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --device xpu +fi \ No newline at end of file diff --git a/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh new file mode 100644 index 000000000..bb58a37c8 --- /dev/null +++ b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nxpu=1 +fi diff --git a/examples/csmsc/tts3/local/synthesize_xpu.sh b/examples/csmsc/tts3/local/synthesize_xpu.sh new file mode 100644 index 000000000..fac8677a7 --- /dev/null +++ b/examples/csmsc/tts3/local/synthesize_xpu.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nxpu=1 +fi diff --git a/examples/csmsc/tts3/local/train_xpu.sh b/examples/csmsc/tts3/local/train_xpu.sh new file mode 100644 index 000000000..a7d889888 --- /dev/null +++ b/examples/csmsc/tts3/local/train_xpu.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=0 \ + --nxpu=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts3/run_xpu.sh b/examples/csmsc/tts3/run_xpu.sh new file mode 100644 index 000000000..4922d6b4b --- /dev/null +++ b/examples/csmsc/tts3/run_xpu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +xpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan by default + FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model, vocoder is pwgan by default + FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1 +fi diff --git a/paddlespeech/dataset/s2t/avg_model.py b/paddlespeech/dataset/s2t/avg_model.py index c5753b726..5bd5cb1f0 100755 --- a/paddlespeech/dataset/s2t/avg_model.py +++ b/paddlespeech/dataset/s2t/avg_model.py @@ -20,30 +20,6 @@ import numpy as np import paddle -def define_argparse(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument( - '--ckpt_dir', required=True, help='ckpt model dir for average') - parser.add_argument( - '--val_best', action="store_true", help='averaged model') - parser.add_argument( - '--num', default=5, type=int, help='nums for averaged model') - parser.add_argument( - '--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument( - '--max_epoch', - default=65536, # Big enough - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - return args - - def average_checkpoints(dst_model="", ckpt_dir="", val_best=True, @@ -85,7 +61,7 @@ def average_checkpoints(dst_model="", print(path_list) avg = None - num = args.num + num = num assert num == len(path_list) for path in path_list: print(f'Processing {path}') @@ -100,14 +76,14 @@ def average_checkpoints(dst_model="", if avg[k] is not None: avg[k] /= num - paddle.save(avg, args.dst_model) - print(f'Saving to {args.dst_model}') + paddle.save(avg, dst_model) + print(f'Saving to {dst_model}') - meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json' + meta_path = os.path.splitext(dst_model)[0] + '.avg.json' with open(meta_path, 'w') as f: data = json.dumps({ - "mode": 'val_best' if args.val_best else 'latest', - "avg_ckpt": args.dst_model, + "mode": 'val_best' if val_best else 'latest', + "avg_ckpt": dst_model, "val_loss_mean": avg_val_score, "ckpts": path_list, "epochs": selected_epochs.tolist(), @@ -116,9 +92,40 @@ def average_checkpoints(dst_model="", f.write(data + "\n") +def define_argparse(): + parser = argparse.ArgumentParser(description='average model') + parser.add_argument('--dst_model', required=True, help='averaged model') + parser.add_argument( + '--ckpt_dir', required=True, help='ckpt model dir for average') + parser.add_argument( + '--val_best', action="store_true", help='averaged model') + parser.add_argument( + '--num', default=5, type=int, help='nums for averaged model') + parser.add_argument( + '--min_epoch', + default=0, + type=int, + help='min epoch used for averaging model') + parser.add_argument( + '--max_epoch', + default=65536, # Big enough + type=int, + help='max epoch used for averaging model') + + args = parser.parse_args() + print(args) + return args + + def main(): args = define_argparse() - average_checkpoints(args) + average_checkpoints( + dst_model=args.dst_model, + ckpt_dir=args.ckpt_dir, + val_best=args.val_best, + num=args.num, + min_epoch=args.min_epoch, + max_epoch=args.max_epoch) if __name__ == '__main__': diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 7ab8cf853..d007a9e39 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -27,7 +27,6 @@ from paddlespeech.audio.text.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog from paddlespeech.s2t.training.reporter import report from paddlespeech.s2t.training.timer import Timer from paddlespeech.s2t.training.trainer import Trainer @@ -148,7 +147,7 @@ class DeepSpeech2Trainer(Trainer): if not self.train: return - grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) + grad_clip = paddle.nn.ClipGradByGlobalNorm(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index f716fa3b5..2e1c14ac1 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -145,7 +145,6 @@ class U2BaseModel(ASRInterface, nn.Layer): text_lengths) ctc_time = time.time() - start #logger.debug(f"ctc time: {ctc_time}") - if loss_ctc is None: loss = loss_att elif loss_att is None: @@ -916,6 +915,8 @@ class U2Model(U2DecodeModel): decoder_type = configs.get('decoder', 'transformer') logger.debug(f"U2 Decoder type: {decoder_type}") if decoder_type == 'transformer': + configs['model_conf'].pop('reverse_weight', None) + configs['decoder_conf'].pop('r_num_blocks', None) decoder = TransformerDecoder(vocab_size, encoder.output_size(), **configs['decoder_conf']) diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py index 59a67a1e5..a3744d340 100755 --- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py +++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py @@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer): x_lens = x.shape[1] ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen) + topk_index = topk_index.view([batch_size, x_lens]) # (B, maxlen) hyps = [hyp.tolist() for hyp in topk_index] hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 14336c03d..10ab3eaea 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -15,6 +15,7 @@ # Modified from wenet(https://github.com/wenet-e2e/wenet) """Multi-Head Attention layer definition.""" import math +from typing import List from typing import Tuple import paddle @@ -26,7 +27,10 @@ from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() -__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"] +__all__ = [ + "MultiHeadedAttention", "RelPositionMultiHeadedAttention", + "RoPERelPositionMultiHeadedAttention" +] # Relative Positional Encodings # https://www.jianshu.com/p/c0608efcc26f @@ -165,6 +169,7 @@ class MultiHeadedAttention(nn.Layer): and `head * d_k == size` """ + # (B,T,D) -> (B,T,H,D/H) q, k, v = self.forward_qkv(query, key, value) # when export onnx model, for 1st chunk, we feed @@ -373,3 +378,139 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): self.d_k) # (batch, head, time1, time2) return self.forward_attention(v, scores, mask), new_cache + + +class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with RoPE relative position encoding.""" + + def __init__(self, + n_head, + n_feat, + dropout_rate, + adaptive_scale=False, + init_weights=False): + """Construct an RelPositionMultiHeadedAttention object. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + super().__init__(n_head, n_feat, dropout_rate) + + def align(self, tensor: paddle.Tensor, axes: List[int], ndim=None): + """重新对齐tensor(批量版expand_dims) + axes:原来的第i维对齐新tensor的第axes[i]维; + ndim:新tensor的维度。 + """ + assert len(axes) == tensor.dim() + assert ndim or min(axes) >= 0 + + ndim = ndim or max(axes) + 1 + + # a[0, None, 1] = a[0, np.newaxis, 1] + indices = [None] * ndim + for i in axes: + # slice nothing, a[0, slice(None), 1] = a[0, :, 1] + indices[i] = slice(None) + + return tensor[indices] + + def apply_rotary_position_embeddings(self, sinusoidal, *tensors): + """应用RoPE到tensors中 + 其中,sinusoidal.shape=[B, T, D],tensors为tensor的列表,而 + tensor.shape=[B, T, ..., D], or (B,H,T,D/H) + """ + assert len(tensors) > 0, 'at least one input tensor' + assert all( + [tensor.shape == tensors[0].shape + for tensor in tensors[1:]]), 'all tensors must have the same shape' + + # (B,H,T,D) + ndim = tensors[0].dim() + _, H, T, D = tensors[0].shape + + # sinusoidal shape same with tensors[0] + # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H) + # sinusoidal = self.align(sinusoidal, [0, 1, -1], ndim) + sinusoidal = sinusoidal.reshape((1, T, H, D)).transpose([0, 2, 1, 3]) + + # http://man.hubwiz.com/docset/TensorFlow.docset/Contents/Resources/Documents/api_docs/python/tf/keras/backend/repeat_elements.html + # like np.repeat, x (s1, s2, s3), axis 1, (s1, s2*rep, s3) + # [b,T, ..., d/2] -> [b,T, ..., d] + cos_pos = paddle.repeat_interleave(sinusoidal[..., 1::2], 2, axis=-1) + sin_pos = paddle.repeat_interleave(sinusoidal[..., 0::2], 2, axis=-1) + outputs = [] + for tensor in tensors: + # x2 = [-x2, x1, -x4, x3, ..., -x_d, x_{d-1}] + tensor2 = paddle.stack([-tensor[..., 1::2], tensor[..., ::2]], ndim) + tensor2 = paddle.reshape(tensor2, paddle.shape(tensor)) + + # 公式 34, out = x * cos_pos + x2 * sin_pos + outputs.append(tensor * cos_pos + tensor2 * sin_pos) + return outputs[0] if len(outputs) == 1 else outputs + + def forward(self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + pos_emb: paddle.Tensor=paddle.empty([0]), + cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Ref: https://github.com/facebookresearch/llama/blob/main/llama/model.py + Args: + query (paddle.Tensor): Query tensor (#batch, time1, size). + key (paddle.Tensor): Key tensor (#batch, time2, size). + value (paddle.Tensor): Value tensor (#batch, time2, size). + mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + pos_emb (paddle.Tensor): Positional embedding tensor + (#batch, time2, size). + cache (paddle.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + Returns: + paddle.Tensor: Output tensor (#batch, time1, d_model). + paddle.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + """ + q, k, v = self.forward_qkv(query, key, value) + # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) + + # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index + # q_t always is chunk_size + q_t = q.shape[2] + q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q) + # k will increase when in streaming decoding. + k = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], k) + + # when export onnx model, for 1st chunk, we feed + # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) + # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). + # In all modes, `if cache.size(0) > 0` will alwayse be `True` + # and we will always do splitting and + # concatnation(this will simplify onnx export). Note that + # it's OK to concat & split zero-shaped tensors(see code below). + # when export jit model, for 1st chunk, we always feed + # cache(0, 0, 0, 0) since jit supports dynamic if-branch. + # >>> a = torch.ones((1, 2, 0, 4)) + # >>> b = torch.ones((1, 2, 3, 4)) + # >>> c = torch.cat((a, b), dim=2) + # >>> torch.equal(b, c) # True + # >>> d = torch.split(a, 2, dim=-1) + # >>> torch.equal(d[0], d[1]) # True + if cache.shape[0] > 0: + # last dim `d_k * 2` for (key, val) + key_cache, value_cache = paddle.split(cache, 2, axis=-1) + k = paddle.concat([key_cache, k], axis=2) + v = paddle.concat([value_cache, v], axis=2) + # We do cache slicing in encoder.forward_chunk, since it's + # non-trivial to calculate `next_cache_start` here. + new_cache = paddle.concat((k, v), axis=-1) + + # dot(q, k) + scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask), new_cache diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index f41a7b5d4..1e9f01018 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -85,18 +85,21 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): reverse (bool, optional): Not used. Defaults to False. """ nn.Layer.__init__(self) - self.d_model = d_model + self.d_model = paddle.to_tensor(d_model) self.max_len = max_len self.xscale = paddle.to_tensor(math.sqrt(self.d_model)) self.dropout = nn.Dropout(p=dropout_rate) + self.base = paddle.to_tensor(10000.0) self.pe = paddle.zeros([1, self.max_len, self.d_model]) #[B=1,T,D] position = paddle.arange( 0, self.max_len, dtype=paddle.float32).unsqueeze(1) #[T, 1] + # base^{-2(i-1)/d)}, i \in (1,2...,d/2) div_term = paddle.exp( - paddle.arange(0, self.d_model, 2, dtype=paddle.float32) * - -(math.log(10000.0) / self.d_model)) + -paddle.arange(0, self.d_model, 2, dtype=paddle.float32) * + (paddle.log(self.base) / self.d_model)) + # [B,T,D] self.pe[:, :, 0::2] = paddle.sin(position * div_term) self.pe[:, :, 1::2] = paddle.cos(position * div_term) @@ -161,6 +164,98 @@ class RelPositionalEncoding(PositionalEncoding): assert offset + x.shape[ 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) + x = x * self.xscale pos_emb = self.pe[:, offset:offset + x.shape[1]] return self.dropout(x), self.dropout(pos_emb) + + +# RotaryRelPositionalEncoding is same to RelPositionalEncoding +class ScaledRotaryRelPositionalEncoding(RelPositionalEncoding): + """Scaled Rotary Relative positional encoding module. + POSITION INTERPOLATION: : https://arxiv.org/pdf/2306.15595v2.pdf + """ + + def __init__(self, + d_model: int, + dropout_rate: float, + max_len: int=5000, + scale=1): + """ + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int, optional): [Maximum input length.]. Defaults to 5000. + scale (int): Interpolation max input length to `scale * max_len` positions. + """ + super().__init__(d_model, dropout_rate, max_len, reverse=True) + self.pscale = paddle.to_tensor(scale) + self.max_len = max_len * scale + + def sinusoidal_embeddings(self, + pos: paddle.Tensor, + dim: paddle.Tensor, + base=10000) -> paddle.Tensor: + """计算pos位置的dim维sinusoidal编码""" + assert dim % 2 == 0 + # (d/2,) + indices = paddle.arange(0, dim // 2, dtype=pos.dtype) + indices = paddle.pow(paddle.cast(base, pos.dtype), -2 * indices / dim) + # pos (1, T), indices (d/2,) -> (1, T, d/2) + embeddings = paddle.einsum('...,d->...d', pos, indices) + # (1, T, d/2, 2) + embeddings = paddle.stack( + [paddle.sin(embeddings), paddle.cos(embeddings)], axis=-1) + # (1, T, d) + embeddings = paddle.flatten(embeddings, start_axis=-2, stop_axis=-1) + return embeddings + + def forward(self, x: paddle.Tensor, + offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Compute positional encoding. + Args: + x (paddle.Tensor): Input tensor (batch, time, `*`). + Returns: + paddle.Tensor: Encoded tensor (batch, time, `*`). + paddle.Tensor: Positional embedding tensor (1, time, `*`). + """ + x = x * self.xscale + + B, T, D = x.shape + assert D == self.d_model + + # postion interploation + start = 0 + end = T * self.pscale + assert end <= self.max_len + position = paddle.arange(start, end, dtype=x.dtype).unsqueeze(0) + position *= 1.0 / self.pscale + pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base) + + pos_emb = pe[:, offset:offset + x.shape[1]] + return self.dropout(x), self.dropout(pos_emb) + + def position_encoding(self, offset: int, size: int) -> paddle.Tensor: + """ For getting encoding in a streaming fashion + Attention!!!!! + we apply dropout only once at the whole utterance level in a none + streaming way, but will call this function several times with + increasing input size in a streaming scenario, so the dropout will + be applied several times. + Args: + offset (int): start offset + size (int): requried size of position encoding + Returns: + paddle.Tensor: Corresponding position encoding, #[1, T, D]. + """ + # postion interploation + start = offset + end = (offset + size) * self.pscale + assert end <= self.max_len + position = paddle.arange( + start, end, dtype=paddle.get_default_dtype()).unsqueeze(0) + position *= 1.0 / self.pscale + + pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base) + + return self.dropout(pe) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index d90d69d77..27d7ffbd7 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -28,6 +28,7 @@ from paddlespeech.s2t.modules.align import LayerNorm from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.attention import MultiHeadedAttention from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention +from paddlespeech.s2t.modules.attention import RoPERelPositionMultiHeadedAttention from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule from paddlespeech.s2t.modules.embedding import NoPositionalEncoding from paddlespeech.s2t.modules.embedding import PositionalEncoding @@ -115,6 +116,8 @@ class BaseEncoder(nn.Layer): pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "rel_pos": pos_enc_class = RelPositionalEncoding + elif pos_enc_layer_type == "rope_pos": + pos_enc_class = RelPositionalEncoding elif pos_enc_layer_type == "no_pos": pos_enc_class = NoPositionalEncoding else: @@ -230,14 +233,14 @@ class BaseEncoder(nn.Layer): xs = self.global_cmvn(xs) # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) + xs, _, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) elayers, _, cache_t1, _ = att_cache.shape chunk_size = xs.shape[1] attention_key_size = cache_t1 + chunk_size - # only used when using `RelPositionMultiHeadedAttention` + # only used when using `RelPositionMultiHeadedAttention` and `RoPERelPositionMultiHeadedAttention` pos_emb = self.embed.position_encoding( offset=offset - cache_t1, size=attention_key_size) @@ -474,21 +477,35 @@ class ConformerEncoder(BaseEncoder): activation = get_activation(activation_type) # self-attention module definition - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = (attention_heads, output_size, - attention_dropout_rate) + encoder_dim = output_size + if pos_enc_layer_type == "abs_pos": + encoder_selfattn_layer = MultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, encoder_dim, + attention_dropout_rate) + elif pos_enc_layer_type == "rel_pos": + encoder_selfattn_layer = RelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, encoder_dim, + attention_dropout_rate) + elif pos_enc_layer_type == "rope_pos": + encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, encoder_dim, + attention_dropout_rate) + else: + raise ValueError( + f"pos_enc_layer_type {pos_enc_layer_type} not supported.") + # feed-forward module definition positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = (output_size, linear_units, dropout_rate, + positionwise_layer_args = (encoder_dim, linear_units, dropout_rate, activation) # convolution module definition convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, + convolution_layer_args = (encoder_dim, cnn_module_kernel, activation, cnn_module_norm, causal) self.encoders = nn.LayerList([ ConformerEncoderLayer( - size=output_size, + size=encoder_dim, self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args), feed_forward=positionwise_layer(*positionwise_layer_args), feed_forward_macaron=positionwise_layer( @@ -580,15 +597,23 @@ class SqueezeformerEncoder(nn.Layer): activation = get_activation(activation_type) # self-attention module definition - if pos_enc_layer_type != "rel_pos": + if pos_enc_layer_type == "abs_pos": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, output_size, attention_dropout_rate) - else: + elif pos_enc_layer_type == "rel_pos": encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, encoder_dim, attention_dropout_rate, adaptive_scale, init_weights) + elif pos_enc_layer_type == "rope_pos": + encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, encoder_dim, + attention_dropout_rate, + adaptive_scale, init_weights) + else: + raise ValueError( + f"pos_enc_layer_type {pos_enc_layer_type} not supported.") # feed-forward module definition positionwise_layer = PositionwiseFeedForward diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index ecba95e85..0499e742b 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -48,7 +48,7 @@ class TransformerEncoderLayer(nn.Layer): Args: size (int): Input dimension. self_attn (nn.Layer): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention` instance can be used as the argument. feed_forward (nn.Layer): Feed-forward module instance. `PositionwiseFeedForward`, instance can be used as the argument. @@ -147,7 +147,7 @@ class ConformerEncoderLayer(nn.Layer): Args: size (int): Input dimension. self_attn (nn.Layer): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention` instance can be used as the argument. feed_forward (nn.Layer): Feed-forward module instance. `PositionwiseFeedForward` instance can be used as the argument. @@ -298,7 +298,7 @@ class SqueezeformerEncoderLayer(nn.Layer): Args: size (int): Input dimension. self_attn (paddle.nn.Layer): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention` instance can be used as the argument. feed_forward1 (paddle.nn.Layer): Feed-forward module instance. `PositionwiseFeedForward` instance can be used as the argument. diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py deleted file mode 100644 index 06587c749..000000000 --- a/paddlespeech/s2t/training/gradclip.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import paddle -from paddle.fluid import core -from paddle.fluid import layers -from paddle.fluid.dygraph import base as imperative_base - -from paddlespeech.s2t.utils.log import Log - -__all__ = ["ClipGradByGlobalNormWithLog"] - -logger = Log(__name__).getlog() - - -class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): - def __init__(self, clip_norm): - super().__init__(clip_norm) - - def __repr__(self): - return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})" - - @imperative_base.no_grad - def _dygraph_clip(self, params_grads): - params_and_grads = [] - sum_square_list = [] - for i, (p, g) in enumerate(params_grads): - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - continue - merge_grad = g - if g.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(g) - merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = paddle.square(merge_grad) - sum_square = paddle.sum(square) - sum_square_list.append(sum_square) - - # debug log, not dump all since slow down train process - if i < 10: - logger.debug( - f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }") - - # all parameters have been filterd out - if len(sum_square_list) == 0: - return params_grads - - global_norm_var = paddle.concat(sum_square_list) - global_norm_var = paddle.sum(global_norm_var) - global_norm_var = paddle.sqrt(global_norm_var) - - # debug log - logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!") - - max_global_norm = paddle.full( - shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm) - clip_var = paddle.divide( - x=max_global_norm, - y=paddle.maximum(x=global_norm_var, y=max_global_norm)) - for i, (p, g) in enumerate(params_grads): - if g is None: - continue - if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) - continue - new_grad = paddle.multiply(x=g, y=clip_var) - params_and_grads.append((p, new_grad)) - - # debug log, not dump all since slow down train process - if i < 10: - logger.debug( - f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}" - ) - - return params_and_grads diff --git a/paddlespeech/s2t/training/optimizer/__init__.py b/paddlespeech/s2t/training/optimizer/__init__.py index aafdc5b6a..90281e1ed 100644 --- a/paddlespeech/s2t/training/optimizer/__init__.py +++ b/paddlespeech/s2t/training/optimizer/__init__.py @@ -19,7 +19,7 @@ from typing import Text import paddle from paddle.optimizer import Optimizer from paddle.regularizer import L2Decay -from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog + from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import instance_class from paddlespeech.s2t.utils.log import Log @@ -100,10 +100,9 @@ class OptimizerFactory(): assert "parameters" in args, "parameters not in args." assert "learning_rate" in args, "learning_rate not in args." - grad_clip = ClipGradByGlobalNormWithLog( + grad_clip = paddle.nn.ClipGradByGlobalNorm( args['grad_clip']) if "grad_clip" in args else None - weight_decay = L2Decay( - args['weight_decay']) if "weight_decay" in args else None + weight_decay = args.get("weight_decay", None) if weight_decay: logger.info(f'') if grad_clip: diff --git a/paddlespeech/s2t/training/optimizer/adadelta.py b/paddlespeech/s2t/training/optimizer/adadelta.py index 900b697c5..7c3950a90 100644 --- a/paddlespeech/s2t/training/optimizer/adadelta.py +++ b/paddlespeech/s2t/training/optimizer/adadelta.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import paddle -from paddle.fluid import framework +from paddle import framework from paddle.optimizer import Optimizer __all__ = [] diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index 0995a55da..9dd31a08b 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -28,7 +28,7 @@ from paddlespeech.server.utils.audio_process import float2pcm from paddlespeech.server.utils.onnx_infer import get_sess from paddlespeech.server.utils.util import denorm from paddlespeech.server.utils.util import get_chunks -from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.en_frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler'] diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py index a46b84bd9..0cfb20354 100644 --- a/paddlespeech/server/engine/tts/online/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py @@ -29,7 +29,7 @@ from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm from paddlespeech.server.utils.util import denorm from paddlespeech.server.utils.util import get_chunks -from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.en_frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index 20b98fae6..3a6461f8c 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -32,7 +32,7 @@ from paddlespeech.server.utils.errors import ErrorCode from paddlespeech.server.utils.exception import ServerBaseException from paddlespeech.server.utils.paddle_predictor import init_predictor from paddlespeech.server.utils.paddle_predictor import run_model -from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.en_frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler'] diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py index 57fe82a9c..7d93c026e 100644 --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -18,6 +18,5 @@ from . import exps from . import frontend from . import models from . import modules -from . import ssml from . import training from . import utils diff --git a/paddlespeech/t2s/assets/__init__.py b/paddlespeech/t2s/assets/__init__.py new file mode 100644 index 000000000..595add0ae --- /dev/null +++ b/paddlespeech/t2s/assets/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/assets/sentences_mix.txt b/paddlespeech/t2s/assets/sentences_mix.txt index 06e97d14a..bfa0db636 100644 --- a/paddlespeech/t2s/assets/sentences_mix.txt +++ b/paddlespeech/t2s/assets/sentences_mix.txt @@ -5,4 +5,5 @@ 005 Paddle Bo Bo: 使用 Paddle Speech 的语音合成模块生成虚拟人的声音。 006 热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中! 007 我喜欢 eat apple, 你喜欢 drink milk。 -008 我们要去云南 team building, 非常非常 happy. \ No newline at end of file +008 我们要去云南 team building, 非常非常 happy. +009 AI for Sceience 平台。 \ No newline at end of file diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 97626db0b..24f2be7d5 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -44,10 +44,17 @@ from paddlespeech.t2s.utils import str2bool def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly - if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + if args.ngpu > 0 and paddle.is_compiled_with_cuda(): + paddle.set_device("gpu") + elif args.nxpu > 0 and paddle.is_compiled_with_xpu(): + paddle.set_device("xpu") + elif args.ngpu == 0 and args.nxpu == 0: paddle.set_device("cpu") else: - paddle.set_device("gpu") + raise ValueError( + "Please make sure that the paddle you installed matches the device type you set, " + "and that ngpu and nxpu cannot be negative at the same time.") + world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() @@ -183,7 +190,12 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu or xpu.") + parser.add_argument( + "--nxpu", + type=int, + default=0, + help="if ngpu=0 and nxpu > 0, use xpu. if ngpu=0 and nxpu=0, use cpu.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 31fe14490..8a5269825 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -112,7 +112,7 @@ def parse_args(): parser.add_argument( "--device", default="gpu", - choices=["gpu", "cpu"], + choices=["gpu", "cpu", "xpu"], help="Device selected for inference.", ) parser.add_argument('--cpu_threads', type=int, default=1) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 57c79dee1..9a07df64d 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -33,8 +33,8 @@ from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import * from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static -from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend +from paddlespeech.t2s.frontend.en_frontend import English from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.frontend.sing_frontend import SingFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend @@ -99,14 +99,23 @@ def norm(data, mean, std): return (data - mean) / std -def get_chunks(data, block_size: int, pad_size: int): - data_len = data.shape[1] +def get_chunks(mel, chunk_size: int, pad_size: int): + """ + Split mel by chunk size with left and right context. + + Args: + mel (paddle.Tensor): mel spectrogram, shape (B, T, D) + chunk_size (int): chunk size + pad_size (int): size for left and right context. + """ + T = mel.shape[1] + n = math.ceil(T / chunk_size) + chunks = [] - n = math.ceil(data_len / block_size) for i in range(n): - start = max(0, i * block_size - pad_size) - end = min((i + 1) * block_size + pad_size, data_len) - chunks.append(data[:, start:end, :]) + start = max(0, i * chunk_size - pad_size) + end = min((i + 1) * chunk_size + pad_size, T) + chunks.append(mel[:, start:end, :]) return chunks @@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): with open(text_file, 'rt', encoding='utf-8') as f: for line in f: if line.strip() != "": - items = re.split(r"\s+", line.strip(), 1) + items = re.split(r"\s+", line.strip(), maxsplit=1) + assert len(items) == 2 utt_id = items[0] - if lang in {'zh', 'canton'}: - sentence = "".join(items[1:]) - elif lang == 'en': - sentence = " ".join(items[1:]) - elif lang == 'mix': - sentence = " ".join(items[1:]) + sentence = items[1] sentences.append((utt_id, sentence)) return sentences @@ -319,6 +324,7 @@ def run_frontend( input_ids = {} if text.strip() != "" and re.match(r".*?.*?.*", text, re.DOTALL): + # using ssml input_ids = frontend.get_input_ids_ssml( text, merge_sentences=merge_sentences, @@ -359,6 +365,7 @@ def run_frontend( outs.update({'is_slurs': is_slurs}) else: print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!") + outs.update({'phone_ids': phone_ids}) return outs diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 6189522db..e7cf7850e 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -219,7 +219,13 @@ def parse_args(): ) # other parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") + parser.add_argument( + "--nxpu", + type=int, + default=0, + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." + ) parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--output_dir", type=str, help="output dir.") parser.add_argument( @@ -235,12 +241,14 @@ def parse_args(): def main(): args = parse_args() - if args.ngpu == 0: - paddle.set_device("cpu") - elif args.ngpu > 0: + if args.ngpu > 0: paddle.set_device("gpu") + elif args.nxpu > 0: + paddle.set_device("xpu") + elif args.ngpu == 0 and args.nxpu == 0: + paddle.set_device("cpu") else: - print("ngpu should >= 0 !") + print("ngpu or nxpu should >= 0 !") evaluate(args) diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 0c7b34b09..c63a5fbe9 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -13,6 +13,7 @@ # limitations under the License. import argparse from pathlib import Path +from pprint import pprint import paddle import soundfile as sf @@ -78,6 +79,7 @@ def evaluate(args): # whether dygraph to static if args.inference_dir: + print("convert am and voc to static model.") # acoustic model am_inference = am_to_static( am_inference=am_inference, @@ -92,6 +94,7 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) + merge_sentences = False # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) @@ -102,13 +105,19 @@ def evaluate(args): if am_name == 'speedyspeech': get_tone_ids = True + # wav samples N = 0 + # inference time cost T = 0 + + # [(uid, text), ] if am_name == 'diffsinger': sentences = get_sentences_svs(text_file=args.text) else: sentences = get_sentences(text_file=args.text, lang=args.lang) + for utt_id, sentence in sentences: + print(f"{utt_id} {sentence}") with timer() as t: if am_name == "diffsinger": text = "" @@ -116,6 +125,8 @@ def evaluate(args): else: text = sentence svs_input = None + + # frontend frontend_dict = run_frontend( frontend=frontend, text=text, @@ -124,25 +135,33 @@ def evaluate(args): lang=args.lang, svs_input=svs_input) phone_ids = frontend_dict['phone_ids'] + # pprint(f"{utt_id} {phone_ids}") + with paddle.no_grad(): flags = 0 for i in range(len(phone_ids)): + # sub phone, split by `sp` or punctuation. part_phone_ids = phone_ids[i] + # acoustic model if am_name == 'fastspeech2': # multi speaker if am_dataset in {"aishell3", "vctk", "mix", "canton"}: - spk_id = paddle.to_tensor(args.spk_id) + # multi-speaker + spk_id = paddle.to_tensor([args.spk_id]) mel = am_inference(part_phone_ids, spk_id) else: + # single-speaker mel = am_inference(part_phone_ids) elif am_name == 'speedyspeech': part_tone_ids = frontend_dict['tone_ids'][i] if am_dataset in {"aishell3", "vctk", "mix"}: - spk_id = paddle.to_tensor(args.spk_id) + # multi-speaker + spk_id = paddle.to_tensor([args.spk_id]) mel = am_inference(part_phone_ids, part_tone_ids, spk_id) else: + # single-speaker mel = am_inference(part_phone_ids, part_tone_ids) elif am_name == 'tacotron2': mel = am_inference(part_phone_ids) @@ -155,6 +174,7 @@ def evaluate(args): note=part_note_ids, note_dur=part_note_durs, is_slur=part_is_slurs, ) + # vocoder wav = voc_inference(mel) if flags == 0: @@ -162,17 +182,23 @@ def evaluate(args): flags = 1 else: wav_all = paddle.concat([wav_all, wav]) + wav = wav_all.numpy() N += wav.size T += t.elapse + + # samples per second speed = wav.size / t.elapse + # generate one second wav need `RTF` seconds rtf = am_config.fs / speed print( f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) + sf.write( str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") @@ -273,7 +299,13 @@ def parse_args(): default=None, help="dir to save inference models") parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") + parser.add_argument( + "--nxpu", + type=int, + default=0, + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." + ) parser.add_argument( "--text", type=str, @@ -303,12 +335,14 @@ def parse_args(): def main(): args = parse_args() - if args.ngpu == 0: - paddle.set_device("cpu") - elif args.ngpu > 0: + if args.ngpu > 0: paddle.set_device("gpu") + elif args.nxpu > 0: + paddle.set_device("xpu") + elif args.ngpu == 0 and args.nxpu == 0: + paddle.set_device("cpu") else: - print("ngpu should >= 0 !") + print("ngpu or nxpu should >= 0 !") evaluate(args) diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py index 2ebd5ecc2..4e82e53ff 100644 --- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py @@ -27,7 +27,7 @@ import yaml from yacs.config import CfgNode as Configuration from paddlespeech.t2s.datasets.get_feats import LogMelFBank -from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.en_frontend import English def get_lj_sentences(file_name, frontend): diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py index 0cd7d224e..279407b38 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py @@ -21,7 +21,7 @@ import soundfile as sf import yaml from yacs.config import CfgNode -from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.en_frontend import English from paddlespeech.t2s.models.transformer_tts import TransformerTTS from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow diff --git a/paddlespeech/t2s/frontend/__init__.py b/paddlespeech/t2s/frontend/__init__.py index 64015435e..a8f77d552 100644 --- a/paddlespeech/t2s/frontend/__init__.py +++ b/paddlespeech/t2s/frontend/__init__.py @@ -13,8 +13,8 @@ # limitations under the License. from .generate_lexicon import * from .normalizer import * -from .phonectic import * from .punctuation import * +from .ssml import * from .tone_sandhi import * from .vocab import * from .zh_normalization import * diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py index 7a81b645d..9b2b11b3d 100644 --- a/paddlespeech/t2s/frontend/arpabet.py +++ b/paddlespeech/t2s/frontend/arpabet.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddlespeech.t2s.frontend.phonectic import Phonetics """ A phonology system with ARPABET symbols and limited punctuations. The G2P conversion is done by g2p_en. @@ -19,55 +18,68 @@ conversion is done by g2p_en. Note that g2p_en does not handle words with hypen well. So make sure the input sentence is first normalized. """ -from paddlespeech.t2s.frontend.vocab import Vocab from g2p_en import G2p +from paddlespeech.t2s.frontend.phonectic import Phonetics +from paddlespeech.t2s.frontend.vocab import Vocab + class ARPABET(Phonetics): - """A phonology for English that uses ARPABET as the phoneme vocabulary. + """A phonology for English that uses ARPABET without stress as the phoneme vocabulary. + + 47 symbols = 39 phones + 4 punctuations + 4 special tokens( ) + + The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker: + 0 — No stress + 1 — Primary stress + 2 — Secondary stress + + Phoneme Set: + Phoneme Example Translation + ------- ------- ----------- + AA odd AA D + AE at AE T + AH hut HH AH T + AO ought AO T + AW cow K AW + AY hide HH AY D + B be B IY + CH cheese CH IY Z + D dee D IY + DH thee DH IY + EH Ed EH D + ER hurt HH ER T + EY ate EY T + F fee F IY + G green G R IY N + HH he HH IY + IH it IH T + IY eat IY T + JH gee JH IY + K key K IY + L lee L IY + M me M IY + N knee N IY + NG ping P IH NG + OW oat OW T + OY toy T OY + P pee P IY + R read R IY D + S sea S IY + SH she SH IY + T tea T IY + TH theta TH EY T AH + UH hood HH UH D + UW two T UW + V vee V IY + W we W IY + Y yield Y IY L D + Z zee Z IY + ZH seizure S IY ZH ER + See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details. - Phoneme Example Translation - ------- ------- ----------- - AA odd AA D - AE at AE T - AH hut HH AH T - AO ought AO T - AW cow K AW - AY hide HH AY D - B be B IY - CH cheese CH IY Z - D dee D IY - DH thee DH IY - EH Ed EH D - ER hurt HH ER T - EY ate EY T - F fee F IY - G green G R IY N - HH he HH IY - IH it IH T - IY eat IY T - JH gee JH IY - K key K IY - L lee L IY - M me M IY - N knee N IY - NG ping P IH NG - OW oat OW T - OY toy T OY - P pee P IY - R read R IY D - S sea S IY - SH she SH IY - T tea T IY - TH theta TH EY T AH - UH hood HH UH D - UW two T UW - V vee V IY - W we W IY - Y yield Y IY L D - Z zee Z IY - ZH seizure S IY ZH ER """ + # 39 phonemes phonemes = [ 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', @@ -76,6 +88,8 @@ class ARPABET(Phonetics): ] punctuations = [',', '.', '?', '!'] symbols = phonemes + punctuations + # vowels carry a lexical stress marker: + # 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音) _stress_to_no_stress_ = { 'AA0': 'AA', 'AA1': 'AA', @@ -124,7 +138,12 @@ class ARPABET(Phonetics): 'UW2': 'UW' } + def __repr__(self): + fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})" + return fmt.format(len(phonemes), punctuations) + def __init__(self): + # https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py self.backend = G2p() self.vocab = Vocab(self.phonemes + self.punctuations) @@ -139,6 +158,7 @@ class ARPABET(Phonetics): Returns: List[str]: The list of pronunciation sequence. """ + # g2p and remove vowel stress phonemes = [ self._remove_vowels(item) for item in self.backend(sentence) ] @@ -158,6 +178,7 @@ class ARPABET(Phonetics): Returns: List[int]: The list of pronunciation id sequence. """ + # phonemes to ids ids = [self.vocab.lookup(item) for item in phonemes] return ids @@ -189,11 +210,16 @@ class ARPABET(Phonetics): def vocab_size(self): """ Vocab size. """ - # 47 = 39 phones + 4 punctuations + 4 special tokens + # 47 = 39 phones + 4 punctuations + 4 special tokens( ) return len(self.vocab) class ARPABETWithStress(Phonetics): + """ + A phonology for English that uses ARPABET with stress as the phoneme vocabulary. + + 77 symbols = 69 phones + 4 punctuations + 4 special tokens + """ phonemes = [ 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', @@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics): punctuations = [',', '.', '?', '!'] symbols = phonemes + punctuations + def __repr__(self): + fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})" + return fmt.format(len(phonemes), punctuations) + def __init__(self): self.backend = G2p() self.vocab = Vocab(self.phonemes + self.punctuations) diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py index f2c7175fe..bbb7bcf00 100644 --- a/paddlespeech/t2s/frontend/canton_frontend.py +++ b/paddlespeech/t2s/frontend/canton_frontend.py @@ -29,7 +29,8 @@ INITIALS = [ INITIALS += ['sp', 'spl', 'spn', 'sil'] -def get_lines(cantons: List[str]): +def jyuping_to_phonemes(cantons: List[str]): + # jyuping to inital and final phones = [] for canton in cantons: for consonant in INITIALS: @@ -47,7 +48,7 @@ def get_lines(cantons: List[str]): class CantonFrontend(): def __init__(self, phone_vocab_path: str): self.text_normalizer = TextNormalizer() - self.punc = ":,;。?!“”‘’':,;.?!" + self.punc = "、:,;。?!“”‘’':,;.?!" self.vocab_phones = {} if phone_vocab_path: @@ -61,8 +62,11 @@ class CantonFrontend(): merge_sentences: bool=True) -> List[List[str]]: phones_list = [] for sentence in sentences: + # jyuping + # 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.' phones_str = ToJyutping.get_jyutping_text(sentence) - phones_split = get_lines(phones_str.split(' ')) + # phonemes + phones_split = jyuping_to_phonemes(phones_str.split(' ')) phones_list.append(phones_split) return phones_list @@ -78,8 +82,11 @@ class CantonFrontend(): sentence: str, merge_sentences: bool=True, print_info: bool=False) -> List[List[str]]: + # TN & Text Segmentation sentences = self.text_normalizer.normalize(sentence) + # G2P phonemes = self._g2p(sentences, merge_sentences=merge_sentences) + if print_info: print("----------------------------") print("text norm results:") @@ -88,6 +95,7 @@ class CantonFrontend(): print("g2p results:") print(phonemes) print("----------------------------") + return phonemes def get_input_ids(self, @@ -98,9 +106,9 @@ class CantonFrontend(): phonemes = self.get_phonemes( sentence, merge_sentences=merge_sentences, print_info=print_info) + result = {} temp_phone_ids = [] - for phones in phonemes: if phones: phone_ids = self._p2id(phones) @@ -108,6 +116,8 @@ class CantonFrontend(): if to_tensor: phone_ids = paddle.to_tensor(phone_ids) temp_phone_ids.append(phone_ids) + if temp_phone_ids: result["phone_ids"] = temp_phone_ids + return result diff --git a/paddlespeech/t2s/frontend/en_frontend.py b/paddlespeech/t2s/frontend/en_frontend.py new file mode 100644 index 000000000..c58bed7d3 --- /dev/null +++ b/paddlespeech/t2s/frontend/en_frontend.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .phonectic import English diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index b8c16097c..2ebfe135e 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -18,9 +18,9 @@ from typing import List import numpy as np import paddle -from paddlespeech.t2s.frontend import English -from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor +from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend +from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor +from paddlespeech.t2s.frontend.zh_frontend import Frontend as ZhFrontend class MixFrontend(): @@ -28,10 +28,9 @@ class MixFrontend(): g2p_model="pypinyin", phone_vocab_path=None, tone_vocab_path=None): - - self.zh_frontend = Frontend( + self.zh_frontend = ZhFrontend( phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path) - self.en_frontend = English(phone_vocab_path=phone_vocab_path) + self.en_frontend = EnFrontend(phone_vocab_path=phone_vocab_path) self.sp_id = self.zh_frontend.vocab_phones["sp"] self.sp_id_numpy = np.array([self.sp_id]) self.sp_id_tensor = paddle.to_tensor([self.sp_id]) @@ -55,15 +54,12 @@ class MixFrontend(): else: return False - def get_segment(self, text: str) -> List[str]: + def split_by_lang(self, text: str) -> List[str]: # sentence --> [ch_part, en_part, ch_part, ...] segments = [] types = [] - flag = 0 - temp_seg = "" - temp_lang = "" - # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point. + # Determine the type of each character. type: chinese, alphabet, other. for ch in text: if self.is_chinese(ch): types.append("zh") @@ -74,31 +70,31 @@ class MixFrontend(): assert len(types) == len(text) - for i in range(len(types)): + flag = 0 + temp_seg = "" + temp_lang = "" + + for i in range(len(text)): # find the first char of the seg if flag == 0: temp_seg += text[i] temp_lang = types[i] flag = 1 - else: if temp_lang == "other": - if types[i] == temp_lang: - temp_seg += text[i] - else: - temp_seg += text[i] + # text start is not lang. + temp_seg += text[i] + if types[i] != temp_lang: temp_lang = types[i] - else: - if types[i] == temp_lang: - temp_seg += text[i] - elif types[i] == "other": + if types[i] == temp_lang or types[i] == "other": + # merge same lang or other temp_seg += text[i] else: + # change lang segments.append((temp_seg, temp_lang)) temp_seg = text[i] - temp_lang = types[i] - flag = 1 + temp_lang = types[i] # new lang segments.append((temp_seg, temp_lang)) @@ -110,76 +106,95 @@ class MixFrontend(): get_tone_ids: bool=False, add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - ''' 1. 添加SSML支持,先列出 文字 和 标签内容, - 然后添加到tmpSegments数组里 - ''' - d_inputs = MixTextProcessor.get_dom_split(sentence) - tmpSegments = [] - for instr in d_inputs: - ''' 暂时只支持 say-as ''' - if instr.lower().startswith("" segments.append(tuple(currentSeg)) + # en segments.append(seg) + # reset currentSeg = ["", ""] else: + # zh if currentSeg[0] == '': + # first see currentSeg[0] = seg[0] currentSeg[1] = seg[1] else: + # merge zh currentSeg[0] = currentSeg[0] + seg[0] + if currentSeg[0] != '': + # last zh currentSeg[0] = "" + currentSeg[0] + "" segments.append(tuple(currentSeg)) phones_list = [] result = {} + # 008 我们要去云南 team building, 非常非常 happy. + # seg ('我们要去云南 ', 'zh') + # seg ('team building, ', 'en') + # seg ('非常非常 ', 'zh') + # seg ('happy.', 'en') + # [('我们要去云南 ', 'zh'), ('team building, ', 'en'), ('非常非常 ', 'zh'), ('happy.', 'en')] for seg in segments: content = seg[0] lang = seg[1] - if content != '': - if lang == "en": - input_ids = self.en_frontend.get_input_ids( - content, merge_sentences=False, to_tensor=to_tensor) + + if not content: + continue + + if lang == "en": + input_ids = self.en_frontend.get_input_ids( + content, merge_sentences=False, to_tensor=to_tensor) + else: + if content.strip() != "" and \ + re.match(r".*?.*?.*", content, re.DOTALL): + # process ssml + input_ids = self.zh_frontend.get_input_ids_ssml( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) else: - ''' 3. 把带speak tag的中文和普通文字分开处理 - ''' - if content.strip() != "" and \ - re.match(r".*?.*?.*", content, re.DOTALL): - input_ids = self.zh_frontend.get_input_ids_ssml( - content, - merge_sentences=False, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) - else: - input_ids = self.zh_frontend.get_input_ids( - content, - merge_sentences=False, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) - if add_sp: - if to_tensor: - input_ids["phone_ids"][-1] = paddle.concat( - [input_ids["phone_ids"][-1], self.sp_id_tensor]) - else: - input_ids["phone_ids"][-1] = np.concatenate( - (input_ids["phone_ids"][-1], self.sp_id_numpy)) + # process plain text + input_ids = self.zh_frontend.get_input_ids( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + + if add_sp: + # add sp between zh and en + if to_tensor: + input_ids["phone_ids"][-1] = paddle.concat( + [input_ids["phone_ids"][-1], self.sp_id_tensor]) + else: + input_ids["phone_ids"][-1] = np.concatenate( + (input_ids["phone_ids"][-1], self.sp_id_numpy)) - for phones in input_ids["phone_ids"]: - phones_list.append(phones) + phones_list.extend(input_ids["phone_ids"]) if merge_sentences: merge_list = paddle.concat(phones_list) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index af86d9b80..d6c66f1e0 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -47,15 +47,34 @@ class Phonetics(ABC): class English(Phonetics): """ Normalize the input text sequence and convert into pronunciation id sequence. + + https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py + + phonemes = ["", "", "", ""] + [ + 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', + 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', + 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', + 'EY2', 'F', 'G', 'HH', + 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', + 'M', 'N', 'NG', 'OW0', 'OW1', + 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', + 'UH0', 'UH1', 'UH2', 'UW', + 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'] """ + LEXICON = { + # key using lowercase + "AI".lower(): [["EY0", "AY1"]], + } + def __init__(self, phone_vocab_path=None): self.backend = G2p() + self.backend.cmu.update(English.LEXICON) self.phonemes = list(self.backend.phonemes) self.punctuations = get_punctuations("en") self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab_phones = {} - self.punc = ":,;。?!“”‘’':,;.?!" + self.punc = "、:,;。?!“”‘’':,;.?!" self.text_normalizer = TextNormalizer() if phone_vocab_path: with open(phone_vocab_path, 'rt', encoding='utf-8') as f: @@ -86,8 +105,8 @@ class English(Phonetics): sentence: str, merge_sentences: bool=False, to_tensor: bool=True) -> paddle.Tensor: - result = {} sentences = self.text_normalizer._split(sentence, lang="en") + phones_list = [] temp_phone_ids = [] for sentence in sentences: @@ -118,7 +137,10 @@ class English(Phonetics): if to_tensor: phone_ids = paddle.to_tensor(phone_ids) temp_phone_ids.append(phone_ids) + + result = {} result["phone_ids"] = temp_phone_ids + return result def numericalize(self, phonemes): diff --git a/paddlespeech/t2s/frontend/polyphonic.py b/paddlespeech/t2s/frontend/polyphonic.py new file mode 100644 index 000000000..9a757e204 --- /dev/null +++ b/paddlespeech/t2s/frontend/polyphonic.py @@ -0,0 +1,36 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import yaml + + +class Polyphonic(): + def __init__(self): + with open( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'polyphonic.yaml'), + 'r', + encoding='utf-8') as polyphonic_file: + # 解析yaml + polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader) + self.polyphonic_words = polyphonic_dict["polyphonic"] + + def correct_pronunciation(self, word, pinyin): + # 词汇被词典收录则返回纠正后的读音 + if word in self.polyphonic_words.keys(): + pinyin = self.polyphonic_words[word] + # 否则返回原读音 + return pinyin diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml index 6885035e7..f52b1cf58 100644 --- a/paddlespeech/t2s/frontend/polyphonic.yaml +++ b/paddlespeech/t2s/frontend/polyphonic.yaml @@ -47,4 +47,8 @@ polyphonic: 恶行: ['e4','xing2'] 唉: ['ai4'] 扎实: ['zha1','shi2'] - 干将: ['gan4','jiang4'] \ No newline at end of file + 干将: ['gan4','jiang4'] + 陈威行: ['chen2', 'wei1', 'hang2'] + 郭晟: ['guo1', 'sheng4'] + 中标: ['zhong4', 'biao1'] + 抗住: ['kang2', 'zhu4'] \ No newline at end of file diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py index c2aecf273..fff72a10c 100644 --- a/paddlespeech/t2s/frontend/sing_frontend.py +++ b/paddlespeech/t2s/frontend/sing_frontend.py @@ -29,7 +29,7 @@ class SingFrontend(): pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line. phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line. """ - self.punc = '[:,;。?!“”‘’\':,;.?!]' + self.punc = '[、:,;。?!“”‘’\':,;.?!]' self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'} if pinyin_phone_path: diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/frontend/ssml/__init__.py similarity index 89% rename from paddlespeech/t2s/ssml/__init__.py rename to paddlespeech/t2s/frontend/ssml/__init__.py index 9b4db053b..b1b9d726f 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/frontend/ssml/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py similarity index 84% rename from paddlespeech/t2s/ssml/xml_processor.py rename to paddlespeech/t2s/frontend/ssml/xml_processor.py index 892ca371e..1d216c31b 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py @@ -1,4 +1,17 @@ # -*- coding: utf-8 -*- +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import re import xml.dom.minidom import xml.parsers.expat @@ -17,7 +30,6 @@ Note: xml 有5种特殊字符, &<>"' ' ' 例如: "姓名" - ''' @@ -61,17 +73,29 @@ class MixTextProcessor(): patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) mat = re.match(patn, mixstr) if mat: + # pre pre_xml = mat.group(1) + # between ... in_xml = mat.group(2) + # post after_xml = mat.group(3) - ctlist.append([pre_xml, []]) + # pre with none syllable + if pre_xml: + ctlist.append([pre_xml, []]) + + # between with syllable + # [(sub sentence, [syllables]), ...] dom = DomXml(in_xml) pinyinlist = dom.get_pinyins_for_xml() ctlist = ctlist + pinyinlist - ctlist.append([after_xml, []]) + + # post with none syllable + if after_xml: + ctlist.append([after_xml, []]) else: ctlist.append([mixstr, []]) + return ctlist @classmethod @@ -86,17 +110,21 @@ class MixTextProcessor(): in_xml = mat.group(2) after_xml = mat.group(3) - ctlist.append(pre_xml) + if pre_xml: + ctlist.append(pre_xml) + dom = DomXml(in_xml) tags = dom.get_text_and_sayas_tags() ctlist.extend(tags) - - ctlist.append(after_xml) - return ctlist + + if after_xml: + ctlist.append(after_xml) else: ctlist.append(mixstr) + return ctlist + class DomXml(): def __init__(self, xmlstr): self.tdom = parseString(xmlstr) #Document diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 42f7b8b2f..690f69aa2 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -20,6 +20,9 @@ from pypinyin import Style class ToneSandhi(): + def __repr__(self): + return "MandarinToneSandhi" + def __init__(self): self.must_neural_tone_words = { '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝', @@ -65,9 +68,22 @@ class ToneSandhi(): '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎', '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得', '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打', - '考考', '整整', '莘莘', '落地', '算子', '家家户户' + '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青' } - self.punc = ":,;。?!“”‘’':,;.?!" + self.punc = "、:,;。?!“”‘’':,;.?!" + + def _split_word(self, word: str) -> List[str]: + word_list = jieba.cut_for_search(word) + word_list = sorted(word_list, key=lambda i: len(i), reverse=False) + first_subword = word_list[0] + first_begin_idx = word.find(first_subword) + if first_begin_idx == 0: + second_subword = word[len(first_subword):] + new_word_list = [first_subword, second_subword] + else: + second_subword = word[:-len(first_subword)] + new_word_list = [second_subword, first_subword] + return new_word_list # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 # e.g. @@ -154,18 +170,8 @@ class ToneSandhi(): finals[i] = finals[i][:-1] + "4" return finals - def _split_word(self, word: str) -> List[str]: - word_list = jieba.cut_for_search(word) - word_list = sorted(word_list, key=lambda i: len(i), reverse=False) - first_subword = word_list[0] - first_begin_idx = word.find(first_subword) - if first_begin_idx == 0: - second_subword = word[len(first_subword):] - new_word_list = [first_subword, second_subword] - else: - second_subword = word[:-len(first_subword)] - new_word_list = [second_subword, first_subword] - return new_word_list + def _all_tone_three(self, finals: List[str]) -> bool: + return all(x[-1] == "3" for x in finals) def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: @@ -207,9 +213,6 @@ class ToneSandhi(): return finals - def _all_tone_three(self, finals: List[str]) -> bool: - return all(x[-1] == "3" for x in finals) - # merge "不" and the word behind it # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: @@ -336,6 +339,9 @@ class ToneSandhi(): def pre_merge_for_modify( self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + """ + seg: [(word, pos), ...] + """ seg = self._merge_bu(seg) seg = self._merge_yi(seg) seg = self._merge_reduplication(seg) @@ -346,7 +352,11 @@ class ToneSandhi(): def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: - + """ + word: 分词 + pos: 词性 + finals: 带调韵母, [final1, ..., finaln] + """ finals = self._bu_sandhi(word, finals) finals = self._yi_sandhi(word, finals) finals = self._neural_sandhi(word, pos, finals) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 35b97a93a..1431bc6d8 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -14,6 +14,7 @@ import os import re from operator import itemgetter +from pprint import pprint from typing import Dict from typing import List @@ -30,10 +31,11 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon +from paddlespeech.t2s.frontend.polyphonic import Polyphonic from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor +from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer -from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor INITIALS = [ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', @@ -41,6 +43,9 @@ INITIALS = [ ] INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil'] +# 0 for None, 5 for neutral +TONES = ["0", "1", "2", "3", "4", "5"] + def intersperse(lst, item): result = [item] * (len(lst) * 2 + 1) @@ -49,34 +54,19 @@ def intersperse(lst, item): def insert_after_character(lst, item): + """ + inset `item` after finals. + """ result = [item] + for phone in lst: result.append(phone) if phone not in INITIALS: # finals has tones # assert phone[-1] in "12345" result.append(item) - return result - - -class Polyphonic(): - def __init__(self): - with open( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - 'polyphonic.yaml'), - 'r', - encoding='utf-8') as polyphonic_file: - # 解析yaml - polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader) - self.polyphonic_words = polyphonic_dict["polyphonic"] - def correct_pronunciation(self, word, pinyin): - # 词汇被词典收录则返回纠正后的读音 - if word in self.polyphonic_words.keys(): - pinyin = self.polyphonic_words[word] - # 否则返回原读音 - return pinyin + return result class Frontend(): @@ -85,10 +75,8 @@ class Frontend(): phone_vocab_path=None, tone_vocab_path=None, use_rhy=False): - self.mix_ssml_processor = MixTextProcessor() - self.tone_modifier = ToneSandhi() - self.text_normalizer = TextNormalizer() - self.punc = ":,;。?!“”‘’':,;.?!" + + self.punc = "、:,;。?!“”‘’':,;.?!" self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4'] self.phrases_dict = { '开户行': [['ka1i'], ['hu4'], ['hang2']], @@ -108,28 +96,7 @@ class Frontend(): '嘞': [['lei5']], '掺和': [['chan1'], ['huo5']] } - self.use_rhy = use_rhy - if use_rhy: - self.rhy_predictor = RhyPredictor() - print("Rhythm predictor loaded.") - # g2p_model can be pypinyin and g2pM and g2pW - self.g2p_model = g2p_model - if self.g2p_model == "g2pM": - self.g2pM_model = G2pM() - self.pinyin2phone = generate_lexicon( - with_tone=True, with_erhua=False) - elif self.g2p_model == "g2pW": - # use pypinyin as backup for non polyphonic characters in g2pW - self._init_pypinyin() - self.corrector = Polyphonic() - self.g2pM_model = G2pM() - self.g2pW_model = G2PWOnnxConverter( - style='pinyin', enable_non_tradional_chinese=True) - self.pinyin2phone = generate_lexicon( - with_tone=True, with_erhua=False) - else: - self._init_pypinyin() self.must_erhua = { "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" } @@ -154,13 +121,51 @@ class Frontend(): for tone, id in tone_id: self.vocab_tones[tone] = int(id) + # SSML + self.mix_ssml_processor = MixTextProcessor() + # tone sandhi + self.tone_modifier = ToneSandhi() + # TN + self.text_normalizer = TextNormalizer() + + # prosody + self.use_rhy = use_rhy + if use_rhy: + self.rhy_predictor = RhyPredictor() + print("Rhythm predictor loaded.") + + # g2p + assert g2p_model in ('pypinyin', 'g2pM', 'g2pW') + self.g2p_model = g2p_model + if self.g2p_model == "g2pM": + self.g2pM_model = G2pM() + self.pinyin2phone = generate_lexicon( + with_tone=True, with_erhua=False) + elif self.g2p_model == "g2pW": + # use pypinyin as backup for non polyphonic characters in g2pW + self._init_pypinyin() + self.corrector = Polyphonic() + self.g2pM_model = G2pM() + self.g2pW_model = G2PWOnnxConverter( + style='pinyin', enable_non_tradional_chinese=True) + self.pinyin2phone = generate_lexicon( + with_tone=True, with_erhua=False) + else: + self._init_pypinyin() + def _init_pypinyin(self): + """ + Load pypinyin G2P module. + """ large_pinyin.load() load_phrases_dict(self.phrases_dict) # 调整字的拼音顺序 load_single_dict({ord(u'地'): u'de,di4'}) def _get_initials_finals(self, word: str) -> List[List[str]]: + """ + Get word initial and final by pypinyin or g2pM + """ initials = [] finals = [] if self.g2p_model == "pypinyin": @@ -171,11 +176,14 @@ class Frontend(): for c, v in zip(orig_initials, orig_finals): if re.match(r'i\d', v): if c in ['z', 'c', 's']: + # zi, ci, si v = re.sub('i', 'ii', v) elif c in ['zh', 'ch', 'sh', 'r']: + # zhi, chi, shi v = re.sub('i', 'iii', v) initials.append(c) finals.append(v) + elif self.g2p_model == "g2pM": pinyins = self.g2pM_model(word, tone=True, char_split=False) for pinyin in pinyins: @@ -192,58 +200,123 @@ class Frontend(): # If it's not pinyin (possibly punctuation) or no conversion is required initials.append(pinyin) finals.append(pinyin) + return initials, finals + def _merge_erhua(self, + initials: List[str], + finals: List[str], + word: str, + pos: str) -> List[List[str]]: + """ + Do erhub. + """ + # fix er1 + for i, phn in enumerate(finals): + if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': + finals[i] = 'er2' + + # 发音 + if word not in self.must_erhua and (word in self.not_erhua or + pos in {"a", "j", "nr"}): + return initials, finals + + # "……" 等情况直接返回 + if len(finals) != len(word): + return initials, finals + + assert len(finals) == len(word) + + # 不发音 + new_initials = [] + new_finals = [] + for i, phn in enumerate(finals): + if i == len(finals) - 1 and word[i] == "儿" and phn in { + "er2", "er5" + } and word[-2:] not in self.not_erhua and new_finals: + new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1] + else: + new_initials.append(initials[i]) + new_finals.append(phn) + + return new_initials, new_finals + # if merge_sentences, merge all sentences into one phone sequence def _g2p(self, sentences: List[str], merge_sentences: bool=True, with_erhua: bool=True) -> List[List[str]]: + """ + Return: list of list phonemes. + [['w', 'o3', 'm', 'en2', 'sp'], ...] + """ segments = sentences phones_list = [] + + # split by punctuation for seg in segments: if self.use_rhy: seg = self.rhy_predictor._clean_text(seg) - phones = [] - # Replace all English words in the sentence + + # remove all English words in the sentence seg = re.sub('[a-zA-Z]+', '', seg) + + # add prosody mark if self.use_rhy: seg = self.rhy_predictor.get_prediction(seg) + + # [(word, pos), ...] seg_cut = psg.lcut(seg) - initials = [] - finals = [] + # fix wordseg bad case for sandhi seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut) + # 为了多音词获得更好的效果,这里采用整句预测 + phones = [] + initials = [] + finals = [] if self.g2p_model == "g2pW": try: + # undo prosody if self.use_rhy: seg = self.rhy_predictor._clean_text(seg) + + # g2p pinyins = self.g2pW_model(seg)[0] except Exception: - # g2pW采用模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测 + # g2pW 模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测 print("[%s] not in g2pW dict,use g2pM" % seg) pinyins = self.g2pM_model(seg, tone=True, char_split=False) + + # do prosody if self.use_rhy: rhy_text = self.rhy_predictor.get_prediction(seg) final_py = self.rhy_predictor.pinyin_align(pinyins, rhy_text) pinyins = final_py + pre_word_length = 0 for word, pos in seg_cut: sub_initials = [] sub_finals = [] now_word_length = pre_word_length + len(word) + + # skip english word if pos == 'eng': pre_word_length = now_word_length continue + word_pinyins = pinyins[pre_word_length:now_word_length] - # 矫正发音 + + # 多音字消歧 word_pinyins = self.corrector.correct_pronunciation( word, word_pinyins) + for pinyin, char in zip(word_pinyins, word): if pinyin is None: pinyin = char + pinyin = pinyin.replace("u:", "v") + if pinyin in self.pinyin2phone: initial_final_list = self.pinyin2phone[ pinyin].split(" ") @@ -257,28 +330,41 @@ class Frontend(): # If it's not pinyin (possibly punctuation) or no conversion is required sub_initials.append(pinyin) sub_finals.append(pinyin) + pre_word_length = now_word_length + # tone sandhi sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals) + # er hua if with_erhua: sub_initials, sub_finals = self._merge_erhua( sub_initials, sub_finals, word, pos) + initials.append(sub_initials) finals.append(sub_finals) # assert len(sub_initials) == len(sub_finals) == len(word) else: + # pypinyin, g2pM for word, pos in seg_cut: if pos == 'eng': + # skip english word continue + + # g2p sub_initials, sub_finals = self._get_initials_finals(word) + # tone sandhi sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals) + # er hua if with_erhua: sub_initials, sub_finals = self._merge_erhua( sub_initials, sub_finals, word, pos) + initials.append(sub_initials) finals.append(sub_finals) # assert len(sub_initials) == len(sub_finals) == len(word) + + # sum(iterable[, start]) initials = sum(initials, []) finals = sum(finals, []) @@ -287,111 +373,34 @@ class Frontend(): # we discriminate i, ii and iii if c and c not in self.punc: phones.append(c) + # replace punctuation by `sp` if c and c in self.punc: phones.append('sp') + if v and v not in self.punc and v not in self.rhy_phns: phones.append(v) - phones_list.append(phones) - if merge_sentences: - merge_list = sum(phones_list, []) - # rm the last 'sp' to avoid the noise at the end - # cause in the training data, no 'sp' in the end - if merge_list[-1] == 'sp': - merge_list = merge_list[:-1] - phones_list = [] - phones_list.append(merge_list) - return phones_list - def _split_word_to_char(self, words): - res = [] - for x in words: - res.append(x) - return res - - # if using ssml, have pingyin specified, assign pinyin to words - def _g2p_assign(self, - words: List[str], - pinyin_spec: List[str], - merge_sentences: bool=True) -> List[List[str]]: - phones_list = [] - initials = [] - finals = [] - - words = self._split_word_to_char(words[0]) - for pinyin, char in zip(pinyin_spec, words): - sub_initials = [] - sub_finals = [] - pinyin = pinyin.replace("u:", "v") - #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu - if pinyin in self.pinyin2phone: - initial_final_list = self.pinyin2phone[pinyin].split(" ") - if len(initial_final_list) == 2: - sub_initials.append(initial_final_list[0]) - sub_finals.append(initial_final_list[1]) - elif len(initial_final_list) == 1: - sub_initials.append('') - sub_finals.append(initial_final_list[1]) - else: - # If it's not pinyin (possibly punctuation) or no conversion is required - sub_initials.append(pinyin) - sub_finals.append(pinyin) - initials.append(sub_initials) - finals.append(sub_finals) + phones_list.append(phones) - initials = sum(initials, []) - finals = sum(finals, []) - phones = [] - for c, v in zip(initials, finals): - # NOTE: post process for pypinyin outputs - # we discriminate i, ii and iii - if c and c not in self.punc: - phones.append(c) - if c and c in self.punc: - phones.append('sp') - if v and v not in self.punc and v not in self.rhy_phns: - phones.append(v) - phones_list.append(phones) + # merge split sub sentence into one sentence. if merge_sentences: + # sub sentence phonemes merge_list = sum(phones_list, []) # rm the last 'sp' to avoid the noise at the end # cause in the training data, no 'sp' in the end if merge_list[-1] == 'sp': merge_list = merge_list[:-1] + + # sentence phonemes phones_list = [] phones_list.append(merge_list) - return phones_list - def _merge_erhua(self, - initials: List[str], - finals: List[str], - word: str, - pos: str) -> List[List[str]]: - # fix er1 - for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': - finals[i] = 'er2' - if word not in self.must_erhua and (word in self.not_erhua or - pos in {"a", "j", "nr"}): - return initials, finals - # "……" 等情况直接返回 - if len(finals) != len(word): - return initials, finals - - assert len(finals) == len(word) - - new_initials = [] - new_finals = [] - for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn in { - "er2", "er5" - } and word[-2:] not in self.not_erhua and new_finals: - new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1] - else: - new_finals.append(phn) - new_initials.append(initials[i]) - return new_initials, new_finals + return phones_list def _p2id(self, phonemes: List[str]) -> np.ndarray: + """ + Phoneme to Index + """ # replace unk phone with sp phonemes = [ phn if phn in self.vocab_phones else "sp" for phn in phonemes @@ -400,6 +409,9 @@ class Frontend(): return np.array(phone_ids, np.int64) def _t2id(self, tones: List[str]) -> np.ndarray: + """ + Tone to Index. + """ # replace unk phone with sp tones = [tone if tone in self.vocab_tones else "0" for tone in tones] tone_ids = [self.vocab_tones[item] for item in tones] @@ -407,6 +419,9 @@ class Frontend(): def _get_phone_tone(self, phonemes: List[str], get_tone_ids: bool=False) -> List[List[str]]: + """ + Get tone from phonemes. + """ phones = [] tones = [] if get_tone_ids and self.vocab_tones: @@ -423,13 +438,14 @@ class Frontend(): -1] == 'r' and phone not in self.vocab_phones and phone[: -1] in self.vocab_phones: phones.append(phone[:-1]) - phones.append("er") tones.append(tone) + phones.append("er") tones.append("2") else: phones.append(phone) tones.append(tone) else: + # initals with 0 tone. phones.append(full_phone) tones.append('0') else: @@ -443,6 +459,7 @@ class Frontend(): phones.append("er2") else: phones.append(phone) + return phones, tones def get_phonemes(self, @@ -451,10 +468,16 @@ class Frontend(): with_erhua: bool=True, robot: bool=False, print_info: bool=False) -> List[List[str]]: + """ + Main function to do G2P + """ + # TN & Text Segmentation sentences = self.text_normalizer.normalize(sentence) + # Prosody & WS & g2p & tone sandhi phonemes = self._g2p( sentences, merge_sentences=merge_sentences, with_erhua=with_erhua) - # change all tones to `1` + + # simulate robot pronunciation, change all tones to `1` if robot: new_phonemes = [] for sentence in phonemes: @@ -466,6 +489,7 @@ class Frontend(): new_sentence.append(item) new_phonemes.append(new_sentence) phonemes = new_phonemes + if print_info: print("----------------------------") print("text norm results:") @@ -476,25 +500,104 @@ class Frontend(): print("----------------------------") return phonemes - #@an added for ssml pinyin + def _split_word_to_char(self, words): + res = [] + for x in words: + res.append(x) + return res + + # if using ssml, have pingyin specified, assign pinyin to words + def _g2p_assign(self, + words: List[str], + pinyin_spec: List[str], + merge_sentences: bool=True) -> List[List[str]]: + """ + Replace phoneme by SSML + """ + phones_list = [] + initials = [] + finals = [] + + # to charactor list + words = self._split_word_to_char(words[0]) + + for pinyin, char in zip(pinyin_spec, words): + sub_initials = [] + sub_finals = [] + pinyin = pinyin.replace("u:", "v") + + #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu + if pinyin in self.pinyin2phone: + initial_final_list = self.pinyin2phone[pinyin].split(" ") + if len(initial_final_list) == 2: + sub_initials.append(initial_final_list[0]) + sub_finals.append(initial_final_list[1]) + elif len(initial_final_list) == 1: + sub_initials.append('') + sub_finals.append(initial_final_list[1]) + else: + # If it's not pinyin (possibly punctuation) or no conversion is required + sub_initials.append(pinyin) + sub_finals.append(pinyin) + + initials.append(sub_initials) + finals.append(sub_finals) + + initials = sum(initials, []) + finals = sum(finals, []) + + phones = [] + for c, v in zip(initials, finals): + # c for consonant, v for vowel + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c and c not in self.punc: + phones.append(c) + # replace punc to `sp` + if c and c in self.punc: + phones.append('sp') + if v and v not in self.punc and v not in self.rhy_phns: + phones.append(v) + phones_list.append(phones) + + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + + return phones_list + def get_phonemes_ssml(self, ssml_inputs: list, merge_sentences: bool=True, with_erhua: bool=True, robot: bool=False, print_info: bool=False) -> List[List[str]]: + """ + Main function to do G2P with SSML support. + """ all_phonemes = [] for word_pinyin_item in ssml_inputs: phonemes = [] + + # ['你喜欢', []] -> 你喜欢 [] sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) + + # TN & Text Segmentation sentences = self.text_normalizer.normalize(sentence) + if len(pinyin_spec) == 0: + # g2p word w/o specified phonemes = self._g2p( sentences, merge_sentences=merge_sentences, with_erhua=with_erhua) else: - # phonemes should be pinyin_spec + # word phonemes specified by phonemes = self._g2p_assign( sentences, pinyin_spec, merge_sentences=merge_sentences) @@ -512,17 +615,24 @@ class Frontend(): new_phonemes.append(new_sentence) all_phonemes = new_phonemes + if merge_sentences: + all_phonemes = [sum(all_phonemes, [])] + if print_info: print("----------------------------") print("text norm results:") print(sentences) print("----------------------------") print("g2p results:") - print(all_phonemes[0]) + print(all_phonemes) print("----------------------------") - return [sum(all_phonemes, [])] + + return all_phonemes def add_sp_if_no(self, phonemes): + """ + Prosody mark #4 added at sentence end. + """ if not phonemes[-1][-1].startswith('sp'): phonemes[-1].append('sp4') return phonemes @@ -542,8 +652,11 @@ class Frontend(): merge_sentences=merge_sentences, print_info=print_info, robot=robot) + + # add #4 for sentence end. if self.use_rhy: phonemes = self.add_sp_if_no(phonemes) + result = {} phones = [] tones = [] @@ -551,28 +664,33 @@ class Frontend(): temp_tone_ids = [] for part_phonemes in phonemes: + phones, tones = self._get_phone_tone( part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: phones = insert_after_character(phones, blank_token) + if tones: tone_ids = self._t2id(tones) if to_tensor: tone_ids = paddle.to_tensor(tone_ids) temp_tone_ids.append(tone_ids) + if phones: phone_ids = self._p2id(phones) # if use paddle.to_tensor() in onnxruntime, the first time will be too low if to_tensor: phone_ids = paddle.to_tensor(phone_ids) temp_phone_ids.append(phone_ids) + if temp_tone_ids: result["tone_ids"] = temp_tone_ids if temp_phone_ids: result["phone_ids"] = temp_phone_ids + return result - # @an added for ssml def get_input_ids_ssml( self, sentence: str, @@ -584,12 +702,15 @@ class Frontend(): blank_token: str="", to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - l_inputs = MixTextProcessor.get_pinyin_split(sentence) + # split setence by SSML tag. + texts = MixTextProcessor.get_pinyin_split(sentence) + phonemes = self.get_phonemes_ssml( - l_inputs, + texts, merge_sentences=merge_sentences, print_info=print_info, robot=robot) + result = {} phones = [] tones = [] @@ -599,21 +720,26 @@ class Frontend(): for part_phonemes in phonemes: phones, tones = self._get_phone_tone( part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: phones = insert_after_character(phones, blank_token) + if tones: tone_ids = self._t2id(tones) if to_tensor: tone_ids = paddle.to_tensor(tone_ids) temp_tone_ids.append(tone_ids) + if phones: phone_ids = self._p2id(phones) # if use paddle.to_tensor() in onnxruntime, the first time will be too low if to_tensor: phone_ids = paddle.to_tensor(phone_ids) temp_phone_ids.append(phone_ids) + if temp_tone_ids: result["tone_ids"] = temp_tone_ids if temp_phone_ids: result["phone_ids"] = temp_phone_ids + return result diff --git a/runtime/README.md b/runtime/README.md index 553bb29ad..0e9c243e9 100644 --- a/runtime/README.md +++ b/runtime/README.md @@ -2,7 +2,7 @@ ## Environment We develop under: -* python - 3.7 +* python - >=3.8 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7` * os - Ubuntu 16.04.7 LTS * gcc/g++/gfortran - 8.2.0 @@ -98,7 +98,7 @@ please install paddlepaddle >= 2.4rc ``` -cd $YOUR_ENV_PATH/lib/python3.7/site-packages/paddle/fluid +cd $YOUR_ENV_PATH/lib/python3.8/site-packages/paddle/fluid patchelf --set-soname libpaddle.so libpaddle.so ``` diff --git a/runtime/tools/venv.sh b/runtime/tools/venv.sh index 3952988c6..2aa7e5095 100755 --- a/runtime/tools/venv.sh +++ b/runtime/tools/venv.sh @@ -1,5 +1,5 @@ #!/bin/bash set -ex -PYTHON=python3.7 +PYTHON=python3.8 test -d venv || virtualenv -p ${PYTHON} venv diff --git a/setup.py b/setup.py index 07b411bd0..af7c4dc3d 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,8 @@ base = [ "hyperpyyaml", "inflect", "jsonlines", + # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x + "numpy==1.23.5", "librosa==0.8.1", "scipy>=1.4.0", "loguru", @@ -260,6 +262,7 @@ setup_info = dict( long_description=read("README.md"), long_description_content_type="text/markdown", keywords=[ + "SSL" "speech", "asr", "tts", @@ -268,12 +271,19 @@ setup_info = dict( "text frontend", "MFA", "paddlepaddle", + "paddleaudio", + "streaming asr", + "streaming tts", "beam search", "ctcdecoder", "deepspeech2", + "wav2vec2", + "hubert", + "wavlm", "transformer", "conformer", "fastspeech2", + "hifigan", "gan vocoders", ], python_requires='>=3.7', diff --git a/tests/unit/tts/test_enfrontend.py b/tests/unit/tts/test_enfrontend.py new file mode 100644 index 000000000..4f8c49305 --- /dev/null +++ b/tests/unit/tts/test_enfrontend.py @@ -0,0 +1,28 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend + +if __name__ == '__main__': + + fe = EnFrontend() + + text = "AI for Sceience" + phonemes = fe.phoneticize(text) + print(text) + print(phonemes) + + text = "eight" + phonemes = fe.phoneticize(text) + print(text) + print(phonemes) diff --git a/tests/unit/tts/test_mixfrontend.py b/tests/unit/tts/test_mixfrontend.py new file mode 100644 index 000000000..5751dd2a7 --- /dev/null +++ b/tests/unit/tts/test_mixfrontend.py @@ -0,0 +1,444 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +import tempfile + +from paddlespeech.t2s.frontend.mix_frontend import MixFrontend + +# mix zh & en phonemes +phone_id_str = """ + 0 + 1 +AA0 2 +AA1 3 +AA2 4 +AE0 5 +AE1 6 +AE2 7 +AH0 8 +AH1 9 +AH2 10 +AO0 11 +AO1 12 +AO2 13 +AW0 14 +AW1 15 +AW2 16 +AY0 17 +AY1 18 +AY2 19 +B 20 +CH 21 +D 22 +DH 23 +EH0 24 +EH1 25 +EH2 26 +ER0 27 +ER1 28 +ER2 29 +EY0 30 +EY1 31 +EY2 32 +F 33 +G 34 +HH 35 +IH0 36 +IH1 37 +IH2 38 +IY0 39 +IY1 40 +IY2 41 +JH 42 +K 43 +L 44 +M 45 +N 46 +NG 47 +OW0 48 +OW1 49 +OW2 50 +OY0 51 +OY1 52 +OY2 53 +P 54 +R 55 +S 56 +SH 57 +T 58 +TH 59 +UH0 60 +UH1 61 +UH2 62 +UW0 63 +UW1 64 +UW2 65 +V 66 +W 67 +Y 68 +Z 69 +ZH 70 +a1 71 +a2 72 +a3 73 +a4 74 +a5 75 +ai1 76 +ai2 77 +ai3 78 +ai4 79 +ai5 80 +air2 81 +air3 82 +air4 83 +an1 84 +an2 85 +an3 86 +an4 87 +an5 88 +ang1 89 +ang2 90 +ang3 91 +ang4 92 +ang5 93 +angr2 94 +angr4 95 +anr1 96 +anr3 97 +anr4 98 +ao1 99 +ao2 100 +ao3 101 +ao4 102 +ao5 103 +aor1 104 +aor3 105 +aor4 106 +aor5 107 +ar2 108 +ar3 109 +ar4 110 +ar5 111 +b 112 +c 113 +ch 114 +d 115 +e1 116 +e2 117 +e3 118 +e4 119 +e5 120 +ei1 121 +ei2 122 +ei3 123 +ei4 124 +ei5 125 +eir4 126 +en1 127 +en2 128 +en3 129 +en4 130 +en5 131 +eng1 132 +eng2 133 +eng3 134 +eng4 135 +eng5 136 +engr4 137 +enr1 138 +enr2 139 +enr3 140 +enr4 141 +enr5 142 +er1 143 +er2 144 +er3 145 +er4 146 +er5 147 +f 148 +g 149 +h 150 +i1 151 +i2 152 +i3 153 +i4 154 +i5 155 +ia1 156 +ia2 157 +ia3 158 +ia4 159 +ia5 160 +ian1 161 +ian2 162 +ian3 163 +ian4 164 +ian5 165 +iang1 166 +iang2 167 +iang3 168 +iang4 169 +iang5 170 +iangr4 171 +ianr1 172 +ianr2 173 +ianr3 174 +ianr4 175 +ianr5 176 +iao1 177 +iao2 178 +iao3 179 +iao4 180 +iao5 181 +iaor1 182 +iaor2 183 +iaor3 184 +iaor4 185 +iar1 186 +iar3 187 +iar4 188 +ie1 189 +ie2 190 +ie3 191 +ie4 192 +ie5 193 +ii1 194 +ii2 195 +ii3 196 +ii4 197 +ii5 198 +iii1 199 +iii2 200 +iii3 201 +iii4 202 +iii5 203 +iiir1 204 +iiir4 205 +iir2 206 +in1 207 +in2 208 +in3 209 +in4 210 +in5 211 +ing1 212 +ing2 213 +ing3 214 +ing4 215 +ing5 216 +ingr1 217 +ingr2 218 +ingr3 219 +ingr4 220 +inr1 221 +inr4 222 +io1 223 +io3 224 +io5 225 +iong1 226 +iong2 227 +iong3 228 +iong4 229 +iong5 230 +iou1 231 +iou2 232 +iou3 233 +iou4 234 +iou5 235 +iour1 236 +iour2 237 +iour3 238 +iour4 239 +ir1 240 +ir2 241 +ir3 242 +ir4 243 +ir5 244 +j 245 +k 246 +l 247 +m 248 +n 249 +o1 250 +o2 251 +o3 252 +o4 253 +o5 254 +ong1 255 +ong2 256 +ong3 257 +ong4 258 +ong5 259 +ongr4 260 +or2 261 +ou1 262 +ou2 263 +ou3 264 +ou4 265 +ou5 266 +our2 267 +our3 268 +our4 269 +our5 270 +p 271 +q 272 +r 273 +s 274 +sh 275 +sil 276 +sp 277 +spl 278 +spn 279 +t 280 +u1 281 +u2 282 +u3 283 +u4 284 +u5 285 +ua1 286 +ua2 287 +ua3 288 +ua4 289 +ua5 290 +uai1 291 +uai2 292 +uai3 293 +uai4 294 +uai5 295 +uair4 296 +uan1 297 +uan2 298 +uan3 299 +uan4 300 +uan5 301 +uang1 302 +uang2 303 +uang3 304 +uang4 305 +uang5 306 +uangr4 307 +uanr1 308 +uanr2 309 +uanr3 310 +uanr4 311 +uanr5 312 +uar1 313 +uar2 314 +uar4 315 +uei1 316 +uei2 317 +uei3 318 +uei4 319 +uei5 320 +ueir1 321 +ueir2 322 +ueir3 323 +ueir4 324 +uen1 325 +uen2 326 +uen3 327 +uen4 328 +uen5 329 +ueng1 330 +ueng2 331 +ueng3 332 +ueng4 333 +uenr1 334 +uenr2 335 +uenr3 336 +uenr4 337 +uo1 338 +uo2 339 +uo3 340 +uo4 341 +uo5 342 +uor1 343 +uor2 344 +uor3 345 +uor5 346 +ur1 347 +ur2 348 +ur3 349 +ur4 350 +ur5 351 +v1 352 +v2 353 +v3 354 +v4 355 +v5 356 +van1 357 +van2 358 +van3 359 +van4 360 +van5 361 +vanr1 362 +vanr2 363 +vanr3 364 +vanr4 365 +ve1 366 +ve2 367 +ve3 368 +ve4 369 +ve5 370 +ver3 371 +ver4 372 +vn1 373 +vn2 374 +vn3 375 +vn4 376 +vn5 377 +vnr2 378 +vr3 379 +x 380 +z 381 +zh 382 +, 383 +. 384 +? 385 +! 386 + 387 +""" + +if __name__ == '__main__': + with tempfile.NamedTemporaryFile(mode='wt') as f: + phone_ids = phone_id_str.split() + for phone, id in zip(phone_ids[::2], phone_ids[1::2]): + f.write(f"{phone} {id}") + f.write('\n') + f.flush() + + frontend = MixFrontend(phone_vocab_path=f.name) + + text = "hello, 我爱北京天安们,what about you." + print(text) + # [('hello, ', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')] + segs = frontend.split_by_lang(text) + print(segs) + + text = "hello?!!我爱北京天安们,what about you." + print(text) + # [('hello?!!', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')] + segs = frontend.split_by_lang(text) + print(segs) + + text = " hello,我爱北京天安们,what about you." + print(text) + # [(' hello,', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')] + segs = frontend.split_by_lang(text) + print(segs) + + # 对于SSML的xml标记处理不好。需要先解析SSML,后处理中英的划分。 + text = "我们的声学模型使用了 Fast Speech Two。前浪在沙滩上,沙滩上倒了一堆。 想象干干的树干了, 里面有个干尸,不知是被谁死的。" + print(text) + # [('', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干', 'en'), ('死的。', 'en')] + segs = frontend.split_by_lang(text) + print(segs) diff --git a/tests/unit/tts/test_ssml.py b/tests/unit/tts/test_ssml.py new file mode 100644 index 000000000..4c3e9d538 --- /dev/null +++ b/tests/unit/tts/test_ssml.py @@ -0,0 +1,83 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor + +if __name__ == '__main__': + text = "你好吗,我们的声学模型使用了 Fast Speech Two。前浪在沙滩上,沙滩上倒了一堆。 想象干干的树干了, 里面有个干尸,不知是被谁死的。thank you." + + # SSML: 13 + # 0 ['你好吗,', []] + # 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []] + # 2 ['倒', ['dao3']] + # 3 ['在沙滩上,沙滩上倒了一堆', []] + # 4 ['土', ['tu3']] + # 5 ['。想象', []] + # 6 ['干干', ['gan1', 'gan1']] + # 7 ['的树干', []] + # 8 ['倒', ['dao3']] + # 9 ['了,里面有个干尸,不知是被谁', []] + # 10 ['干', ['gan4']] + # 11 ['死的。', []] + # 12 ['thank you.', []] + inputs = MixTextProcessor.get_pinyin_split(text) + print(f"SSML get_pinyin_split: {len(inputs)}") + for i, sub in enumerate(inputs): + print(i, sub) + print() + + # SSML get_dom_split: 13 + # 0 你好吗, + # 1 我们的声学模型使用了 Fast Speech Two。前浪 + # 2 + # 3 在沙滩上,沙滩上倒了一堆 + # 4 + # 5 。 想象 + # 6 干干 + # 7 的树干 + # 8 + # 9 了, 里面有个干尸,不知是被谁 + # 10 + # 11 死的。 + # 12 thank you. + inputs = MixTextProcessor.get_dom_split(text) + print(f"SSML get_dom_split: {len(inputs)}") + for i, sub in enumerate(inputs): + print(i, sub) + print() + + # SSML object.get_pinyin_split: 246 + # 我们的声学模型使用了 Fast Speech Two。前浪在沙滩上,沙滩上倒了一堆。 想象干干的树干了, 里面有个干尸,不知是被谁死的。 + outs = MixTextProcessor().get_xml_content(text) + print(f"SSML object.get_pinyin_split: {len(outs)}") + print(outs) + print() + + # SSML object.get_content_split: 30 你好吗, + # 1 我们的声学模型使用了 Fast Speech Two。前浪在沙滩上,沙滩上倒了一堆。 想象干干的树干 + # 倒了, 里面有个干尸,不知是被谁死的。 + # 2 thank you. + outs = MixTextProcessor().get_content_split(text) + print(f"SSML object.get_content_split: {len(outs)}") + for i, sub in enumerate(outs): + print(i, sub) + print() + + import json + import xmltodict + text = "我们的声学模型使用了 Fast Speech Two。前浪在沙滩上,沙滩上倒了一堆。 想象干干的树干了, 里面有个干尸,不知是被谁死的。" + ssml = xmltodict.parse(text) + print(json.dumps(ssml)) + print(ssml['speak'].keys()) + print(ssml['speak']['#text']) + print(ssml['speak']['say-as']) diff --git a/tools/Makefile b/tools/Makefile index a5a4485da..c6c667cd0 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,5 +1,5 @@ SHELL:= /bin/bash -PYTHON:= python3.7 +PYTHON:= python3.8 CXX ?= g++ CC ?= gcc # used for sph2pipe