From 880c172db7a6e4e2e0b0f2c3a0b3cdea512e5f0a Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Tue, 21 Mar 2023 13:04:28 +0800 Subject: [PATCH] [TTS] add svs frontend (#3062) --- examples/opencpop/svs1/README.md | 110 ++++- examples/opencpop/svs1/README_cn.md | 107 ++++- .../opencpop/svs1/local/pinyin_to_phone.txt | 418 ++++++++++++++++++ .../opencpop/svs1/local/synthesize_e2e.sh | 53 +++ examples/opencpop/svs1/run.sh | 5 + paddlespeech/t2s/exps/sentences_sing.txt | 2 + paddlespeech/t2s/exps/syn_utils.py | 60 ++- paddlespeech/t2s/exps/synthesize_e2e.py | 48 +- paddlespeech/t2s/frontend/sing_frontend.py | 175 ++++++++ 9 files changed, 957 insertions(+), 21 deletions(-) create mode 100644 examples/opencpop/svs1/local/pinyin_to_phone.txt create mode 100755 examples/opencpop/svs1/local/synthesize_e2e.sh create mode 100644 paddlespeech/t2s/exps/sentences_sing.txt create mode 100644 paddlespeech/t2s/frontend/sing_frontend.py diff --git a/examples/opencpop/svs1/README.md b/examples/opencpop/svs1/README.md index 2e28a6e61..1600d0c76 100644 --- a/examples/opencpop/svs1/README.md +++ b/examples/opencpop/svs1/README.md @@ -70,7 +70,7 @@ Train a FastSpeech2 model. optional arguments: -h, --help show this help message and exit - --config CONFIG fastspeech2 config file. + --config CONFIG diffsinger config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -126,6 +126,7 @@ optional arguments: -h, --help show this help message and exit --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. --am_config AM_CONFIG Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. @@ -141,6 +142,7 @@ optional arguments: whether training voice cloning model. --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. --voc_config VOC_CONFIG Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. @@ -151,9 +153,84 @@ optional arguments: test metadata. --output_dir OUTPUT_DIR output dir. - --speech-stretchs mel min and max values file. + --speech-stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. ``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +`local/pinyin_to_phone.txt` comes from the readme of the opencpop dataset, indicating the mapping from pinyin to phonemes in opencpop. + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + [--pinyin_phone PINYIN_PHONE] + [--speech_stretchs SPEECH_STRETCHS] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} + Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. + --am_config AM_CONFIG + Config of acoustic model. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} + Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. + --voc_config VOC_CONFIG + Config of voc. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG {zh, en, mix, canton} Choose language type of tts task. + {sing} Choose language type of svs task. + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize file, a 'utt_id sentence' pair per line for tts task. + A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task. + --output_dir OUTPUT_DIR + output dir. + --pinyin_phone PINYIN_PHONE + pinyin to phone map file, using on sing_frontend. + --speech_stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the diffsinger pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is language. `zh`, `en`, `mix` and `canton` for tts task. `sing` for tts task. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +10. `--inference_dir` is the directory to save static models. If this line is not added, it will not be generated and saved as a static model. +11. `--pinyin_phone` pinyin to phone map file, using on sing_frontend. +12. `--speech_stretchs` The min and max values of the mel spectrum, using on diffusion of diffsinger. + +Note: At present, the diffsinger model does not support dynamic to static, so do not add `--inference_dir`. + ## Pretrained Model Pretrained DiffSinger model: @@ -165,10 +242,35 @@ diffsinger_opencpop_ckpt_1.4.0.zip ├── default.yaml # default config used to train diffsinger ├── energy_stats.npy # statistics used to normalize energy when training diffsinger if norm is needed ├── phone_id_map.txt # phone vocabulary file when training diffsinger +├── pinyin_to_phone.txt # pinyin-to-phoneme mapping file when training diffsinger ├── pitch_stats.npy # statistics used to normalize pitch when training diffsinger if norm is needed ├── snapshot_iter_160000.pdz # model parameters of diffsinger ├── speech_stats.npy # statistics used to normalize mel when training diffsinger if norm is needed -└── speech_stretchs.npy # Min and max values to use for mel spectral stretching before training diffusion +└── speech_stretchs.npy # min and max values to use for mel spectral stretching before training diffusion + +``` + +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_sing.txt` using pretrained diffsinger and parallel wavegan models. + +```bash +source path.sh +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \ + --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \ + --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \ + --voc=pwgan_opencpop \ + --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \ + --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=exp/default/test_e2e \ + --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \ + --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \ + --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy + ``` -At present, the text frontend is not perfect, and the method of `synthesize_e2e` is not supported for synthesizing audio. Try using `synthesize` first. \ No newline at end of file diff --git a/examples/opencpop/svs1/README_cn.md b/examples/opencpop/svs1/README_cn.md index 19908fd60..1435b42ec 100644 --- a/examples/opencpop/svs1/README_cn.md +++ b/examples/opencpop/svs1/README_cn.md @@ -73,7 +73,7 @@ Train a DiffSinger model. optional arguments: -h, --help show this help message and exit - --config CONFIG fastspeech2 config file. + --config CONFIG diffsinger config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -131,6 +131,7 @@ optional arguments: -h, --help show this help message and exit --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. --am_config AM_CONFIG Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. @@ -146,6 +147,7 @@ optional arguments: whether training voice cloning model. --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. --voc_config VOC_CONFIG Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. @@ -156,9 +158,85 @@ optional arguments: test metadata. --output_dir OUTPUT_DIR output dir. - --speech-stretchs mel min and max values file. + --speech-stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. ``` +`./local/synthesize_e2e.sh` 调用 `${BIN_DIR}/../synthesize_e2e.py`,即可从文本文件中合成波形。 +`local/pinyin_to_phone.txt`来源于opencpop数据集中的README,表示opencpop中拼音到音素的映射。 + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + [--pinyin_phone PINYIN_PHONE] + [--speech_stretchs SPEECH_STRETCHS] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} + Choose acoustic model type of tts task. + {diffsinger_opencpop} Choose acoustic model type of svs task. + --am_config AM_CONFIG + Config of acoustic model. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} + Choose vocoder type of tts task. + {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task. + --voc_config VOC_CONFIG + Config of voc. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG {zh, en, mix, canton} Choose language type of tts task. + {sing} Choose language type of svs task. + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize file, a 'utt_id sentence' pair per line for tts task. + A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task. + --output_dir OUTPUT_DIR + output dir. + --pinyin_phone PINYIN_PHONE + pinyin to phone map file, using on sing_frontend. + --speech_stretchs SPEECH_STRETCHS + The min and max values of the mel spectrum, using on diffusion of diffsinger. +``` +1. `--am` 声学模型格式是否符合 {model_name}_{dataset} +2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 diffsinger 预训练模型中的 4 个文件。 +3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset} +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 +5. `--lang` tts对应模型的语言可以是 `zh`、`en`、`mix`和`canton`。 svs 对应的语言是 `sing` 。 +6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、 +7. `--text` 是文本文件,其中包含要合成的句子。 +8. `--output_dir` 是保存合成音频文件的目录。 +9. `--ngpu` 要使用的GPU数,如果 ngpu==0,则使用 cpu。 +10. `--inference_dir` 静态模型保存的目录。如果不加这一行,就不会生并保存成静态模型。 +11. `--pinyin_phone` 拼音到音素的映射文件。 +12. `--speech_stretchs` mel谱的最大最小值用于diffsinger中diffusion之前的线性拉伸。 + +注意: 目前 diffsinger 模型还不支持动转静,所以不要加 `--inference_dir`。 + + ## 预训练模型 预先训练的 DiffSinger 模型: - [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip) @@ -170,10 +248,33 @@ diffsinger_opencpop_ckpt_1.4.0.zip ├── default.yaml # 用于训练 diffsinger 的默认配置 ├── energy_stats.npy # 训练 diffsinger 时如若需要 norm energy 会使用到的统计数据 ├── phone_id_map.txt # 训练 diffsinger 时的音素词汇文件 +├── pinyin_to_phone.txt # 训练 diffsinger 时的拼音到音素映射文件 ├── pitch_stats.npy # 训练 diffsinger 时如若需要 norm pitch 会使用到的统计数据 ├── snapshot_iter_160000.pdz # 模型参数和优化器状态 ├── speech_stats.npy # 训练 diffsinger 时用于规范化频谱图的统计数据 └── speech_stretchs.npy # 训练 diffusion 前用于 mel 谱拉伸的最小及最大值 ``` -目前文本前端未完善,暂不支持 `synthesize_e2e` 的方式合成音频。尝试效果可先使用 `synthesize`。 +您可以使用以下脚本通过使用预训练的 diffsinger 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences_sing.txt` 合成句子 +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \ + --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \ + --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \ + --voc=pwgan_opencpop \ + --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \ + --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=exp/default/test_e2e \ + --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \ + --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \ + --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy + +``` diff --git a/examples/opencpop/svs1/local/pinyin_to_phone.txt b/examples/opencpop/svs1/local/pinyin_to_phone.txt new file mode 100644 index 000000000..34ed079d7 --- /dev/null +++ b/examples/opencpop/svs1/local/pinyin_to_phone.txt @@ -0,0 +1,418 @@ +a|a +ai|ai +an|an +ang|ang +ao|ao +ba|b a +bai|b ai +ban|b an +bang|b ang +bao|b ao +bei|b ei +ben|b en +beng|b eng +bi|b i +bian|b ian +biao|b iao +bie|b ie +bin|b in +bing|b ing +bo|b o +bu|b u +ca|c a +cai|c ai +can|c an +cang|c ang +cao|c ao +ce|c e +cei|c ei +cen|c en +ceng|c eng +cha|ch a +chai|ch ai +chan|ch an +chang|ch ang +chao|ch ao +che|ch e +chen|ch en +cheng|ch eng +chi|ch i +chong|ch ong +chou|ch ou +chu|ch u +chua|ch ua +chuai|ch uai +chuan|ch uan +chuang|ch uang +chui|ch ui +chun|ch un +chuo|ch uo +ci|c i +cong|c ong +cou|c ou +cu|c u +cuan|c uan +cui|c ui +cun|c un +cuo|c uo +da|d a +dai|d ai +dan|d an +dang|d ang +dao|d ao +de|d e +dei|d ei +den|d en +deng|d eng +di|d i +dia|d ia +dian|d ian +diao|d iao +die|d ie +ding|d ing +diu|d iu +dong|d ong +dou|d ou +du|d u +duan|d uan +dui|d ui +dun|d un +duo|d uo +e|e +ei|ei +en|en +eng|eng +er|er +fa|f a +fan|f an +fang|f ang +fei|f ei +fen|f en +feng|f eng +fo|f o +fou|f ou +fu|f u +ga|g a +gai|g ai +gan|g an +gang|g ang +gao|g ao +ge|g e +gei|g ei +gen|g en +geng|g eng +gong|g ong +gou|g ou +gu|g u +gua|g ua +guai|g uai +guan|g uan +guang|g uang +gui|g ui +gun|g un +guo|g uo +ha|h a +hai|h ai +han|h an +hang|h ang +hao|h ao +he|h e +hei|h ei +hen|h en +heng|h eng +hm|h m +hng|h ng +hong|h ong +hou|h ou +hu|h u +hua|h ua +huai|h uai +huan|h uan +huang|h uang +hui|h ui +hun|h un +huo|h uo +ji|j i +jia|j ia +jian|j ian +jiang|j iang +jiao|j iao +jie|j ie +jin|j in +jing|j ing +jiong|j iong +jiu|j iu +ju|j v +juan|j van +jue|j ve +jun|j vn +ka|k a +kai|k ai +kan|k an +kang|k ang +kao|k ao +ke|k e +kei|k ei +ken|k en +keng|k eng +kong|k ong +kou|k ou +ku|k u +kua|k ua +kuai|k uai +kuan|k uan +kuang|k uang +kui|k ui +kun|k un +kuo|k uo +la|l a +lai|l ai +lan|l an +lang|l ang +lao|l ao +le|l e +lei|l ei +leng|l eng +li|l i +lia|l ia +lian|l ian +liang|l iang +liao|l iao +lie|l ie +lin|l in +ling|l ing +liu|l iu +lo|l o +long|l ong +lou|l ou +lu|l u +luan|l uan +lun|l un +luo|l uo +lv|l v +lve|l ve +m|m +ma|m a +mai|m ai +man|m an +mang|m ang +mao|m ao +me|m e +mei|m ei +men|m en +meng|m eng +mi|m i +mian|m ian +miao|m iao +mie|m ie +min|m in +ming|m ing +miu|m iu +mo|m o +mou|m ou +mu|m u +n|n +na|n a +nai|n ai +nan|n an +nang|n ang +nao|n ao +ne|n e +nei|n ei +nen|n en +neng|n eng +ng|n g +ni|n i +nian|n ian +niang|n iang +niao|n iao +nie|n ie +nin|n in +ning|n ing +niu|n iu +nong|n ong +nou|n ou +nu|n u +nuan|n uan +nun|n un +nuo|n uo +nv|n v +nve|n ve +o|o +ou|ou +pa|p a +pai|p ai +pan|p an +pang|p ang +pao|p ao +pei|p ei +pen|p en +peng|p eng +pi|p i +pian|p ian +piao|p iao +pie|p ie +pin|p in +ping|p ing +po|p o +pou|p ou +pu|p u +qi|q i +qia|q ia +qian|q ian +qiang|q iang +qiao|q iao +qie|q ie +qin|q in +qing|q ing +qiong|q iong +qiu|q iu +qu|q v +quan|q van +que|q ve +qun|q vn +ran|r an +rang|r ang +rao|r ao +re|r e +ren|r en +reng|r eng +ri|r i +rong|r ong +rou|r ou +ru|r u +rua|r ua +ruan|r uan +rui|r ui +run|r un +ruo|r uo +sa|s a +sai|s ai +san|s an +sang|s ang +sao|s ao +se|s e +sen|s en +seng|s eng +sha|sh a +shai|sh ai +shan|sh an +shang|sh ang +shao|sh ao +she|sh e +shei|sh ei +shen|sh en +sheng|sh eng +shi|sh i +shou|sh ou +shu|sh u +shua|sh ua +shuai|sh uai +shuan|sh uan +shuang|sh uang +shui|sh ui +shun|sh un +shuo|sh uo +si|s i +song|s ong +sou|s ou +su|s u +suan|s uan +sui|s ui +sun|s un +suo|s uo +ta|t a +tai|t ai +tan|t an +tang|t ang +tao|t ao +te|t e +tei|t ei +teng|t eng +ti|t i +tian|t ian +tiao|t iao +tie|t ie +ting|t ing +tong|t ong +tou|t ou +tu|t u +tuan|t uan +tui|t ui +tun|t un +tuo|t uo +wa|w a +wai|w ai +wan|w an +wang|w ang +wei|w ei +wen|w en +weng|w eng +wo|w o +wu|w u +xi|x i +xia|x ia +xian|x ian +xiang|x iang +xiao|x iao +xie|x ie +xin|x in +xing|x ing +xiong|x iong +xiu|x iu +xu|x v +xuan|x van +xue|x ve +xun|x vn +ya|y a +yan|y an +yang|y ang +yao|y ao +ye|y e +yi|y i +yin|y in +ying|y ing +yo|y o +yong|y ong +you|y ou +yu|y v +yuan|y van +yue|y ve +yun|y vn +za|z a +zai|z ai +zan|z an +zang|z ang +zao|z ao +ze|z e +zei|z ei +zen|z en +zeng|z eng +zha|zh a +zhai|zh ai +zhan|zh an +zhang|zh ang +zhao|zh ao +zhe|zh e +zhei|zh ei +zhen|zh en +zheng|zh eng +zhi|zh i +zhong|zh ong +zhou|zh ou +zhu|zh u +zhua|zh ua +zhuai|zh uai +zhuan|zh uan +zhuang|zh uang +zhui|zh ui +zhun|zh un +zhuo|zh uo +zi|z i +zong|z ong +zou|z ou +zu|z u +zuan|z uan +zui|z ui +zun|z un +zuo|z uo \ No newline at end of file diff --git a/examples/opencpop/svs1/local/synthesize_e2e.sh b/examples/opencpop/svs1/local/synthesize_e2e.sh new file mode 100755 index 000000000..b3dc29b11 --- /dev/null +++ b/examples/opencpop/svs1/local/synthesize_e2e.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_opencpop \ + --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \ + --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --speech_stretchs=dump/train/speech_stretchs.npy \ + --pinyin_phone=local/pinyin_to_phone.txt +fi + +# for more GAN Vocoders +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "in hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=diffsinger_opencpop \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_opencpop \ + --voc_config=hifigan_opencpop_ckpt_1.4.0/default.yaml \ + --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \ + --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \ + --lang=sing \ + --text=${BIN_DIR}/../sentences_sing.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --speech_stretchs=dump/train/speech_stretchs.npy \ + --pinyin_phone=local/pinyin_to_phone.txt + +fi diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh index 7bde38518..bfe5b6594 100755 --- a/examples/opencpop/svs1/run.sh +++ b/examples/opencpop/svs1/run.sh @@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # synthesize, vocoder is pwgan by default CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/paddlespeech/t2s/exps/sentences_sing.txt b/paddlespeech/t2s/exps/sentences_sing.txt new file mode 100644 index 000000000..7b9c6272d --- /dev/null +++ b/paddlespeech/t2s/exps/sentences_sing.txt @@ -0,0 +1,2 @@ +{"utt_id": "2093003457", "input_type": "word", "text": "小酒窝长睫毛AP是你最美的记号", "notes": "C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4", "note_durs": "0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340"} +{"utt_id": "2093003458", "input_type": "phoneme", "phones": "w o m ei t ian sh ui ui b u u zh ao AP x iang n ian n i d e w ei x iao iao AP" , "notes": "C#4/Db4 C#4/Db4 D#4/Eb4 D#4/Eb4 F4 F4 F#4/Gb4 F#4/Gb4 D#4/Eb4 D#4/Eb4 D#4/Eb4 A#3/Bb3 A#3/Bb3 A#3/Bb3 rest F#4/Gb4 F#4/Gb4 F4 F4 F#4/Gb4 F#4/Gb4 F4 F4 G#4/Ab4 G#4/Ab4 D#4/Eb4 D#4/Eb4 C#4/Db4 rest", "note_durs": "0.221750 0.221750 0.414460 0.414460 0.223160 0.223160 0.430900 0.430900 0.335990 0.269270 0.269270 0.289060 0.522690 0.522690 0.355060 0.397130 0.397130 0.247690 0.247690 0.406720 0.406720 0.246830 0.246830 0.307540 0.307540 0.429910 0.429910 0.519130 0.342300", "is_slurs": "0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0"} \ No newline at end of file diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 60608ee5b..2b958b567 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -20,6 +20,7 @@ from typing import Dict from typing import List from typing import Optional +import jsonlines import numpy as np import onnxruntime as ort import paddle @@ -35,6 +36,7 @@ from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend from paddlespeech.t2s.frontend.mix_frontend import MixFrontend +from paddlespeech.t2s.frontend.sing_frontend import SingFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.utils.dynamic_import import dynamic_import @@ -127,6 +129,19 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): return sentences +# input for svs +def get_sentences_svs(text_file: Optional[os.PathLike]): + # construct dataset for evaluation + sentences = [] + with jsonlines.open(text_file, 'r') as reader: + svs_inputs = list(reader) + for svs_input in svs_inputs: + utt_id = svs_input['utt_id'] + sentence = svs_input + sentences.append((utt_id, sentence)) + return sentences + + # am only def get_test_dataset(test_metadata: List[Dict[str, Any]], am: str, @@ -268,6 +283,7 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]], def get_frontend(lang: str='zh', phones_dict: Optional[os.PathLike]=None, tones_dict: Optional[os.PathLike]=None, + pinyin_phone: Optional[os.PathLike]=None, use_rhy=False): if lang == 'zh': frontend = Frontend( @@ -281,18 +297,23 @@ def get_frontend(lang: str='zh', elif lang == 'mix': frontend = MixFrontend( phone_vocab_path=phones_dict, tone_vocab_path=tones_dict) + elif lang == 'sing': + frontend = SingFrontend( + pinyin_phone_path=pinyin_phone, phone_vocab_path=phones_dict) else: print("wrong lang!") return frontend -def run_frontend(frontend: object, - text: str, - merge_sentences: bool=False, - get_tone_ids: bool=False, - lang: str='zh', - to_tensor: bool=True, - add_blank: bool=False): +def run_frontend( + frontend: object, + text: str, + merge_sentences: bool=False, + get_tone_ids: bool=False, + lang: str='zh', + to_tensor: bool=True, + add_blank: bool=False, + svs_input: Dict[str, str]=None, ): outs = dict() if lang == 'zh': input_ids = {} @@ -326,8 +347,18 @@ def run_frontend(frontend: object, input_ids = frontend.get_input_ids( text, merge_sentences=merge_sentences, to_tensor=to_tensor) phone_ids = input_ids["phone_ids"] + elif lang == 'sing': + input_ids = frontend.get_input_ids( + svs_input=svs_input, to_tensor=to_tensor) + phone_ids = input_ids["phone_ids"] + note_ids = input_ids["note_ids"] + note_durs = input_ids["note_durs"] + is_slurs = input_ids["is_slurs"] + outs.update({'note_ids': note_ids}) + outs.update({'note_durs': note_durs}) + outs.update({'is_slurs': is_slurs}) else: - print("lang should in {'zh', 'en', 'mix', 'canton'}!") + print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!") outs.update({'phone_ids': phone_ids}) return outs @@ -474,6 +505,7 @@ def am_to_static(am_inference, elif am_name == 'tacotron2': am_inference = jit.to_static( am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + elif am_name == 'vits': if am_dataset in {"aishell3", "vctk"} and speaker_dict is not None: am_inference = jit.to_static( @@ -485,8 +517,20 @@ def am_to_static(am_inference, else: am_inference = jit.to_static( am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) + + elif am_name == 'diffsinger': + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), # phone + InputSpec([-1], dtype=paddle.int64), # note + InputSpec([-1], dtype=paddle.float32), # note_dur + InputSpec([-1], dtype=paddle.int64), # is_slur + ]) + jit.save(am_inference, os.path.join(inference_dir, am)) am_inference = jit.load(os.path.join(inference_dir, am)) + return am_inference diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index db94a6e53..0c7b34b09 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -24,6 +24,7 @@ from paddlespeech.t2s.exps.syn_utils import am_to_static from paddlespeech.t2s.exps.syn_utils import get_am_inference from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import get_sentences_svs from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.exps.syn_utils import run_frontend from paddlespeech.t2s.exps.syn_utils import voc_to_static @@ -44,20 +45,18 @@ def evaluate(args): print(am_config) print(voc_config) - sentences = get_sentences(text_file=args.text, lang=args.lang) - # frontend frontend = get_frontend( lang=args.lang, phones_dict=args.phones_dict, tones_dict=args.tones_dict, + pinyin_phone=args.pinyin_phone, use_rhy=args.use_rhy) print("frontend done!") # acoustic model am_name = args.am[:args.am.rindex('_')] am_dataset = args.am[args.am.rindex('_') + 1:] - am_inference = get_am_inference( am=args.am, am_config=am_config, @@ -65,8 +64,10 @@ def evaluate(args): am_stat=args.am_stat, phones_dict=args.phones_dict, tones_dict=args.tones_dict, - speaker_dict=args.speaker_dict) + speaker_dict=args.speaker_dict, + speech_stretchs=args.speech_stretchs, ) print("acoustic model done!") + # vocoder voc_inference = get_voc_inference( voc=args.voc, @@ -103,14 +104,25 @@ def evaluate(args): N = 0 T = 0 + if am_name == 'diffsinger': + sentences = get_sentences_svs(text_file=args.text) + else: + sentences = get_sentences(text_file=args.text, lang=args.lang) for utt_id, sentence in sentences: with timer() as t: + if am_name == "diffsinger": + text = "" + svs_input = sentence + else: + text = sentence + svs_input = None frontend_dict = run_frontend( frontend=frontend, - text=sentence, + text=text, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids, - lang=args.lang) + lang=args.lang, + svs_input=svs_input) phone_ids = frontend_dict['phone_ids'] with paddle.no_grad(): flags = 0 @@ -134,6 +146,15 @@ def evaluate(args): mel = am_inference(part_phone_ids, part_tone_ids) elif am_name == 'tacotron2': mel = am_inference(part_phone_ids) + elif am_name == 'diffsinger': + part_note_ids = frontend_dict['note_ids'][i] + part_note_durs = frontend_dict['note_durs'][i] + part_is_slurs = frontend_dict['is_slurs'][i] + mel = am_inference( + text=part_phone_ids, + note=part_note_ids, + note_dur=part_note_durs, + is_slur=part_is_slurs, ) # vocoder wav = voc_inference(mel) if flags == 0: @@ -178,6 +199,7 @@ def parse_args(): 'fastspeech2_male-zh', 'fastspeech2_male-en', 'fastspeech2_male-mix', + 'diffsinger_opencpop', ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -223,6 +245,8 @@ def parse_args(): 'wavernn_csmsc', 'pwgan_male', 'hifigan_male', + 'pwgan_opencpop', + 'hifigan_opencpop', ], help='Choose vocoder type of tts task.') parser.add_argument( @@ -240,6 +264,7 @@ def parse_args(): '--lang', type=str, default='zh', + choices=['zh', 'en', 'mix', 'canton', 'sing'], help='Choose model language. zh or en or mix') parser.add_argument( @@ -259,6 +284,17 @@ def parse_args(): type=str2bool, default=False, help="run rhythm frontend or not") + parser.add_argument( + "--pinyin_phone", + type=str, + default=None, + help="pinyin to phone map file, using on sing_frontend.") + parser.add_argument( + "--speech_stretchs", + type=str, + default=None, + help="The min and max values of the mel spectrum, using on diffusion of diffsinger." + ) args = parser.parse_args() return args diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py new file mode 100644 index 000000000..c2aecf273 --- /dev/null +++ b/paddlespeech/t2s/frontend/sing_frontend.py @@ -0,0 +1,175 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from typing import Dict +from typing import List + +import librosa +import numpy as np +import paddle +from pypinyin import lazy_pinyin + + +class SingFrontend(): + def __init__(self, pinyin_phone_path: str, phone_vocab_path: str): + """SVS Frontend + + Args: + pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line. + phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line. + """ + self.punc = '[:,;。?!“”‘’\':,;.?!]' + + self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'} + if pinyin_phone_path: + with open(pinyin_phone_path, 'rt', encoding='utf-8') as f: + for line in f.readlines(): + pinyin_phn = [ + x.strip() for x in line.split('|') if x.strip() != '' + ] + self.pinyin_phones[pinyin_phn[0]] = pinyin_phn[1] + + self.vocab_phones = {} + if phone_vocab_path: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: + phn_id = [line.strip().split() for line in f.readlines()] + for phn, id in phn_id: + self.vocab_phones[phn] = int(id) + + def get_phones(self, sentence: str) -> List[int]: + """get phone list + + Args: + sentence (str): sentence + + Returns: + List[int]: phones list + + Example: + sentence = "你好" + phones = ['n i', 'h ao'] + """ + # remove all punc + sentence = re.sub(self.punc, "", sentence) + + # Pypinyin can't solve polyphonic words + sentence = sentence.replace('最长', '最常').replace('长睫毛', '常睫毛') \ + .replace('那么长', '那么常').replace('多长', '多常') \ + .replace('很长', '很常') + + # lyric + pinyins = lazy_pinyin(sentence, strict=False) + # replace unk word with SP + pinyins = [ + pinyin if pinyin in self.pinyin_phones.keys() else "SP" + for pinyin in pinyins + ] + phones = [ + self.pinyin_phones[pinyin.strip()] for pinyin in pinyins + if pinyin.strip() in self.pinyin_phones + ] + + return phones + + def get_note_info(self, note_info: str) -> List[str]: + note_info = [x.strip() for x in note_info.split('|') if x.strip() != ''] + return note_info + + def process( + self, + phones: List[int], + notes: List[str], + note_durs: List[float], ) -> Dict[str, List[paddle.Tensor]]: + new_phones = [] + new_notes = [] + new_note_durs = [] + is_slurs = [] + assert len(phones) == len(notes) == len( + note_durs + ), "Please check the input, text, notes, note_durs should be the same length." + for i in range(len(phones)): + phone = phones[i].split() + note = notes[i].split() + note_dur = note_durs[i].split() + + for phn in phone: + new_phones.append(phn) + new_notes.append(note[0]) + new_note_durs.append(note_dur[0]) + is_slurs.append(0) + + if len(note) > 1: + for i in range(1, len(note)): + new_phones.append(phone[-1]) + new_notes.append(note[i]) + new_note_durs.append(note_dur[i]) + is_slurs.append(1) + + return new_phones, new_notes, new_note_durs, is_slurs + + def get_input_ids(self, svs_input: Dict[str, str], + to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + """convert input to int/float. + + Args: + svs_input (Dict[str, str]): include keys: if input_type is phones, phones, notes, note_durs and is_slurs are needed. + if input_type is word, text, notes, and note_durs sre needed. + to_tensor (bool, optional): whether to convert to Tensor. Defaults to True. + + Returns: + Dict[str, List[paddle.Tensor]]: result include phone_ids, note_ids, note_durs, is_slurs. + """ + result = {} + input_type = svs_input['input_type'] + if input_type == 'phoneme': + assert "phones" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys() and "is_slurs" in svs_input.keys(), \ + "When input_type is phoneme, phones, notes, note_durs, is_slurs should be in the svs_input." + phones = svs_input["phones"].split() + notes = svs_input["notes"].split() + note_durs = svs_input["note_durs"].split() + is_slurs = svs_input["is_slurs"].split() + assert len(phones) == len(notes) == len(note_durs) == len( + is_slurs + ), "Please check the input, phones, notes, note_durs is_slurs should be the same length." + elif input_type == "word": + assert "text" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys(), \ + "When input_type is word, text, notes, note_durs, should be in the svs_input." + phones = self.get_phones(svs_input['text']) + notes = self.get_note_info(svs_input['notes']) + note_durs = self.get_note_info(svs_input['note_durs']) + phones, notes, note_durs, is_slurs = self.process( + phones=phones, notes=notes, note_durs=note_durs) + + phone_ids = [self.vocab_phones[phn] for phn in phones] + phone_ids = np.array(phone_ids, np.int64) + note_ids = [ + librosa.note_to_midi(note.split("/")[0]) if note != 'rest' else 0 + for note in notes + ] + note_ids = np.array(note_ids, np.int64) + note_durs = np.array(note_durs, np.float32) + is_slurs = np.array(is_slurs, np.int64) + + if to_tensor: + phone_ids = paddle.to_tensor(phone_ids) + note_ids = paddle.to_tensor(note_ids) + note_durs = paddle.to_tensor(note_durs) + is_slurs = paddle.to_tensor(is_slurs) + + result['phone_ids'] = [phone_ids] + result['note_ids'] = [note_ids] + result['note_durs'] = [note_durs] + result['is_slurs'] = [is_slurs] + + return result