From 2c4c141de5b8bf7335370f955559d1492adf42c2 Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Tue, 10 Jan 2023 14:35:41 +0800 Subject: [PATCH 01/24] [audio] fix load paddleaudio fail (#2815) * fix paddleaudio import fail --- audio/paddleaudio/_internal/module_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/audio/paddleaudio/_internal/module_utils.py b/audio/paddleaudio/_internal/module_utils.py index 7b3230de9..becd23cd8 100644 --- a/audio/paddleaudio/_internal/module_utils.py +++ b/audio/paddleaudio/_internal/module_utils.py @@ -67,8 +67,11 @@ def deprecated(direction: str, version: Optional[str]=None): def is_kaldi_available(): - return is_module_available("paddleaudio._paddleaudio") - + try: + from paddleaudio import _paddleaudio + return True + except Exception: + return False def requires_kaldi(): if is_kaldi_available(): @@ -128,9 +131,11 @@ def requires_soundfile(): def is_sox_available(): - if platform.system() == "Windows": # not support sox in windows + try: + from paddleaudio import _paddleaudio + return True + except Exception: return False - return is_module_available("paddleaudio._paddleaudio") def requires_sox(): From 88fe26f17ca4a35d007bb934cbe96550b1592508 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 10 Jan 2023 19:46:39 +0800 Subject: [PATCH 02/24] [ASR] add asr code-switch cli and demo, test='asr' (#2816) * add asr code-switch cli and demo. * fix some model named problem. --- README.md | 2 ++ README_cn.md | 2 ++ demos/speech_recognition/README.md | 28 +++++++++-------- demos/speech_recognition/README_cn.md | 29 ++++++++++-------- demos/speech_recognition/run.sh | 6 ++++ paddlespeech/cli/asr/infer.py | 28 ++++++++++++----- paddlespeech/cli/base_commands.py | 19 ++++++++++-- paddlespeech/resource/pretrained_models.py | 13 ++++++++ .../server/bin/paddlespeech_server.py | 30 ++++++++++++++----- tests/unit/cli/test_cli.sh | 3 +- 10 files changed, 118 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index dbdf6a4f8..2fb773634 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model). - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid). - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website diff --git a/README_cn.md b/README_cn.md index 5cc156c9f..53f6a66e4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,6 +164,8 @@ ### 近期更新 +- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验! diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c815a88af..ee2acd6fd 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. Usage @@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav -v # English paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v + # Code-Switch + paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v # Chinese ASR + Punctuation Restoration paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` @@ -40,6 +42,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `input`(required): Audio file to recognize. - `model`: Model type of asr task. Default: `conformer_wenetspeech`. - `lang`: Model language. Default: `zh`. + - `codeswitch`: Code Swith Model. Default: `False` - `sample_rate`: Sample rate of the model. Default: `16000`. - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. @@ -83,14 +86,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API: -| Model | Language | Sample Rate -| :--- | :---: | :---: | -| conformer_wenetspeech | zh | 16k -| conformer_online_multicn | zh | 16k -| conformer_aishell | zh | 16k -| conformer_online_aishell | zh | 16k -| transformer_librispeech | en | 16k -| deepspeech2online_wenetspeech | zh | 16k -| deepspeech2offline_aishell| zh| 16k -| deepspeech2online_aishell | zh | 16k -| deepspeech2offline_librispeech | en | 16k +| Model | Code Switch | Language | Sample Rate +| :--- | :---: | :---: | :---: | +| conformer_wenetspeech | False | zh | 16k +| conformer_online_multicn | False | zh | 16k +| conformer_aishell | False | zh | 16k +| conformer_online_aishell | False | zh | 16k +| transformer_librispeech | False | en | 16k +| deepspeech2online_wenetspeech | False | zh | 16k +| deepspeech2offline_aishell | False | zh| 16k +| deepspeech2online_aishell | False | zh | 16k +| deepspeech2offline_librispeech | False | en | 16k +| conformer_talcs | True | zh_en | 16k diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index 13aa9f277..62dce3bc9 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -1,4 +1,5 @@ (简体中文|[English](./README.md)) + (简体中文|[English](./README.md)) # 语音识别 ## 介绍 @@ -16,7 +17,7 @@ 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) @@ -25,6 +26,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav -v # 英文 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v + #中英混合 + paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v # 中文 + 标点恢复 paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` @@ -38,6 +41,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `input`(必须输入):用于识别的音频文件。 - `model`:ASR 任务的模型,默认值:`conformer_wenetspeech`。 - `lang`:模型语言,默认值:`zh`。 + - `codeswitch`: 是否使用语言转换,默认值:`False`。 - `sample_rate`:音频采样率,默认值:`16000`。 - `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 @@ -80,14 +84,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 4.预训练模型 以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表: -| 模型 | 语言 | 采样率 -| :--- | :---: | :---: | -| conformer_wenetspeech | zh | 16k -| conformer_online_multicn | zh | 16k -| conformer_aishell | zh | 16k -| conformer_online_aishell | zh | 16k -| transformer_librispeech | en | 16k -| deepspeech2online_wenetspeech | zh | 16k -| deepspeech2offline_aishell| zh| 16k -| deepspeech2online_aishell | zh | 16k -| deepspeech2offline_librispeech | en | 16k +| 模型 | 语言转换 | 语言 | 采样率 +| :--- | :---: | :---: | :---: | +| conformer_wenetspeech | False | zh | 16k +| conformer_online_multicn | False | zh | 16k +| conformer_aishell | False | zh | 16k +| conformer_online_aishell | False | zh | 16k +| transformer_librispeech | False | en | 16k +| deepspeech2online_wenetspeech | False | zh | 16k +| deepspeech2offline_aishell | False | zh| 16k +| deepspeech2online_aishell | False | zh | 16k +| deepspeech2offline_librispeech | False | en | 16k +| conformer_talcs | True | zh_en | 16k diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index e48ff3e96..8ba6e4c3e 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -2,6 +2,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav # asr paddlespeech asr --input ./zh.wav @@ -18,6 +19,11 @@ paddlespeech asr --help # english asr paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav + +# code-switch asr +paddlespeech asr --lang zh_en --codeswitch True --model conformer_talcs --input ./ch_zh_mix.wav + + # model stats paddlespeech stats --task asr diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 004143361..7a7aef8b0 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -25,6 +25,9 @@ import librosa import numpy as np import paddle import soundfile +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ...utils.env import MODEL_HOME @@ -34,9 +37,6 @@ from ..log import logger from ..utils import CLI_TIMER from ..utils import stats_wrapper from ..utils import timer_register -from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] @@ -62,8 +62,13 @@ class ASRExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]' + help='Choose model language. [zh, en, zh_en], zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k], zh_en:[conformer_talcs-codeswitch_zh_en-16k]' ) + self.parser.add_argument( + '--codeswitch', + type=bool, + default=False, + help='Choose whether use code-switch. True or False.') self.parser.add_argument( "--sample_rate", type=int, @@ -127,6 +132,7 @@ class ASRExecutor(BaseExecutor): def _init_from_path(self, model_type: str='wenetspeech', lang: str='zh', + codeswitch: bool=False, sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, decode_method: str='attention_rescoring', @@ -144,7 +150,12 @@ class ASRExecutor(BaseExecutor): if cfg_path is None or ckpt_path is None: sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str + if lang == "zh_en" and codeswitch is True: + tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str + elif lang == "zh_en" or codeswitch is True: + raise Exception("codeswitch is true only in zh_en model") + else: + tag = model_type + '-' + lang + '-' + sample_rate_str self.task_resource.set_task_model(tag, version=None) self.res_path = self.task_resource.res_dir @@ -423,6 +434,7 @@ class ASRExecutor(BaseExecutor): model = parser_args.model lang = parser_args.lang + codeswitch = parser_args.codeswitch sample_rate = parser_args.sample_rate config = parser_args.config ckpt_path = parser_args.ckpt_path @@ -444,6 +456,7 @@ class ASRExecutor(BaseExecutor): audio_file=input_, model=model, lang=lang, + codeswitch=codeswitch, sample_rate=sample_rate, config=config, ckpt_path=ckpt_path, @@ -472,6 +485,7 @@ class ASRExecutor(BaseExecutor): audio_file: os.PathLike, model: str='conformer_u2pp_online_wenetspeech', lang: str='zh', + codeswitch: bool=False, sample_rate: int=16000, config: os.PathLike=None, ckpt_path: os.PathLike=None, @@ -485,8 +499,8 @@ class ASRExecutor(BaseExecutor): """ audio_file = os.path.abspath(audio_file) paddle.set_device(device) - self._init_from_path(model, lang, sample_rate, config, decode_method, - num_decoding_left_chunks, ckpt_path) + self._init_from_path(model, lang, codeswitch, sample_rate, config, + decode_method, num_decoding_left_chunks, ckpt_path) if not self._check(audio_file, sample_rate, force_yes): sys.exit(-1) if rtf: diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py index 767d0df78..dfeb5cae5 100644 --- a/paddlespeech/cli/base_commands.py +++ b/paddlespeech/cli/base_commands.py @@ -14,6 +14,7 @@ import argparse from typing import List +import numpy from prettytable import PrettyTable from ..resource import CommonTaskResource @@ -78,7 +79,7 @@ class VersionCommand: model_name_format = { - 'asr': 'Model-Language-Sample Rate', + 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate', 'cls': 'Model-Sample Rate', 'st': 'Model-Source language-Target language', 'text': 'Model-Task-Language', @@ -111,7 +112,21 @@ class StatsCommand: fields = model_name_format[self.task].split("-") table = PrettyTable(fields) for key in pretrained_models: - table.add_row(key.split("-")) + line = key.split("-") + if self.task == "asr" and len(line) < len(fields): + for i in range(len(line), len(fields)): + line.append("-") + if "codeswitch" in key: + line[3], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + elif "multilingual" in key: + line[4], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + tmp = numpy.array(line) + idx = [0, 5, 3, 4, 1, 2] + line = tmp[idx] + table.add_row(line) + print(table) def execute(self, argv: List[str]) -> bool: diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 3c5aa1f90..ff0b30f6d 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -30,6 +30,7 @@ __all__ = [ ] # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". +# Add code-switch and multilingual tag, "{model_name}[_{dataset}]-[codeswitch/multilingual][_{lang}][-...]". # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k". # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" @@ -322,6 +323,18 @@ asr_dynamic_pretrained_models = { '099a601759d467cd0a8523ff939819c5' }, }, + "conformer_talcs-codeswitch_zh_en-16k": { + '1.4': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz', + 'md5': + '01962c5d0a70878fe41cacd4f61e14d1', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/conformer/checkpoints/avg_10' + }, + }, } asr_static_pretrained_models = { diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 1b1792bd1..299a8c3d4 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -16,14 +16,9 @@ import sys import warnings from typing import List +import numpy import uvicorn from fastapi import FastAPI -from prettytable import PrettyTable -from starlette.middleware.cors import CORSMiddleware - -from ..executor import BaseExecutor -from ..util import cli_server_register -from ..util import stats_wrapper from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource from paddlespeech.server.engine.engine_pool import init_engine_pool @@ -31,6 +26,12 @@ from paddlespeech.server.engine.engine_warmup import warm_up from paddlespeech.server.restful.api import setup_router as setup_http_router from paddlespeech.server.utils.config import get_config from paddlespeech.server.ws.api import setup_router as setup_ws_router +from prettytable import PrettyTable +from starlette.middleware.cors import CORSMiddleware + +from ..executor import BaseExecutor +from ..util import cli_server_register +from ..util import stats_wrapper warnings.filterwarnings("ignore") __all__ = ['ServerExecutor', 'ServerStatsExecutor'] @@ -134,7 +135,7 @@ class ServerStatsExecutor(): required=True) self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector'] self.model_name_format = { - 'asr': 'Model-Language-Sample Rate', + 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate', 'tts': 'Model-Language', 'cls': 'Model-Sample Rate', 'text': 'Model-Task-Language', @@ -145,7 +146,20 @@ class ServerStatsExecutor(): fields = self.model_name_format[self.task].split("-") table = PrettyTable(fields) for key in pretrained_models: - table.add_row(key.split("-")) + line = key.split("-") + if self.task == "asr" and len(line) < len(fields): + for i in range(len(line), len(fields)): + line.append("-") + if "codeswitch" in key: + line[3], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + elif "multilingual" in key: + line[4], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + tmp = numpy.array(line) + idx = [0, 5, 3, 4, 1, 2] + line = tmp[idx] + table.add_row(line) print(table) def execute(self, argv: List[str]) -> bool: diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 3a58626d2..5d3b76f6c 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -14,7 +14,7 @@ paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav # Speech_recognition -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav paddlespeech asr --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav @@ -26,6 +26,7 @@ paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav +paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav # Support editing num_decoding_left_chunks paddlespeech asr --model conformer_online_wenetspeech --num_decoding_left_chunks 3 --input ./zh.wav From faa2f866516e1e1afb40b25df907ebe3078bd078 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Wed, 11 Jan 2023 12:59:41 +0800 Subject: [PATCH 03/24] [TTS]update VITS init method (#2809) --- paddlespeech/t2s/models/vits/text_encoder.py | 13 +- paddlespeech/t2s/models/vits/vits.py | 55 +++- paddlespeech/utils/initialize.py | 321 +++++++++++++++++++ 3 files changed, 375 insertions(+), 14 deletions(-) create mode 100644 paddlespeech/utils/initialize.py diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py index 799e0c759..015ed76c6 100644 --- a/paddlespeech/t2s/models/vits/text_encoder.py +++ b/paddlespeech/t2s/models/vits/text_encoder.py @@ -24,6 +24,7 @@ from paddle import nn from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder +from paddlespeech.utils.initialize import normal_ class TextEncoder(nn.Layer): @@ -105,10 +106,6 @@ class TextEncoder(nn.Layer): # define modules self.emb = nn.Embedding(vocabs, attention_dim) - dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5) - w = dist.sample(self.emb.weight.shape) - self.emb.weight.set_value(w) - self.encoder = Encoder( idim=-1, input_layer=None, @@ -130,6 +127,8 @@ class TextEncoder(nn.Layer): cnn_module_kernel=conformer_kernel_size, ) self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1) + self.reset_parameters() + def forward( self, x: paddle.Tensor, @@ -166,3 +165,9 @@ class TextEncoder(nn.Layer): m, logs = paddle.split(stats, 2, axis=1) return x, m, logs, x_mask + + def reset_parameters(self): + normal_(self.emb.weight, mean=0.0, std=self.attention_dim**-0.5) + if self.emb._padding_idx is not None: + with paddle.no_grad(): + self.emb.weight[self.emb._padding_idx] = 0 diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index 0ff3a546d..e68ed5643 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -13,6 +13,7 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """VITS module""" +import math from typing import Any from typing import Dict from typing import Optional @@ -27,7 +28,12 @@ from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscrimi from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator from paddlespeech.t2s.models.vits.generator import VITSGenerator -from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out +from paddlespeech.utils.initialize import kaiming_uniform_ +from paddlespeech.utils.initialize import normal_ +from paddlespeech.utils.initialize import ones_ +from paddlespeech.utils.initialize import uniform_ +from paddlespeech.utils.initialize import zeros_ AVAILABLE_GENERATERS = { "vits_generator": VITSGenerator, @@ -152,8 +158,7 @@ class VITS(nn.Layer): "use_spectral_norm": False, }, }, - cache_generator_outputs: bool=True, - init_type: str="xavier_uniform", ): + cache_generator_outputs: bool=True, ): """Initialize VITS module. Args: idim (int): @@ -179,9 +184,6 @@ class VITS(nn.Layer): assert check_argument_types() super().__init__() - # initialize parameters - initialize(self, init_type) - # define modules generator_class = AVAILABLE_GENERATERS[generator_type] if generator_type == "vits_generator": @@ -196,8 +198,6 @@ class VITS(nn.Layer): self.discriminator = discriminator_class( **discriminator_params, ) - nn.initializer.set_global_initializer(None) - # cache self.cache_generator_outputs = cache_generator_outputs self._cache = None @@ -214,6 +214,10 @@ class VITS(nn.Layer): self.reuse_cache_gen = True self.reuse_cache_dis = True + self.reset_parameters() + self.generator.decoder.reset_parameters() + self.generator.text_encoder.reset_parameters() + def forward( self, text: paddle.Tensor, @@ -243,7 +247,7 @@ class VITS(nn.Layer): forward_generator (bool): Whether to forward generator. Returns: - + """ if forward_generator: return self._forward_generator( @@ -290,7 +294,7 @@ class VITS(nn.Layer): lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). Returns: - + """ # setup feats = feats.transpose([0, 2, 1]) @@ -497,3 +501,34 @@ class VITS(nn.Layer): lids, ) return dict(wav=paddle.reshape(wav, [-1])) + + def reset_parameters(self): + def _reset_parameters(module): + if isinstance(module, + (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)): + kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(module.weight) + if fan_in != 0: + bound = 1 / math.sqrt(fan_in) + uniform_(module.bias, -bound, bound) + + if isinstance(module, + (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)): + ones_(module.weight) + zeros_(module.bias) + + if isinstance(module, nn.Linear): + kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + uniform_(module.bias, -bound, bound) + + if isinstance(module, nn.Embedding): + normal_(module.weight) + if module._padding_idx is not None: + with paddle.no_grad(): + module.weight[module._padding_idx] = 0 + + self.apply(_reset_parameters) diff --git a/paddlespeech/utils/initialize.py b/paddlespeech/utils/initialize.py new file mode 100644 index 000000000..8ebe6845e --- /dev/null +++ b/paddlespeech/utils/initialize.py @@ -0,0 +1,321 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py +Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file. +""" +import math + +import numpy as np +import paddle +import paddle.nn as nn + +__all__ = [ + "uniform_", + "normal_", + "constant_", + "ones_", + "zeros_", + "xavier_uniform_", + "xavier_normal_", + "kaiming_uniform_", + "kaiming_normal_", + "linear_init_", + "conv_init_", + "reset_initialized_parameter", + "_calculate_fan_in_and_fan_out", +] + + +def _no_grad_uniform_(tensor, a, b): + with paddle.no_grad(): + tensor.set_value( + paddle.uniform( + shape=tensor.shape, dtype=tensor.dtype, min=a, max=b)) + return tensor + + +def _no_grad_normal_(tensor, mean=0.0, std=1.0): + with paddle.no_grad(): + tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape)) + return tensor + + +def _no_grad_fill_(tensor, value=0.0): + with paddle.no_grad(): + tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype)) + return tensor + + +def uniform_(tensor, a, b): + """ + Modified tensor inspace using uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + a (float|int): min value. + b (float|int): max value. + Return: + tensor + """ + return _no_grad_uniform_(tensor, a, b) + + +def normal_(tensor, mean=0.0, std=1.0): + """ + Modified tensor inspace using normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mean (float|int): mean value. + std (float|int): std value. + Return: + tensor + """ + return _no_grad_normal_(tensor, mean, std) + + +def constant_(tensor, value=0.0): + """ + Modified tensor inspace using constant_ + Args: + tensor (paddle.Tensor): paddle Tensor + value (float|int): value to fill tensor. + Return: + tensor + """ + return _no_grad_fill_(tensor, value) + + +def ones_(tensor): + """ + Modified tensor inspace using ones_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 1) + + +def zeros_(tensor): + """ + Modified tensor inspace using zeros_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 0) + + +def vector_(tensor, vector): + with paddle.no_grad(): + tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype)) + return tensor + + +def _calculate_fan_in_and_fan_out(tensor, reverse=False): + """ + Calculate (fan_in, _fan_out) for tensor + Args: + tensor (Tensor): paddle.Tensor + reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True + Return: + Tuple[fan_in, fan_out] + """ + if tensor.ndim < 2: + raise ValueError( + "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" + ) + + if reverse: + num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] + else: + num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0] + + receptive_field_size = 1 + if tensor.ndim > 2: + receptive_field_size = np.prod(tensor.shape[2:]) + + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def xavier_uniform_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def xavier_normal_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + return _no_grad_normal_(tensor, 0, std) + + +# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html +def _calculate_correct_fan(tensor, mode, reverse=False): + mode = mode.lower() + valid_modes = ["fan_in", "fan_out"] + if mode not in valid_modes: + raise ValueError("Mode {} not supported, please use one of {}".format( + mode, valid_modes)) + + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) + + return fan_in if mode == "fan_in" else fan_out + + +def _calculate_gain(nonlinearity, param=None): + linear_fns = [ + "linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", + "conv_transpose2d", "conv_transpose3d" + ] + if nonlinearity in linear_fns or nonlinearity == "sigmoid": + return 1 + elif nonlinearity == "tanh": + return 5.0 / 3 + elif nonlinearity == "relu": + return math.sqrt(2.0) + elif nonlinearity == "leaky_relu": + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance( + param, int) or isinstance(param, float): + # True/False are instances of int, hence check above + negative_slope = param + else: + raise ValueError( + "negative_slope {} not a valid number".format(param)) + return math.sqrt(2.0 / (1 + negative_slope**2)) + elif nonlinearity == "selu": + return 3.0 / 4 + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + + +def kaiming_uniform_(tensor, + a=0, + mode="fan_in", + nonlinearity="leaky_relu", + reverse=False): + """ + Modified tensor inspace using kaiming_uniform method + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def kaiming_normal_(tensor, + a=0, + mode="fan_in", + nonlinearity="leaky_relu", + reverse=False): + """ + Modified tensor inspace using kaiming_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + return _no_grad_normal_(tensor, 0, std) + + +def linear_init_(module): + bound = 1 / math.sqrt(module.weight.shape[0]) + uniform_(module.weight, -bound, bound) + uniform_(module.bias, -bound, bound) + + +def conv_init_(module): + bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) + uniform_(module.weight, -bound, bound) + if module.bias is not None: + uniform_(module.bias, -bound, bound) + + +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +@paddle.no_grad() +def reset_initialized_parameter(model, include_self=True): + """ + Reset initialized parameter using following method for [conv, linear, embedding, bn] + Args: + model (paddle.Layer): paddle Layer + include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself + Return: + None + """ + for _, m in model.named_sublayers(include_self=include_self): + if isinstance(m, nn.Conv2D): + k = float(m._groups) / (m._in_channels * m._kernel_size[0] * + m._kernel_size[1]) + k = math.sqrt(k) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Linear): + k = math.sqrt(1.0 / m.weight.shape[0]) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Embedding): + _no_grad_normal_(m.weight, mean=0.0, std=1.0) + + elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)): + _no_grad_fill_(m.weight, 1.0) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_fill_(m.bias, 0) From ad40dafa856b9c4539e7b9f82bad2d9ff8c317f4 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 12 Jan 2023 10:23:56 +0800 Subject: [PATCH 04/24] fix some bug. (#2825) --- paddlespeech/s2t/models/whisper/tokenizer.py | 4 ++++ paddlespeech/s2t/models/whisper/whipser.py | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py index 8bd85c914..1e1aea044 100644 --- a/paddlespeech/s2t/models/whisper/tokenizer.py +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -155,6 +155,10 @@ class Tokenizer: if ids < len(self.tokenizer): ids_list.append(ids) token_ids = ids_list + elif len(token_ids) == 1: + token_ids = token_ids[0] + else: + raise ValueError(f"token_ids {token_ids} load error.") return self.tokenizer.decode(token_ids, **kwargs) diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 63cafbdb7..81692f37a 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -17,12 +17,11 @@ from typing import Union import numpy as np import paddle import paddle.nn.functional as F +import paddlespeech.s2t.modules.align as paddlespeech_nn import soundfile import tqdm from paddle import nn from paddle.distribution import Categorical - -import paddlespeech.s2t.modules.align as paddlespeech_nn from paddlespeech.s2t.models.whisper import utils from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES @@ -771,8 +770,10 @@ class GreedyDecoder(TokenDecoder): if temperature == 0: next_tokens = paddle.argmax(logits, axis=-1) else: - next_tokens = Categorical(logits=logits / temperature).sample( - shape=logits.shape) + next_tokens = Categorical(logits=logits / temperature).sample([1]) + next_tokens = paddle.reshape(next_tokens, [ + next_tokens.shape[0] * next_tokens.shape[1], + ]) logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) current_logprobs = logprobs[paddle.arange(logprobs.shape[0]), @@ -1205,9 +1206,8 @@ class DecodingTask: DecodingResult( audio_features=features, language=language, - language_probs=probs) - for features, language, probs in zip(audio_features, languages, - language_probs) + language_probs=probs) for features, language, probs in + zip(audio_features, languages, language_probs) ] # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling From a99244d86e56a0d796f04919b4e6493b6d4d22a6 Mon Sep 17 00:00:00 2001 From: cxumol Date: Wed, 11 Jan 2023 22:04:10 -0800 Subject: [PATCH 05/24] fix: whisper language choice, test=asr (#2828) --- paddlespeech/s2t/models/whisper/whipser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 81692f37a..9cf9a9eca 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -476,7 +476,7 @@ def transcribe( decode_options["fp16"] = False if decode_options.get( - "language", 'None') or decode_options.get("language", None) is None: + "language") == 'None' or decode_options.get("language", None) is None: if not model.is_multilingual: decode_options["language"] = "en" else: From 742523fb38f521aaa93431658a7eb2042b2bad81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=8B=E4=B8=9C=E6=AF=85?= Date: Fri, 13 Jan 2023 15:45:49 +0800 Subject: [PATCH 06/24] [tts]For mixed Chinese and English speech synthesis, add SSML support for Chinese (#2830) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 添加.history * [tts]添加中英混合语音合成时对中文SSML的支持 --- .gitignore | 1 + paddlespeech/t2s/frontend/mix_frontend.py | 57 ++++++++++++++++++++--- paddlespeech/t2s/ssml/xml_processor.py | 34 ++++++++++++++ 3 files changed, 86 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 75f56b604..4a0c43312 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ *.egg-info build *output/ +.history audio/dist/ audio/fc_patch/ diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 19c98d53f..c13a5ab62 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from typing import Dict from typing import List @@ -18,6 +19,7 @@ import paddle from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor class MixFrontend(): @@ -107,7 +109,40 @@ class MixFrontend(): add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - segments = self.get_segment(sentence) + ''' 1. 添加SSML支持,先列出 文字 和 标签内容, + 然后添加到tmpSegments数组里 + ''' + d_inputs = MixTextProcessor.get_dom_split(sentence) + tmpSegments = [] + for instr in d_inputs: + ''' 暂时只支持 say-as ''' + if instr.lower().startswith("" + segments.append(tuple(currentSeg)) + segments.append(seg) + currentSeg = ["", ""] + else: + if currentSeg[0] == '': + currentSeg[0] = seg[0] + currentSeg[1] = seg[1] + else: + currentSeg[0] = currentSeg[0] + seg[0] + if currentSeg[0] != '': + currentSeg[0] = "" + currentSeg[0] + "" + segments.append(tuple(currentSeg)) phones_list = [] result = {} @@ -120,11 +155,21 @@ class MixFrontend(): input_ids = self.en_frontend.get_input_ids( content, merge_sentences=False, to_tensor=to_tensor) else: - input_ids = self.zh_frontend.get_input_ids( - content, - merge_sentences=False, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) + ''' 3. 把带speak tag的中文和普通文字分开处理 + ''' + if content.strip() != "" and \ + re.match(r".*?.*?.*", content, re.DOTALL): + input_ids = self.zh_frontend.get_input_ids_ssml( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + else: + input_ids = self.zh_frontend.get_input_ids( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) if add_sp: input_ids["phone_ids"][-1] = paddle.concat( [input_ids["phone_ids"][-1], self.sp_id_tensor]) diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py index b39121347..892ca371e 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -74,6 +74,28 @@ class MixTextProcessor(): ctlist.append([mixstr, []]) return ctlist + @classmethod + def get_dom_split(self, mixstr): + ''' 文本分解,顺序加了列表中,返回文本和say-as标签 + ''' + ctlist = [] + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append(pre_xml) + dom = DomXml(in_xml) + tags = dom.get_text_and_sayas_tags() + ctlist.extend(tags) + + ctlist.append(after_xml) + return ctlist + else: + ctlist.append(mixstr) + return ctlist class DomXml(): def __init__(self, xmlstr): @@ -156,3 +178,15 @@ class DomXml(): if x.hasAttribute('pinyin'): # pinyin print(x.tagName, 'pinyin', x.getAttribute('pinyin'), x.firstChild.data) + + def get_text_and_sayas_tags(self): + '''返回 xml 内容的列表,包括所有文本内容和 tag''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + res.append(x2.toxml()) + return res From 1fd38c0e8b5937a5e9a1fd576e35c610b7b181a0 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 13 Jan 2023 17:40:47 +0800 Subject: [PATCH 07/24] fix o (#2831) --- paddlespeech/t2s/frontend/g2pw/onnx_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py index 47c26a610..3ce3d246d 100644 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -100,7 +100,7 @@ class G2PWOnnxConverter: ] self.non_polyphonic = { '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', - '肖', '瘙', '誒', '泊', '听' + '肖', '瘙', '誒', '泊', '听', '噢' } self.non_monophonic = {'似', '攢'} self.monophonic_chars = [ From 57b9d4bca4c897835a52a8f6a2f9ee04ddc4b402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Fri, 13 Jan 2023 20:42:23 +0800 Subject: [PATCH 08/24] add diffusion module for training diffsinger (#2832) --- docs/requirements.txt | 1 + paddlespeech/t2s/modules/diffusion.py | 467 ++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 469 insertions(+) create mode 100644 paddlespeech/t2s/modules/diffusion.py diff --git a/docs/requirements.txt b/docs/requirements.txt index bd7f40ec3..c6228d917 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -27,6 +27,7 @@ pandas pathos==0.2.8 pattern_singleton Pillow>=9.0.0 +ppdiffusers>=0.9.0 praatio==5.0.0 prettytable pypinyin-dict diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py new file mode 100644 index 000000000..52fe84ceb --- /dev/null +++ b/paddlespeech/t2s/modules/diffusion.py @@ -0,0 +1,467 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Diffusion denoising related modules for paddle""" +import math +from typing import Callable +from typing import Optional +from typing import Tuple + +import paddle +import ppdiffusers +from paddle import nn +from ppdiffusers.models.embeddings import Timesteps +from ppdiffusers.schedulers import DDPMScheduler + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock + + +class WaveNetDenoiser(nn.Layer): + """A Mel-Spectrogram Denoiser modified from WaveNet + + Args: + in_channels (int, optional): + Number of channels of the input mel-spectrogram, by default 80 + out_channels (int, optional): + Number of channels of the output mel-spectrogram, by default 80 + kernel_size (int, optional): + Kernel size of the residual blocks inside, by default 3 + layers (int, optional): + Number of residual blocks inside, by default 20 + stacks (int, optional): + The number of groups to split the residual blocks into, by default 4 + Within each group, the dilation of the residual block grows exponentially. + residual_channels (int, optional): + Residual channel of the residual blocks, by default 256 + gate_channels (int, optional): + Gate channel of the residual blocks, by default 512 + skip_channels (int, optional): + Skip channel of the residual blocks, by default 256 + aux_channels (int, optional): + Auxiliary channel of the residual blocks, by default 256 + dropout (float, optional): + Dropout of the residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight norm in all convolutions, by default False + """ + + def __init__( + self, + in_channels: int=80, + out_channels: int=80, + kernel_size: int=3, + layers: int=20, + stacks: int=4, + residual_channels: int=256, + gate_channels: int=512, + skip_channels: int=256, + aux_channels: int=256, + dropout: float=0., + bias: bool=True, + use_weight_norm: bool=False, + init_type: str="kaiming_uniform", ): + super().__init__() + + # initialize parameters + initialize(self, init_type) + + self.in_channels = in_channels + self.out_channels = out_channels + self.aux_channels = aux_channels + self.layers = layers + self.stacks = stacks + self.kernel_size = kernel_size + + assert layers % stacks == 0 + layers_per_stack = layers // stacks + + self.first_t_emb = nn.Sequential( + Timesteps( + residual_channels, + flip_sin_to_cos=False, + downscale_freq_shift=1), + nn.Linear(residual_channels, residual_channels * 4), + nn.Mish(), nn.Linear(residual_channels * 4, residual_channels)) + self.t_emb_layers = nn.LayerList([ + nn.Linear(residual_channels, residual_channels) + for _ in range(layers) + ]) + + self.first_conv = nn.Conv1D( + in_channels, residual_channels, 1, bias_attr=True) + self.first_act = nn.ReLU() + + self.conv_layers = nn.LayerList() + for layer in range(layers): + dilation = 2**(layer % layers_per_stack) + conv = WaveNetResidualBlock( + kernel_size=kernel_size, + residual_channels=residual_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=aux_channels, + dilation=dilation, + dropout=dropout, + bias=bias) + self.conv_layers.append(conv) + + self.last_conv_layers = nn.Sequential(nn.ReLU(), + nn.Conv1D( + skip_channels, + skip_channels, + 1, + bias_attr=True), + nn.ReLU(), + nn.Conv1D( + skip_channels, + out_channels, + 1, + bias_attr=True)) + + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x, t, c): + """Denoise mel-spectrogram. + + Args: + x(Tensor): + Shape (N, C_in, T), The input mel-spectrogram. + t(Tensor): + Shape (N), The timestep input. + c(Tensor): + Shape (N, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output). + + Returns: + Tensor: Shape (N, C_out, T), the denoised mel-spectrogram. + """ + assert c.shape[-1] == x.shape[-1] + + if t.shape[0] != x.shape[0]: + t = t.tile([x.shape[0]]) + t_emb = self.first_t_emb(t) + t_embs = [ + t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers + ] + + x = self.first_conv(x) + x = self.first_act(x) + skips = 0 + for f, t in zip(self.conv_layers, t_embs): + x = x + t + x, s = f(x, c) + skips += s + skips *= math.sqrt(1.0 / len(self.conv_layers)) + + x = self.last_conv_layers(skips) + return x + + def apply_weight_norm(self): + """Recursively apply weight normalization to all the Convolution layers + in the sublayers. + """ + + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv2D)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + """Recursively remove weight normalization from all the Convolution + layers in the sublayers. + """ + + def _remove_weight_norm(layer): + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) + + +class GaussianDiffusion(nn.Layer): + """Common Gaussian Diffusion Denoising Model Module + + Args: + denoiser (Layer, optional): + The model used for denoising noises. + In fact, the denoiser model performs the operation + of producing a output with more noises from the noisy input. + Then we use the diffusion algorithm to calculate + the input with the output to get the denoised result. + num_train_timesteps (int, optional): + The number of timesteps between the noise and the real during training, by default 1000. + beta_start (float, optional): + beta start parameter for the scheduler, by default 0.0001. + beta_end (float, optional): + beta end parameter for the scheduler, by default 0.0001. + beta_schedule (str, optional): + beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule). + num_max_timesteps (int, optional): + The max timestep transition from real to noise, by default None. + + Examples: + >>> import paddle + >>> import paddle.nn.functional as F + >>> from tqdm import tqdm + >>> + >>> denoiser = WaveNetDenoiser() + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=1000, num_max_timesteps=100) + >>> x = paddle.ones([4, 80, 192]) # [B, mel_ch, T] # real mel input + >>> c = paddle.randn([4, 256, 192]) # [B, fs2_encoder_out_ch, T] # fastspeech2 encoder output + >>> loss = F.mse_loss(*diffusion(x, c)) + >>> loss.backward() + >>> print('MSE Loss:', loss.item()) + MSE Loss: 1.6669728755950928 + >>> def create_progress_callback(): + >>> pbar = None + >>> def callback(index, timestep, num_timesteps, sample): + >>> nonlocal pbar + >>> if pbar is None: + >>> pbar = tqdm(total=num_timesteps-index) + >>> pbar.update() + >>> + >>> return callback + >>> + >>> # ds=1000, K_step=60, scheduler=ddpm, from aux fs2 mel output + >>> ds = 1000 + >>> infer_steps = 1000 + >>> K_step = 60 + >>> scheduler_type = 'ddpm' + >>> x_in = x + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, x, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 60/60 [00:03<00:00, 18.36it/s] + >>> + >>> # ds=100, K_step=100, scheduler=ddpm, from gaussian noise + >>> ds = 100 + >>> infer_steps = 100 + >>> K_step = 100 + >>> scheduler_type = 'ddpm' + >>> x_in = None + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, x_in, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 100/100 [00:05<00:00, 18.29it/s] + >>> + >>> # ds=1000, K_step=1000, scheduler=pndm, infer_step=25, from gaussian noise + >>> ds = 1000 + >>> infer_steps = 25 + >>> K_step = 1000 + >>> scheduler_type = 'pndm' + >>> x_in = None + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, None, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 25/25 [00:01<00:00, 19.75it/s] + >>> + >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output + >>> ds = 1000 + >>> infer_steps = 50 + >>> K_step = 100 + >>> scheduler_type = 'pndm' + >>> x_in = x + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, x, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 5/5 [00:00<00:00, 23.80it/s] + + """ + + def __init__(self, + denoiser: nn.Layer, + num_train_timesteps: Optional[int]=1000, + beta_start: Optional[float]=0.0001, + beta_end: Optional[float]=0.02, + beta_schedule: Optional[str]="squaredcos_cap_v2", + num_max_timesteps: Optional[int]=None): + super().__init__() + + self.num_train_timesteps = num_train_timesteps + self.beta_start = beta_start + self.beta_end = beta_end + self.beta_schedule = beta_schedule + + self.denoiser = denoiser + self.noise_scheduler = DDPMScheduler( + num_train_timesteps=num_train_timesteps, + beta_start=beta_start, + beta_end=beta_end, + beta_schedule=beta_schedule) + self.num_max_timesteps = num_max_timesteps + + def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Generate random timesteps noised x. + + Args: + x (Tensor): + The input for adding noises. + cond (Tensor, optional): + Conditional input for compute noises. + + Returns: + y (Tensor): + The output with noises added in. + target (Tensor): + The noises which is added to the input. + + """ + noise_scheduler = self.noise_scheduler + + # Sample noise that we'll add to the mel-spectrograms + target = noise = paddle.randn(x.shape) + + # Sample a random timestep for each mel-spectrogram + num_timesteps = self.num_train_timesteps + if self.num_max_timesteps is not None: + num_timesteps = self.num_max_timesteps + timesteps = paddle.randint(0, num_timesteps, (x.shape[0], )) + + # Add noise to the clean mel-spectrograms according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_images = noise_scheduler.add_noise(x, noise, timesteps) + + y = self.denoiser(noisy_images, timesteps, cond) + + # then compute loss use output y and noisy target for prediction_type == "epsilon" + return y, target + + def inference(self, + noise: paddle.Tensor, + cond: Optional[paddle.Tensor]=None, + ref_x: Optional[paddle.Tensor]=None, + num_inference_steps: Optional[int]=1000, + strength: Optional[float]=None, + scheduler_type: Optional[str]="ddpm", + callback: Optional[Callable[[int, int, int, paddle.Tensor], + None]]=None, + callback_steps: Optional[int]=1): + """Denoising input from noises. Refer to ppdiffusers img2img pipeline. + + Args: + noise (Tensor): + The input tensor as a starting point for denoising. + cond (Tensor, optional): + Conditional input for compute noises. + ref_x (Tensor, optional): + The real output for the denoising process to refer. + num_inference_steps (int, optional): + The number of timesteps between the noise and the real during inference, by default 1000. + strength (float, optional): + Mixing strength of ref_x with noise. The larger the value, the stronger the noise. + Range [0,1], by default None. + scheduler_type (str, optional): + Noise scheduler for generate noises. + Choose a great scheduler can skip many denoising step, by default 'ddpm'. + callback (Callable[[int,int,int,Tensor], None], optional): + Callback function during denoising steps. + + Args: + index (int): + Current denoising index. + timestep (int): + Current denoising timestep. + num_timesteps (int): + Number of the denoising timesteps. + denoised_output (Tensor): + Current intermediate result produced during denoising. + + callback_steps (int, optional): + The step to call the callback function. + + Returns: + denoised_output (Tensor): + The denoised output tensor. + + """ + scheduler_cls = None + for clsname in dir(ppdiffusers.schedulers): + if clsname.lower() == scheduler_type + "scheduler": + scheduler_cls = getattr(ppdiffusers.schedulers, clsname) + break + + if scheduler_cls is None: + raise ValueError(f"No such scheduler type named {scheduler_type}") + + scheduler = scheduler_cls( + num_train_timesteps=self.num_train_timesteps, + beta_start=self.beta_start, + beta_end=self.beta_end, + beta_schedule=self.beta_schedule) + + # set timesteps + scheduler.set_timesteps(num_inference_steps) + + # prepare first noise variables + noisy_input = noise + timesteps = scheduler.timesteps + if ref_x is not None: + init_timestep = None + if strength is None or strength < 0. or strength > 1.: + strength = None + if self.num_max_timesteps is not None: + strength = self.num_max_timesteps / self.num_train_timesteps + if strength is not None: + # get the original timestep using init_timestep + init_timestep = min( + int(num_inference_steps * strength), num_inference_steps) + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = scheduler.timesteps[t_start:] + num_inference_steps = num_inference_steps - t_start + noisy_input = scheduler.add_noise( + ref_x, noise, timesteps[:1].tile([noise.shape[0]])) + + # denoising loop + denoised_output = noisy_input + num_warmup_steps = len( + timesteps) - num_inference_steps * scheduler.order + for i, t in enumerate(timesteps): + denoised_output = scheduler.scale_model_input(denoised_output, t) + + # predict the noise residual + noise_pred = self.denoiser(denoised_output, t, cond) + + # compute the previous noisy sample x_t -> x_t-1 + denoised_output = scheduler.step(noise_pred, t, + denoised_output).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and + (i + 1) % scheduler.order == 0): + if callback is not None and i % callback_steps == 0: + callback(i, t, len(timesteps), denoised_output) + + return denoised_output diff --git a/setup.py b/setup.py index 3bde2b205..212d3b109 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ base = [ "opencc-python-reimplemented", "pandas", "paddlenlp>=2.4.8", + "ppdiffusers>=0.9.0", "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", From 2f3ca4ac4809767008f89b0ab24846b2f5e0b983 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 17 Jan 2023 13:55:18 +0800 Subject: [PATCH 09/24] Update README.md (#2840) * Update README.md * Update README_cn.md --- README.md | 4 ++-- README_cn.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2fb773634..40064f5d2 100644 --- a/README.md +++ b/README.md @@ -157,8 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update -- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). -- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). +- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model). - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid). - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website diff --git a/README_cn.md b/README_cn.md index 53f6a66e4..d2e5f63d7 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,8 +164,8 @@ ### 近期更新 -- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). -- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). +- 🔥 2023.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验! From 478fd2593e215a65ec641bc1ba831e53d9da7d4b Mon Sep 17 00:00:00 2001 From: Ming Date: Tue, 17 Jan 2023 17:40:15 +0800 Subject: [PATCH 10/24] update QR Code in README, test=doc (#2841) --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 40064f5d2..afc4e4d09 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
- +
## Installation diff --git a/README_cn.md b/README_cn.md index d2e5f63d7..ecc4644aa 100644 --- a/README_cn.md +++ b/README_cn.md @@ -202,7 +202,7 @@ 微信扫描二维码关注公众号,点击“马上报名”填写问卷加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
- +
From 140aed4b545885cdb9a13117e9d1a009466c44ac Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Thu, 19 Jan 2023 16:04:03 +0800 Subject: [PATCH 11/24] [TTS]VITS init sampler reverse, test=tts (#2843) --- paddlespeech/t2s/exps/vits/normalize.py | 2 +- paddlespeech/t2s/exps/vits/preprocess.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 514cbef8e..24e15765e 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -187,7 +187,7 @@ def main(): record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) - output_metadata.sort(key=itemgetter('feats_lengths')) + output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" with jsonlines.open(output_metadata_path, 'w') as writer: for item in output_metadata: diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index 2b1a40834..d6b226a20 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -166,7 +166,7 @@ def process_sentences(config, if record: results.append(record) - results.sort(key=itemgetter("feats_lengths")) + results.sort(key=itemgetter("feats_lengths"), reverse=True) with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: for item in results: writer.write(item) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index 07301db56..f6a31ced2 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -110,7 +110,7 @@ def train_sp(args, config): train_sampler = ErnieSATSampler( train_dataset, batch_size=config.batch_size, - shuffle=True, + shuffle=False, drop_last=True) dev_sampler = ErnieSATSampler( dev_dataset, From 2b01e4052559b5c0e1a7d47f4eb1e340a5a1bf1d Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 30 Jan 2023 13:33:38 +0800 Subject: [PATCH 12/24] =?UTF-8?q?[TTS]soft=20link=20for=20shell=20in=20exa?= =?UTF-8?q?mple,=20add=20skip=5Fcopy=5Fwave=20in=20norm=20stage=20of=20G?= =?UTF-8?q?=E2=80=A6=20(#2851)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit soft link for shell in example, add skip_copy_wave in norm stage of GANVocoders to save disk --- examples/aishell3/tts3/path.sh | 14 +--- examples/aishell3/vc0/path.sh | 14 +--- examples/aishell3/vc1/local/train.sh | 14 +--- examples/aishell3/vc1/path.sh | 14 +--- examples/aishell3/vc2/local/synthesize.sh | 21 +----- examples/aishell3/vc2/local/train.sh | 14 +--- examples/aishell3/vc2/path.sh | 14 +--- examples/aishell3/voc1/local/preprocess.sh | 10 ++- examples/aishell3/voc1/local/synthesize.sh | 15 +---- examples/aishell3/voc1/local/train.sh | 14 +--- examples/aishell3/voc1/path.sh | 14 +--- examples/aishell3/voc5/local/preprocess.sh | 56 +--------------- examples/aishell3/voc5/local/synthesize.sh | 15 +---- examples/aishell3/voc5/local/train.sh | 14 +--- examples/aishell3/voc5/path.sh | 14 +--- .../ernie_sat/local/synthesize.sh | 26 +------- .../aishell3_vctk/ernie_sat/local/train.sh | 13 +--- examples/aishell3_vctk/ernie_sat/path.sh | 14 +--- examples/csmsc/voc1/local/preprocess.sh | 10 ++- examples/csmsc/voc3/finetune.sh | 65 +------------------ examples/csmsc/voc3/local/preprocess.sh | 56 +--------------- examples/csmsc/voc3/local/train.sh | 14 +--- examples/csmsc/voc4/local/preprocess.sh | 56 +--------------- examples/csmsc/voc4/local/train.sh | 14 +--- examples/csmsc/voc5/finetune.sh | 9 ++- examples/csmsc/voc5/local/preprocess.sh | 56 +--------------- examples/csmsc/voc5/local/train.sh | 14 +--- examples/csmsc/voc6/local/preprocess.sh | 10 ++- examples/csmsc/voc6/local/train.sh | 14 +--- examples/ljspeech/tts0/local/train.sh | 13 +--- examples/ljspeech/tts0/path.sh | 14 +--- examples/ljspeech/tts3/local/train.sh | 13 +--- examples/ljspeech/tts3/path.sh | 14 +--- examples/ljspeech/voc1/local/preprocess.sh | 10 ++- examples/ljspeech/voc1/local/synthesize.sh | 15 +---- examples/ljspeech/voc1/local/train.sh | 14 +--- examples/ljspeech/voc1/path.sh | 14 +--- examples/ljspeech/voc5/local/preprocess.sh | 56 +--------------- examples/ljspeech/voc5/local/synthesize.sh | 15 +---- examples/ljspeech/voc5/local/train.sh | 14 +--- examples/ljspeech/voc5/path.sh | 14 +--- examples/vctk/ernie_sat/local/train.sh | 13 +--- examples/vctk/ernie_sat/path.sh | 14 +--- examples/vctk/tts3/local/train.sh | 14 +--- examples/vctk/tts3/path.sh | 14 +--- examples/vctk/voc1/local/preprocess.sh | 10 ++- examples/vctk/voc1/local/synthesize.sh | 15 +---- examples/vctk/voc1/local/train.sh | 14 +--- examples/vctk/voc1/path.sh | 14 +--- examples/vctk/voc5/local/preprocess.sh | 56 +--------------- examples/vctk/voc5/local/synthesize.sh | 15 +---- examples/vctk/voc5/local/train.sh | 14 +--- examples/vctk/voc5/path.sh | 14 +--- examples/zh_en_tts/tts3/local/train.sh | 14 +--- examples/zh_en_tts/tts3/path.sh | 14 +--- 55 files changed, 90 insertions(+), 979 deletions(-) mode change 100755 => 120000 examples/aishell3/tts3/path.sh mode change 100755 => 120000 examples/aishell3/vc0/path.sh mode change 100755 => 120000 examples/aishell3/vc1/local/train.sh mode change 100755 => 120000 examples/aishell3/vc1/path.sh mode change 100755 => 120000 examples/aishell3/vc2/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/vc2/local/train.sh mode change 100755 => 120000 examples/aishell3/vc2/path.sh mode change 100755 => 120000 examples/aishell3/voc1/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/voc1/local/train.sh mode change 100755 => 120000 examples/aishell3/voc1/path.sh mode change 100755 => 120000 examples/aishell3/voc5/local/preprocess.sh mode change 100755 => 120000 examples/aishell3/voc5/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/voc5/local/train.sh mode change 100755 => 120000 examples/aishell3/voc5/path.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/synthesize.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/train.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/path.sh mode change 100755 => 120000 examples/csmsc/voc3/finetune.sh mode change 100755 => 120000 examples/csmsc/voc3/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc3/local/train.sh mode change 100755 => 120000 examples/csmsc/voc4/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc4/local/train.sh mode change 100755 => 120000 examples/csmsc/voc5/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc5/local/train.sh mode change 100755 => 120000 examples/csmsc/voc6/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts0/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts0/path.sh mode change 100755 => 120000 examples/ljspeech/tts3/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts3/path.sh mode change 100755 => 120000 examples/ljspeech/voc1/local/synthesize.sh mode change 100755 => 120000 examples/ljspeech/voc1/local/train.sh mode change 100755 => 120000 examples/ljspeech/voc1/path.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/preprocess.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/synthesize.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/train.sh mode change 100755 => 120000 examples/ljspeech/voc5/path.sh mode change 100755 => 120000 examples/vctk/ernie_sat/local/train.sh mode change 100755 => 120000 examples/vctk/ernie_sat/path.sh mode change 100755 => 120000 examples/vctk/tts3/local/train.sh mode change 100755 => 120000 examples/vctk/tts3/path.sh mode change 100755 => 120000 examples/vctk/voc1/local/synthesize.sh mode change 100755 => 120000 examples/vctk/voc1/local/train.sh mode change 100755 => 120000 examples/vctk/voc1/path.sh mode change 100755 => 120000 examples/vctk/voc5/local/preprocess.sh mode change 100755 => 120000 examples/vctk/voc5/local/synthesize.sh mode change 100755 => 120000 examples/vctk/voc5/local/train.sh mode change 100755 => 120000 examples/vctk/voc5/path.sh mode change 100755 => 120000 examples/zh_en_tts/tts3/local/train.sh mode change 100755 => 120000 examples/zh_en_tts/tts3/path.sh diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/aishell3/vc0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/aishell3/vc0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc1/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc1/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh deleted file mode 100755 index 8fd8977d3..000000000 --- a/examples/aishell3/vc2/local/synthesize.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_aishell3 \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_aishell3 \ - --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ - --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt \ - --voice-cloning=True diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh new file mode 120000 index 000000000..ca8df6b04 --- /dev/null +++ b/examples/aishell3/vc2/local/synthesize.sh @@ -0,0 +1 @@ +../../vc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc2/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc2/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc2/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc2/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh index 44cc3dbe4..71eab68ad 100755 --- a/examples/aishell3/voc1/local/preprocess.sh +++ b/examples/aishell3/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/aishell3/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/aishell3/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/aishell3/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/aishell3/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh deleted file mode 100755 index 44cc3dbe4..000000000 --- a/examples/aishell3/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./aishell3_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/data_aishell3/ \ - --dataset=aishell3 \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/aishell3/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/aishell3/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/aishell3/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/aishell3/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/aishell3/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh deleted file mode 100755 index 8b4178f13..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -stage=0 -stop_stage=0 - -# hifigan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/synthesize.py \ - --erniesat_config=${config_path} \ - --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --erniesat_stat=dump/train/speech_stats.npy \ - --voc=hifigan_aishell3 \ - --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ - --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ - --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt -fi diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh new file mode 120000 index 000000000..5703dcb2c --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/aishell3_vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh index 61d6d62be..62d0717b9 100755 --- a/examples/csmsc/voc1/local/preprocess.sh +++ b/examples/csmsc/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh deleted file mode 100755 index 6719bd0be..000000000 --- a/examples/csmsc/voc3/finetune.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -source path.sh - -gpus=0 -stage=0 -stop_stage=100 - -source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ - --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --dur-file=durations.txt \ - --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ - --dataset=baker \ - --rootdir=~/datasets/BZNSYP/ -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 ${MAIN_ROOT}/utils/link_wav.py \ - --old-dump-dir=dump \ - --dump-dir=dump_finetune -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - cp dump/train/feats_stats.npy dump_finetune/train/ -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/train/raw/metadata.jsonl \ - --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/dev/raw/metadata.jsonl \ - --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/test/raw/metadata.jsonl \ - --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - CUDA_VISIBLE_DEVICES=${gpus} \ - FLAGS_cudnn_exhaustive_search=true \ - FLAGS_conv_workspace_size_limit=4000 \ - python ${BIN_DIR}/train.py \ - --train-metadata=dump_finetune/train/norm/metadata.jsonl \ - --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ - --config=conf/finetune.yaml \ - --output-dir=exp/finetune \ - --ngpu=1 -fi \ No newline at end of file diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh new file mode 120000 index 000000000..b6fa868e2 --- /dev/null +++ b/examples/csmsc/voc3/finetune.sh @@ -0,0 +1 @@ +../voc5/finetune.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc3/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc3/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc3/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc4/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc4/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc4/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc4/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh index 6719bd0be..eb8325aeb 100755 --- a/examples/csmsc/voc5/finetune.sh +++ b/examples/csmsc/voc5/finetune.sh @@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/train/raw/metadata.jsonl \ --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/dev/raw/metadata.jsonl \ --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/test/raw/metadata.jsonl \ --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc5/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh index 2dcc39ac7..509824b8e 100755 --- a/examples/csmsc/voc6/local/preprocess.sh +++ b/examples/csmsc/voc6/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc6/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc6/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh deleted file mode 100755 index f90db9150..000000000 --- a/examples/ljspeech/tts0/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh new file mode 120000 index 000000000..7f54e9239 --- /dev/null +++ b/examples/ljspeech/tts0/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts0/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/ljspeech/tts0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/ljspeech/tts0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh deleted file mode 100755 index d1302f99f..000000000 --- a/examples/ljspeech/tts3/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh new file mode 120000 index 000000000..d7b05058e --- /dev/null +++ b/examples/ljspeech/tts3/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/ljspeech/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/ljspeech/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh index d1af60dad..bfbf75b7d 100755 --- a/examples/ljspeech/voc1/local/preprocess.sh +++ b/examples/ljspeech/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/ljspeech/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/ljspeech/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/ljspeech/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/ljspeech/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh deleted file mode 100755 index d1af60dad..000000000 --- a/examples/ljspeech/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./ljspeech_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/LJSpeech-1.1/ \ - --dataset=ljspeech \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/ljspeech/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/ljspeech/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/ljspeech/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/ljspeech/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/ljspeech/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh deleted file mode 100755 index 3a5076505..000000000 --- a/examples/vctk/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/vctk/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/vctk/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/vctk/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh index 88a478cd5..6b7e5288a 100755 --- a/examples/vctk/voc1/local/preprocess.sh +++ b/examples/vctk/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/vctk/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/vctk/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/vctk/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/vctk/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh deleted file mode 100755 index 88a478cd5..000000000 --- a/examples/vctk/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./vctk_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/VCTK-Corpus-0.92/ \ - --dataset=vctk \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/vctk/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/vctk/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/vctk/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/vctk/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/vctk/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh deleted file mode 100755 index 1da72f117..000000000 --- a/examples/zh_en_tts/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/zh_en_tts/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/zh_en_tts/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/zh_en_tts/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file From 31c2c226cacf88281332e61bd03bb863b1c1e9cf Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 30 Jan 2023 19:11:02 +0800 Subject: [PATCH 13/24] clean fluid elementwise_max and square api. (#2852) --- paddlespeech/s2t/training/gradclip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py index 26ac501e2..b2c0500d3 100644 --- a/paddlespeech/s2t/training/gradclip.py +++ b/paddlespeech/s2t/training/gradclip.py @@ -43,7 +43,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = layers.square(merge_grad) + square = paddle.square(merge_grad) sum_square = layers.reduce_sum(square) sum_square_list.append(sum_square) @@ -66,7 +66,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm)) + y=paddle.maximum(x=global_norm_var, y=max_global_norm)) for i, (p, g) in enumerate(params_grads): if g is None: continue From b5764e9f74665babfdd922189560ba269c072635 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 30 Jan 2023 19:17:37 +0800 Subject: [PATCH 14/24] [Install]rm protobuf in setup.py (#2853) * rm protobuf in setup.py && rm audio's dependances in setup.py --- audio/setup.py | 2 +- docs/requirements.txt | 4 +--- setup.py | 6 +----- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index 82e9a55a5..d36b2c440 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -43,7 +43,7 @@ base = [ "scipy>=1.0.0", "soundfile~=0.10", "colorlog", - "pathos == 0.2.8", + "pathos==0.2.8", "pybind11", "parameterized", "tqdm", diff --git a/docs/requirements.txt b/docs/requirements.txt index c6228d917..5422c26f9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,4 @@ braceexpand -colorlog editdistance fastapi g2p_en @@ -16,7 +15,7 @@ matplotlib myst-parser nara_wpe numpydoc -onnxruntime==1.10.0 +onnxruntime>=1.11.0 opencc paddlenlp # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243 @@ -24,7 +23,6 @@ paddlepaddle>=2.2.2,<2.4.0 paddlespeech_ctcdecoders paddlespeech_feat pandas -pathos==0.2.8 pattern_singleton Pillow>=9.0.0 ppdiffusers>=0.9.0 diff --git a/setup.py b/setup.py index 212d3b109..be6cf63a9 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,6 @@ base = [ "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", - "protobuf>=3.1.0, <=3.20.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", @@ -72,12 +71,9 @@ base = [ "yacs~=0.1.8", "prettytable", "zhon", - "colorlog", - "pathos==0.2.8", "braceexpand", "pyyaml", - "pybind11", - "paddleslim==2.3.4", + "paddleslim>=2.3.4", "paddleaudio>=1.0.2", ] From 64aeb6dccc73a262bab9f9ed2a1b8c7b15a30582 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 31 Jan 2023 19:52:45 +0800 Subject: [PATCH 15/24] remove some fluid api (elementwise_div elementwise_mul sqrt reduce_sum). (#2859) --- paddlespeech/s2t/training/gradclip.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py index b2c0500d3..be6fcf589 100644 --- a/paddlespeech/s2t/training/gradclip.py +++ b/paddlespeech/s2t/training/gradclip.py @@ -44,7 +44,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = paddle.square(merge_grad) - sum_square = layers.reduce_sum(square) + sum_square = paddle.sum(square) sum_square_list.append(sum_square) # debug log, not dump all since slow down train process @@ -57,14 +57,15 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): return params_grads global_norm_var = layers.concat(sum_square_list) - global_norm_var = layers.reduce_sum(global_norm_var) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sum(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) + # debug log logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!") max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var, y=max_global_norm)) for i, (p, g) in enumerate(params_grads): @@ -73,7 +74,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue - new_grad = layers.elementwise_mul(x=g, y=clip_var) + new_grad = paddle.multiply(x=g, y=clip_var) params_and_grads.append((p, new_grad)) # debug log, not dump all since slow down train process From 2f526c093cac230493f1ae399fa7182f73d588d3 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 1 Feb 2023 14:06:46 +0800 Subject: [PATCH 16/24] fix data for slim (#2862) --- examples/csmsc/tts3/local/PTQ_static.sh | 2 +- examples/csmsc/voc1/local/PTQ_static.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh index a70a77b58..c6dce53cb 100755 --- a/examples/csmsc/tts3/local/PTQ_static.sh +++ b/examples/csmsc/tts3/local/PTQ_static.sh @@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ - --onnx_forma=True \ No newline at end of file + --onnx_format=True \ No newline at end of file diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh index 2e5166141..c85ebd109 100755 --- a/examples/csmsc/voc1/local/PTQ_static.sh +++ b/examples/csmsc/voc1/local/PTQ_static.sh @@ -2,7 +2,7 @@ train_output_path=$1 model_name=$2 python3 ${BIN_DIR}/../../PTQ_static.py \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ + --dev-metadata=dump/dev/raw/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ --onnx_format=True \ No newline at end of file From ac3ed3c5a8a4e81ad662b8c41efa562f415dad7b Mon Sep 17 00:00:00 2001 From: QuanZ9 <31169290+QuanZ9@users.noreply.github.com> Date: Wed, 1 Feb 2023 15:55:52 +0800 Subject: [PATCH 17/24] Update zh_frontend.py (#2863) --- paddlespeech/t2s/frontend/zh_frontend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index ddd8cf5c7..efb673e36 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -138,7 +138,7 @@ class Frontend(): "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", - "狗儿" + "狗儿", "少儿" } self.vocab_phones = {} From 896da6dcd152b6241f606343dfa5ee6ec4932df5 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 1 Feb 2023 18:25:00 +0800 Subject: [PATCH 18/24] remove utils and third_party in paddlespeech's site-packages (#2867) --- audio/setup.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index d36b2c440..6e358346c 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -273,7 +273,7 @@ def main(): }, # Package info - packages=find_packages(include=('paddleaudio*')), + packages=find_packages(include=['paddleaudio*']), package_data=lib_package_data, ext_modules=setup_helpers.get_ext_modules(), zip_safe=True, diff --git a/setup.py b/setup.py index be6cf63a9..2c97ce783 100644 --- a/setup.py +++ b/setup.py @@ -300,7 +300,7 @@ setup_info = dict( }, # Package info - packages=find_packages(include=('paddlespeech*')), + packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']), zip_safe=True, classifiers=[ 'Development Status :: 5 - Production/Stable', From a55fd2e55685236c34330e0ba01e98878fc5b8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Thu, 2 Feb 2023 13:03:41 +0800 Subject: [PATCH 19/24] [TTS]Fix diffusion wavenet denoiser final conv init param (#2868) * add diffusion module for training diffsinger * add wavenet denoiser final conv initializer --- paddlespeech/t2s/modules/diffusion.py | 34 +++++++++++---------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index 52fe84ceb..eb67ffb0d 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -40,7 +40,7 @@ class WaveNetDenoiser(nn.Layer): layers (int, optional): Number of residual blocks inside, by default 20 stacks (int, optional): - The number of groups to split the residual blocks into, by default 4 + The number of groups to split the residual blocks into, by default 5 Within each group, the dilation of the residual block grows exponentially. residual_channels (int, optional): Residual channel of the residual blocks, by default 256 @@ -64,7 +64,7 @@ class WaveNetDenoiser(nn.Layer): out_channels: int=80, kernel_size: int=3, layers: int=20, - stacks: int=4, + stacks: int=5, residual_channels: int=256, gate_channels: int=512, skip_channels: int=256, @@ -72,7 +72,7 @@ class WaveNetDenoiser(nn.Layer): dropout: float=0., bias: bool=True, use_weight_norm: bool=False, - init_type: str="kaiming_uniform", ): + init_type: str="kaiming_normal", ): super().__init__() # initialize parameters @@ -118,18 +118,15 @@ class WaveNetDenoiser(nn.Layer): bias=bias) self.conv_layers.append(conv) + final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True) + nn.initializer.Constant(0.0)(final_conv.weight) self.last_conv_layers = nn.Sequential(nn.ReLU(), nn.Conv1D( skip_channels, skip_channels, 1, bias_attr=True), - nn.ReLU(), - nn.Conv1D( - skip_channels, - out_channels, - 1, - bias_attr=True)) + nn.ReLU(), final_conv) if use_weight_norm: self.apply_weight_norm() @@ -200,10 +197,6 @@ class GaussianDiffusion(nn.Layer): Args: denoiser (Layer, optional): The model used for denoising noises. - In fact, the denoiser model performs the operation - of producing a output with more noises from the noisy input. - Then we use the diffusion algorithm to calculate - the input with the output to get the denoised result. num_train_timesteps (int, optional): The number of timesteps between the noise and the real during training, by default 1000. beta_start (float, optional): @@ -233,7 +226,8 @@ class GaussianDiffusion(nn.Layer): >>> def callback(index, timestep, num_timesteps, sample): >>> nonlocal pbar >>> if pbar is None: - >>> pbar = tqdm(total=num_timesteps-index) + >>> pbar = tqdm(total=num_timesteps) + >>> pbar.update(index) >>> pbar.update() >>> >>> return callback @@ -247,7 +241,7 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) @@ -262,7 +256,7 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x_in, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) @@ -277,11 +271,11 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, None, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) - 100%|█████| 25/25 [00:01<00:00, 19.75it/s] + 100%|█████| 34/34 [00:01<00:00, 19.75it/s] >>> >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output >>> ds = 1000 @@ -292,11 +286,11 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) - 100%|█████| 5/5 [00:00<00:00, 23.80it/s] + 100%|█████| 14/14 [00:00<00:00, 23.80it/s] """ From a283f8a57e8bbc411bd36f2e0d8df3e0780a1c0e Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 2 Feb 2023 13:04:20 +0800 Subject: [PATCH 20/24] [TTS]fix open encoding (#2865) --- paddlespeech/cli/tts/infer.py | 6 +++--- paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py | 2 +- paddlespeech/t2s/exps/ernie_sat/train.py | 2 +- paddlespeech/t2s/exps/fastspeech2/train.py | 4 ++-- paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py | 6 +++--- paddlespeech/t2s/exps/speedyspeech/train.py | 6 +++--- paddlespeech/t2s/exps/syn_utils.py | 8 ++++---- paddlespeech/t2s/exps/tacotron2/train.py | 2 +- paddlespeech/t2s/exps/transformer_tts/train.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 4 ++-- paddlespeech/t2s/frontend/phonectic.py | 2 +- paddlespeech/t2s/frontend/zh_frontend.py | 4 ++-- 12 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 707518c05..5515ade26 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor): with open(self.voc_config) as f: self.voc_config = CfgNode(yaml.safe_load(f)) - with open(self.phones_dict, "r") as f: + with open(self.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if self.tones_dict: - with open(self.tones_dict, "r") as f: + with open(self.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if self.speaker_dict: - with open(self.speaker_dict, 'rt') as f: + with open(self.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index e450aa1a0..c43dafb3c 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -437,7 +437,7 @@ if __name__ == '__main__': vocab_phones = {} - with open(args.phones_dict, 'rt') as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py index 75a666bb1..c98d691be 100644 --- a/paddlespeech/t2s/exps/ernie_sat/train.py +++ b/paddlespeech/t2s/exps/ernie_sat/train.py @@ -109,7 +109,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index d31e62a82..97626db0b 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -67,7 +67,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -123,7 +123,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 644ec250d..d05dfafcf 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config): # construct dataset for evaluation sentences = [] - with open(args.text, 'rt') as f: + with open(args.text, 'rt', encoding='utf-8') as f: for line in f: items = line.strip().split() utt_id = items[0] sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 7b422e64f..c90090daa 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -70,7 +70,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker speedyspeech!") collate_fn = speedyspeech_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -133,11 +133,11 @@ def train_sp(args, config): collate_fn=collate_fn, num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 6b693440c..491edda30 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int): def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): # construct dataset for evaluation sentences = [] - with open(text_file, 'rt') as f: + with open(text_file, 'rt', encoding='utf-8') as f: for line in f: if line.strip() != "": items = re.split(r"\s+", line.strip(), 1) @@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc', tones_dict: Optional[os.PathLike]=None, speaker_dict: Optional[os.PathLike]=None, return_am: bool=False): - with open(phones_dict, "r") as f: + with open(phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if tones_dict is not None: - with open(tones_dict, "r") as f: + with open(tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if speaker_dict is not None: - with open(speaker_dict, 'rt') as f: + with open(speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) odim = am_config.n_mels diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py index 69ff80e46..db88009a8 100644 --- a/paddlespeech/t2s/exps/tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -119,7 +119,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index da48b6b99..d49baad99 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -114,7 +114,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index f6a31ced2..0e74bf631 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -78,7 +78,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker vits!") collate_fn = vits_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -132,7 +132,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 261db80a8..af86d9b80 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -58,7 +58,7 @@ class English(Phonetics): self.punc = ":,;。?!“”‘’':,;.?!" self.text_normalizer = TextNormalizer() if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index efb673e36..35b97a93a 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -144,12 +144,12 @@ class Frontend(): self.vocab_phones = {} self.vocab_tones = {} if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) if tone_vocab_path: - with open(tone_vocab_path, 'rt') as f: + with open(tone_vocab_path, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) From c764710aa12a2f0db23475b15e1f6cafd5f05e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A0=E5=AE=8F=E5=BD=AC?= <57510731+hopingZ@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:05:35 +0800 Subject: [PATCH 21/24] [TTS]Avoid using variable "attn_loss" before assignment (#2860) * Avoid using variable "attn_loss" before assignment * Update tacotron2_updater.py --------- Co-authored-by: TianYuan --- .../t2s/models/tacotron2/tacotron2_updater.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py index 09e6827d0..1db9248ae 100644 --- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py @@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater): loss.backward() optimizer.step() + if self.use_guided_attn_loss: + report("train/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) + report("train/l1_loss", float(l1_loss)) report("train/mse_loss", float(mse_loss)) report("train/bce_loss", float(bce_loss)) - report("train/attn_loss", float(attn_loss)) report("train/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) @@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator): attn_loss = self.attn_loss( att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss + + if self.use_guided_attn_loss: + report("eval/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) report("eval/l1_loss", float(l1_loss)) report("eval/mse_loss", float(mse_loss)) report("eval/bce_loss", float(bce_loss)) - report("eval/attn_loss", float(attn_loss)) report("eval/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) From 6b00ad6064a390525bd992dc747e1e5681b49db4 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 3 Feb 2023 09:57:51 +0800 Subject: [PATCH 22/24] [Install]clean dependencies (#2871) * clean dependencies * update paddleaudio's version * rm dependency in librosa and paddlenlp * rm dependency in paddlepaddle * rm dependency in speech_web --- audio/setup.py | 7 +------ .../speech_web/speech_server/requirements.txt | 4 +--- docs/requirements.txt | 15 +++------------ setup.py | 18 +++++------------- 4 files changed, 10 insertions(+), 34 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index 6e358346c..823e5dfad 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -40,14 +40,9 @@ COMMITID = 'none' base = [ "kaldiio", "librosa==0.8.1", - "scipy>=1.0.0", - "soundfile~=0.10", - "colorlog", - "pathos==0.2.8", + "pathos", "pybind11", "parameterized", - "tqdm", - "scikit-learn" ] requirements = { diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt index cdc654656..8425a1fee 100644 --- a/demos/speech_web/speech_server/requirements.txt +++ b/demos/speech_web/speech_server/requirements.txt @@ -1,8 +1,6 @@ aiofiles faiss-cpu -praatio==5.0.0 +praatio>=5.0.0 pydantic python-multipart -scikit_learn starlette -uvicorn diff --git a/docs/requirements.txt b/docs/requirements.txt index 5422c26f9..609f27925 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,11 +1,9 @@ braceexpand editdistance -fastapi g2p_en g2pM h5py inflect -jieba jsonlines kaldiio keyboard @@ -24,30 +22,23 @@ paddlespeech_ctcdecoders paddlespeech_feat pandas pattern_singleton -Pillow>=9.0.0 ppdiffusers>=0.9.0 -praatio==5.0.0 +praatio>=5.0.0 prettytable pypinyin-dict pypinyin<=0.44.0 python-dateutil -pyworld==0.2.12 +pyworld>=0.2.12 recommonmark>=0.5.0 -resampy==0.2.2 +resampy sacrebleu -scipy -sentencepiece~=0.1.96 -soundfile~=0.10 sphinx sphinx-autobuild sphinx-markdown-tables sphinx_rtd_theme textgrid timer -tqdm typeguard -uvicorn -visualdl webrtcvad websockets yacs~=0.1.8 diff --git a/setup.py b/setup.py index 2c97ce783..76bc5be8d 100644 --- a/setup.py +++ b/setup.py @@ -37,9 +37,7 @@ base = [ "g2pM", "h5py", "inflect", - "jieba", "jsonlines", - "kaldiio", "librosa==0.8.1", "loguru", "matplotlib", @@ -51,22 +49,16 @@ base = [ "paddlenlp>=2.4.8", "ppdiffusers>=0.9.0", "paddlespeech_feat", - "Pillow>=9.0.0", - "praatio==5.0.0", + "praatio>=5.0.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", - "pyworld==0.2.12", - "resampy==0.2.2", + "pyworld>=0.2.12", + "resampy", "sacrebleu", - "scipy", - "sentencepiece~=0.1.96", - "soundfile~=0.10", "textgrid", "timer", - "tqdm", "typeguard", - "visualdl", "webrtcvad", "yacs~=0.1.8", "prettytable", @@ -74,10 +66,10 @@ base = [ "braceexpand", "pyyaml", "paddleslim>=2.3.4", - "paddleaudio>=1.0.2", + "paddleaudio>=1.1.0", ] -server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] +server = ["pattern_singleton", "websockets"] requirements = { "install": From 089c060756c9fe5494ad9e13a57e61451103fee1 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 6 Feb 2023 19:59:02 +0800 Subject: [PATCH 23/24] fix pwgan tipc (#2882) --- tests/test_tipc/prepare.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index cb05a1d0f..9ff81bd8b 100755 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then mkdir -p BZNSYP unrar x BZNSYP.rar BZNSYP wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt + # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住 + wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz + tar -xzf nltk_data.tar.gz -C ${HOME} # 数据预处理 python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" From 16d84367c6c7452deb0cc9955aa40298271637b0 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 7 Feb 2023 10:10:53 +0800 Subject: [PATCH 24/24] fix Tensor.numpy()[0] to float(Tensor) to adapt 0D (#2884) --- examples/tess/cls0/local/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py index 25382d8c3..f023a37b7 100644 --- a/examples/tess/cls0/local/train.py +++ b/examples/tess/cls0/local/train.py @@ -121,7 +121,7 @@ if __name__ == "__main__": optimizer.clear_grad() # Calculate loss - avg_loss += loss.numpy()[0] + avg_loss += float(loss) # Calculate metrics preds = paddle.argmax(logits, axis=1)