From bcbb85af7668a17c6498200f4675a6ac41d868f6 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 23 May 2024 19:34:04 +0800 Subject: [PATCH 01/18] fixed version for paddlepaddle. (#3701) * fixed version for paddlepaddle. * fix code style --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index af7c4dc3..8e81da6d 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ base = [ "pandas", "paddleaudio>=1.1.0", "paddlenlp>=2.4.8", + "paddlepaddle==2.5.1", "paddleslim>=2.3.4", "ppdiffusers>=0.9.0", "paddlespeech_feat", From 03022f2170ce76d2ca8385a92aa8df3519e2366b Mon Sep 17 00:00:00 2001 From: mjxs <52824616+kk-2000@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:34:39 +0800 Subject: [PATCH 02/18] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.5?= =?UTF-8?q?=E3=80=91issue=203444=20transformation=20import=20error=20(#377?= =?UTF-8?q?9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix paddlespeech.s2t.transform.transformation import error * fix paddlespeech.s2t.transform import error --- audio/tests/features/base.py | 2 +- audio/tests/features/test_istft.py | 4 ++-- audio/tests/features/test_log_melspectrogram.py | 2 +- audio/tests/features/test_spectrogram.py | 2 +- audio/tests/features/test_stft.py | 2 +- docs/tutorial/asr/tutorial_transformer.ipynb | 4 ++-- utils/apply-cmvn.py | 2 +- utils/compute-cmvn-stats.py | 2 +- utils/copy-feats.py | 2 +- utils/feat-to-shape.py | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py index d183b72a..3bb1d1dd 100644 --- a/audio/tests/features/base.py +++ b/audio/tests/features/base.py @@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase): self.waveform, self.sr = load(os.path.abspath(os.path.basename(url))) self.waveform = self.waveform.astype( np.float32 - ) # paddlespeech.s2t.transform.spectrogram only supports float32 + ) # paddlespeech.audio.transform.spectrogram only supports float32 dim = len(self.waveform.shape) assert dim in [1, 2] diff --git a/audio/tests/features/test_istft.py b/audio/tests/features/test_istft.py index 9cf8cdd6..ea1ee5cb 100644 --- a/audio/tests/features/test_istft.py +++ b/audio/tests/features/test_istft.py @@ -18,8 +18,8 @@ import paddle from paddleaudio.functional.window import get_window from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import IStft -from paddlespeech.s2t.transform.spectrogram import Stft +from paddlespeech.audio.transform.spectrogram import IStft +from paddlespeech.audio.transform.spectrogram import Stft class TestIstft(FeatTest): diff --git a/audio/tests/features/test_log_melspectrogram.py b/audio/tests/features/test_log_melspectrogram.py index 7d568038..b2765d3b 100644 --- a/audio/tests/features/test_log_melspectrogram.py +++ b/audio/tests/features/test_log_melspectrogram.py @@ -18,7 +18,7 @@ import paddle import paddleaudio from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram +from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram class TestLogMelSpectrogram(FeatTest): diff --git a/audio/tests/features/test_spectrogram.py b/audio/tests/features/test_spectrogram.py index 5fe5afee..6f460963 100644 --- a/audio/tests/features/test_spectrogram.py +++ b/audio/tests/features/test_spectrogram.py @@ -18,7 +18,7 @@ import paddle import paddleaudio from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import Spectrogram +from paddlespeech.audio.transform.spectrogram import Spectrogram class TestSpectrogram(FeatTest): diff --git a/audio/tests/features/test_stft.py b/audio/tests/features/test_stft.py index 58792ffe..9511a292 100644 --- a/audio/tests/features/test_stft.py +++ b/audio/tests/features/test_stft.py @@ -18,7 +18,7 @@ import paddle from paddleaudio.functional.window import get_window from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import Stft +from paddlespeech.audio.transform.spectrogram import Stft class TestStft(FeatTest): diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb index dc303006..77aed4bf 100644 --- a/docs/tutorial/asr/tutorial_transformer.ipynb +++ b/docs/tutorial/asr/tutorial_transformer.ipynb @@ -236,8 +236,8 @@ "warnings.filterwarnings('ignore')\n", "\n", "from yacs.config import CfgNode\n", - "from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n", - "from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n", + "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n", + "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n", "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n", "from paddlespeech.s2t.models.u2 import U2Model\n", "\n", diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py index cf91bdfc..fa69ff8e 100755 --- a/utils/apply-cmvn.py +++ b/utils/apply-cmvn.py @@ -6,7 +6,7 @@ import kaldiio import numpy from distutils.util import strtobool -from paddlespeech.s2t.transform.cmvn import CMVN +from paddlespeech.audio.transform.cmvn import CMVN from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py index 276bcd36..763347ce 100755 --- a/utils/compute-cmvn-stats.py +++ b/utils/compute-cmvn-stats.py @@ -5,7 +5,7 @@ import logging import kaldiio import numpy as np -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style diff --git a/utils/copy-feats.py b/utils/copy-feats.py index dc7a70b4..89ea30f9 100755 --- a/utils/copy-feats.py +++ b/utils/copy-feats.py @@ -4,7 +4,7 @@ import logging from distutils.util import strtobool -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style diff --git a/utils/feat-to-shape.py b/utils/feat-to-shape.py index bbc9242f..e5e014de 100755 --- a/utils/feat-to-shape.py +++ b/utils/feat-to-shape.py @@ -3,7 +3,7 @@ import argparse import logging import sys -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style From 09e5d8a4ac03f29c2ce6511e1a3c39136cd3e29b Mon Sep 17 00:00:00 2001 From: Mattheliu Date: Wed, 5 Jun 2024 10:41:32 +0800 Subject: [PATCH 03/18] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.8?= =?UTF-8?q?=E3=80=91issue=203652=20merge=5Fyi=20function=20has=20a=20bug?= =?UTF-8?q?=20(#3786)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug * 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug --- paddlespeech/t2s/frontend/tone_sandhi.py | 25 ++++++++++-------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 690f69aa..3558064c 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -237,30 +237,25 @@ class ToneSandhi(): # output seg: [['听一听', 'v']] def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] + skip_next = False # function 1 for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ - 0] == seg[i + 1][0] and seg[i - 1][1] == "v": - if i - 1 < len(new_seg): - new_seg[i - - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] - else: - new_seg.append([word, pos]) - new_seg.append([seg[i + 1][0], pos]) + if skip_next: + skip_next = False + continue + if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v": + new_seg[-1] = (new_seg[-1][0] + "一" + seg[i + 1][0], new_seg[-1][1]) + skip_next = True else: - if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][ - 0] == word and pos == "v": - continue - else: - new_seg.append([word, pos]) + new_seg.append((word, pos)) seg = new_seg new_seg = [] # function 2 for i, (word, pos) in enumerate(seg): if new_seg and new_seg[-1][0] == "一": - new_seg[-1][0] = new_seg[-1][0] + word + new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1]) else: - new_seg.append([word, pos]) + new_seg.append((word, pos)) return new_seg # the first and the second words are all_tone_three From 05660a62cb2f56c1af0773be06a75d8dbc18df20 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Wed, 5 Jun 2024 14:28:14 +0800 Subject: [PATCH 04/18] =?UTF-8?q?=E3=80=90test=E3=80=91add=20cli=20test=20?= =?UTF-8?q?readme=20(#3784)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add cli test readme * fix code style --- tests/unit/cli/test_cli.sh | 2 ++ tests/unit/doc/test_cli.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 tests/unit/doc/test_cli.md diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index a7f7d11e..3bc2eae2 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -110,5 +110,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav # whisper recognize text and translate to English paddlespeech whisper --task translate --input ./zh.wav +# to change model English-Only model +paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav echo -e "\033[32mTest success !!!\033[0m" diff --git a/tests/unit/doc/test_cli.md b/tests/unit/doc/test_cli.md new file mode 100644 index 00000000..34a0c016 --- /dev/null +++ b/tests/unit/doc/test_cli.md @@ -0,0 +1,29 @@ +# test CLI 测试文档 + + 该文档为 CLI 测试说明,该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。 + + # 测试流程 + ## 1. 环境安装 + + CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。 + + CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。 + + ### 其他相关依赖 + + gcc >= 4.8.5, + python >= 3.8 + + ## 2. 功能测试 + + 在 repo 的 tests/unit/cli 中运行: + + ```shell + + source path.sh + bash test_cli.sh + + ``` +## 3. 预期结果 + + 输出 "Test success",且运行过程中无报错或 Error 即为成功。 From 72ce8861779cc7fef9eb3277217878fd65375c58 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 6 Jun 2024 15:26:16 +0800 Subject: [PATCH 05/18] =?UTF-8?q?=E3=80=90test=E3=80=91fix=20test=20cli=20?= =?UTF-8?q?bug=20(#3793)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add cli test readme * fix code style * fix bug --- tests/unit/cli/test_cli.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 3bc2eae2..3903e659 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10 paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast # Speech SSL +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav # Speech_recognition -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav paddlespeech asr --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav From e8018a11ce73176549d92ddbac9bc4b0bbdd2157 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Fri, 7 Jun 2024 14:11:36 +0800 Subject: [PATCH 06/18] Update setup.py (#3795) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8e81da6d..10a6502c 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ base = [ "pandas", "paddleaudio>=1.1.0", "paddlenlp>=2.4.8", - "paddlepaddle==2.5.1", + "paddlepaddle-gpu==2.5.1", "paddleslim>=2.3.4", "ppdiffusers>=0.9.0", "paddlespeech_feat", From 91170bd2604e5a22237fcb46ebcf44f4d86914b5 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 11 Jun 2024 11:12:58 +0800 Subject: [PATCH 07/18] adapt view behavior change, fix KeyError. (#3794) * adapt view behavior change, fix KeyError. * fix readme demo run error. * fixed opencc version --- paddlespeech/cli/asr/infer.py | 2 +- paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 4001f957..231a00f4 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor): # fbank audio = preprocessing(audio, **preprocess_args) - audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0) + audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) self._inputs["audio"] = audio diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py index a3744d34..64195def 100755 --- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py +++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py @@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer): x_lens = x.shape[1] ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) - topk_index = topk_index.view([batch_size, x_lens]) # (B, maxlen) + topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen) hyps = [hyp.tolist() for hyp in topk_index] hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] diff --git a/setup.py b/setup.py index 10a6502c..030f7f88 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ base = [ "matplotlib", "nara_wpe", "onnxruntime>=1.11.0", - "opencc", + "opencc==1.1.6", "opencc-python-reimplemented", "pandas", "paddleaudio>=1.1.0", From 98fe6d1153e59cbaf9653e00695e061169ec683c Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:36:59 +0800 Subject: [PATCH 08/18] =?UTF-8?q?=E3=80=90benchmark=E3=80=91fix=20benchmar?= =?UTF-8?q?k=20prepare.sh=20(#3803)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix profiler * add max_mem_reserved for benchmark * fix benchmark * Update prepare.sh * Update prepare.sh --- tests/test_tipc/prepare.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index e57feda0..7d4dd8b1 100755 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -35,6 +35,8 @@ if [[ ${MODE} = "benchmark_train" ]];then pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple pip install jsonlines + pip install -U scipy==1.12.0 # 高版本数据处理部分报错 + pip install -U matplotlib==3.7.1 # 高版本报错cannot import name 'get_cmap' from 'matplotlib.cm' pip list cd - if [[ ${model_name} == "conformer" ]]; then From 5e03da403b3c806a1cf1a736f17d2f16d4f61c51 Mon Sep 17 00:00:00 2001 From: funnycoder888 Date: Mon, 8 Jul 2024 15:30:56 +0800 Subject: [PATCH 09/18] Fix spelling errors (#3807) * Fix spelling errors * Update README.md fix spelling error --- audio/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audio/README.md b/audio/README.md index d42d4122..a8c47efe 100644 --- a/audio/README.md +++ b/audio/README.md @@ -14,7 +14,7 @@ Linux test build whl environment: * gcc/g++ - 8.2.0 * cmake - 3.18.0 (need install) -MAC:test build whl envrioment: +MAC:test build whl environment: * os * gcc/g++ 12.2.0 * cpu Intel Xeon E5 x86_64 From 748a5f9d5c36ed6f1f2c8fb67aa66a366314635b Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Fri, 19 Jul 2024 18:18:16 +0800 Subject: [PATCH 10/18] fix (#3818) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 030f7f88..941639e7 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ base = [ # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x "numpy==1.23.5", "librosa==0.8.1", - "scipy>=1.4.0", + "scipy>=1.4.0, <=1.12.0", "loguru", "matplotlib", "nara_wpe", From 2e93229a9379868d2f76e1c2a113a18c5a55bece Mon Sep 17 00:00:00 2001 From: tianshuo78520a Date: Tue, 23 Jul 2024 11:08:45 +0800 Subject: [PATCH 11/18] Fix (#3821) * Fix * Test CI Docker * Test CI Docker --- tools/Dockerfile | 4 ++++ tools/pre_commit.sh | 54 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 tools/Dockerfile create mode 100644 tools/pre_commit.sh diff --git a/tools/Dockerfile b/tools/Dockerfile new file mode 100644 index 00000000..18596f32 --- /dev/null +++ b/tools/Dockerfile @@ -0,0 +1,4 @@ +FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8.2-trt8.0-gcc82 +RUN apt-get update -y +RUN apt-get -y install libsndfile1 +RUN pip3.8 install pytest-runner diff --git a/tools/pre_commit.sh b/tools/pre_commit.sh new file mode 100644 index 00000000..3a179782 --- /dev/null +++ b/tools/pre_commit.sh @@ -0,0 +1,54 @@ +set +x + +# use pre-commit 2.17 +if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then + pip install pre-commit==2.17.0 1>nul +fi + +# Install clang-format before git commit to avoid repeat installation due to +# pre-commit multi-thread running. +readonly VERSION="13.0.0" +version=$(clang-format -version) +if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then + echo "clang-format installation by pip need python version great equal 3.6, + please change the default python to higher version." + exit 1 +fi + +diff_files=$(git diff --name-only --diff-filter=ACMR ${BRANCH}) +num_diff_files=$(echo "$diff_files" | wc -l) +echo -e "diff files between pr and ${BRANCH}:\n${diff_files}" + +echo "Checking code style by pre-commit ..." +pre-commit run --files ${diff_files};check_error=$? + +if test ! -z "$(git diff)"; then + echo -e '\n************************************************************************************' + echo -e "These files have been formatted by code format hook. You should use pre-commit to \ +format them before git push." + echo -e '************************************************************************************\n' + git diff 2>&1 +fi + +echo -e '\n************************************************************************************' +if [ ${check_error} != 0 ];then + echo "Your PR code style check failed." + echo "Please install pre-commit locally and set up git hook scripts:" + echo "" + echo " pip install pre-commit==2.17.0" + echo " pre-commit install" + echo "" + if [[ $num_diff_files -le 100 ]];then + echo "Then, run pre-commit to check codestyle issues in your PR:" + echo "" + echo " pre-commit run --files" $(echo ${diff_files} | tr "\n" " ") + echo "" + fi + echo "For more information, please refer to our codestyle check guide:" + echo "https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/dev_guides/git_guides/codestyle_check_guide_cn.html" +else + echo "Your PR code style check passed." +fi +echo -e '************************************************************************************\n' + +exit ${check_error} From d615fc33de2f340f1b6ca81c71d08b9bfcdd9b94 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Wed, 14 Aug 2024 14:17:53 +0800 Subject: [PATCH 12/18] =?UTF-8?q?=E3=80=90ASR=E3=80=91fix=20acs=20demo=20(?= =?UTF-8?q?#3826)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix demo acs * fix --- demos/audio_content_search/README.md | 15 ++++++++++++--- demos/audio_content_search/README_cn.md | 18 ++++++++++++++---- .../conf/ws_conformer_application.yaml | 4 +++- .../ws_conformer_wenetspeech_application.yaml | 1 + 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/demos/audio_content_search/README.md b/demos/audio_content_search/README.md index 4428bf38..f04ac447 100644 --- a/demos/audio_content_search/README.md +++ b/demos/audio_content_search/README.md @@ -19,7 +19,7 @@ You can choose one way from meduim and hard to install paddlespeech. The dependency refers to the requirements.txt, and install the dependency as follows: ``` -pip install -r requriement.txt +pip install -r requirements.txt ``` ### 2. Prepare Input File @@ -30,11 +30,20 @@ Here are sample files for this demo that can be downloaded: wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ``` -### 3. Usage +### 3. run paddlespeech_server +Before using the client, it is necessary to start paddlespeech_servers. + +Here are sample server configuration: +```bash +bash demos/audio_content_search/run.sh +``` +The logs of the two services will be recorded in 'acs.log' and 'streaming_asr.log' in this configuration. + +### 4. Usage - Command Line(Recommended) ```bash # Chinese - paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav ``` Usage: diff --git a/demos/audio_content_search/README_cn.md b/demos/audio_content_search/README_cn.md index 6f51c4cf..16c1a3dd 100644 --- a/demos/audio_content_search/README_cn.md +++ b/demos/audio_content_search/README_cn.md @@ -19,7 +19,7 @@ 依赖参见 requirements.txt, 安装依赖 ``` -pip install -r requriement.txt +pip install -r requirements.txt ``` ### 2. 准备输入 @@ -29,16 +29,26 @@ pip install -r requriement.txt ```bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ``` -### 3. 使用方法 + +### 3. 启动 server +使用 client 之前需要先启动 paddlespeech_server。 + +可以使用默认 server 配置: +```bash +bash demos/audio_content_search/run.sh +``` +该配置下两个服务的日志会被记录在 `acs.log` 和 `streaming_asr.log` 中。 + +### 4. 使用方法 - 命令行 (推荐使用) ```bash # 中文 - paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav ``` 使用方法: ```bash - paddlespeech acs --help + paddlespeech asr --help ``` 参数: - `input`(必须输入):用于识别的音频文件。 diff --git a/demos/audio_content_search/conf/ws_conformer_application.yaml b/demos/audio_content_search/conf/ws_conformer_application.yaml index 97201382..ad34ec9f 100644 --- a/demos/audio_content_search/conf/ws_conformer_application.yaml +++ b/demos/audio_content_search/conf/ws_conformer_application.yaml @@ -26,8 +26,10 @@ asr_online: sample_rate: 16000 cfg_path: decode_method: 'attention_rescoring' + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. force_yes: True device: 'cpu' # cpu or gpu:id + continuous_decoding: False # disable continue decoding when endpoint detected am_predictor_conf: device: # set 'gpu:id' or 'cpu' switch_ir_optim: True @@ -40,4 +42,4 @@ asr_online: window_ms: 25 # ms shift_ms: 10 # ms sample_rate: 16000 - sample_width: 2 + sample_width: 2 \ No newline at end of file diff --git a/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml b/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml index c23680bd..ef1ce8d5 100644 --- a/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml +++ b/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml @@ -31,6 +31,7 @@ asr_online: force_yes: True device: 'cpu' # cpu or gpu:id decode_method: "attention_rescoring" + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. am_predictor_conf: device: # set 'gpu:id' or 'cpu' switch_ir_optim: True From 0b568136d9f777d2e702d2cc1f40781f6fee8312 Mon Sep 17 00:00:00 2001 From: zhuyipin Date: Tue, 20 Aug 2024 16:53:25 +0800 Subject: [PATCH 13/18] speedyspeech code adapt for npu (#3804) * speedyspeech code adapt for npu * fix npu inference * fix e2e synthesize * add paddle version control for memory optim config * fix code style * fix code style * fix help message * fix code style * fix help message --- examples/csmsc/tts2/local/inference_npu.sh | 46 +++++++ .../csmsc/tts2/local/synthesize_e2e_npu.sh | 124 ++++++++++++++++++ examples/csmsc/tts2/local/synthesize_npu.sh | 110 ++++++++++++++++ examples/csmsc/tts2/local/train_npu.sh | 16 +++ examples/csmsc/tts2/run_npu.sh | 42 ++++++ paddlespeech/t2s/exps/inference.py | 2 +- paddlespeech/t2s/exps/speedyspeech/train.py | 29 ++-- paddlespeech/t2s/exps/syn_utils.py | 3 +- paddlespeech/t2s/exps/synthesize.py | 19 ++- paddlespeech/t2s/exps/synthesize_e2e.py | 19 ++- 10 files changed, 392 insertions(+), 18 deletions(-) create mode 100644 examples/csmsc/tts2/local/inference_npu.sh create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_npu.sh create mode 100755 examples/csmsc/tts2/local/synthesize_npu.sh create mode 100755 examples/csmsc/tts2/local/train_npu.sh create mode 100644 examples/csmsc/tts2/run_npu.sh diff --git a/examples/csmsc/tts2/local/inference_npu.sh b/examples/csmsc/tts2/local/inference_npu.sh new file mode 100644 index 00000000..0746a0cd --- /dev/null +++ b/examples/csmsc/tts2/local/inference_npu.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device npu +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device npu +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device npu +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e_npu.sh b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh new file mode 100755 index 00000000..1209a532 --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 + + +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nnpu=1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_npu.sh b/examples/csmsc/tts2/local/synthesize_npu.sh new file mode 100755 index 00000000..90fcef83 --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_npu.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --tones_dict=dump/tone_id_map.txt \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nnpu=1 +fi diff --git a/examples/csmsc/tts2/local/train_npu.sh b/examples/csmsc/tts2/local/train_npu.sh new file mode 100755 index 00000000..46243e15 --- /dev/null +++ b/examples/csmsc/tts2/local/train_npu.sh @@ -0,0 +1,16 @@ + +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=0 \ + --nnpu=1 \ + --phones-dict=dump/phone_id_map.txt \ + --tones-dict=dump/tone_id_map.txt \ + --use-relative-path=True diff --git a/examples/csmsc/tts2/run_npu.sh b/examples/csmsc/tts2/run_npu.sh new file mode 100644 index 00000000..f36c93f7 --- /dev/null +++ b/examples/csmsc/tts2/run_npu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +npus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_76.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run_xpu.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan by default + FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1 +fi diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 8a526982..21d105ad 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -112,7 +112,7 @@ def parse_args(): parser.add_argument( "--device", default="gpu", - choices=["gpu", "cpu", "xpu"], + choices=["gpu", "cpu", "xpu", "npu"], help="Device selected for inference.", ) parser.add_argument('--cpu_threads', type=int, default=1) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index c90090da..b82d6880 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -45,15 +45,18 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: - if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0: - paddle.set_device("cpu") - else: - paddle.set_device("xpu") - else: + if paddle.is_compiled_with_cuda() and args.ngpu > 0: paddle.set_device("gpu") if world_size > 1: paddle.distributed.init_parallel_env() + elif paddle.is_compiled_with_xpu() and args.nxpu > 0: + paddle.device.set_device("xpu") + elif args.nnpu > 0: + paddle.device.set_device("npu") + if world_size > 1: + paddle.distributed.init_parallel_env() + else: + paddle.set_device("cpu") # set the random seed, it is a must for multiprocess training seed_everything(config.seed) @@ -191,9 +194,19 @@ def main(): "--nxpu", type=int, default=0, - help="if nxpu == 0 and ngpu == 0, use cpu.") + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + ) + parser.add_argument( + "--nnpu", + type=int, + default=0, + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + ) parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu") + "--ngpu", + type=int, + default=1, + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") parser.add_argument( "--use-relative-path", diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 9a07df64..d29dd811 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -591,7 +591,8 @@ def get_predictor( config = inference.Config( str(Path(model_dir) / model_file), str(Path(model_dir) / params_file)) - config.enable_memory_optim() + if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0": + config.enable_memory_optim() config.switch_ir_optim(True) if device == "gpu": config.enable_use_gpu(100, device_id) diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index e7cf7850..9eb45989 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -219,12 +219,21 @@ def parse_args(): ) # other parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") + "--ngpu", + type=int, + default=1, + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + ) + parser.add_argument( + "--nnpu", + type=int, + default=0, + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." ) parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--output_dir", type=str, help="output dir.") @@ -245,10 +254,12 @@ def main(): paddle.set_device("gpu") elif args.nxpu > 0: paddle.set_device("xpu") - elif args.ngpu == 0 and args.nxpu == 0: + elif args.nnpu > 0: + paddle.set_device("npu") + elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0: paddle.set_device("cpu") else: - print("ngpu or nxpu should >= 0 !") + print("ngpu, nxpu and nnpu should be >= 0") evaluate(args) diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index c63a5fbe..b9073124 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -299,12 +299,21 @@ def parse_args(): default=None, help="dir to save inference models") parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") + "--ngpu", + type=int, + default=1, + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + ) + parser.add_argument( + "--nnpu", + type=int, + default=0, + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." ) parser.add_argument( "--text", @@ -339,10 +348,12 @@ def main(): paddle.set_device("gpu") elif args.nxpu > 0: paddle.set_device("xpu") - elif args.ngpu == 0 and args.nxpu == 0: + elif args.nnpu > 0: + paddle.set_device("npu") + elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0: paddle.set_device("cpu") else: - print("ngpu or nxpu should >= 0 !") + print("ngpu, nxpu and nnpu should be >= 0") evaluate(args) From 4be005858b75c380a6a7b614108bafd1db8dddd6 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 22 Aug 2024 11:18:29 +0800 Subject: [PATCH 14/18] =?UTF-8?q?=E3=80=90DOC=E3=80=91fix=20demos=20bug=20?= =?UTF-8?q?(#3830)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix demos * fix test --- demos/audio_searching/requirements.txt | 6 +++--- demos/streaming_asr_server/README.md | 4 ++-- demos/streaming_asr_server/README_cn.md | 4 ++-- demos/style_fs2/run.sh | 2 +- .../unit/server/online/tts/test_server/test_http_client.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/demos/audio_searching/requirements.txt b/demos/audio_searching/requirements.txt index 9d0f6419..3c0f05af 100644 --- a/demos/audio_searching/requirements.txt +++ b/demos/audio_searching/requirements.txt @@ -1,5 +1,5 @@ -diskcache==5.2.1 -dtaidistance==2.3.1 +diskcache +dtaidistane fastapi librosa==0.8.0 numpy==1.22.0 @@ -10,4 +10,4 @@ python-multipart soundfile==0.10.3.post1 starlette typing -uvicorn \ No newline at end of file +uvicorn diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 31256d15..136863b9 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -429,7 +429,7 @@ bash server.sh If `127.0.0.1` is not accessible, you need to use the actual service IP address. ```bash - paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav + paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav ``` Output: ```text @@ -507,7 +507,7 @@ bash server.sh If `127.0.0.1` is not accessible, you need to use the actual service IP address. ```bash - python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav + python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav ``` Output: ```text diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index bbddd693..f5f477ea 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -428,7 +428,7 @@ bash server.sh 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 ```bash - paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav + paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav ``` 输出: ```text @@ -506,7 +506,7 @@ bash server.sh 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 ```bash - python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav + python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav ``` 输出: ```text diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh index 6f6d6068..45fc0c10 100755 --- a/demos/style_fs2/run.sh +++ b/demos/style_fs2/run.sh @@ -32,7 +32,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=${BIN_DIR}/../sentences.txt \ + --text=./sentences.txt \ --output-dir=output \ --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt fi diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py index 3174e85e..685c5ca9 100644 --- a/tests/unit/server/online/tts/test_server/test_http_client.py +++ b/tests/unit/server/online/tts/test_server/test_http_client.py @@ -48,7 +48,7 @@ if __name__ == "__main__": parser.add_argument( "--text", type=str, - default="../../../../../../paddlespeech/t2s/exps/csmsc_test.txt", + default="../../../../../../paddlespeech/t2s/assets/csmsc_test.txt", help="text to synthesize, a 'utt_id sentence' pair per line") parser.add_argument('--spk_id', type=int, default=0, help='Speaker id') parser.add_argument('--speed', type=float, default=1.0, help='Audio speed') From a9ece28ba63bc0841ef5488b3dba46e8d5aa180b Mon Sep 17 00:00:00 2001 From: zhuyipin Date: Thu, 29 Aug 2024 10:56:01 +0800 Subject: [PATCH 15/18] speedyspeech code adapt for mlu (#3828) * speedyspeech code adapt for mlu * fix inference * fix help message --- examples/csmsc/tts2/local/inference_mlu.sh | 33 +++++++ .../csmsc/tts2/local/synthesize_e2e_mlu.sh | 99 +++++++++++++++++++ examples/csmsc/tts2/local/synthesize_mlu.sh | 90 +++++++++++++++++ examples/csmsc/tts2/local/train_mlu.sh | 16 +++ examples/csmsc/tts2/run_mlu.sh | 76 ++++++++++++++ paddlespeech/t2s/exps/inference.py | 2 +- paddlespeech/t2s/exps/speedyspeech/train.py | 12 ++- paddlespeech/t2s/exps/synthesize.py | 21 +++- paddlespeech/t2s/exps/synthesize_e2e.py | 21 +++- 9 files changed, 357 insertions(+), 13 deletions(-) create mode 100755 examples/csmsc/tts2/local/inference_mlu.sh create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_mlu.sh create mode 100755 examples/csmsc/tts2/local/synthesize_mlu.sh create mode 100755 examples/csmsc/tts2/local/train_mlu.sh create mode 100755 examples/csmsc/tts2/run_mlu.sh diff --git a/examples/csmsc/tts2/local/inference_mlu.sh b/examples/csmsc/tts2/local/inference_mlu.sh new file mode 100755 index 00000000..d1bade84 --- /dev/null +++ b/examples/csmsc/tts2/local/inference_mlu.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device mlu +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device mlu +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh new file mode 100755 index 00000000..7ad2024f --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nmlu=1 +fi + +# wavernn +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nmlu=1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_mlu.sh b/examples/csmsc/tts2/local/synthesize_mlu.sh new file mode 100755 index 00000000..6c0b0b65 --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_mlu.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +stage=0 +stop_stage=0 + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# style melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# wavernn +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --tones_dict=dump/tone_id_map.txt \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi diff --git a/examples/csmsc/tts2/local/train_mlu.sh b/examples/csmsc/tts2/local/train_mlu.sh new file mode 100755 index 00000000..4c148643 --- /dev/null +++ b/examples/csmsc/tts2/local/train_mlu.sh @@ -0,0 +1,16 @@ + +#!/bin/bash + +config_path=$1 +train_output_path=$2 +# export MLU_VISIBLE_DEVICES=8 +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=0 \ + --nmlu=2 \ + --phones-dict=dump/phone_id_map.txt \ + --tones-dict=dump/tone_id_map.txt \ + --use-relative-path=True diff --git a/examples/csmsc/tts2/run_mlu.sh b/examples/csmsc/tts2/run_mlu.sh new file mode 100755 index 00000000..848e5407 --- /dev/null +++ b/examples/csmsc/tts2/run_mlu.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -e +source path.sh +export CUSTOM_DEVICE_BLACK_LIST=elementwise_max +mlus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_30600.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan by default + FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1 +fi + +# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first +# we have only tested the following models so far +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # install paddle2onnx + pip install paddle2onnx --upgrade + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc + # considering the balance between speed and quality, we recommend that you use hifigan as vocoder + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc +fi + +# inference with onnxruntime +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + ./local/ort_predict.sh ${train_output_path} +fi + +# must run after stage 3 (which stage generated static models) +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + ./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi + +# PTQ_static +if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1 +fi diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 21d105ad..e8ddd3be 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -112,7 +112,7 @@ def parse_args(): parser.add_argument( "--device", default="gpu", - choices=["gpu", "cpu", "xpu", "npu"], + choices=["gpu", "cpu", "xpu", "npu", "mlu"], help="Device selected for inference.", ) parser.add_argument('--cpu_threads', type=int, default=1) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index b82d6880..b1916fbc 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -55,6 +55,8 @@ def train_sp(args, config): paddle.device.set_device("npu") if world_size > 1: paddle.distributed.init_parallel_env() + elif args.nmlu > 0: + paddle.device.set_device("mlu") else: paddle.set_device("cpu") @@ -194,13 +196,19 @@ def main(): "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu." ) parser.add_argument( "--nnpu", type=int, default=0, - help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu." + ) + parser.add_argument( + "--nmlu", + type=int, + default=1, + help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu." ) parser.add_argument( "--ngpu", diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 9eb45989..b159725e 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -222,18 +222,25 @@ def parse_args(): "--ngpu", type=int, default=1, - help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu." + ) parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu." ) parser.add_argument( "--nnpu", type=int, default=0, - help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu." + ) + parser.add_argument( + "--nmlu", + type=int, + default=0, + help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu." ) parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--output_dir", type=str, help="output dir.") @@ -256,10 +263,14 @@ def main(): paddle.set_device("xpu") elif args.nnpu > 0: paddle.set_device("npu") - elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0: + elif args.nmlu > 0: + paddle.set_device("mlu") + elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0: paddle.set_device("cpu") else: - print("ngpu, nxpu and nnpu should be >= 0") + print( + "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0" + ) evaluate(args) diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index b9073124..08a14b31 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -302,18 +302,25 @@ def parse_args(): "--ngpu", type=int, default=1, - help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu." + ) parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu." ) parser.add_argument( "--nnpu", type=int, default=0, - help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu." + ) + parser.add_argument( + "--nmlu", + type=int, + default=0, + help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu." ) parser.add_argument( "--text", @@ -350,10 +357,14 @@ def main(): paddle.set_device("xpu") elif args.nnpu > 0: paddle.set_device("npu") - elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0: + elif args.nmlu > 0: + paddle.set_device("mlu") + elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0: paddle.set_device("cpu") else: - print("ngpu, nxpu and nnpu should be >= 0") + print( + "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0" + ) evaluate(args) From d9eb82a6324bdc3ab7bfd9d38ced92ae7e9693c5 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 29 Aug 2024 19:35:26 +0800 Subject: [PATCH 16/18] fix unit test (#3835) --- tests/unit/asr/deepspeech2_model_test.py | 10 ++++---- .../unit/asr/deepspeech2_online_model_test.py | 24 +++++++++---------- .../unit/server/offline/test_server_client.sh | 2 ++ tests/unit/tts/test_data_table.py | 2 +- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/tests/unit/asr/deepspeech2_model_test.py b/tests/unit/asr/deepspeech2_model_test.py index 5835445d..fd42192e 100644 --- a/tests/unit/asr/deepspeech2_model_test.py +++ b/tests/unit/asr/deepspeech2_model_test.py @@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=False, ) + rnn_direction="forward", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=True, - share_rnn_weights=False, ) + rnn_direction="forward", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True, ) + rnn_direction="bidirect", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=True, - share_rnn_weights=True, ) + rnn_direction="bidirect", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=False, ) + rnn_direction="forward", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py index f23c4926..f7ea87b1 100644 --- a/tests/unit/asr/deepspeech2_online_model_test.py +++ b/tests/unit/asr/deepspeech2_online_model_test.py @@ -19,11 +19,11 @@ import numpy as np import paddle from paddle import inference -from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline +from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel +from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -class TestDeepSpeech2ModelOnline(unittest.TestCase): +class TestDeepSpeech2Model(unittest.TestCase): def setUp(self): paddle.set_device('cpu') @@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.text_len = paddle.to_tensor(text_len, dtype='int64') def test_ds2_1(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_2(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_3(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_4(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_5(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_6(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): def test_ds2_7(self): use_gru = False - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): def test_ds2_8(self): use_gru = True - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase): export_prefix = "exp/deepspeech2_online/checkpoints/test_export" if not os.path.exists(os.path.dirname(export_prefix)): os.makedirs(os.path.dirname(export_prefix), mode=0o755) - infer_model = DeepSpeech2InferModelOnline( + infer_model = DeepSpeech2InferModel( feat_size=161, dict_size=4233, num_conv_layers=2, diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh index dc52609c..29bdd403 100644 --- a/tests/unit/server/offline/test_server_client.sh +++ b/tests/unit/server/offline/test_server_client.sh @@ -1,5 +1,7 @@ #!/bin/bash # bash test_server_client.sh +## require lsof to get server pid +## apt-get install -y lsof StartService(){ # Start service diff --git a/tests/unit/tts/test_data_table.py b/tests/unit/tts/test_data_table.py index 3ff5bc1a..773942a2 100644 --- a/tests/unit/tts/test_data_table.py +++ b/tests/unit/tts/test_data_table.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddlespeech.t2s.datasets.data_tabel import DataTable +from paddlespeech.t2s.datasets.data_table import DataTable def test_audio_dataset(): From 7e52aaed74f87b02af6d03098ff9f65e3224f5ce Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 30 Aug 2024 13:09:29 +0800 Subject: [PATCH 17/18] Add tests (#3836) * Add tests * fix * Fix * Fix * disable deepspeech2_online_model_test * disable test_data_table * Fix --- tests/unit/ci.sh | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/unit/ci.sh diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh new file mode 100644 index 00000000..9342a268 --- /dev/null +++ b/tests/unit/ci.sh @@ -0,0 +1,31 @@ +function main(){ + set -ex + speech_ci_path=`pwd` + + echo "Start asr" + cd ${speech_ci_path}/asr + bash deepspeech2_online_model_test.sh + python error_rate_test.py + python mask_test.py + python reverse_pad_list.py + echo "End asr" + + echo "Start TTS" + cd ${speech_ci_path}/tts + python test_data_table.py + python test_enfrontend.py + python test_mixfrontend.py + echo "End TTS" + + echo "Start Vector" + cd ${speech_ci_path}/vector + python test_augment.py + echo "End Vector" + + echo "Start cli" + cd ${speech_ci_path}/cli + bash test_cli.sh + echo "End cli" +} + +main From f66d7d25c40987bf4262ae7f17b442b0d7d4f356 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 5 Sep 2024 17:05:29 +0800 Subject: [PATCH 18/18] fix matplotlib version for incompatible upgrade (#3841) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 941639e7..48c68485 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ base = [ "librosa==0.8.1", "scipy>=1.4.0, <=1.12.0", "loguru", - "matplotlib", + "matplotlib<=3.8.4", "nara_wpe", "onnxruntime>=1.11.0", "opencc==1.1.6",