From 1113a68a6de08187d7311fabbbbb629fea40b134 Mon Sep 17 00:00:00 2001 From: fazledyn-or Date: Tue, 3 Oct 2023 19:10:48 +0600 Subject: [PATCH 01/11] FIX: Added missing imports --- paddlespeech/audio/transform/perturb.py | 1 + paddlespeech/s2t/io/reader.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py index 0825caec8..78b8d2c34 100644 --- a/paddlespeech/audio/transform/perturb.py +++ b/paddlespeech/audio/transform/perturb.py @@ -14,6 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) import io import os +import sys import h5py import librosa diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index 5e018befb..be643cc7b 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -14,9 +14,12 @@ # Modified from espnet(https://github.com/espnet/espnet) from collections import OrderedDict +import io +import os import kaldiio import numpy as np import soundfile +import h5py from .utility import feat_type from paddlespeech.audio.transform.transformation import Transformation From 1a693448faaa07aa54d1a2a1b89c7c7c4d7427a2 Mon Sep 17 00:00:00 2001 From: fazledyn-or Date: Tue, 3 Oct 2023 19:12:00 +0600 Subject: [PATCH 02/11] FIX: Fixed the implementation of a special method --- paddlespeech/audio/transform/perturb.py | 2 +- paddlespeech/s2t/io/reader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py index 78b8d2c34..757a2f1bf 100644 --- a/paddlespeech/audio/transform/perturb.py +++ b/paddlespeech/audio/transform/perturb.py @@ -99,7 +99,7 @@ class SoundHDF5File(): def __contains__(self, item): return item in self.file - def __len__(self, item): + def __len__(self): return len(self.file) def __enter__(self): diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index be643cc7b..d433a643f 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -404,7 +404,7 @@ class SoundHDF5File(): def __contains__(self, item): return item in self.file - def __len__(self, item): + def __len__(self): return len(self.file) def __enter__(self): From 1b8ca706d6a8e0a8b97ee21d93314a245d777a69 Mon Sep 17 00:00:00 2001 From: gmm <38800877+mmglove@users.noreply.github.com> Date: Tue, 5 Dec 2023 14:57:20 +0800 Subject: [PATCH 03/11] =?UTF-8?q?=E3=80=90benchmark=E3=80=91fix=20gpu=5Fme?= =?UTF-8?q?m=20unit=20(#3634)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix profiler * add max_mem_reserved for benchmark * fix benchmark --- paddlespeech/t2s/training/trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/training/trainer.py b/paddlespeech/t2s/training/trainer.py index 7f1b17de2..7631ef350 100644 --- a/paddlespeech/t2s/training/trainer.py +++ b/paddlespeech/t2s/training/trainer.py @@ -164,9 +164,10 @@ class Trainer(object): self.updater. batch_size) + "avg_ips: {:.5f} sequences/sec,".format( self.updater.batch_size / avg_batch_cost) - max_mem_reserved_str = f" max_mem_reserved: {paddle.device.cuda.max_memory_reserved()} B" - max_mem_allocated_str = f" max_mem_allocated: {paddle.device.cuda.max_memory_allocated()} B" - msg += max_mem_reserved_str + "," + max_mem_allocated_str + if paddle.device.is_compiled_with_cuda(): + max_mem_reserved_str = f" max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB" + max_mem_allocated_str = f" max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB" + msg += max_mem_reserved_str + "," + max_mem_allocated_str logger.info(msg) From 39ba32fafb6fc80311ab3f4f6998e52f6583c12e Mon Sep 17 00:00:00 2001 From: Color_yr <402067010@qq.com> Date: Tue, 16 Jan 2024 20:11:24 +0800 Subject: [PATCH 04/11] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E7=BC=96=E7=A0=81=E8=AF=BB=E5=8F=96=20(#3606)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed #3605 --- paddlespeech/server/engine/tts/online/onnx/tts_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index 9dd31a08b..14204dde7 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -154,7 +154,7 @@ class TTSServerExecutor(TTSExecutor): self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf) logger.debug("Create voc sess successfully.") - with open(self.phones_dict, "r") as f: + with open(self.phones_dict, "r", encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] self.vocab_size = len(phn_id) logger.debug(f"vocab_size: {self.vocab_size}") From 02a5f7bce84e331620cf8775254c601627516450 Mon Sep 17 00:00:00 2001 From: JeffLu Date: Mon, 26 Feb 2024 10:59:55 +0800 Subject: [PATCH 05/11] bugfix: audio_len should be 1D, no 0D, which will raise list index out (#3490) of range error in the following decode process Co-authored-by: Luzhenhui --- paddlespeech/cli/asr/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 231a00f4d..4001f957f 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor): # fbank audio = preprocessing(audio, **preprocess_args) - audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) + audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) self._inputs["audio"] = audio From 2147d3b565c7db99dcdaa2db9dae52bee375d0f7 Mon Sep 17 00:00:00 2001 From: satani99 <42287151+satani99@users.noreply.github.com> Date: Mon, 26 Feb 2024 08:30:28 +0530 Subject: [PATCH 06/11] Update README.md (#3532) Fixed a typo --- demos/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/README.md b/demos/README.md index a41967864..6f9cd2e41 100644 --- a/demos/README.md +++ b/demos/README.md @@ -18,4 +18,4 @@ This directory contains many speech applications in multiple scenarios. * style_fs2 - multi style control for FastSpeech2 model * text_to_speech - convert text into speech * self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2 -* Wishper - speech recognize and translate based on Whisper model +* Whisper - speech recognize and translate based on Whisper model From bcbb85af7668a17c6498200f4675a6ac41d868f6 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 23 May 2024 19:34:04 +0800 Subject: [PATCH 07/11] fixed version for paddlepaddle. (#3701) * fixed version for paddlepaddle. * fix code style --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index af7c4dc3d..8e81da6d4 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ base = [ "pandas", "paddleaudio>=1.1.0", "paddlenlp>=2.4.8", + "paddlepaddle==2.5.1", "paddleslim>=2.3.4", "ppdiffusers>=0.9.0", "paddlespeech_feat", From 03022f2170ce76d2ca8385a92aa8df3519e2366b Mon Sep 17 00:00:00 2001 From: mjxs <52824616+kk-2000@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:34:39 +0800 Subject: [PATCH 08/11] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.5?= =?UTF-8?q?=E3=80=91issue=203444=20transformation=20import=20error=20(#377?= =?UTF-8?q?9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix paddlespeech.s2t.transform.transformation import error * fix paddlespeech.s2t.transform import error --- audio/tests/features/base.py | 2 +- audio/tests/features/test_istft.py | 4 ++-- audio/tests/features/test_log_melspectrogram.py | 2 +- audio/tests/features/test_spectrogram.py | 2 +- audio/tests/features/test_stft.py | 2 +- docs/tutorial/asr/tutorial_transformer.ipynb | 4 ++-- utils/apply-cmvn.py | 2 +- utils/compute-cmvn-stats.py | 2 +- utils/copy-feats.py | 2 +- utils/feat-to-shape.py | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py index d183b72ad..3bb1d1dde 100644 --- a/audio/tests/features/base.py +++ b/audio/tests/features/base.py @@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase): self.waveform, self.sr = load(os.path.abspath(os.path.basename(url))) self.waveform = self.waveform.astype( np.float32 - ) # paddlespeech.s2t.transform.spectrogram only supports float32 + ) # paddlespeech.audio.transform.spectrogram only supports float32 dim = len(self.waveform.shape) assert dim in [1, 2] diff --git a/audio/tests/features/test_istft.py b/audio/tests/features/test_istft.py index 9cf8cdd65..ea1ee5cb6 100644 --- a/audio/tests/features/test_istft.py +++ b/audio/tests/features/test_istft.py @@ -18,8 +18,8 @@ import paddle from paddleaudio.functional.window import get_window from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import IStft -from paddlespeech.s2t.transform.spectrogram import Stft +from paddlespeech.audio.transform.spectrogram import IStft +from paddlespeech.audio.transform.spectrogram import Stft class TestIstft(FeatTest): diff --git a/audio/tests/features/test_log_melspectrogram.py b/audio/tests/features/test_log_melspectrogram.py index 7d5680387..b2765d3be 100644 --- a/audio/tests/features/test_log_melspectrogram.py +++ b/audio/tests/features/test_log_melspectrogram.py @@ -18,7 +18,7 @@ import paddle import paddleaudio from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram +from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram class TestLogMelSpectrogram(FeatTest): diff --git a/audio/tests/features/test_spectrogram.py b/audio/tests/features/test_spectrogram.py index 5fe5afee1..6f4609632 100644 --- a/audio/tests/features/test_spectrogram.py +++ b/audio/tests/features/test_spectrogram.py @@ -18,7 +18,7 @@ import paddle import paddleaudio from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import Spectrogram +from paddlespeech.audio.transform.spectrogram import Spectrogram class TestSpectrogram(FeatTest): diff --git a/audio/tests/features/test_stft.py b/audio/tests/features/test_stft.py index 58792ffe2..9511a2926 100644 --- a/audio/tests/features/test_stft.py +++ b/audio/tests/features/test_stft.py @@ -18,7 +18,7 @@ import paddle from paddleaudio.functional.window import get_window from .base import FeatTest -from paddlespeech.s2t.transform.spectrogram import Stft +from paddlespeech.audio.transform.spectrogram import Stft class TestStft(FeatTest): diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb index dc3030061..77aed4bf8 100644 --- a/docs/tutorial/asr/tutorial_transformer.ipynb +++ b/docs/tutorial/asr/tutorial_transformer.ipynb @@ -236,8 +236,8 @@ "warnings.filterwarnings('ignore')\n", "\n", "from yacs.config import CfgNode\n", - "from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n", - "from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n", + "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n", + "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n", "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n", "from paddlespeech.s2t.models.u2 import U2Model\n", "\n", diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py index cf91bdfcd..fa69ff8e0 100755 --- a/utils/apply-cmvn.py +++ b/utils/apply-cmvn.py @@ -6,7 +6,7 @@ import kaldiio import numpy from distutils.util import strtobool -from paddlespeech.s2t.transform.cmvn import CMVN +from paddlespeech.audio.transform.cmvn import CMVN from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py index 276bcd36e..763347ce8 100755 --- a/utils/compute-cmvn-stats.py +++ b/utils/compute-cmvn-stats.py @@ -5,7 +5,7 @@ import logging import kaldiio import numpy as np -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style diff --git a/utils/copy-feats.py b/utils/copy-feats.py index dc7a70b45..89ea30f97 100755 --- a/utils/copy-feats.py +++ b/utils/copy-feats.py @@ -4,7 +4,7 @@ import logging from distutils.util import strtobool -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style diff --git a/utils/feat-to-shape.py b/utils/feat-to-shape.py index bbc9242f4..e5e014ded 100755 --- a/utils/feat-to-shape.py +++ b/utils/feat-to-shape.py @@ -3,7 +3,7 @@ import argparse import logging import sys -from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style From 09e5d8a4ac03f29c2ce6511e1a3c39136cd3e29b Mon Sep 17 00:00:00 2001 From: Mattheliu Date: Wed, 5 Jun 2024 10:41:32 +0800 Subject: [PATCH 09/11] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.8?= =?UTF-8?q?=E3=80=91issue=203652=20merge=5Fyi=20function=20has=20a=20bug?= =?UTF-8?q?=20(#3786)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug * 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug --- paddlespeech/t2s/frontend/tone_sandhi.py | 25 ++++++++++-------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 690f69aa2..3558064cd 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -237,30 +237,25 @@ class ToneSandhi(): # output seg: [['听一听', 'v']] def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] + skip_next = False # function 1 for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ - 0] == seg[i + 1][0] and seg[i - 1][1] == "v": - if i - 1 < len(new_seg): - new_seg[i - - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] - else: - new_seg.append([word, pos]) - new_seg.append([seg[i + 1][0], pos]) + if skip_next: + skip_next = False + continue + if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v": + new_seg[-1] = (new_seg[-1][0] + "一" + seg[i + 1][0], new_seg[-1][1]) + skip_next = True else: - if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][ - 0] == word and pos == "v": - continue - else: - new_seg.append([word, pos]) + new_seg.append((word, pos)) seg = new_seg new_seg = [] # function 2 for i, (word, pos) in enumerate(seg): if new_seg and new_seg[-1][0] == "一": - new_seg[-1][0] = new_seg[-1][0] + word + new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1]) else: - new_seg.append([word, pos]) + new_seg.append((word, pos)) return new_seg # the first and the second words are all_tone_three From 05660a62cb2f56c1af0773be06a75d8dbc18df20 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Wed, 5 Jun 2024 14:28:14 +0800 Subject: [PATCH 10/11] =?UTF-8?q?=E3=80=90test=E3=80=91add=20cli=20test=20?= =?UTF-8?q?readme=20(#3784)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add cli test readme * fix code style --- tests/unit/cli/test_cli.sh | 2 ++ tests/unit/doc/test_cli.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 tests/unit/doc/test_cli.md diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index a7f7d11e4..3bc2eae2f 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -110,5 +110,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav # whisper recognize text and translate to English paddlespeech whisper --task translate --input ./zh.wav +# to change model English-Only model +paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav echo -e "\033[32mTest success !!!\033[0m" diff --git a/tests/unit/doc/test_cli.md b/tests/unit/doc/test_cli.md new file mode 100644 index 000000000..34a0c016a --- /dev/null +++ b/tests/unit/doc/test_cli.md @@ -0,0 +1,29 @@ +# test CLI 测试文档 + + 该文档为 CLI 测试说明,该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。 + + # 测试流程 + ## 1. 环境安装 + + CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。 + + CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。 + + ### 其他相关依赖 + + gcc >= 4.8.5, + python >= 3.8 + + ## 2. 功能测试 + + 在 repo 的 tests/unit/cli 中运行: + + ```shell + + source path.sh + bash test_cli.sh + + ``` +## 3. 预期结果 + + 输出 "Test success",且运行过程中无报错或 Error 即为成功。 From 72ce8861779cc7fef9eb3277217878fd65375c58 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 6 Jun 2024 15:26:16 +0800 Subject: [PATCH 11/11] =?UTF-8?q?=E3=80=90test=E3=80=91fix=20test=20cli=20?= =?UTF-8?q?bug=20(#3793)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add cli test readme * fix code style * fix bug --- tests/unit/cli/test_cli.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 3bc2eae2f..3903e6597 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10 paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast # Speech SSL +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav # Speech_recognition -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav paddlespeech asr --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav