From a196c052cb7f7ca4efed4193229bcabd2e56d29c Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 17 Jan 2025 17:00:47 +0800 Subject: [PATCH 01/46] Fix (#3974) --- tests/unit/tts/test_pwg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/tts/test_pwg.py b/tests/unit/tts/test_pwg.py index 10c82c9fd..bcdb5aafc 100644 --- a/tests/unit/tts/test_pwg.py +++ b/tests/unit/tts/test_pwg.py @@ -14,16 +14,16 @@ import paddle import torch from paddle.device.cuda import synchronize +from parallel_wavegan import models as pwgan from parallel_wavegan.layers import residual_block from parallel_wavegan.layers import upsample -from parallel_wavegan.models import parallel_wavegan as pwgan from timer import timer from paddlespeech.t2s.models.parallel_wavegan import ConvInUpsampleNet from paddlespeech.t2s.models.parallel_wavegan import PWGDiscriminator from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator -from paddlespeech.t2s.models.parallel_wavegan import ResidualBlock from paddlespeech.t2s.models.parallel_wavegan import ResidualPWGDiscriminator +from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock from paddlespeech.t2s.utils.layer_tools import summary paddle.set_device("gpu:0") @@ -79,8 +79,8 @@ def test_convin_upsample_net(): def test_residual_block(): - net = ResidualBlock(dilation=9) - net2 = residual_block.ResidualBlock(dilation=9) + net = WaveNetResidualBlock(dilation=9) + net2 = residual_block.WaveNetResidualBlock(dilation=9) summary(net) summary(net2) for k, v in net2.named_parameters(): From 65dbf46cdbdc681af3d86497d468f4ccb8a89e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:06:09 +0800 Subject: [PATCH 02/46] =?UTF-8?q?=E3=80=90Hackathon=208th=20No.7=E3=80=91P?= =?UTF-8?q?ython=E7=89=88=E6=9C=AC=E9=80=82=E9=85=8D=204=20(#3970)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update setup.py * auto scipy * auto matplotlib * Update setup.py * Apply suggestions from code review --- setup.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 59a3e7db1..fa53b5d7e 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,26 @@ def determine_opencc_version(): return "opencc" # default +def determine_scipy_version(): + # get python version + python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + + # determine scipy version + if python_version == "3.8": + return "scipy>=1.4.0, <=1.12.0" # Python3.8 need scipy>=1.4.0, <=1.12.0 + return "scipy" # default + + +def determine_matplotlib_version(): + # get python version + python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + + # determine matplotlib version + if python_version == "3.8" or python_version == "3.9": + return "matplotlib<=3.8.4" # Python3.8/9 need matplotlib<=3.8.4 + return "matplotlib" # default + + base = [ "braceexpand", "editdistance", @@ -63,9 +83,9 @@ base = [ # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x "numpy==1.23.5", "librosa==0.8.1", - "scipy>=1.4.0, <=1.12.0", + determine_scipy_version(), # scipy or scipy>=1.4.0, <=1.12.0 "loguru", - "matplotlib<=3.8.4", + determine_matplotlib_version(), # matplotlib or matplotlib<=3.8.4 "nara_wpe", "onnxruntime>=1.11.0", determine_opencc_version(), # opencc or opencc==1.1.6 @@ -92,7 +112,7 @@ base = [ "ToJyutping", "typeguard", "webrtcvad", - "yacs~=0.1.8", + "yacs>=0.1.8", "zhon", ] From 85de840d073533795d5b2f83bedd62acb5b6dc4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:21:30 +0800 Subject: [PATCH 03/46] =?UTF-8?q?=E3=80=90Hackathon=208th=20No.7=E3=80=91P?= =?UTF-8?q?ython=E7=89=88=E6=9C=AC=E9=80=82=E9=85=8D=203=20(#3969)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update setup.py * add optional * fit with praatio>=6.0.0 * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review --- examples/other/g2p/get_g2p_data.py | 2 +- paddlespeech/server/restful/request.py | 2 +- paddlespeech/server/restful/response.py | 3 ++- paddlespeech/t2s/exps/ernie_sat/align.py | 4 ++-- setup.py | 4 ++-- utils/gen_duration_from_textgrid.py | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/other/g2p/get_g2p_data.py b/examples/other/g2p/get_g2p_data.py index 8fa3e53cd..87e7b9fdc 100644 --- a/examples/other/g2p/get_g2p_data.py +++ b/examples/other/g2p/get_g2p_data.py @@ -32,7 +32,7 @@ def get_baker_data(root_dir): alignment_fp, includeEmptyIntervals=True) # only with baker's annotation utt_id = alignment.tierNameList[0].split(".")[0] - intervals = alignment.tierDict[alignment.tierNameList[0]].entryList + intervals = alignment.getTier(alignment.tierNameList[0]).entries phones = [] for interval in intervals: label = interval.label diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py index b7a32481f..068694de3 100644 --- a/paddlespeech/server/restful/request.py +++ b/paddlespeech/server/restful/request.py @@ -65,7 +65,7 @@ class TTSRequest(BaseModel): speed: float = 1.0 volume: float = 1.0 sample_rate: int = 0 - save_path: str = None + save_path: Optional[str] = None #****************************************************************************************/ diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py index 3d991de43..12b264c02 100644 --- a/paddlespeech/server/restful/response.py +++ b/paddlespeech/server/restful/response.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from typing import List +from typing import Optional from pydantic import BaseModel @@ -62,7 +63,7 @@ class TTSResult(BaseModel): volume: float = 1.0 sample_rate: int duration: float - save_path: str = None + save_path: Optional[str] = None audio: str diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py index a802d0295..e7c8083a8 100755 --- a/paddlespeech/t2s/exps/ernie_sat/align.py +++ b/paddlespeech/t2s/exps/ernie_sat/align.py @@ -41,11 +41,11 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300): ends = [] words = [] - for interval in alignment.tierDict['words'].entryList: + for interval in alignment.getTier('words').entries: word = interval.label if word: words.append(word) - for interval in alignment.tierDict['phones'].entryList: + for interval in alignment.getTier('phones').entries: phone = interval.label phones.append(phone) ends.append(interval.end) diff --git a/setup.py b/setup.py index fa53b5d7e..184205926 100644 --- a/setup.py +++ b/setup.py @@ -96,9 +96,9 @@ base = [ "paddleslim>=2.3.4", "ppdiffusers>=0.9.0", "paddlespeech_feat", - "praatio>=5.0.0, <=5.1.1", + "praatio>=6.0.0", "prettytable", - "pydantic>=1.10.14, <2.0", + "pydantic", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", diff --git a/utils/gen_duration_from_textgrid.py b/utils/gen_duration_from_textgrid.py index 9ee0c05cc..54427665a 100755 --- a/utils/gen_duration_from_textgrid.py +++ b/utils/gen_duration_from_textgrid.py @@ -26,7 +26,7 @@ def readtg(tg_path, sample_rate=24000, n_shift=300): alignment = textgrid.openTextgrid(tg_path, includeEmptyIntervals=True) phones = [] ends = [] - for interval in alignment.tierDict["phones"].entryList: + for interval in alignment.getTier("phones").entries: phone = interval.label phones.append(phone) ends.append(interval.end) From 76cd9db6c579f3a0975c2f1900b80e5c86109af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Wed, 22 Jan 2025 16:53:53 +0800 Subject: [PATCH 04/46] def PythonDetermine in setup.py (#3975) * extract python version * Update setup.py --- setup.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 184205926..8e8265749 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,14 @@ VERSION = '0.0.0' COMMITID = 'none' +def determine_python_version(): + """ + Determine the current python version. The function return a string such as '3.7'. + """ + python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + return python_version + + def determine_opencc_version(): # get gcc version gcc_version = None @@ -53,7 +61,7 @@ def determine_opencc_version(): def determine_scipy_version(): # get python version - python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + python_version = determine_python_version() # determine scipy version if python_version == "3.8": @@ -63,7 +71,7 @@ def determine_scipy_version(): def determine_matplotlib_version(): # get python version - python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + python_version = determine_python_version() # determine matplotlib version if python_version == "3.8" or python_version == "3.9": From 69985c28698587c881eaf0c5a55519db4a570b13 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 23 Jan 2025 15:04:52 +0800 Subject: [PATCH 05/46] Fix readme (#3978) * Update README.md * Update README_cn.md --- README.md | 4 ++-- README_cn.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 00367d787..39cb1bc9d 100644 --- a/README.md +++ b/README.md @@ -228,12 +228,12 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision ## Installation -We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.8* and *paddlepaddle<=2.5.1*. Some new versions of Paddle do not have support for adaptation in PaddleSpeech, so currently only versions 2.5.1 and earlier can be supported. +We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.8*. ### **Dependency Introduction** + gcc >= 4.8.5 -+ paddlepaddle <= 2.5.1 ++ paddlepaddle + python >= 3.8 + OS support: Linux(recommend), Windows, Mac OSX diff --git a/README_cn.md b/README_cn.md index d70940dd2..a644e4c9f 100644 --- a/README_cn.md +++ b/README_cn.md @@ -238,11 +238,11 @@ ## 安装 -我们强烈建议用户在 **Linux** 环境下,*3.8* 以上版本的 *python* 上安装 PaddleSpeech。同时,有一些Paddle新版本的内容没有在做适配的支持,因此目前只能使用2.5.1及之前的版本。 +我们强烈建议用户在 **Linux** 环境下,*3.8* 以上版本的 *python* 上安装 PaddleSpeech。 ### 相关依赖 + gcc >= 4.8.5 -+ paddlepaddle <= 2.5.1 ++ paddlepaddle + python >= 3.8 + linux(推荐), mac, windows From cb0ba54d6ed592d8e00d4db5a32aa24a00b7477b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Thu, 23 Jan 2025 15:11:14 +0800 Subject: [PATCH 06/46] =?UTF-8?q?=E3=80=90Hackathon=208th=20No.7=E3=80=91P?= =?UTF-8?q?ython=E7=89=88=E6=9C=AC=E9=80=82=E9=85=8D=205=20(#3972)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update setup.py * fit with pypinyin * Apply suggestions from code review * Apply suggestions from code review * Update tone_sandhi.py * Apply suggestions from code review --- paddlespeech/t2s/frontend/tone_sandhi.py | 31 ++++++++++++++++-------- paddlespeech/t2s/frontend/zh_frontend.py | 5 ++++ setup.py | 2 +- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 3558064cd..d8688115b 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -243,8 +243,10 @@ class ToneSandhi(): if skip_next: skip_next = False continue - if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v": - new_seg[-1] = (new_seg[-1][0] + "一" + seg[i + 1][0], new_seg[-1][1]) + if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ + 0] == seg[i + 1][0] and seg[i - 1][1] == "v": + new_seg[-1] = (new_seg[-1][0] + "一" + seg[i + 1][0], + new_seg[-1][1]) skip_next = True else: new_seg.append((word, pos)) @@ -262,11 +264,16 @@ class ToneSandhi(): def _merge_continuous_three_tones( self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] - sub_finals_list = [ - lazy_pinyin( + sub_finals_list = [] + for (word, pos) in seg: + orig_finals = lazy_pinyin( word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) - for (word, pos) in seg - ] + # after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time + en_index = [index for index, c in enumerate(word) if c == "嗯"] + for i in en_index: + orig_finals[i] = "n2" + sub_finals_list.append(orig_finals) + assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): @@ -292,11 +299,15 @@ class ToneSandhi(): def _merge_continuous_three_tones_2( self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] - sub_finals_list = [ - lazy_pinyin( + sub_finals_list = [] + for (word, pos) in seg: + orig_finals = lazy_pinyin( word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) - for (word, pos) in seg - ] + # after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time + en_index = [index for index, c in enumerate(word) if c == "嗯"] + for i in en_index: + orig_finals[i] = "n2" + sub_finals_list.append(orig_finals) assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 1431bc6d8..95c75a7f0 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -173,6 +173,11 @@ class Frontend(): word, neutral_tone_with_five=True, style=Style.INITIALS) orig_finals = lazy_pinyin( word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + # after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time + en_index = [index for index, c in enumerate(word) if c == "嗯"] + for i in en_index: + orig_finals[i] = "n2" + for c, v in zip(orig_initials, orig_finals): if re.match(r'i\d', v): if c in ['z', 'c', 's']: diff --git a/setup.py b/setup.py index 8e8265749..8c2a4c1b7 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ base = [ "praatio>=6.0.0", "prettytable", "pydantic", - "pypinyin<=0.44.0", + "pypinyin", "pypinyin-dict", "python-dateutil", "pyworld>=0.2.12", From 675863ba662a376b302b2934efca57e6e780e913 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 23 Jan 2025 16:40:49 +0800 Subject: [PATCH 07/46] Fix (#3976) --- tests/unit/tts/test_snapshot.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/tts/test_snapshot.py b/tests/unit/tts/test_snapshot.py index 6ceff3e5a..fb18c7d78 100644 --- a/tests/unit/tts/test_snapshot.py +++ b/tests/unit/tts/test_snapshot.py @@ -19,10 +19,11 @@ from paddle.optimizer import Adam from paddlespeech.t2s.training.extensions.snapshot import Snapshot from paddlespeech.t2s.training.trainer import Trainer -from paddlespeech.t2s.training.updater import StandardUpdater +# from paddlespeech.t2s.training.updater import StandardUpdater -def test_snapshot(): + +def _test_snapshot(): model = nn.Linear(3, 4) optimizer = Adam(parameters=model.parameters()) From 59d641bc14bf5a8532b8643e0655af46f7a73173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 27 Jan 2025 11:20:05 +0800 Subject: [PATCH 08/46] =?UTF-8?q?=E3=80=90Hackathon=208th=20No.7=E3=80=91A?= =?UTF-8?q?dd=20hints=20for=20installing=20with=20-e=20(#3979)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update README_cn.md * Update README.md --- README.md | 2 ++ README_cn.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 39cb1bc9d..6594a4b8f 100644 --- a/README.md +++ b/README.md @@ -265,6 +265,8 @@ git clone https://github.com/PaddlePaddle/PaddleSpeech.git cd PaddleSpeech pip install pytest-runner pip install . +# If you need to install in editable mode, you need to use --use-pep517. The command is as follows: +# pip install -e . --use-pep517 ``` For more installation problems, such as conda environment, librosa-dependent, gcc problems, kaldi installation, etc., you can refer to this [installation document](./docs/source/install.md). If you encounter problems during installation, you can leave a message on [#2150](https://github.com/PaddlePaddle/PaddleSpeech/issues/2150) and find related problems diff --git a/README_cn.md b/README_cn.md index a644e4c9f..5b95a2879 100644 --- a/README_cn.md +++ b/README_cn.md @@ -272,6 +272,8 @@ git clone https://github.com/PaddlePaddle/PaddleSpeech.git cd PaddleSpeech pip install pytest-runner pip install . +# 如果需要在可编辑模式下安装,需要使用 --use-pep517,命令如下 +# pip install -e . --use-pep517 ``` 更多关于安装问题,如 conda 环境,librosa 依赖的系统库,gcc 环境问题,kaldi 安装等,可以参考这篇[安装文档](docs/source/install_cn.md),如安装上遇到问题可以在 [#2150](https://github.com/PaddlePaddle/PaddleSpeech/issues/2150) 上留言以及查找相关问题 From bb77a7f7db286f62f520a1055bae1292809d51bc Mon Sep 17 00:00:00 2001 From: co63oc Date: Mon, 27 Jan 2025 11:35:49 +0800 Subject: [PATCH 09/46] Fix (#3980) --- .github/ISSUE_TEMPLATE/bug-report-s2t.md | 2 +- .github/ISSUE_TEMPLATE/bug-report-tts.md | 2 +- audio/paddleaudio/backends/soundfile_backend.py | 2 +- audio/paddleaudio/compliance/kaldi.py | 6 +++--- audio/paddleaudio/datasets/dataset.py | 2 +- audio/paddleaudio/datasets/esc50.py | 2 +- audio/paddleaudio/datasets/gtzan.py | 2 +- audio/paddleaudio/datasets/tess.py | 2 +- audio/paddleaudio/datasets/urban_sound.py | 2 +- audio/paddleaudio/datasets/voxceleb.py | 4 ++-- audio/paddleaudio/features/layers.py | 2 +- audio/paddleaudio/functional/functional.py | 2 +- audio/paddleaudio/metric/eer.py | 4 ++-- audio/paddleaudio/sox_effects/sox_effects.py | 14 +++++++------- .../src/pybind/kaldi/feature_common_inl.h | 4 ++-- .../src/pybind/kaldi/kaldi_feature_wrapper.cc | 2 +- audio/paddleaudio/src/pybind/sox/effects.cpp | 10 +++++----- audio/paddleaudio/src/pybind/sox/effects_chain.cpp | 4 ++-- audio/paddleaudio/src/pybind/sox/utils.cpp | 4 ++-- audio/paddleaudio/src/pybind/sox/utils.h | 2 +- audio/paddleaudio/third_party/sox/CMakeLists.txt | 4 ++-- audio/paddleaudio/utils/download.py | 4 ++-- audio/paddleaudio/utils/log.py | 2 +- audio/paddleaudio/utils/sox_utils.py | 2 +- audio/paddleaudio/utils/tensor_utils.py | 8 ++++---- 25 files changed, 47 insertions(+), 47 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report-s2t.md b/.github/ISSUE_TEMPLATE/bug-report-s2t.md index 512cdbb01..e9732ad8c 100644 --- a/.github/ISSUE_TEMPLATE/bug-report-s2t.md +++ b/.github/ISSUE_TEMPLATE/bug-report-s2t.md @@ -33,7 +33,7 @@ If applicable, add screenshots to help explain your problem. - Python Version [e.g. 3.7] - PaddlePaddle Version [e.g. 2.0.0] - Model Version [e.g. 2.0.0] - - GPU/DRIVER Informationo [e.g. Tesla V100-SXM2-32GB/440.64.00] + - GPU/DRIVER Information [e.g. Tesla V100-SXM2-32GB/440.64.00] - CUDA/CUDNN Version [e.g. cuda-10.2] - MKL Version - TensorRT Version diff --git a/.github/ISSUE_TEMPLATE/bug-report-tts.md b/.github/ISSUE_TEMPLATE/bug-report-tts.md index e2322c239..b4c5dabdd 100644 --- a/.github/ISSUE_TEMPLATE/bug-report-tts.md +++ b/.github/ISSUE_TEMPLATE/bug-report-tts.md @@ -32,7 +32,7 @@ If applicable, add screenshots to help explain your problem. - Python Version [e.g. 3.7] - PaddlePaddle Version [e.g. 2.0.0] - Model Version [e.g. 2.0.0] - - GPU/DRIVER Informationo [e.g. Tesla V100-SXM2-32GB/440.64.00] + - GPU/DRIVER Information [e.g. Tesla V100-SXM2-32GB/440.64.00] - CUDA/CUDNN Version [e.g. cuda-10.2] - MKL Version - TensorRT Version diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py index 9195ea097..dcd2b4b1e 100644 --- a/audio/paddleaudio/backends/soundfile_backend.py +++ b/audio/paddleaudio/backends/soundfile_backend.py @@ -61,7 +61,7 @@ def resample(y: np.ndarray, if mode == 'kaiser_best': warnings.warn( f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \ - we recommend the mode kaiser_fast in large scale audio trainning') + we recommend the mode kaiser_fast in large scale audio training') if not isinstance(y, np.ndarray): raise ParameterError( diff --git a/audio/paddleaudio/compliance/kaldi.py b/audio/paddleaudio/compliance/kaldi.py index eb92ec1f2..a94ec4053 100644 --- a/audio/paddleaudio/compliance/kaldi.py +++ b/audio/paddleaudio/compliance/kaldi.py @@ -233,7 +233,7 @@ def spectrogram(waveform: Tensor, round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input to FFT. Defaults to True. sr (int, optional): Sample rate of input waveform. Defaults to 16000. - snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". @@ -443,7 +443,7 @@ def fbank(waveform: Tensor, round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input to FFT. Defaults to True. sr (int, optional): Sample rate of input waveform. Defaults to 16000. - snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. @@ -566,7 +566,7 @@ def mfcc(waveform: Tensor, round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input to FFT. Defaults to True. sr (int, optional): Sample rate of input waveform. Defaults to 16000. - snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. diff --git a/audio/paddleaudio/datasets/dataset.py b/audio/paddleaudio/datasets/dataset.py index f1dfc1ea3..170e91669 100644 --- a/audio/paddleaudio/datasets/dataset.py +++ b/audio/paddleaudio/datasets/dataset.py @@ -47,7 +47,7 @@ class AudioClassificationDataset(paddle.io.Dataset): files (:obj:`List[str]`): A list of absolute path of audio files. labels (:obj:`List[int]`): Labels of audio files. feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. + It identifies the feature type that user wants to extract of an audio file. """ super(AudioClassificationDataset, self).__init__() diff --git a/audio/paddleaudio/datasets/esc50.py b/audio/paddleaudio/datasets/esc50.py index e7477d40e..fd8c8503e 100644 --- a/audio/paddleaudio/datasets/esc50.py +++ b/audio/paddleaudio/datasets/esc50.py @@ -117,7 +117,7 @@ class ESC50(AudioClassificationDataset): split (:obj:`int`, `optional`, defaults to 1): It specify the fold of dev dataset. feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. + It identifies the feature type that user wants to extract of an audio file. """ files, labels = self._get_data(mode, split) super(ESC50, self).__init__( diff --git a/audio/paddleaudio/datasets/gtzan.py b/audio/paddleaudio/datasets/gtzan.py index cfea6f37e..a76e9208e 100644 --- a/audio/paddleaudio/datasets/gtzan.py +++ b/audio/paddleaudio/datasets/gtzan.py @@ -67,7 +67,7 @@ class GTZAN(AudioClassificationDataset): split (:obj:`int`, `optional`, defaults to 1): It specify the fold of dev dataset. feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. + It identifies the feature type that user wants to extract of an audio file. """ assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' files, labels = self._get_data(mode, seed, n_folds, split) diff --git a/audio/paddleaudio/datasets/tess.py b/audio/paddleaudio/datasets/tess.py index 8faab9c39..e34eaea37 100644 --- a/audio/paddleaudio/datasets/tess.py +++ b/audio/paddleaudio/datasets/tess.py @@ -76,7 +76,7 @@ class TESS(AudioClassificationDataset): split (:obj:`int`, `optional`, defaults to 1): It specify the fold of dev dataset. feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. + It identifies the feature type that user wants to extract of an audio file. """ assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' files, labels = self._get_data(mode, seed, n_folds, split) diff --git a/audio/paddleaudio/datasets/urban_sound.py b/audio/paddleaudio/datasets/urban_sound.py index d97c4d1dc..43d1b36c4 100644 --- a/audio/paddleaudio/datasets/urban_sound.py +++ b/audio/paddleaudio/datasets/urban_sound.py @@ -68,7 +68,7 @@ class UrbanSound8K(AudioClassificationDataset): split (:obj:`int`, `optional`, defaults to 1): It specify the fold of dev dataset. feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. + It identifies the feature type that user wants to extract of an audio file. """ def _get_meta_info(self): diff --git a/audio/paddleaudio/datasets/voxceleb.py b/audio/paddleaudio/datasets/voxceleb.py index b7160b24c..1fafb5176 100644 --- a/audio/paddleaudio/datasets/voxceleb.py +++ b/audio/paddleaudio/datasets/voxceleb.py @@ -262,8 +262,8 @@ class VoxCeleb(Dataset): split_chunks: bool=True): print(f'Generating csv: {output_file}') header = ["id", "duration", "wav", "start", "stop", "spk_id"] - # Note: this may occurs c++ execption, but the program will execute fine - # so we can ignore the execption + # Note: this may occurs c++ exception, but the program will execute fine + # so we can ignore the exception with Pool(cpu_count()) as p: infos = list( tqdm( diff --git a/audio/paddleaudio/features/layers.py b/audio/paddleaudio/features/layers.py index 292363e64..801ae34ce 100644 --- a/audio/paddleaudio/features/layers.py +++ b/audio/paddleaudio/features/layers.py @@ -34,7 +34,7 @@ __all__ = [ class Spectrogram(nn.Layer): """Compute spectrogram of given signals, typically audio waveforms. - The spectorgram is defined as the complex norm of the short-time Fourier transformation. + The spectrogram is defined as the complex norm of the short-time Fourier transformation. Args: n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. diff --git a/audio/paddleaudio/functional/functional.py b/audio/paddleaudio/functional/functional.py index 19c63a9ae..7c20f9013 100644 --- a/audio/paddleaudio/functional/functional.py +++ b/audio/paddleaudio/functional/functional.py @@ -247,7 +247,7 @@ def create_dct(n_mfcc: int, Args: n_mfcc (int): Number of mel frequency cepstral coefficients. n_mels (int): Number of mel filterbanks. - norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'. + norm (Optional[str], optional): Normalization type. Defaults to 'ortho'. dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. Returns: diff --git a/audio/paddleaudio/metric/eer.py b/audio/paddleaudio/metric/eer.py index a1166d3f9..a55695ac1 100644 --- a/audio/paddleaudio/metric/eer.py +++ b/audio/paddleaudio/metric/eer.py @@ -22,8 +22,8 @@ def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]: """Compute EER and return score threshold. Args: - labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num - scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num + labels (np.ndarray): the trial label, shape: [N], one-dimension, N refer to the samples num + scores (np.ndarray): the trial scores, shape: [N], one-dimension, N refer to the samples num Returns: List[float]: eer and the specific threshold diff --git a/audio/paddleaudio/sox_effects/sox_effects.py b/audio/paddleaudio/sox_effects/sox_effects.py index cb7e1b0b9..aa282b572 100644 --- a/audio/paddleaudio/sox_effects/sox_effects.py +++ b/audio/paddleaudio/sox_effects/sox_effects.py @@ -121,8 +121,8 @@ def apply_effects_tensor( """ tensor_np = tensor.numpy() - ret = paddleaudio._paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, - effects, channels_first) + ret = paddleaudio._paddleaudio.sox_effects_apply_effects_tensor( + tensor_np, sample_rate, effects, channels_first) if ret is not None: return (paddle.to_tensor(ret[0]), ret[1]) raise RuntimeError("Failed to apply sox effect") @@ -139,7 +139,7 @@ def apply_effects_file( Note: This function works in the way very similar to ``sox`` command, however there are slight - differences. For example, ``sox`` commnad adds certain effects automatically (such as + differences. For example, ``sox`` command adds certain effects automatically (such as ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate`` effect with desired sampling rate, because internally, ``speed`` effects only alter sampling @@ -228,14 +228,14 @@ def apply_effects_file( >>> pass """ if hasattr(path, "read"): - ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize, - channels_first, format) + ret = paddleaudio._paddleaudio.apply_effects_fileobj( + path, effects, normalize, channels_first, format) if ret is None: raise RuntimeError("Failed to load audio from {}".format(path)) return (paddle.to_tensor(ret[0]), ret[1]) path = os.fspath(path) - ret = paddleaudio._paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, - channels_first, format) + ret = paddleaudio._paddleaudio.sox_effects_apply_effects_file( + path, effects, normalize, channels_first, format) if ret is not None: return (paddle.to_tensor(ret[0]), ret[1]) raise RuntimeError("Failed to load audio from {}".format(path)) diff --git a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h index 985d586fe..3c62bb0d4 100644 --- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h +++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h @@ -26,7 +26,7 @@ template bool StreamingFeatureTpl::ComputeFeature( const std::vector& wav, std::vector* feats) { - // append remaned waves + // append remained waves int wav_len = wav.size(); if (wav_len == 0) return false; int left_len = remained_wav_.size(); @@ -38,7 +38,7 @@ bool StreamingFeatureTpl::ComputeFeature( wav.data(), wav_len * sizeof(float)); - // cache remaned waves + // cache remained waves knf::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); int num_frames = knf::NumFrames(waves.size(), frame_opts); int frame_shift = frame_opts.WindowShift(); diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc index 8b8ff18be..6fdf68af2 100644 --- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc +++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc @@ -44,5 +44,5 @@ py::array_t KaldiFeatureWrapper::ComputeFbank( return result.reshape(shape); } -} // namesapce kaldi +} // namespace kaldi } // namespace paddleaudio diff --git a/audio/paddleaudio/src/pybind/sox/effects.cpp b/audio/paddleaudio/src/pybind/sox/effects.cpp index ea77527bb..5b8959f6c 100644 --- a/audio/paddleaudio/src/pybind/sox/effects.cpp +++ b/audio/paddleaudio/src/pybind/sox/effects.cpp @@ -12,9 +12,9 @@ using namespace paddleaudio::sox_utils; namespace paddleaudio::sox_effects { // Streaming decoding over file-like object is tricky because libsox operates on -// FILE pointer. The folloing is what `sox` and `play` commands do +// FILE pointer. The following is what `sox` and `play` commands do // - file input -> FILE pointer -// - URL input -> call wget in suprocess and pipe the data -> FILE pointer +// - URL input -> call wget in subprocess and pipe the data -> FILE pointer // - stdin -> FILE pointer // // We want to, instead, fetch byte strings chunk by chunk, consume them, and @@ -127,12 +127,12 @@ namespace { enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown }; SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized; -std::mutex SOX_RESOUCE_STATE_MUTEX; +std::mutex SOX_RESOURCE_STATE_MUTEX; } // namespace void initialize_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); + const std::lock_guard lock(SOX_RESOURCE_STATE_MUTEX); switch (SOX_RESOURCE_STATE) { case NotInitialized: @@ -150,7 +150,7 @@ void initialize_sox_effects() { }; void shutdown_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); + const std::lock_guard lock(SOX_RESOURCE_STATE_MUTEX); switch (SOX_RESOURCE_STATE) { case NotInitialized: diff --git a/audio/paddleaudio/src/pybind/sox/effects_chain.cpp b/audio/paddleaudio/src/pybind/sox/effects_chain.cpp index 0204fb309..54f54840f 100644 --- a/audio/paddleaudio/src/pybind/sox/effects_chain.cpp +++ b/audio/paddleaudio/src/pybind/sox/effects_chain.cpp @@ -14,7 +14,7 @@ namespace { /// helper classes for passing the location of input tensor and output buffer /// -/// drain/flow callback functions require plaing C style function signature and +/// drain/flow callback functions require plain C style function signature and /// the way to pass extra data is to attach data to sox_effect_t::priv pointer. /// The following structs will be assigned to sox_effect_t::priv pointer which /// gives sox_effect_t an access to input Tensor and output buffer object. @@ -50,7 +50,7 @@ int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { *osamp -= *osamp % num_channels; // Slice the input Tensor - // refacor this module, chunk + // refactor this module, chunk auto i_frame = index / num_channels; auto num_frames = *osamp / num_channels; diff --git a/audio/paddleaudio/src/pybind/sox/utils.cpp b/audio/paddleaudio/src/pybind/sox/utils.cpp index bc32b7407..acdef8040 100644 --- a/audio/paddleaudio/src/pybind/sox/utils.cpp +++ b/audio/paddleaudio/src/pybind/sox/utils.cpp @@ -162,7 +162,7 @@ py::dtype get_dtype( } default: // default to float32 for the other formats, including - // 32-bit flaoting-point WAV, + // 32-bit floating-point WAV, // MP3, // FLAC, // VORBIS etc... @@ -177,7 +177,7 @@ py::array convert_to_tensor( const py::dtype dtype, const bool normalize, const bool channels_first) { - // todo refector later(SGoat) + // todo refactor later(SGoat) py::array t; uint64_t dummy = 0; SOX_SAMPLE_LOCALS; diff --git a/audio/paddleaudio/src/pybind/sox/utils.h b/audio/paddleaudio/src/pybind/sox/utils.h index 6fce66714..c98e8f9ed 100644 --- a/audio/paddleaudio/src/pybind/sox/utils.h +++ b/audio/paddleaudio/src/pybind/sox/utils.h @@ -76,7 +76,7 @@ py::dtype get_dtype( /// Tensor. /// @param dtype Target dtype. Determines the output dtype and value range in /// conjunction with normalization. -/// @param noramlize Perform normalization. Only effective when dtype is not +/// @param normalize Perform normalization. Only effective when dtype is not /// kFloat32. When effective, the output tensor is kFloat32 type and value range /// is [-1.0, 1.0] /// @param channels_first When True, output Tensor has shape of [num_channels, diff --git a/audio/paddleaudio/third_party/sox/CMakeLists.txt b/audio/paddleaudio/third_party/sox/CMakeLists.txt index 8a5bc55c7..91be289bd 100644 --- a/audio/paddleaudio/third_party/sox/CMakeLists.txt +++ b/audio/paddleaudio/third_party/sox/CMakeLists.txt @@ -8,9 +8,9 @@ set(patch_dir ${CMAKE_CURRENT_SOURCE_DIR}/../patches) set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc) # To pass custom environment variables to ExternalProject_Add command, -# we need to do `${CMAKE_COMMAND} -E env ${envs} `. +# we need to do `${CMAKE_COMMAND} -E env ${envs} `. # https://stackoverflow.com/a/62437353 -# We constrcut the custom environment variables here +# We construct the custom environment variables here set(envs "PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig" "LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}" diff --git a/audio/paddleaudio/utils/download.py b/audio/paddleaudio/utils/download.py index 07d5eea84..f47345dfc 100644 --- a/audio/paddleaudio/utils/download.py +++ b/audio/paddleaudio/utils/download.py @@ -41,14 +41,14 @@ def download_and_decompress(archives: List[Dict[str, str]], path: str, decompress: bool=True): """ - Download archieves and decompress to specific path. + Download archives and decompress to specific path. """ if not os.path.isdir(path): os.makedirs(path) for archive in archives: assert 'url' in archive and 'md5' in archive, \ - 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' + 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archive.keys())}' download.get_path_from_url( archive['url'], path, archive['md5'], decompress=decompress) diff --git a/audio/paddleaudio/utils/log.py b/audio/paddleaudio/utils/log.py index 5656b286a..ddc8fd669 100644 --- a/audio/paddleaudio/utils/log.py +++ b/audio/paddleaudio/utils/log.py @@ -58,7 +58,7 @@ log_config = { class Logger(object): ''' - Deafult logger in PaddleAudio + Default logger in PaddleAudio Args: name(str) : Logger name, default is 'PaddleAudio' ''' diff --git a/audio/paddleaudio/utils/sox_utils.py b/audio/paddleaudio/utils/sox_utils.py index 305bb68b0..7665238ef 100644 --- a/audio/paddleaudio/utils/sox_utils.py +++ b/audio/paddleaudio/utils/sox_utils.py @@ -55,7 +55,7 @@ def set_use_threads(use_threads: bool): Args: use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing. - To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support. + To use multithread, the underlying ``libsox`` has to be compiled with OpenMP support. See Also: http://sox.sourceforge.net/sox.html diff --git a/audio/paddleaudio/utils/tensor_utils.py b/audio/paddleaudio/utils/tensor_utils.py index cfd490b9a..1448d48a3 100644 --- a/audio/paddleaudio/utils/tensor_utils.py +++ b/audio/paddleaudio/utils/tensor_utils.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Unility functions for Transformer.""" +"""Utility functions for Transformer.""" from typing import List from typing import Tuple @@ -80,7 +80,7 @@ def pad_sequence(sequences: List[paddle.Tensor], # assuming trailing dimensions and type of all the Tensors # in sequences are same and fetching those from sequences[0] max_size = paddle.shape(sequences[0]) - # (TODO Hui Zhang): slice not supprot `end==start` + # (TODO Hui Zhang): slice not support `end==start` # trailing_dims = max_size[1:] trailing_dims = tuple( max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () @@ -94,7 +94,7 @@ def pad_sequence(sequences: List[paddle.Tensor], length = tensor.shape[0] # use index notation to prevent duplicate references to the tensor if batch_first: - # TODO (Hui Zhang): set_value op not supprot `end==start` + # TODO (Hui Zhang): set_value op not support `end==start` # TODO (Hui Zhang): set_value op not support int16 # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] # out_tensor[i, :length, ...] = tensor @@ -103,7 +103,7 @@ def pad_sequence(sequences: List[paddle.Tensor], else: out_tensor[i, length] = tensor else: - # TODO (Hui Zhang): set_value op not supprot `end==start` + # TODO (Hui Zhang): set_value op not support `end==start` # out_tensor[:length, i, ...] = tensor if length != 0: out_tensor[:length, i] = tensor From c74a6be99882905e1f19b3b103904fec7853724c Mon Sep 17 00:00:00 2001 From: co63oc Date: Mon, 27 Jan 2025 11:40:11 +0800 Subject: [PATCH 10/46] Fix (#3981) --- audio/paddleaudio/utils/time.py | 2 +- audio/tests/backends/base.py | 2 +- audio/tests/backends/soundfile/base.py | 2 +- audio/tests/backends/soundfile/save_test.py | 4 +- audio/tests/common_utils/data_utils.py | 2 +- audio/tests/common_utils/sox_utils.py | 43 ++++++++++++------- audio/tests/features/base.py | 4 +- audio/tests/features/test_istft.py | 2 +- audio/tests/features/test_kaldi.py | 3 +- audio/tests/features/test_librosa.py | 5 +-- .../tests/features/test_log_melspectrogram.py | 2 +- audio/tests/features/test_spectrogram.py | 2 +- audio/tests/features/test_stft.py | 4 +- dataset/librispeech/librispeech.py | 2 +- dataset/ted_en_zh/ted_en_zh.py | 2 +- dataset/thchs30/thchs30.py | 4 +- dataset/timit/timit.py | 2 +- dataset/timit/timit_kaldi_standard_split.py | 2 +- dataset/voxceleb/voxceleb1.py | 2 +- dataset/voxceleb/voxceleb2.py | 4 +- demos/audio_content_search/README.md | 2 +- demos/audio_searching/README.md | 2 +- demos/audio_tagging/README.md | 2 +- demos/automatic_video_subtitiles/README.md | 2 +- demos/keyword_spotting/README.md | 2 +- demos/punctuation_restoration/README.md | 2 +- demos/speaker_verification/README.md | 2 +- demos/speech_recognition/README.md | 2 +- demos/speech_server/README.md | 2 +- demos/speech_ssl/README.md | 2 +- demos/speech_translation/README.md | 2 +- demos/streaming_asr_server/README.md | 2 +- demos/streaming_tts_server/README.md | 2 +- demos/text_to_speech/README.md | 2 +- demos/whisper/README.md | 2 +- 35 files changed, 67 insertions(+), 58 deletions(-) diff --git a/audio/paddleaudio/utils/time.py b/audio/paddleaudio/utils/time.py index 105208f91..4ea413282 100644 --- a/audio/paddleaudio/utils/time.py +++ b/audio/paddleaudio/utils/time.py @@ -21,7 +21,7 @@ __all__ = [ class Timer(object): - '''Calculate runing speed and estimated time of arrival(ETA)''' + '''Calculate running speed and estimated time of arrival(ETA)''' def __init__(self, total_step: int): self.total_step = total_step diff --git a/audio/tests/backends/base.py b/audio/tests/backends/base.py index a67191887..c2d53d209 100644 --- a/audio/tests/backends/base.py +++ b/audio/tests/backends/base.py @@ -30,5 +30,5 @@ class BackendTest(unittest.TestCase): urllib.request.urlretrieve(url, os.path.basename(url)) self.files.append(os.path.basename(url)) - def initParmas(self): + def initParams(self): raise NotImplementedError diff --git a/audio/tests/backends/soundfile/base.py b/audio/tests/backends/soundfile/base.py index a67191887..c2d53d209 100644 --- a/audio/tests/backends/soundfile/base.py +++ b/audio/tests/backends/soundfile/base.py @@ -30,5 +30,5 @@ class BackendTest(unittest.TestCase): urllib.request.urlretrieve(url, os.path.basename(url)) self.files.append(os.path.basename(url)) - def initParmas(self): + def initParams(self): raise NotImplementedError diff --git a/audio/tests/backends/soundfile/save_test.py b/audio/tests/backends/soundfile/save_test.py index 4f3df6e48..4b5facd08 100644 --- a/audio/tests/backends/soundfile/save_test.py +++ b/audio/tests/backends/soundfile/save_test.py @@ -103,7 +103,7 @@ class MockedSaveTest(unittest.TestCase): encoding=encoding, bits_per_sample=bits_per_sample, ) - # on +Py3.8 call_args.kwargs is more descreptive + # on +Py3.8 call_args.kwargs is more descriptive args = mocked_write.call_args[1] assert args["file"] == filepath assert args["samplerate"] == sample_rate @@ -191,7 +191,7 @@ class SaveTestBase(TempDirMixin, unittest.TestCase): def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels): """`soundfile_backend.save` can save non-wav format. - Due to precision missmatch, and the lack of alternative way to decode the + Due to precision mismatch, and the lack of alternative way to decode the resulting files without using soundfile, only meta data are validated. """ num_frames = sample_rate * 3 diff --git a/audio/tests/common_utils/data_utils.py b/audio/tests/common_utils/data_utils.py index b5618618c..16f575701 100644 --- a/audio/tests/common_utils/data_utils.py +++ b/audio/tests/common_utils/data_utils.py @@ -81,7 +81,7 @@ def convert_tensor_encoding( #dtype = getattr(paddle, dtype) #if dtype not in [paddle.float64, paddle.float32, paddle.int32, paddle.int16, paddle.uint8]: #raise NotImplementedError(f"dtype {dtype} is not supported.") -## According to the doc, folking rng on all CUDA devices is slow when there are many CUDA devices, +## According to the doc, forking rng on all CUDA devices is slow when there are many CUDA devices, ## so we only fork on CPU, generate values and move the data to the given device #with paddle.random.fork_rng([]): #paddle.random.manual_seed(seed) diff --git a/audio/tests/common_utils/sox_utils.py b/audio/tests/common_utils/sox_utils.py index 6ceae081e..4c0866ed9 100644 --- a/audio/tests/common_utils/sox_utils.py +++ b/audio/tests/common_utils/sox_utils.py @@ -24,20 +24,21 @@ def get_bit_depth(dtype): def gen_audio_file( - path, - sample_rate, - num_channels, - *, - encoding=None, - bit_depth=None, - compression=None, - attenuation=None, - duration=1, - comment_file=None, -): + path, + sample_rate, + num_channels, + *, + encoding=None, + bit_depth=None, + compression=None, + attenuation=None, + duration=1, + comment_file=None, ): """Generate synthetic audio file with `sox` command.""" if path.endswith(".wav"): - warnings.warn("Use get_wav_data and save_wav to generate wav file for accurate result.") + warnings.warn( + "Use get_wav_data and save_wav to generate wav file for accurate result." + ) command = [ "sox", "-V3", # verbose @@ -81,7 +82,12 @@ def gen_audio_file( subprocess.run(command, check=True) -def convert_audio_file(src_path, dst_path, *, encoding=None, bit_depth=None, compression=None): +def convert_audio_file(src_path, + dst_path, + *, + encoding=None, + bit_depth=None, + compression=None): """Convert audio file with `sox` command.""" command = ["sox", "-V3", "--no-dither", "-R", str(src_path)] if encoding is not None: @@ -95,7 +101,7 @@ def convert_audio_file(src_path, dst_path, *, encoding=None, bit_depth=None, com subprocess.run(command, check=True) -def _flattern(effects): +def _flatten(effects): if not effects: return effects if isinstance(effects[0], str): @@ -103,9 +109,14 @@ def _flattern(effects): return [item for sublist in effects for item in sublist] -def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None): +def run_sox_effect(input_file, + output_file, + effect, + *, + output_sample_rate=None, + output_bitdepth=None): """Run sox effects""" - effect = _flattern(effect) + effect = _flatten(effect) command = ["sox", "-V", "--no-dither", input_file] if output_bitdepth: command += ["--bits", str(output_bitdepth)] diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py index 3bb1d1dde..4a44e04bb 100644 --- a/audio/tests/features/base.py +++ b/audio/tests/features/base.py @@ -24,7 +24,7 @@ wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' class FeatTest(unittest.TestCase): def setUp(self): - self.initParmas() + self.initParams() self.initWavInput() self.setUpDevice() @@ -44,5 +44,5 @@ class FeatTest(unittest.TestCase): if dim == 1: self.waveform = np.expand_dims(self.waveform, 0) - def initParmas(self): + def initParams(self): raise NotImplementedError diff --git a/audio/tests/features/test_istft.py b/audio/tests/features/test_istft.py index ea1ee5cb6..862a1d753 100644 --- a/audio/tests/features/test_istft.py +++ b/audio/tests/features/test_istft.py @@ -23,7 +23,7 @@ from paddlespeech.audio.transform.spectrogram import Stft class TestIstft(FeatTest): - def initParmas(self): + def initParams(self): self.n_fft = 512 self.hop_length = 128 self.window_str = 'hann' diff --git a/audio/tests/features/test_kaldi.py b/audio/tests/features/test_kaldi.py index 2bd5dc734..50e2571ca 100644 --- a/audio/tests/features/test_kaldi.py +++ b/audio/tests/features/test_kaldi.py @@ -18,12 +18,11 @@ import paddle import paddleaudio import torch import torchaudio - from base import FeatTest class TestKaldi(FeatTest): - def initParmas(self): + def initParams(self): self.window_size = 1024 self.dtype = 'float32' diff --git a/audio/tests/features/test_librosa.py b/audio/tests/features/test_librosa.py index 8cda25b19..07b117cb0 100644 --- a/audio/tests/features/test_librosa.py +++ b/audio/tests/features/test_librosa.py @@ -17,13 +17,12 @@ import librosa import numpy as np import paddle import paddleaudio -from paddleaudio.functional.window import get_window - from base import FeatTest +from paddleaudio.functional.window import get_window class TestLibrosa(FeatTest): - def initParmas(self): + def initParams(self): self.n_fft = 512 self.hop_length = 128 self.n_mels = 40 diff --git a/audio/tests/features/test_log_melspectrogram.py b/audio/tests/features/test_log_melspectrogram.py index b2765d3be..6152d6ff2 100644 --- a/audio/tests/features/test_log_melspectrogram.py +++ b/audio/tests/features/test_log_melspectrogram.py @@ -22,7 +22,7 @@ from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram class TestLogMelSpectrogram(FeatTest): - def initParmas(self): + def initParams(self): self.n_fft = 512 self.hop_length = 128 self.n_mels = 40 diff --git a/audio/tests/features/test_spectrogram.py b/audio/tests/features/test_spectrogram.py index 6f4609632..c2dced2e7 100644 --- a/audio/tests/features/test_spectrogram.py +++ b/audio/tests/features/test_spectrogram.py @@ -22,7 +22,7 @@ from paddlespeech.audio.transform.spectrogram import Spectrogram class TestSpectrogram(FeatTest): - def initParmas(self): + def initParams(self): self.n_fft = 512 self.hop_length = 128 diff --git a/audio/tests/features/test_stft.py b/audio/tests/features/test_stft.py index 9511a2926..5bab170be 100644 --- a/audio/tests/features/test_stft.py +++ b/audio/tests/features/test_stft.py @@ -22,7 +22,7 @@ from paddlespeech.audio.transform.spectrogram import Stft class TestStft(FeatTest): - def initParmas(self): + def initParams(self): self.n_fft = 512 self.hop_length = 128 self.window_str = 'hann' @@ -30,7 +30,7 @@ class TestStft(FeatTest): def test_stft(self): ps_stft = Stft(self.n_fft, self.hop_length) ps_res = ps_stft( - self.waveform.T).squeeze(1).T # (n_fft//2 + 1, n_frmaes) + self.waveform.T).squeeze(1).T # (n_fft//2 + 1, n_frames) x = paddle.to_tensor(self.waveform) window = get_window(self.window_str, self.n_fft, dtype=x.dtype) diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py index 2f5f9016c..ccf8d4b49 100644 --- a/dataset/librispeech/librispeech.py +++ b/dataset/librispeech/librispeech.py @@ -132,7 +132,7 @@ def create_manifest(data_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path): - """Download, unpack and create summmary manifest file. + """Download, unpack and create summary manifest file. """ if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): # download diff --git a/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py index 2d1fc6710..66810c85e 100644 --- a/dataset/ted_en_zh/ted_en_zh.py +++ b/dataset/ted_en_zh/ted_en_zh.py @@ -13,7 +13,7 @@ # limitations under the License. """Prepare Ted-En-Zh speech translation dataset -Create manifest files from splited datased. +Create manifest files from splited dataset. dev set: tst2010, test set: tst2015 Manifest file is a json-format file with each line containing the meta data (i.e. audio filepath, transcript and audio duration) diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py index c5c3eb7a8..fc8338984 100644 --- a/dataset/thchs30/thchs30.py +++ b/dataset/thchs30/thchs30.py @@ -71,7 +71,7 @@ def read_trn(filepath): with open(filepath, 'r') as f: lines = f.read().strip().split('\n') assert len(lines) == 3, lines - # charactor text, remove withespace + # character text, remove whitespace texts.append(''.join(lines[0].split())) texts.extend(lines[1:]) return texts @@ -127,7 +127,7 @@ def create_manifest(data_dir, manifest_path_prefix): 'utt2spk': spk, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': word_text, # charactor + 'text': word_text, # character 'syllable': syllable_text, 'phone': phone_text, }, diff --git a/dataset/timit/timit.py b/dataset/timit/timit.py index f3889d176..2943ff548 100644 --- a/dataset/timit/timit.py +++ b/dataset/timit/timit.py @@ -123,7 +123,7 @@ def read_algin(filepath: str) -> str: filepath (str): [description] Returns: - str: token sepearte by + str: token separate by """ aligns = [] # (start, end, token) with open(filepath, 'r') as f: diff --git a/dataset/timit/timit_kaldi_standard_split.py b/dataset/timit/timit_kaldi_standard_split.py index 473fc856f..59ce2e64a 100644 --- a/dataset/timit/timit_kaldi_standard_split.py +++ b/dataset/timit/timit_kaldi_standard_split.py @@ -13,7 +13,7 @@ # limitations under the License. """Prepare TIMIT dataset (Standard split from Kaldi) -Create manifest files from splited datased. +Create manifest files from splited dataset. Manifest file is a json-format file with each line containing the meta data (i.e. audio filepath, transcript and audio duration) of each audio file in the data set. diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py index 8d4100678..49a2a6baa 100644 --- a/dataset/voxceleb/voxceleb1.py +++ b/dataset/voxceleb/voxceleb1.py @@ -167,7 +167,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path, # check the target zip file md5sum if not check_md5sum(target_name, target_md5sum): - raise RuntimeError("{} MD5 checkssum failed".format(target_name)) + raise RuntimeError("{} MD5 checksum failed".format(target_name)) else: print("Check {} md5sum successfully".format(target_name)) diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py index 6df6d1f38..faa3b99bc 100644 --- a/dataset/voxceleb/voxceleb2.py +++ b/dataset/voxceleb/voxceleb2.py @@ -179,7 +179,7 @@ def download_dataset(base_url, data_list, target_data, target_dir, dataset): # check the target zip file md5sum if not check_md5sum(target_name, target_md5sum): - raise RuntimeError("{} MD5 checkssum failed".format(target_name)) + raise RuntimeError("{} MD5 checksum failed".format(target_name)) else: print("Check {} md5sum successfully".format(target_name)) @@ -187,7 +187,7 @@ def download_dataset(base_url, data_list, target_data, target_dir, dataset): # we need make the test directory unzip(target_name, os.path.join(target_dir, "test")) else: - # upzip dev zip pacakge and will create the dev directory + # unzip dev zip package and will create the dev directory unzip(target_name, target_dir) diff --git a/demos/audio_content_search/README.md b/demos/audio_content_search/README.md index f04ac447e..89b1c0d89 100644 --- a/demos/audio_content_search/README.md +++ b/demos/audio_content_search/README.md @@ -14,7 +14,7 @@ Now, the search word in demo is: ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from meduim and hard to install paddlespeech. +You can choose one way from medium and hard to install paddlespeech. The dependency refers to the requirements.txt, and install the dependency as follows: diff --git a/demos/audio_searching/README.md b/demos/audio_searching/README.md index 0fc901432..528fce9e8 100644 --- a/demos/audio_searching/README.md +++ b/demos/audio_searching/README.md @@ -19,7 +19,7 @@ Note:this demo uses the [CN-Celeb](http://openslr.org/82/) dataset of at least ### 1. Prepare PaddleSpeech Audio vector extraction requires PaddleSpeech training model, so please make sure that PaddleSpeech has been installed before running. Specific installation steps: See [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare MySQL and Milvus services by docker-compose The audio similarity search system requires Milvus, MySQL services. We can start these containers with one click through [docker-compose.yaml](./docker-compose.yaml), so please make sure you have [installed Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) before running. then diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index fc4a334ea..b602c6022 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -11,7 +11,7 @@ This demo is an implementation to tag an audio file with 527 [AudioSet](https:// ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`). diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md index b815425ec..89d8c73c9 100644 --- a/demos/automatic_video_subtitiles/README.md +++ b/demos/automatic_video_subtitiles/README.md @@ -10,7 +10,7 @@ This demo is an implementation to automatic video subtitles from a video file. I ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input Get a video file with the speech of the specific language: diff --git a/demos/keyword_spotting/README.md b/demos/keyword_spotting/README.md index 6544cf71e..b55c71124 100644 --- a/demos/keyword_spotting/README.md +++ b/demos/keyword_spotting/README.md @@ -10,7 +10,7 @@ This demo is an implementation to recognize keyword from a specific audio file. ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md index 458ab92f9..3544a2060 100644 --- a/demos/punctuation_restoration/README.md +++ b/demos/punctuation_restoration/README.md @@ -9,7 +9,7 @@ This demo is an implementation to restore punctuation from raw text. It can be d ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input The input of this demo should be a text of the specific language that can be passed via argument. diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md index 55f9a7360..37c6bf3b9 100644 --- a/demos/speaker_verification/README.md +++ b/demos/speaker_verification/README.md @@ -11,7 +11,7 @@ This demo is an implementation to extract speaker embedding from a specific audi ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input File The input of this cli demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index ee2acd6fd..e406590d2 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -10,7 +10,7 @@ This demo is an implementation to recognize text from a specific audio file. It ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 116f1fd7b..08788a89e 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -15,7 +15,7 @@ see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/doc It is recommended to use **paddlepaddle 2.4rc** or above. -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. **If you install in easy mode, you need to prepare the yaml file by yourself, you can refer to the yaml file in the conf directory.** diff --git a/demos/speech_ssl/README.md b/demos/speech_ssl/README.md index ef9b2237d..8677ebc57 100644 --- a/demos/speech_ssl/README.md +++ b/demos/speech_ssl/README.md @@ -10,7 +10,7 @@ This demo is an implementation to recognize text or produce the acoustic represe ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index 00a9c7932..4866336c0 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -9,7 +9,7 @@ This demo is an implementation to recognize text from a specific audio file and ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input File diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 136863b96..423485466 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -18,7 +18,7 @@ see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/doc It is recommended to use **paddlepaddle 2.4rc** or above. -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. **If you install in easy mode, you need to prepare the yaml file by yourself, you can refer to diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index ca5d6f1f8..ad87bebdc 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -15,7 +15,7 @@ see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/doc It is recommended to use **paddlepaddle 2.4rc** or above. -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. **If you install in easy mode, you need to prepare the yaml file by yourself, you can refer to the yaml file in the conf directory.** diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index d7bb8ca1c..b58777def 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -10,7 +10,7 @@ This demo is an implementation to generate audio from the given text. It can be ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input The input of this demo should be a text of the specific language that can be passed via argument. diff --git a/demos/whisper/README.md b/demos/whisper/README.md index 9b12554e6..6e1b8011f 100644 --- a/demos/whisper/README.md +++ b/demos/whisper/README.md @@ -9,7 +9,7 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). - You can choose one way from easy, meduim and hard to install paddlespeech. + You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. From 4e5181c949bab59bbbaa11b945165e06fc7689cd Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 27 Jan 2025 12:29:14 +0800 Subject: [PATCH 11/46] add some pir model (#3982) --- examples/vctk/tts3/README.md | 3 +++ examples/zh_en_tts/tts3/README.md | 3 +++ 2 files changed, 6 insertions(+) diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 3a6f3e1b9..183a20f0a 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -221,6 +221,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios: The static model can be downloaded here: - [fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip) +The PIR static model can be downloaded here: + - [fastspeech2_vctk_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md index 15de3f487..9c3cd4079 100644 --- a/examples/zh_en_tts/tts3/README.md +++ b/examples/zh_en_tts/tts3/README.md @@ -260,6 +260,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios: The static model can be downloaded here: - [fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip) +The PIR static model can be downloaded here: +- [fastspeech2_mix_static_pir_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_pir_0.2.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [fastspeech2_mix_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip) From f3a5df2049740ccdcac5ec88d329aa916931a87a Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 6 Feb 2025 11:10:04 +0800 Subject: [PATCH 12/46] Fix typos (#3984) * Fix * Fix --- audio/paddleaudio/backends/soundfile_backend.py | 16 ++++++++-------- audio/paddleaudio/compliance/librosa.py | 8 ++++---- audio/paddleaudio/src/pybind/sox/utils.cpp | 2 +- audio/tests/backends/soundfile/save_test.py | 2 +- dataset/chime3_background/chime3_background.py | 2 +- dataset/mini_librispeech/mini_librispeech.py | 2 +- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py index dcd2b4b1e..7611fd297 100644 --- a/audio/paddleaudio/backends/soundfile_backend.py +++ b/audio/paddleaudio/backends/soundfile_backend.py @@ -183,7 +183,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None: Args: y (np.ndarray): Input waveform array in 1D or 2D. sr (int): Sample rate. - file (os.PathLike): Path of auido file to save. + file (os.PathLike): Path of audio file to save. """ if not file.endswith('.wav'): raise ParameterError( @@ -216,10 +216,10 @@ def soundfile_load( duration: Optional[int]=None, dtype: str='float32', resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]: - """Load audio file from disk. This function loads audio from disk using using audio beackend. + """Load audio file from disk. This function loads audio from disk using using audio backend. Args: - file (os.PathLike): Path of auido file to load. + file (os.PathLike): Path of audio file to load. sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None. mono (bool, optional): Return waveform with mono channel. Defaults to True. merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'. @@ -250,14 +250,14 @@ def soundfile_load( if normal: y = normalize(y, norm_type, norm_mul_factor) elif dtype in ['int8', 'int16']: - # still need to do normalization, before depth convertion + # still need to do normalization, before depth conversion y = normalize(y, 'linear', 1.0) y = depth_convert(y, dtype) return y, r -#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion. +#The code below is taken from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py, with some modifications. def _get_subtype_for_wav(dtype: paddle.dtype, @@ -382,7 +382,7 @@ def save( channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, otherwise `[time, channel]`. compression (float of None, optional): Not used. - It is here only for interface compatibility reson with "sox_io" backend. + It is here only for interface compatibility reason with "sox_io" backend. format (str or None, optional): Override the audio format. When ``filepath`` argument is path-like object, audio format is inferred from file extension. If the file extension is missing or @@ -394,8 +394,8 @@ def save( Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``, ``"flac"`` and ``"sph"``. encoding (str or None, optional): Changes the encoding for supported formats. - This argument is effective only for supported formats, sush as - ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are; + This argument is effective only for supported formats, such as + ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are: - ``"PCM_S"`` (signed integer Linear PCM) - ``"PCM_U"`` (unsigned integer Linear PCM) diff --git a/audio/paddleaudio/compliance/librosa.py b/audio/paddleaudio/compliance/librosa.py index 168632d7c..c24d6d497 100644 --- a/audio/paddleaudio/compliance/librosa.py +++ b/audio/paddleaudio/compliance/librosa.py @@ -626,7 +626,7 @@ def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: def _randint(high: int) -> int: """Generate one random integer in range [0 high) - This is a helper function for random data augmentaiton + This is a helper function for random data augmentation """ return int(np.random.randint(0, high=high)) @@ -659,7 +659,7 @@ def depth_augment(y: np.ndarray, def adaptive_spect_augment(spect: np.ndarray, tempo_axis: int=0, level: float=0.1) -> np.ndarray: - """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation. + """Do adaptive spectrogram augmentation. The level of the augmentation is govern by the parameter level, ranging from 0 to 1, with 0 represents no augmentation. Args: spect (np.ndarray): Input spectrogram. @@ -711,9 +711,9 @@ def spect_augment(spect: np.ndarray, spect (np.ndarray): Input spectrogram. tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. max_time_mask (int, optional): Maximum number of time masking. Defaults to 3. - max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3. + max_freq_mask (int, optional): Maximum number of frequency masking. Defaults to 3. max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30. - max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20. + max_freq_mask_width (int, optional): Maximum width of frequency masking. Defaults to 20. Returns: np.ndarray: The augmented spectrogram. diff --git a/audio/paddleaudio/src/pybind/sox/utils.cpp b/audio/paddleaudio/src/pybind/sox/utils.cpp index acdef8040..1d38dff90 100644 --- a/audio/paddleaudio/src/pybind/sox/utils.cpp +++ b/audio/paddleaudio/src/pybind/sox/utils.cpp @@ -449,7 +449,7 @@ unsigned get_precision(const std::string filetype, py::dtype dtype) { return SOX_UNSPEC; if (filetype == "wav" || filetype == "amb") { switch (dtype.num()) { - case 1: // byte in numpy dype num + case 1: // byte in numpy dtype num return 8; case 3: // short, in numpy dtype num return 16; diff --git a/audio/tests/backends/soundfile/save_test.py b/audio/tests/backends/soundfile/save_test.py index 4b5facd08..0dce592c8 100644 --- a/audio/tests/backends/soundfile/save_test.py +++ b/audio/tests/backends/soundfile/save_test.py @@ -58,7 +58,7 @@ class MockedSaveTest(unittest.TestCase): encoding=encoding, bits_per_sample=bits_per_sample, ) - # on +Py3.8 call_args.kwargs is more descreptive + # on +Py3.8 call_args.kwargs is more descriptive args = mocked_write.call_args[1] assert args["file"] == filepath assert args["samplerate"] == sample_rate diff --git a/dataset/chime3_background/chime3_background.py b/dataset/chime3_background/chime3_background.py index 1f5439aab..4f081e6c3 100644 --- a/dataset/chime3_background/chime3_background.py +++ b/dataset/chime3_background/chime3_background.py @@ -58,7 +58,7 @@ def download(url, md5sum, target_dir, filename=None): if not (os.path.exists(filepath) and md5file(filepath) == md5sum): print("Downloading %s ..." % url) wget.download(url, target_dir) - print("\nMD5 Chesksum %s ..." % filepath) + print("\nMD5 Checksum %s ..." % filepath) if not md5file(filepath) == md5sum: raise RuntimeError("MD5 checksum failed.") else: diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py index 24bd98d8c..3a60ef22b 100644 --- a/dataset/mini_librispeech/mini_librispeech.py +++ b/dataset/mini_librispeech/mini_librispeech.py @@ -108,7 +108,7 @@ def create_manifest(data_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path): - """Download, unpack and create summmary manifest file. + """Download, unpack and create summary manifest file. """ if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): # download From 0479cce8ffa31a9b7bb10310de99dbbdab3f46a1 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Sat, 8 Feb 2025 12:43:19 +0800 Subject: [PATCH 13/46] =?UTF-8?q?=E3=80=90audio=E3=80=91remove=20paddleaud?= =?UTF-8?q?io=20from=20paddlespeech=20(#3986)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove paddleaudio from paddlespeech * use scikit-learn instead sklearn * add pathos * remove utils * add kaldiio * remove useless print --- audio/paddleaudio/backends/common.py | 32 +- docs/source/cls/custom_dataset.md | 6 +- docs/tutorial/cls/cls_tutorial.ipynb | 16 +- examples/tess/cls0/local/train.py | 4 +- examples/voxceleb/sv0/local/data_prepare.py | 2 +- .../make_rirs_noise_csv_dataset_from_json.py | 2 +- .../local/make_vox_csv_dataset_from_json.py | 2 +- paddlespeech/audio/__init__.py | 4 + paddlespeech/audio/backends/__init__.py | 20 + paddlespeech/audio/backends/common.py | 53 ++ .../audio/backends/soundfile_backend.py | 677 +++++++++++++++ paddlespeech/audio/compliance/__init__.py | 15 + paddlespeech/audio/compliance/kaldi.py | 643 ++++++++++++++ paddlespeech/audio/compliance/librosa.py | 788 ++++++++++++++++++ paddlespeech/audio/datasets/__init__.py | 15 + paddlespeech/audio/datasets/dataset.py | 100 +++ paddlespeech/audio/datasets/esc50.py | 152 ++++ paddlespeech/audio/datasets/voxceleb.py | 356 ++++++++ paddlespeech/audio/functional/__init__.py | 20 + paddlespeech/audio/functional/functional.py | 266 ++++++ paddlespeech/audio/functional/window.py | 373 +++++++++ paddlespeech/audio/streamdata/autodecode.py | 4 +- paddlespeech/audio/streamdata/filters.py | 4 +- paddlespeech/audio/streamdata/soundfile.py | 677 +++++++++++++++ paddlespeech/audio/streamdata/tariterators.py | 6 +- paddlespeech/audio/transform/spectrogram.py | 3 +- paddlespeech/cli/cls/infer.py | 2 +- paddlespeech/cli/kws/infer.py | 4 +- paddlespeech/cli/vector/infer.py | 4 +- paddlespeech/cls/exps/panns/deploy/predict.py | 2 +- paddlespeech/cls/exps/panns/export_model.py | 2 +- paddlespeech/cls/exps/panns/predict.py | 5 +- paddlespeech/cls/exps/panns/train.py | 4 +- paddlespeech/cls/models/panns/panns.py | 2 +- paddlespeech/kws/exps/mdtc/train.py | 4 +- .../frontend/featurizer/audio_featurizer.py | 3 +- paddlespeech/s2t/modules/fbank.py | 2 +- .../engine/vector/python/vector_engine.py | 4 +- paddlespeech/server/util.py | 4 +- .../starganv2_vc/AuxiliaryASR/layers.py | 4 +- .../vector/exps/ecapa_tdnn/extract_emb.py | 4 +- paddlespeech/vector/exps/ecapa_tdnn/test.py | 19 +- paddlespeech/vector/exps/ecapa_tdnn/train.py | 2 +- paddlespeech/vector/io/dataset.py | 4 +- paddlespeech/vector/io/dataset_from_json.py | 7 +- setup.py | 4 +- .../unit/audiotools/core/test_audio_signal.py | 6 +- 47 files changed, 4254 insertions(+), 78 deletions(-) create mode 100644 paddlespeech/audio/backends/__init__.py create mode 100644 paddlespeech/audio/backends/common.py create mode 100644 paddlespeech/audio/backends/soundfile_backend.py create mode 100644 paddlespeech/audio/compliance/__init__.py create mode 100644 paddlespeech/audio/compliance/kaldi.py create mode 100644 paddlespeech/audio/compliance/librosa.py create mode 100644 paddlespeech/audio/datasets/__init__.py create mode 100644 paddlespeech/audio/datasets/dataset.py create mode 100644 paddlespeech/audio/datasets/esc50.py create mode 100644 paddlespeech/audio/datasets/voxceleb.py create mode 100644 paddlespeech/audio/functional/__init__.py create mode 100644 paddlespeech/audio/functional/functional.py create mode 100644 paddlespeech/audio/functional/window.py create mode 100644 paddlespeech/audio/streamdata/soundfile.py diff --git a/audio/paddleaudio/backends/common.py b/audio/paddleaudio/backends/common.py index 9d3edf812..3065fe89f 100644 --- a/audio/paddleaudio/backends/common.py +++ b/audio/paddleaudio/backends/common.py @@ -1,4 +1,5 @@ -# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification. +# Token from https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification. + class AudioInfo: """return of info function. @@ -30,13 +31,12 @@ class AudioInfo: """ def __init__( - self, - sample_rate: int, - num_frames: int, - num_channels: int, - bits_per_sample: int, - encoding: str, - ): + self, + sample_rate: int, + num_frames: int, + num_channels: int, + bits_per_sample: int, + encoding: str, ): self.sample_rate = sample_rate self.num_frames = num_frames self.num_channels = num_channels @@ -44,12 +44,10 @@ class AudioInfo: self.encoding = encoding def __str__(self): - return ( - f"AudioMetaData(" - f"sample_rate={self.sample_rate}, " - f"num_frames={self.num_frames}, " - f"num_channels={self.num_channels}, " - f"bits_per_sample={self.bits_per_sample}, " - f"encoding={self.encoding}" - f")" - ) + return (f"AudioMetaData(" + f"sample_rate={self.sample_rate}, " + f"num_frames={self.num_frames}, " + f"num_channels={self.num_channels}, " + f"bits_per_sample={self.bits_per_sample}, " + f"encoding={self.encoding}" + f")") diff --git a/docs/source/cls/custom_dataset.md b/docs/source/cls/custom_dataset.md index 7482d5edf..26bd60b25 100644 --- a/docs/source/cls/custom_dataset.md +++ b/docs/source/cls/custom_dataset.md @@ -2,7 +2,7 @@ Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech`. -A base class of classification dataset is `paddlespeech.audio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`. +A base class of classification dataset is `paddlespeech.audio.datasets.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`. Assuming you have some wave files that stored in your own directory. You should prepare a meta file with the information of filepaths and labels. For example the absolute path of it is `/PATH/TO/META_FILE.txt`: ``` @@ -14,7 +14,7 @@ Assuming you have some wave files that stored in your own directory. You should Here is an example to build your custom dataset in `custom_dataset.py`: ```python -from paddleaudio.datasets.dataset import AudioClassificationDataset +from paddlespeech.audio.datasets.dataset import AudioClassificationDataset class CustomDataset(AudioClassificationDataset): meta_file = '/PATH/TO/META_FILE.txt' @@ -48,7 +48,7 @@ class CustomDataset(AudioClassificationDataset): Then you can build dataset and data loader from `CustomDataset`: ```python import paddle -from paddleaudio.features import LogMelSpectrogram +from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram from custom_dataset import CustomDataset diff --git a/docs/tutorial/cls/cls_tutorial.ipynb b/docs/tutorial/cls/cls_tutorial.ipynb index 3cee64991..e37b086f7 100644 --- a/docs/tutorial/cls/cls_tutorial.ipynb +++ b/docs/tutorial/cls/cls_tutorial.ipynb @@ -52,8 +52,8 @@ "metadata": {}, "outputs": [], "source": [ - "# 环境准备:安装paddlespeech和paddleaudio\n", - "!pip install --upgrade pip && pip install paddlespeech paddleaudio -U" + "# 环境准备:安装paddlespeech\n", + "!pip install --upgrade pip && pip install paddlespeech -U" ] }, { @@ -100,7 +100,7 @@ "metadata": {}, "outputs": [], "source": [ - "from paddleaudio import load\n", + "from paddlespeech.audio.backends import load\n", "data, sr = load(file='./dog.wav', mono=True, dtype='float32') # 单通道,float32音频样本点\n", "print('wav shape: {}'.format(data.shape))\n", "print('sample rate: {}'.format(sr))\n", @@ -191,7 +191,7 @@ "
图片来源:https://ww2.mathworks.cn/help/audio/ref/mfcc.html
\n", "\n", "

\n", - "下面例子采用 `paddleaudio.features.LogMelSpectrogram` 演示如何提取示例音频的 LogFBank:" + "下面例子采用 `paddlespeech.audio.transform.spectrogram.LogMelSpectrogram` 演示如何提取示例音频的 LogFBank:" ] }, { @@ -200,7 +200,7 @@ "metadata": {}, "outputs": [], "source": [ - "from paddleaudio.features import LogMelSpectrogram\n", + "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram\n", "\n", "f_min=50.0\n", "f_max=14000.0\n", @@ -337,7 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "from paddleaudio.datasets import ESC50\n", + "from paddlespeech.audio.datasets import ESC50\n", "\n", "train_ds = ESC50(mode='train', sample_rate=sr)\n", "dev_ds = ESC50(mode='dev', sample_rate=sr)" @@ -348,7 +348,7 @@ "metadata": {}, "source": [ "### 3.1.2 特征提取\n", - "通过下列代码,用 `paddleaudio.features.LogMelSpectrogram` 初始化一个音频特征提取器,在训练过程中实时提取音频的 LogFBank 特征,其中主要的参数如下: " + "通过下列代码,用 `paddlespeech.audio.transform.spectrogram.LogMelSpectrogram` 初始化一个音频特征提取器,在训练过程中实时提取音频的 LogFBank 特征,其中主要的参数如下: " ] }, { @@ -481,7 +481,7 @@ "metadata": {}, "outputs": [], "source": [ - "from paddleaudio.utils import logger\n", + "from paddlespeech.audio.utils import logger\n", "\n", "epochs = 20\n", "steps_per_epoch = len(train_loader)\n", diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py index f023a37b7..ad4926d76 100644 --- a/examples/tess/cls0/local/train.py +++ b/examples/tess/cls0/local/train.py @@ -16,9 +16,9 @@ import os import paddle import yaml -from paddleaudio.utils import logger -from paddleaudio.utils import Timer +from paddlespeech.audio.utils import logger +from paddlespeech.audio.utils.time import Timer from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py index b4486b6f0..e5a5dff7b 100644 --- a/examples/voxceleb/sv0/local/data_prepare.py +++ b/examples/voxceleb/sv0/local/data_prepare.py @@ -14,9 +14,9 @@ import argparse import paddle -from paddleaudio.datasets.voxceleb import VoxCeleb from yacs.config import CfgNode +from paddlespeech.audio.datasets.voxceleb import VoxCeleb from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.augment import build_augment_pipeline from paddlespeech.vector.training.seeding import seed_everything diff --git a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py index 11908fe63..b65fa35b4 100644 --- a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py +++ b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py @@ -21,9 +21,9 @@ import os from typing import List import tqdm -from paddleaudio.backends import soundfile_load as load_audio from yacs.config import CfgNode +from paddlespeech.audio.backends import soundfile_load as load_audio from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.utils.vector_utils import get_chunks diff --git a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py index ebeb598a4..6ef2064a0 100644 --- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py +++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py @@ -22,9 +22,9 @@ import os import random import tqdm -from paddleaudio.backends import soundfile_load as load_audio from yacs.config import CfgNode +from paddlespeech.audio.backends import soundfile_load as load_audio from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.utils.vector_utils import get_chunks diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py index a7cf6caaf..0e120be29 100644 --- a/paddlespeech/audio/__init__.py +++ b/paddlespeech/audio/__init__.py @@ -11,6 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from . import backends +from . import compliance +from . import datasets +from . import functional from . import streamdata from . import text from . import transform diff --git a/paddlespeech/audio/backends/__init__.py b/paddlespeech/audio/backends/__init__.py new file mode 100644 index 000000000..7e4ee6506 --- /dev/null +++ b/paddlespeech/audio/backends/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .soundfile_backend import depth_convert +from .soundfile_backend import load +from .soundfile_backend import normalize +from .soundfile_backend import resample +from .soundfile_backend import soundfile_load +from .soundfile_backend import soundfile_save +from .soundfile_backend import to_mono diff --git a/paddlespeech/audio/backends/common.py b/paddlespeech/audio/backends/common.py new file mode 100644 index 000000000..3065fe89f --- /dev/null +++ b/paddlespeech/audio/backends/common.py @@ -0,0 +1,53 @@ +# Token from https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification. + + +class AudioInfo: + """return of info function. + + This class is used by :ref:`"sox_io" backend` and + :ref:`"soundfile" backend with the new interface`. + + :ivar int sample_rate: Sample rate + :ivar int num_frames: The number of frames + :ivar int num_channels: The number of channels + :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats, + or when it cannot be accurately inferred. + :ivar str encoding: Audio encoding + The values encoding can take are one of the following: + + * ``PCM_S``: Signed integer linear PCM + * ``PCM_U``: Unsigned integer linear PCM + * ``PCM_F``: Floating point linear PCM + * ``FLAC``: Flac, Free Lossless Audio Codec + * ``ULAW``: Mu-law + * ``ALAW``: A-law + * ``MP3`` : MP3, MPEG-1 Audio Layer III + * ``VORBIS``: OGG Vorbis + * ``AMR_WB``: Adaptive Multi-Rate + * ``AMR_NB``: Adaptive Multi-Rate Wideband + * ``OPUS``: Opus + * ``HTK``: Single channel 16-bit PCM + * ``UNKNOWN`` : None of above + """ + + def __init__( + self, + sample_rate: int, + num_frames: int, + num_channels: int, + bits_per_sample: int, + encoding: str, ): + self.sample_rate = sample_rate + self.num_frames = num_frames + self.num_channels = num_channels + self.bits_per_sample = bits_per_sample + self.encoding = encoding + + def __str__(self): + return (f"AudioMetaData(" + f"sample_rate={self.sample_rate}, " + f"num_frames={self.num_frames}, " + f"num_channels={self.num_channels}, " + f"bits_per_sample={self.bits_per_sample}, " + f"encoding={self.encoding}" + f")") diff --git a/paddlespeech/audio/backends/soundfile_backend.py b/paddlespeech/audio/backends/soundfile_backend.py new file mode 100644 index 000000000..7611fd297 --- /dev/null +++ b/paddlespeech/audio/backends/soundfile_backend.py @@ -0,0 +1,677 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import warnings +from typing import Optional +from typing import Tuple + +import numpy as np +import paddle +import resampy +import soundfile +from scipy.io import wavfile + +from ..utils import depth_convert +from ..utils import ParameterError +from .common import AudioInfo + +__all__ = [ + 'resample', + 'to_mono', + 'normalize', + 'save', + 'soundfile_save', + 'load', + 'soundfile_load', + 'info', +] +NORMALMIZE_TYPES = ['linear', 'gaussian'] +MERGE_TYPES = ['ch0', 'ch1', 'random', 'average'] +RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast'] +EPS = 1e-8 + + +def resample(y: np.ndarray, + src_sr: int, + target_sr: int, + mode: str='kaiser_fast') -> np.ndarray: + """Audio resampling. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + src_sr (int): Source sample rate. + target_sr (int): Target sample rate. + mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + np.ndarray: `y` resampled to `target_sr` + """ + + if mode == 'kaiser_best': + warnings.warn( + f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \ + we recommend the mode kaiser_fast in large scale audio training') + + if not isinstance(y, np.ndarray): + raise ParameterError( + 'Only support numpy np.ndarray, but received y in {type(y)}') + + if mode not in RESAMPLE_MODES: + raise ParameterError(f'resample mode must in {RESAMPLE_MODES}') + + return resampy.resample(y, src_sr, target_sr, filter=mode) + + +def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray: + """Convert sterior audio to mono. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'. + + Returns: + np.ndarray: `y` with mono channel. + """ + + if merge_type not in MERGE_TYPES: + raise ParameterError( + f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}' + ) + if y.ndim > 2: + raise ParameterError( + f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}') + if y.ndim == 1: # nothing to merge + return y + + if merge_type == 'ch0': + return y[0] + if merge_type == 'ch1': + return y[1] + if merge_type == 'random': + return y[np.random.randint(0, 2)] + + # need to do averaging according to dtype + + if y.dtype == 'float32': + y_out = (y[0] + y[1]) * 0.5 + elif y.dtype == 'int16': + y_out = y.astype('int32') + y_out = (y_out[0] + y_out[1]) // 2 + y_out = np.clip(y_out, np.iinfo(y.dtype).min, + np.iinfo(y.dtype).max).astype(y.dtype) + + elif y.dtype == 'int8': + y_out = y.astype('int16') + y_out = (y_out[0] + y_out[1]) // 2 + y_out = np.clip(y_out, np.iinfo(y.dtype).min, + np.iinfo(y.dtype).max).astype(y.dtype) + else: + raise ParameterError(f'Unsupported dtype: {y.dtype}') + return y_out + + +def soundfile_load_(file: os.PathLike, + offset: Optional[float]=None, + dtype: str='int16', + duration: Optional[int]=None) -> Tuple[np.ndarray, int]: + """Load audio using soundfile library. This function load audio file using libsndfile. + + Args: + file (os.PathLike): File of waveform. + offset (Optional[float], optional): Offset to the start of waveform. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'int16'. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. + """ + with soundfile.SoundFile(file) as sf_desc: + sr_native = sf_desc.samplerate + if offset: + sf_desc.seek(int(offset * sr_native)) + if duration is not None: + frame_duration = int(duration * sr_native) + else: + frame_duration = -1 + y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T + + return y, sf_desc.samplerate + + +def normalize(y: np.ndarray, norm_type: str='linear', + mul_factor: float=1.0) -> np.ndarray: + """Normalize an input audio with additional multiplier. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + mul_factor (float, optional): Scaling factor. Defaults to 1.0. + + Returns: + np.ndarray: `y` after normalization. + """ + + if norm_type == 'linear': + amax = np.max(np.abs(y)) + factor = 1.0 / (amax + EPS) + y = y * factor * mul_factor + elif norm_type == 'gaussian': + amean = np.mean(y) + astd = np.std(y) + astd = max(astd, EPS) + y = mul_factor * (y - amean) / astd + else: + raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}') + + return y + + +def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None: + """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + sr (int): Sample rate. + file (os.PathLike): Path of audio file to save. + """ + if not file.endswith('.wav'): + raise ParameterError( + f'only .wav file supported, but dst file name is: {file}') + + if sr <= 0: + raise ParameterError( + f'Sample rate should be larger than 0, received sr = {sr}') + + if y.dtype not in ['int16', 'int8']: + warnings.warn( + f'input data type is {y.dtype}, will convert data to int16 format before saving' + ) + y_out = depth_convert(y, 'int16') + else: + y_out = y + + wavfile.write(file, sr, y_out) + + +def soundfile_load( + file: os.PathLike, + sr: Optional[int]=None, + mono: bool=True, + merge_type: str='average', # ch0,ch1,random,average + normal: bool=True, + norm_type: str='linear', + norm_mul_factor: float=1.0, + offset: float=0.0, + duration: Optional[int]=None, + dtype: str='float32', + resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]: + """Load audio file from disk. This function loads audio from disk using using audio backend. + + Args: + file (os.PathLike): Path of audio file to load. + sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None. + mono (bool, optional): Return waveform with mono channel. Defaults to True. + merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'. + normal (bool, optional): Waveform normalization. Defaults to True. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0. + offset (float, optional): Offset to the start of waveform. Defaults to 0.0. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'float32'. + resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. + """ + + y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration) + + if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)): + raise ParameterError(f'audio file {file} looks empty') + + if mono: + y = to_mono(y, merge_type) + + if sr is not None and sr != r: + y = resample(y, r, sr, mode=resample_mode) + r = sr + + if normal: + y = normalize(y, norm_type, norm_mul_factor) + elif dtype in ['int8', 'int16']: + # still need to do normalization, before depth conversion + y = normalize(y, 'linear', 1.0) + + y = depth_convert(y, dtype) + return y, r + + +#The code below is taken from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py, with some modifications. + + +def _get_subtype_for_wav(dtype: paddle.dtype, + encoding: str, + bits_per_sample: int): + if not encoding: + if not bits_per_sample: + subtype = { + paddle.uint8: "PCM_U8", + paddle.int16: "PCM_16", + paddle.int32: "PCM_32", + paddle.float32: "FLOAT", + paddle.float64: "DOUBLE", + }.get(dtype) + if not subtype: + raise ValueError(f"Unsupported dtype for wav: {dtype}") + return subtype + if bits_per_sample == 8: + return "PCM_U8" + return f"PCM_{bits_per_sample}" + if encoding == "PCM_S": + if not bits_per_sample: + return "PCM_32" + if bits_per_sample == 8: + raise ValueError("wav does not support 8-bit signed PCM encoding.") + return f"PCM_{bits_per_sample}" + if encoding == "PCM_U": + if bits_per_sample in (None, 8): + return "PCM_U8" + raise ValueError("wav only supports 8-bit unsigned PCM encoding.") + if encoding == "PCM_F": + if bits_per_sample in (None, 32): + return "FLOAT" + if bits_per_sample == 64: + return "DOUBLE" + raise ValueError("wav only supports 32/64-bit float PCM encoding.") + if encoding == "ULAW": + if bits_per_sample in (None, 8): + return "ULAW" + raise ValueError("wav only supports 8-bit mu-law encoding.") + if encoding == "ALAW": + if bits_per_sample in (None, 8): + return "ALAW" + raise ValueError("wav only supports 8-bit a-law encoding.") + raise ValueError(f"wav does not support {encoding}.") + + +def _get_subtype_for_sphere(encoding: str, bits_per_sample: int): + if encoding in (None, "PCM_S"): + return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32" + if encoding in ("PCM_U", "PCM_F"): + raise ValueError(f"sph does not support {encoding} encoding.") + if encoding == "ULAW": + if bits_per_sample in (None, 8): + return "ULAW" + raise ValueError("sph only supports 8-bit for mu-law encoding.") + if encoding == "ALAW": + return "ALAW" + raise ValueError(f"sph does not support {encoding}.") + + +def _get_subtype(dtype: paddle.dtype, + format: str, + encoding: str, + bits_per_sample: int): + if format == "wav": + return _get_subtype_for_wav(dtype, encoding, bits_per_sample) + if format == "flac": + if encoding: + raise ValueError("flac does not support encoding.") + if not bits_per_sample: + return "PCM_16" + if bits_per_sample > 24: + raise ValueError("flac does not support bits_per_sample > 24.") + return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}" + if format in ("ogg", "vorbis"): + if encoding or bits_per_sample: + raise ValueError( + "ogg/vorbis does not support encoding/bits_per_sample.") + return "VORBIS" + if format == "sph": + return _get_subtype_for_sphere(encoding, bits_per_sample) + if format in ("nis", "nist"): + return "PCM_16" + raise ValueError(f"Unsupported format: {format}") + + +def save( + filepath: str, + src: paddle.Tensor, + sample_rate: int, + channels_first: bool=True, + compression: Optional[float]=None, + format: Optional[str]=None, + encoding: Optional[str]=None, + bits_per_sample: Optional[int]=None, ): + """Save audio data to file. + + Note: + The formats this function can handle depend on the soundfile installation. + This function is tested on the following formats; + + * WAV + + * 32-bit floating-point + * 32-bit signed integer + * 16-bit signed integer + * 8-bit unsigned integer + + * FLAC + * OGG/VORBIS + * SPHERE + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + + Args: + filepath (str or pathlib.Path): Path to audio file. + src (paddle.Tensor): Audio data to save. must be 2D tensor. + sample_rate (int): sampling rate + channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, + otherwise `[time, channel]`. + compression (float of None, optional): Not used. + It is here only for interface compatibility reason with "sox_io" backend. + format (str or None, optional): Override the audio format. + When ``filepath`` argument is path-like object, audio format is + inferred from file extension. If the file extension is missing or + different, you can specify the correct format with this argument. + + When ``filepath`` argument is file-like object, + this argument is required. + + Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``, + ``"flac"`` and ``"sph"``. + encoding (str or None, optional): Changes the encoding for supported formats. + This argument is effective only for supported formats, such as + ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are: + + - ``"PCM_S"`` (signed integer Linear PCM) + - ``"PCM_U"`` (unsigned integer Linear PCM) + - ``"PCM_F"`` (floating point PCM) + - ``"ULAW"`` (mu-law) + - ``"ALAW"`` (a-law) + + bits_per_sample (int or None, optional): Changes the bit depth for the + supported formats. + When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``, + you can change the bit depth. + Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. + + Supported formats/encodings/bit depth/compression are: + + ``"wav"`` + - 32-bit floating-point PCM + - 32-bit signed integer PCM + - 24-bit signed integer PCM + - 16-bit signed integer PCM + - 8-bit unsigned integer PCM + - 8-bit mu-law + - 8-bit a-law + + Note: + Default encoding/bit depth is determined by the dtype of + the input Tensor. + + ``"flac"`` + - 8-bit + - 16-bit (default) + - 24-bit + + ``"ogg"``, ``"vorbis"`` + - Doesn't accept changing configuration. + + ``"sph"`` + - 8-bit signed integer PCM + - 16-bit signed integer PCM + - 24-bit signed integer PCM + - 32-bit signed integer PCM (default) + - 8-bit mu-law + - 8-bit a-law + - 16-bit a-law + - 24-bit a-law + - 32-bit a-law + + """ + if src.ndim != 2: + raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.") + if compression is not None: + warnings.warn( + '`save` function of "soundfile" backend does not support "compression" parameter. ' + "The argument is silently ignored.") + if hasattr(filepath, "write"): + if format is None: + raise RuntimeError( + "`format` is required when saving to file object.") + ext = format.lower() + else: + ext = str(filepath).split(".")[-1].lower() + + if bits_per_sample not in (None, 8, 16, 24, 32, 64): + raise ValueError("Invalid bits_per_sample.") + if bits_per_sample == 24: + warnings.warn( + "Saving audio with 24 bits per sample might warp samples near -1. " + "Using 16 bits per sample might be able to avoid this.") + subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample) + + # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format, + # so we extend the extensions manually here + if ext in ["nis", "nist", "sph"] and format is None: + format = "NIST" + + if channels_first: + src = src.t() + + soundfile.write( + file=filepath, + data=src, + samplerate=sample_rate, + subtype=subtype, + format=format) + + +_SUBTYPE2DTYPE = { + "PCM_S8": "int8", + "PCM_U8": "uint8", + "PCM_16": "int16", + "PCM_32": "int32", + "FLOAT": "float32", + "DOUBLE": "float64", +} + + +def load( + filepath: str, + frame_offset: int=0, + num_frames: int=-1, + normalize: bool=True, + channels_first: bool=True, + format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]: + """Load audio data from file. + + Note: + The formats this function can handle depend on the soundfile installation. + This function is tested on the following formats; + + * WAV + + * 32-bit floating-point + * 32-bit signed integer + * 16-bit signed integer + * 8-bit unsigned integer + + * FLAC + * OGG/VORBIS + * SPHERE + + By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with + ``float32`` dtype and the shape of `[channel, time]`. + The samples are normalized to fit in the range of ``[-1.0, 1.0]``. + + When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit + signed integer and 8-bit unsigned integer (24-bit signed integer is not supported), + by providing ``normalize=False``, this function can return integer Tensor, where the samples + are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor + for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. + + ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as + ``flac`` and ``mp3``. + For these formats, this function always returns ``float32`` Tensor with values normalized to + ``[-1.0, 1.0]``. + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend. + + Args: + filepath (path-like object or file-like object): + Source of audio data. + frame_offset (int, optional): + Number of frames to skip before start reading data. + num_frames (int, optional): + Maximum number of frames to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + This function may return the less number of frames if there is not enough + frames in the given file. + normalize (bool, optional): + When ``True``, this function always return ``float32``, and sample values are + normalized to ``[-1.0, 1.0]``. + If input file is integer WAV, giving ``False`` will change the resulting Tensor type to + integer type. + This argument has no effect for formats other than integer WAV type. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Not used. PySoundFile does not accept format hint. + + Returns: + (paddle.Tensor, int): Resulting Tensor and sample rate. + If the input file has integer wav format and normalization is off, then it has + integer type, else ``float32`` type. If ``channels_first=True``, it has + `[channel, time]` else `[time, channel]`. + """ + with soundfile.SoundFile(filepath, "r") as file_: + if file_.format != "WAV" or normalize: + dtype = "float32" + elif file_.subtype not in _SUBTYPE2DTYPE: + raise ValueError(f"Unsupported subtype: {file_.subtype}") + else: + dtype = _SUBTYPE2DTYPE[file_.subtype] + + frames = file_._prepare_read(frame_offset, None, num_frames) + waveform = file_.read(frames, dtype, always_2d=True) + sample_rate = file_.samplerate + + waveform = paddle.to_tensor(waveform) + if channels_first: + waveform = paddle.transpose(waveform, perm=[1, 0]) + return waveform, sample_rate + + +# Mapping from soundfile subtype to number of bits per sample. +# This is mostly heuristical and the value is set to 0 when it is irrelevant +# (lossy formats) or when it can't be inferred. +# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard: +# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony, +# the default seems to be 8 bits but it can be compressed further to 4 bits. +# The dict is inspired from +# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94 +_SUBTYPE_TO_BITS_PER_SAMPLE = { + "PCM_S8": 8, # Signed 8 bit data + "PCM_16": 16, # Signed 16 bit data + "PCM_24": 24, # Signed 24 bit data + "PCM_32": 32, # Signed 32 bit data + "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only) + "FLOAT": 32, # 32 bit float data + "DOUBLE": 64, # 64 bit float data + "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types + "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types + "IMA_ADPCM": 0, # IMA ADPCM. + "MS_ADPCM": 0, # Microsoft ADPCM. + "GSM610": + 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate) + "VOX_ADPCM": 0, # OKI / Dialogix ADPCM + "G721_32": 0, # 32kbs G721 ADPCM encoding. + "G723_24": 0, # 24kbs G723 ADPCM encoding. + "G723_40": 0, # 40kbs G723 ADPCM encoding. + "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding. + "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding. + "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding. + "DWVW_N": 0, # N bit Delta Width Variable Word encoding. + "DPCM_8": 8, # 8 bit differential PCM (XI only) + "DPCM_16": 16, # 16 bit differential PCM (XI only) + "VORBIS": 0, # Xiph Vorbis encoding. (lossy) + "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit). + "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit). + "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit). + "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit). +} + + +def _get_bit_depth(subtype): + if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE: + warnings.warn( + f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample " + "attribute will be set to 0. If you are seeing this warning, please " + "report by opening an issue on github (after checking for existing/closed ones). " + "You may otherwise ignore this warning.") + return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0) + + +_SUBTYPE_TO_ENCODING = { + "PCM_S8": "PCM_S", + "PCM_16": "PCM_S", + "PCM_24": "PCM_S", + "PCM_32": "PCM_S", + "PCM_U8": "PCM_U", + "FLOAT": "PCM_F", + "DOUBLE": "PCM_F", + "ULAW": "ULAW", + "ALAW": "ALAW", + "VORBIS": "VORBIS", +} + + +def _get_encoding(format: str, subtype: str): + if format == "FLAC": + return "FLAC" + return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN") + + +def info(filepath: str, format: Optional[str]=None) -> AudioInfo: + """Get signal information of an audio file. + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + + Args: + filepath (path-like object or file-like object): + Source of audio data. + format (str or None, optional): + Not used. PySoundFile does not accept format hint. + + Returns: + AudioInfo: meta data of the given audio. + + """ + sinfo = soundfile.info(filepath) + return AudioInfo( + sinfo.samplerate, + sinfo.frames, + sinfo.channels, + bits_per_sample=_get_bit_depth(sinfo.subtype), + encoding=_get_encoding(sinfo.format, sinfo.subtype), ) diff --git a/paddlespeech/audio/compliance/__init__.py b/paddlespeech/audio/compliance/__init__.py new file mode 100644 index 000000000..c08f9ab11 --- /dev/null +++ b/paddlespeech/audio/compliance/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import kaldi +from . import librosa diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py new file mode 100644 index 000000000..a94ec4053 --- /dev/null +++ b/paddlespeech/audio/compliance/kaldi.py @@ -0,0 +1,643 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from torchaudio(https://github.com/pytorch/audio) +import math +from typing import Tuple + +import paddle +from paddle import Tensor + +from ..functional import create_dct +from ..functional.window import get_window + +__all__ = [ + 'spectrogram', + 'fbank', + 'mfcc', +] + +# window types +HANNING = 'hann' +HAMMING = 'hamming' +POVEY = 'povey' +RECTANGULAR = 'rect' +BLACKMAN = 'blackman' + + +def _get_epsilon(dtype): + return paddle.to_tensor(1e-07, dtype=dtype) + + +def _next_power_of_2(x: int) -> int: + return 1 if x == 0 else 2**(x - 1).bit_length() + + +def _get_strided(waveform: Tensor, + window_size: int, + window_shift: int, + snip_edges: bool) -> Tensor: + assert waveform.dim() == 1 + num_samples = waveform.shape[0] + + if snip_edges: + if num_samples < window_size: + return paddle.empty((0, 0), dtype=waveform.dtype) + else: + m = 1 + (num_samples - window_size) // window_shift + else: + reversed_waveform = paddle.flip(waveform, [0]) + m = (num_samples + (window_shift // 2)) // window_shift + pad = window_size // 2 - window_shift // 2 + pad_right = reversed_waveform + if pad > 0: + pad_left = reversed_waveform[-pad:] + waveform = paddle.concat((pad_left, waveform, pad_right), axis=0) + else: + waveform = paddle.concat((waveform[-pad:], pad_right), axis=0) + + return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T + + +def _feature_window_function( + window_type: str, + window_size: int, + blackman_coeff: float, + dtype: int, ) -> Tensor: + if window_type == "hann": + return get_window('hann', window_size, fftbins=False, dtype=dtype) + elif window_type == "hamming": + return get_window('hamming', window_size, fftbins=False, dtype=dtype) + elif window_type == "povey": + return get_window( + 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) + elif window_type == "rect": + return paddle.ones([window_size], dtype=dtype) + elif window_type == "blackman": + a = 2 * math.pi / (window_size - 1) + window_function = paddle.arange(window_size, dtype=dtype) + return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + + (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function) + ).astype(dtype) + else: + raise Exception('Invalid window type ' + window_type) + + +def _get_log_energy(strided_input: Tensor, epsilon: Tensor, + energy_floor: float) -> Tensor: + log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log() + if energy_floor == 0.0: + return log_energy + return paddle.maximum( + log_energy, + paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype)) + + +def _get_waveform_and_window_properties( + waveform: Tensor, + channel: int, + sr: int, + frame_shift: float, + frame_length: float, + round_to_power_of_two: bool, + preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]: + channel = max(channel, 0) + assert channel < waveform.shape[0], ( + 'Invalid channel {} for size {}'.format(channel, waveform.shape[0])) + waveform = waveform[channel, :] # size (n) + window_shift = int( + sr * frame_shift * + 0.001) # pass frame_shift and frame_length in milliseconds + window_size = int(sr * frame_length * 0.001) + padded_window_size = _next_power_of_2( + window_size) if round_to_power_of_two else window_size + + assert 2 <= window_size <= len(waveform), ( + 'choose a window size {} that is [2, {}]'.format(window_size, + len(waveform))) + assert 0 < window_shift, '`window_shift` must be greater than 0' + assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \ + ' use `round_to_power_of_two` or change `frame_length`' + assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]' + assert sr > 0, '`sr` must be greater than zero' + return waveform, window_shift, window_size, padded_window_size + + +def _get_window(waveform: Tensor, + padded_window_size: int, + window_size: int, + window_shift: int, + window_type: str, + blackman_coeff: float, + snip_edges: bool, + raw_energy: bool, + energy_floor: float, + dither: float, + remove_dc_offset: bool, + preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]: + dtype = waveform.dtype + epsilon = _get_epsilon(dtype) + + # (m, window_size) + strided_input = _get_strided(waveform, window_size, window_shift, + snip_edges) + + if dither != 0.0: + x = paddle.maximum(epsilon, + paddle.rand(strided_input.shape, dtype=dtype)) + rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x) + strided_input = strided_input + rand_gauss * dither + + if remove_dc_offset: + row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1) + strided_input = strided_input - row_means + + if raw_energy: + signal_log_energy = _get_log_energy(strided_input, epsilon, + energy_floor) # (m) + + if preemphasis_coefficient != 0.0: + offset_strided_input = paddle.nn.functional.pad( + strided_input.unsqueeze(0), (1, 0), + data_format='NCL', + mode='replicate').squeeze(0) # (m, window_size + 1) + strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, : + -1] + + window_function = _feature_window_function( + window_type, window_size, blackman_coeff, + dtype).unsqueeze(0) # (1, window_size) + strided_input = strided_input * window_function # (m, window_size) + + # (m, padded_window_size) + if padded_window_size != window_size: + padding_right = padded_window_size - window_size + strided_input = paddle.nn.functional.pad( + strided_input.unsqueeze(0), (0, padding_right), + data_format='NCL', + mode='constant', + value=0).squeeze(0) + + if not raw_energy: + signal_log_energy = _get_log_energy(strided_input, epsilon, + energy_floor) # size (m) + + return strided_input, signal_log_energy + + +def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor: + if subtract_mean: + col_means = paddle.mean(tensor, axis=0).unsqueeze(0) + tensor = tensor - col_means + return tensor + + +def spectrogram(waveform: Tensor, + blackman_coeff: float=0.42, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + window_type: str="povey") -> Tensor: + """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape `(C, T)`. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". + + Returns: + Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames + depends on frame_length and frame_shift. + """ + dtype = waveform.dtype + epsilon = _get_epsilon(dtype) + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) + + strided_input, signal_log_energy = _get_window( + waveform, padded_window_size, window_size, window_shift, window_type, + blackman_coeff, snip_edges, raw_energy, energy_floor, dither, + remove_dc_offset, preemphasis_coefficient) + + # (m, padded_window_size // 2 + 1, 2) + fft = paddle.fft.rfft(strided_input) + + power_spectrum = paddle.maximum( + fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1) + power_spectrum[:, 0] = signal_log_energy + + power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean) + return power_spectrum + + +def _inverse_mel_scale_scalar(mel_freq: float) -> float: + return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0) + + +def _inverse_mel_scale(mel_freq: Tensor) -> Tensor: + return 700.0 * ((mel_freq / 1127.0).exp() - 1.0) + + +def _mel_scale_scalar(freq: float) -> float: + return 1127.0 * math.log(1.0 + freq / 700.0) + + +def _mel_scale(freq: Tensor) -> Tensor: + return 1127.0 * (1.0 + freq / 700.0).log() + + +def _vtln_warp_freq(vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq: float, + high_freq: float, + vtln_warp_factor: float, + freq: Tensor) -> Tensor: + assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq' + assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]' + l = vtln_low_cutoff * max(1.0, vtln_warp_factor) + h = vtln_high_cutoff * min(1.0, vtln_warp_factor) + scale = 1.0 / vtln_warp_factor + Fl = scale * l + Fh = scale * h + assert l > low_freq and h < high_freq + scale_left = (Fl - low_freq) / (l - low_freq) + scale_right = (high_freq - Fh) / (high_freq - h) + res = paddle.empty_like(freq) + + outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \ + | paddle.greater_than(freq, paddle.to_tensor(high_freq)) + before_l = paddle.less_than(freq, paddle.to_tensor(l)) + before_h = paddle.less_than(freq, paddle.to_tensor(h)) + after_h = paddle.greater_equal(freq, paddle.to_tensor(h)) + + res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq) + res[before_h] = scale * freq[before_h] + res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq) + res[outside_low_high_freq] = freq[outside_low_high_freq] + + return res + + +def _vtln_warp_mel_freq(vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq, + high_freq: float, + vtln_warp_factor: float, + mel_freq: Tensor) -> Tensor: + return _mel_scale( + _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, + vtln_warp_factor, _inverse_mel_scale(mel_freq))) + + +def _get_mel_banks(num_bins: int, + window_length_padded: int, + sample_freq: float, + low_freq: float, + high_freq: float, + vtln_low: float, + vtln_high: float, + vtln_warp_factor: float) -> Tuple[Tensor, Tensor]: + assert num_bins > 3, 'Must have at least 3 mel bins' + assert window_length_padded % 2 == 0 + num_fft_bins = window_length_padded / 2 + nyquist = 0.5 * sample_freq + + if high_freq <= 0.0: + high_freq += nyquist + + assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \ + ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist)) + + fft_bin_width = sample_freq / window_length_padded + mel_low_freq = _mel_scale_scalar(low_freq) + mel_high_freq = _mel_scale_scalar(high_freq) + + mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) + + if vtln_high < 0.0: + vtln_high += nyquist + + assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and + (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \ + ('Bad values in options: vtln-low {} and vtln-high {}, versus ' + 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) + + bin = paddle.arange(num_bins, dtype=paddle.float32).unsqueeze(1) + # left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + # center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) + # right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) + left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + center_mel = left_mel + mel_freq_delta + right_mel = center_mel + mel_freq_delta + + if vtln_warp_factor != 1.0: + left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, left_mel) + center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, + high_freq, vtln_warp_factor, + center_mel) + right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, + high_freq, vtln_warp_factor, right_mel) + + center_freqs = _inverse_mel_scale(center_mel) # (num_bins) + # (1, num_fft_bins) + mel = _mel_scale(fft_bin_width * paddle.arange( + num_fft_bins, dtype=paddle.float32)).unsqueeze(0) + + # (num_bins, num_fft_bins) + up_slope = (mel - left_mel) / (center_mel - left_mel) + down_slope = (right_mel - mel) / (right_mel - center_mel) + + if vtln_warp_factor == 1.0: + bins = paddle.maximum( + paddle.zeros([1]), paddle.minimum(up_slope, down_slope)) + else: + bins = paddle.zeros_like(up_slope) + up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than( + mel, center_mel) + down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than( + mel, right_mel) + bins[up_idx] = up_slope[up_idx] + bins[down_idx] = down_slope[down_idx] + + return bins, center_freqs + + +def fbank(waveform: Tensor, + blackman_coeff: float=0.42, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + high_freq: float=0.0, + htk_compat: bool=False, + low_freq: float=20.0, + n_mels: int=23, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + use_energy: bool=False, + use_log_fbank: bool=True, + use_power: bool=True, + vtln_high: float=-500.0, + vtln_low: float=100.0, + vtln_warp: float=1.0, + window_type: str="povey") -> Tensor: + """Compute and return filter banks from a waveform. The output is identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape `(C, T)`. `C` is in the range [0,1]. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True. + use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". + + Returns: + Tensor: A filter banks tensor with shape `(m, n_mels)`. + """ + dtype = waveform.dtype + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) + + strided_input, signal_log_energy = _get_window( + waveform, padded_window_size, window_size, window_shift, window_type, + blackman_coeff, snip_edges, raw_energy, energy_floor, dither, + remove_dc_offset, preemphasis_coefficient) + + # (m, padded_window_size // 2 + 1) + spectrum = paddle.fft.rfft(strided_input).abs() + if use_power: + spectrum = spectrum.pow(2.) + + # (n_mels, padded_window_size // 2) + mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, + high_freq, vtln_low, vtln_high, vtln_warp) + # mel_energies = mel_energies.astype(dtype) + assert mel_energies.dtype == dtype + + # (n_mels, padded_window_size // 2 + 1) + mel_energies = paddle.nn.functional.pad( + mel_energies.unsqueeze(0), (0, 1), + data_format='NCL', + mode='constant', + value=0).squeeze(0) + + # (m, n_mels) + mel_energies = paddle.mm(spectrum, mel_energies.T) + if use_log_fbank: + mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log() + + if use_energy: + signal_log_energy = signal_log_energy.unsqueeze(1) + if htk_compat: + mel_energies = paddle.concat( + (mel_energies, signal_log_energy), axis=1) + else: + mel_energies = paddle.concat( + (signal_log_energy, mel_energies), axis=1) + + # (m, n_mels + 1) + mel_energies = _subtract_column_mean(mel_energies, subtract_mean) + return mel_energies + + +def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor: + dct_matrix = create_dct(n_mels, n_mels, 'ortho') + dct_matrix[:, 0] = math.sqrt(1 / float(n_mels)) + dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc) + return dct_matrix + + +def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor: + i = paddle.arange(n_mfcc) + return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i / + cepstral_lifter) + + +def mfcc(waveform: Tensor, + blackman_coeff: float=0.42, + cepstral_lifter: float=22.0, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + high_freq: float=0.0, + htk_compat: bool=False, + low_freq: float=20.0, + n_mfcc: int=13, + n_mels: int=23, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + use_energy: bool=False, + vtln_high: float=-500.0, + vtln_low: float=100.0, + vtln_warp: float=1.0, + window_type: str="povey") -> Tensor: + """Compute and return mel frequency cepstral coefficients from a waveform. The output is + identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape `(C, T)`. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + + Returns: + Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`. + """ + assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( + n_mfcc, n_mels) + + dtype = waveform.dtype + + # (m, n_mels + use_energy) + feature = fbank( + waveform=waveform, + blackman_coeff=blackman_coeff, + channel=channel, + dither=dither, + energy_floor=energy_floor, + frame_length=frame_length, + frame_shift=frame_shift, + high_freq=high_freq, + htk_compat=htk_compat, + low_freq=low_freq, + n_mels=n_mels, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + round_to_power_of_two=round_to_power_of_two, + sr=sr, + snip_edges=snip_edges, + subtract_mean=False, + use_energy=use_energy, + use_log_fbank=True, + use_power=True, + vtln_high=vtln_high, + vtln_low=vtln_low, + vtln_warp=vtln_warp, + window_type=window_type) + + if use_energy: + # (m) + signal_log_energy = feature[:, n_mels if htk_compat else 0] + mel_offset = int(not htk_compat) + feature = feature[:, mel_offset:(n_mels + mel_offset)] + + # (n_mels, n_mfcc) + dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype) + + # (m, n_mfcc) + feature = feature.matmul(dct_matrix) + + if cepstral_lifter != 0.0: + # (1, n_mfcc) + lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0) + feature *= lifter_coeffs.astype(dtype=dtype) + + if use_energy: + feature[:, 0] = signal_log_energy + + if htk_compat: + energy = feature[:, 0].unsqueeze(1) # (m, 1) + feature = feature[:, 1:] # (m, n_mfcc - 1) + if not use_energy: + energy *= math.sqrt(2) + + feature = paddle.concat((feature, energy), axis=1) + + feature = _subtract_column_mean(feature, subtract_mean) + return feature diff --git a/paddlespeech/audio/compliance/librosa.py b/paddlespeech/audio/compliance/librosa.py new file mode 100644 index 000000000..c671d4fb8 --- /dev/null +++ b/paddlespeech/audio/compliance/librosa.py @@ -0,0 +1,788 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from librosa(https://github.com/librosa/librosa) +import warnings +from typing import List +from typing import Optional +from typing import Union + +import numpy as np +import scipy +from numpy.lib.stride_tricks import as_strided +from scipy import signal + +from ..utils import depth_convert +from ..utils import ParameterError + +__all__ = [ + # dsp + 'stft', + 'mfcc', + 'hz_to_mel', + 'mel_to_hz', + 'mel_frequencies', + 'power_to_db', + 'compute_fbank_matrix', + 'melspectrogram', + 'spectrogram', + 'mu_encode', + 'mu_decode', + # augmentation + 'depth_augment', + 'spect_augment', + 'random_crop1d', + 'random_crop2d', + 'adaptive_spect_augment', +] + + +def _pad_center(data: np.ndarray, size: int, axis: int=-1, + **kwargs) -> np.ndarray: + """Pad an array to a target length along a target axis. + + This differs from `np.pad` by centering the data prior to padding, + analogous to `str.center` + """ + + kwargs.setdefault("mode", "constant") + n = data.shape[axis] + lpad = int((size - n) // 2) + lengths = [(0, 0)] * data.ndim + lengths[axis] = (lpad, int(size - n - lpad)) + + if lpad < 0: + raise ParameterError(("Target size ({size:d}) must be " + "at least input size ({n:d})")) + + return np.pad(data, lengths, **kwargs) + + +def _split_frames(x: np.ndarray, + frame_length: int, + hop_length: int, + axis: int=-1) -> np.ndarray: + """Slice a data array into (overlapping) frames. + + This function is aligned with librosa.frame + """ + + if not isinstance(x, np.ndarray): + raise ParameterError( + f"Input must be of type numpy.ndarray, given type(x)={type(x)}") + + if x.shape[axis] < frame_length: + raise ParameterError(f"Input is too short (n={x.shape[axis]:d})" + f" for frame_length={frame_length:d}") + + if hop_length < 1: + raise ParameterError(f"Invalid hop_length: {hop_length:d}") + + if axis == -1 and not x.flags["F_CONTIGUOUS"]: + warnings.warn(f"librosa.util.frame called with axis={axis} " + "on a non-contiguous input. This will result in a copy.") + x = np.asfortranarray(x) + elif axis == 0 and not x.flags["C_CONTIGUOUS"]: + warnings.warn(f"librosa.util.frame called with axis={axis} " + "on a non-contiguous input. This will result in a copy.") + x = np.ascontiguousarray(x) + + n_frames = 1 + (x.shape[axis] - frame_length) // hop_length + strides = np.asarray(x.strides) + + new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize + + if axis == -1: + shape = list(x.shape)[:-1] + [frame_length, n_frames] + strides = list(strides) + [hop_length * new_stride] + + elif axis == 0: + shape = [n_frames, frame_length] + list(x.shape)[1:] + strides = [hop_length * new_stride] + list(strides) + + else: + raise ParameterError(f"Frame axis={axis} must be either 0 or -1") + + return as_strided(x, shape=shape, strides=strides) + + +def _check_audio(y, mono=True) -> bool: + """Determine whether a variable contains valid audio data. + + The audio y must be a np.ndarray, ether 1-channel or two channel + """ + if not isinstance(y, np.ndarray): + raise ParameterError("Audio data must be of type numpy.ndarray") + if y.ndim > 2: + raise ParameterError( + f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}") + + if mono and y.ndim == 2: + raise ParameterError( + f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}") + + if (mono and len(y) == 0) or (not mono and y.shape[1] < 0): + raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}") + + if not np.issubdtype(y.dtype, np.floating): + raise ParameterError("Audio data must be floating-point") + + if not np.isfinite(y).all(): + raise ParameterError("Audio buffer is not finite everywhere") + + return True + + +def hz_to_mel(frequencies: Union[float, List[float], np.ndarray], + htk: bool=False) -> np.ndarray: + """Convert Hz to Mels. + + Args: + frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequency in mels. + """ + freq = np.asanyarray(frequencies) + + if htk: + return 2595.0 * np.log10(1.0 + freq / 700.0) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if freq.ndim: + # If we have array data, vectorize + log_t = freq >= min_log_hz + mels[log_t] = min_log_mel + \ + np.log(freq[log_t] / min_log_hz) / logstep + elif freq >= min_log_hz: + # If we have scalar data, heck directly + mels = min_log_mel + np.log(freq / min_log_hz) / logstep + + return mels + + +def mel_to_hz(mels: Union[float, List[float], np.ndarray], + htk: int=False) -> np.ndarray: + """Convert mel bin numbers to frequencies. + + Args: + mels (Union[float, List[float], np.ndarray]): Frequency in mels. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequencies in Hz. + """ + mel_array = np.asanyarray(mels) + + if htk: + return 700.0 * (10.0**(mel_array / 2595.0) - 1.0) + + # Fill in the linear scale + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mel_array + + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if mel_array.ndim: + # If we have vector data, vectorize + log_t = mel_array >= min_log_mel + freqs[log_t] = min_log_hz * \ + np.exp(logstep * (mel_array[log_t] - min_log_mel)) + elif mel_array >= min_log_mel: + # If we have scalar data, check directly + freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int=128, + fmin: float=0.0, + fmax: float=11025.0, + htk: bool=False) -> np.ndarray: + """Compute mel frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`. + """ + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(fmin, htk=htk) + max_mel = hz_to_mel(fmax, htk=htk) + + mels = np.linspace(min_mel, max_mel, n_mels) + + return mel_to_hz(mels, htk=htk) + + +def fft_frequencies(sr: int, n_fft: int) -> np.ndarray: + """Compute fourier frequencies. + + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + + Returns: + np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. + """ + return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) + + +def compute_fbank_matrix(sr: int, + n_fft: int, + n_mels: int=128, + fmin: float=0.0, + fmax: Optional[float]=None, + htk: bool=False, + norm: str="slaney", + dtype: type=np.float32) -> np.ndarray: + """Compute fbank matrix. + + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (str, optional): Type of normalization. Defaults to "slaney". + dtype (type, optional): Data type. Defaults to np.float32. + + + Returns: + np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. + """ + if norm != "slaney": + raise ParameterError('norm must set to slaney') + + if fmax is None: + fmax = float(sr) / 2 + + # Initialize the weights + n_mels = int(n_mels) + weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) + + fdiff = np.diff(mel_f) + ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = np.maximum(0, np.minimum(lower, upper)) + + if norm == "slaney": + # Slaney-style mel is scaled to be approx constant energy per channel + enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) + weights *= enorm[:, np.newaxis] + + # Only check weights if f_mel[0] is positive + if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): + # This means we have an empty channel somewhere + warnings.warn("Empty filters detected in mel frequency basis. " + "Some channels will produce empty responses. " + "Try increasing your sampling rate (and fmax) or " + "reducing n_mels.") + + return weights + + +def stft(x: np.ndarray, + n_fft: int=2048, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str="hann", + center: bool=True, + dtype: type=np.complex64, + pad_mode: str="reflect") -> np.ndarray: + """Short-time Fourier transform (STFT). + + Args: + x (np.ndarray): Input waveform in one dimension. + n_fft (int, optional): FFT size. Defaults to 2048. + hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None. + win_length (Optional[int], optional): The size of window. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + dtype (type, optional): Data type of STFT results. Defaults to np.complex64. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + + Returns: + np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`. + """ + _check_audio(x) + + # By default, use the entire frame + if win_length is None: + win_length = n_fft + + # Set the default hop, if it's not already specified + if hop_length is None: + hop_length = int(win_length // 4) + + fft_window = signal.get_window(window, win_length, fftbins=True) + + # Pad the window out to n_fft size + fft_window = _pad_center(fft_window, n_fft) + + # Reshape so that the window can be broadcast + fft_window = fft_window.reshape((-1, 1)) + + # Pad the time series so that frames are centered + if center: + if n_fft > x.shape[-1]: + warnings.warn( + f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" + ) + x = np.pad(x, int(n_fft // 2), mode=pad_mode) + + elif n_fft > x.shape[-1]: + raise ParameterError( + f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" + ) + + # Window the time series. + x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length) + # Pre-allocate the STFT matrix + stft_matrix = np.empty( + (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F") + fft = np.fft # use numpy fft as default + # Constrain STFT block sizes to 256 KB + MAX_MEM_BLOCK = 2**8 * 2**10 + # how many columns can we fit within MAX_MEM_BLOCK? + n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize) + n_columns = max(n_columns, 1) + + for bl_s in range(0, stft_matrix.shape[1], n_columns): + bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) + stft_matrix[:, bl_s:bl_t] = fft.rfft( + fft_window * x_frames[:, bl_s:bl_t], axis=0) + + return stft_matrix + + +def power_to_db(spect: np.ndarray, + ref: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=80.0) -> np.ndarray: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. + + Args: + spect (np.ndarray): STFT power spectrogram of an input waveform. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0. + + Returns: + np.ndarray: Power spectrogram in db scale. + """ + spect = np.asarray(spect) + + if amin <= 0: + raise ParameterError("amin must be strictly positive") + + if np.issubdtype(spect.dtype, np.complexfloating): + warnings.warn( + "power_to_db was called on complex input so phase " + "information will be discarded. To suppress this warning, " + "call power_to_db(np.abs(D)**2) instead.") + magnitude = np.abs(spect) + else: + magnitude = spect + + if callable(ref): + # User supplied a function to calculate reference power + ref_value = ref(magnitude) + else: + ref_value = np.abs(ref) + + log_spec = 10.0 * np.log10(np.maximum(amin, magnitude)) + log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value)) + + if top_db is not None: + if top_db < 0: + raise ParameterError("top_db must be non-negative") + log_spec = np.maximum(log_spec, log_spec.max() - top_db) + + return log_spec + + +def mfcc(x: np.ndarray, + sr: int=16000, + spect: Optional[np.ndarray]=None, + n_mfcc: int=20, + dct_type: int=2, + norm: str="ortho", + lifter: int=0, + **kwargs) -> np.ndarray: + """Mel-frequency cepstral coefficients (MFCCs) + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20. + dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2. + norm (str, optional): Type of normalization. Defaults to "ortho". + lifter (int, optional): Cepstral filtering. Defaults to 0. + + Returns: + np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`. + """ + if spect is None: + spect = melspectrogram(x, sr=sr, **kwargs) + + M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc] + + if lifter > 0: + factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) / + lifter) + return M * factor[:, np.newaxis] + elif lifter == 0: + return M + else: + raise ParameterError( + f"MFCC lifter={lifter} must be a non-negative number") + + +def melspectrogram(x: np.ndarray, + sr: int=16000, + window_size: int=512, + hop_length: int=320, + n_mels: int=64, + fmin: float=50.0, + fmax: Optional[float]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + power: float=2.0, + to_db: bool=True, + ref: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None) -> np.ndarray: + """Compute mel-spectrogram. + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + n_mels (int, optional): Number of mel bins. Defaults to 64. + fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. + to_db (bool, optional): Enable db scale. Defaults to True. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. + + Returns: + np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`. + """ + _check_audio(x, mono=True) + if len(x) <= 0: + raise ParameterError('The input waveform is empty') + + if fmax is None: + fmax = sr // 2 + if fmin < 0 or fmin >= fmax: + raise ParameterError('fmin and fmax must statisfy 0 np.ndarray: + """Compute spectrogram. + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. + + Returns: + np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`. + """ + + s = stft( + x, + n_fft=window_size, + hop_length=hop_length, + win_length=window_size, + window=window, + center=center, + pad_mode=pad_mode) + + return np.abs(s)**power + + +def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`. + + Args: + x (np.ndarray): The input waveform to encode. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True. + + Returns: + np.ndarray: The mu-law encoded waveform. + """ + mu = 255 + y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu) + if quantized: + y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1] + return y + + +def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise. + + Args: + y (np.ndarray): The encoded waveform. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True. + + Returns: + np.ndarray: The mu-law decoded waveform. + """ + if mu < 1: + raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...') + + mu = mu - 1 + if quantized: # undo the quantization + y = y * 2 / mu - 1 + x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1) + return x + + +def _randint(high: int) -> int: + """Generate one random integer in range [0 high) + + This is a helper function for random data augmentation + """ + return int(np.random.randint(0, high=high)) + + +def depth_augment(y: np.ndarray, + choices: List=['int8', 'int16'], + probs: List[float]=[0.5, 0.5]) -> np.ndarray: + """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16']. + probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5]. + + Returns: + np.ndarray: The augmented waveform. + """ + assert len(probs) == len( + choices + ), 'number of choices {} must be equal to size of probs {}'.format( + len(choices), len(probs)) + depth = np.random.choice(choices, p=probs) + src_depth = y.dtype + y1 = depth_convert(y, depth) + y2 = depth_convert(y1, src_depth) + + return y2 + + +def adaptive_spect_augment(spect: np.ndarray, + tempo_axis: int=0, + level: float=0.1) -> np.ndarray: + """Do adaptive spectrogram augmentation. The level of the augmentation is govern by the parameter level, ranging from 0 to 1, with 0 represents no augmentation. + + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + level (float, optional): The level factor of masking. Defaults to 0.1. + + Returns: + np.ndarray: The augmented spectrogram. + """ + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + time_mask_width = int(nt * level * 0.5) + freq_mask_width = int(nf * level * 0.5) + + num_time_mask = int(10 * level) + num_freq_mask = int(10 * level) + + if tempo_axis == 0: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def spect_augment(spect: np.ndarray, + tempo_axis: int=0, + max_time_mask: int=3, + max_freq_mask: int=3, + max_time_mask_width: int=30, + max_freq_mask_width: int=20) -> np.ndarray: + """Do spectrogram augmentation in both time and freq axis. + + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + max_time_mask (int, optional): Maximum number of time masking. Defaults to 3. + max_freq_mask (int, optional): Maximum number of frequency masking. Defaults to 3. + max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30. + max_freq_mask_width (int, optional): Maximum width of frequency masking. Defaults to 20. + + Returns: + np.ndarray: The augmented spectrogram. + """ + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + num_time_mask = _randint(max_time_mask) + num_freq_mask = _randint(max_freq_mask) + + time_mask_width = _randint(max_time_mask_width) + freq_mask_width = _randint(max_freq_mask_width) + + if tempo_axis == 0: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray: + """ Random cropping on a input waveform. + + Args: + y (np.ndarray): Input waveform array in 1D. + crop_len (int): Length of waveform to crop. + + Returns: + np.ndarray: The cropped waveform. + """ + if y.ndim != 1: + 'only accept 1d tensor or numpy array' + n = len(y) + idx = _randint(n - crop_len) + return y[idx:idx + crop_len] + + +def random_crop2d(s: np.ndarray, crop_len: int, + tempo_axis: int=0) -> np.ndarray: + """ Random cropping on a spectrogram. + + Args: + s (np.ndarray): Input spectrogram in 2D. + crop_len (int): Length of spectrogram to crop. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + + Returns: + np.ndarray: The cropped spectrogram. + """ + if tempo_axis >= s.ndim: + raise ParameterError('axis out of range') + + n = s.shape[tempo_axis] + idx = _randint(high=n - crop_len) + sli = [slice(None) for i in range(s.ndim)] + sli[tempo_axis] = slice(idx, idx + crop_len) + out = s[tuple(sli)] + return out diff --git a/paddlespeech/audio/datasets/__init__.py b/paddlespeech/audio/datasets/__init__.py new file mode 100644 index 000000000..8068fa9d3 --- /dev/null +++ b/paddlespeech/audio/datasets/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .esc50 import ESC50 +from .voxceleb import VoxCeleb diff --git a/paddlespeech/audio/datasets/dataset.py b/paddlespeech/audio/datasets/dataset.py new file mode 100644 index 000000000..170e91669 --- /dev/null +++ b/paddlespeech/audio/datasets/dataset.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +import numpy as np +import paddle + +from ..backends.soundfile_backend import soundfile_load as load_audio +from ..compliance.kaldi import fbank as kaldi_fbank +from ..compliance.kaldi import mfcc as kaldi_mfcc +from ..compliance.librosa import melspectrogram +from ..compliance.librosa import mfcc + +feat_funcs = { + 'raw': None, + 'melspectrogram': melspectrogram, + 'mfcc': mfcc, + 'kaldi_fbank': kaldi_fbank, + 'kaldi_mfcc': kaldi_mfcc, +} + + +class AudioClassificationDataset(paddle.io.Dataset): + """ + Base class of audio classification dataset. + """ + + def __init__(self, + files: List[str], + labels: List[int], + feat_type: str='raw', + sample_rate: int=None, + **kwargs): + """ + Ags: + files (:obj:`List[str]`): A list of absolute path of audio files. + labels (:obj:`List[int]`): Labels of audio files. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extract of an audio file. + """ + super(AudioClassificationDataset, self).__init__() + + if feat_type not in feat_funcs.keys(): + raise RuntimeError( + f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}" + ) + + self.files = files + self.labels = labels + + self.feat_type = feat_type + self.sample_rate = sample_rate + self.feat_config = kwargs # Pass keyword arguments to customize feature config + + def _get_data(self, input_file: str): + raise NotImplementedError + + def _convert_to_record(self, idx): + file, label = self.files[idx], self.labels[idx] + + if self.sample_rate is None: + waveform, sample_rate = load_audio(file) + else: + waveform, sample_rate = load_audio(file, sr=self.sample_rate) + + feat_func = feat_funcs[self.feat_type] + + record = {} + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T) + record['feat'] = feat_func( + waveform=waveform, sr=self.sample_rate, **self.feat_config) + else: + record['feat'] = feat_func( + waveform, sample_rate, + **self.feat_config) if feat_func else waveform + record['label'] = label + return record + + def __getitem__(self, idx): + record = self._convert_to_record(idx) + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + return self.keys[idx], record['feat'], record['label'] + else: + return np.array(record['feat']).transpose(), np.array( + record['label'], dtype=np.int64) + + def __len__(self): + return len(self.files) diff --git a/paddlespeech/audio/datasets/esc50.py b/paddlespeech/audio/datasets/esc50.py new file mode 100644 index 000000000..684a8b8f5 --- /dev/null +++ b/paddlespeech/audio/datasets/esc50.py @@ -0,0 +1,152 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import os +from typing import List +from typing import Tuple + +from ...utils.env import DATA_HOME +from ..utils.download import download_and_decompress +from .dataset import AudioClassificationDataset + +__all__ = ['ESC50'] + + +class ESC50(AudioClassificationDataset): + """ + The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings + suitable for benchmarking methods of environmental sound classification. The dataset + consists of 5-second-long recordings organized into 50 semantical classes (with + 40 examples per class) + + Reference: + ESC: Dataset for Environmental Sound Classification + http://dx.doi.org/10.1145/2733373.2806390 + """ + + archieves = [ + { + 'url': + 'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip', + 'md5': '7771e4b9d86d0945acce719c7a59305a', + }, + ] + label_list = [ + # Animals + 'Dog', + 'Rooster', + 'Pig', + 'Cow', + 'Frog', + 'Cat', + 'Hen', + 'Insects (flying)', + 'Sheep', + 'Crow', + # Natural soundscapes & water sounds + 'Rain', + 'Sea waves', + 'Crackling fire', + 'Crickets', + 'Chirping birds', + 'Water drops', + 'Wind', + 'Pouring water', + 'Toilet flush', + 'Thunderstorm', + # Human, non-speech sounds + 'Crying baby', + 'Sneezing', + 'Clapping', + 'Breathing', + 'Coughing', + 'Footsteps', + 'Laughing', + 'Brushing teeth', + 'Snoring', + 'Drinking, sipping', + # Interior/domestic sounds + 'Door knock', + 'Mouse click', + 'Keyboard typing', + 'Door, wood creaks', + 'Can opening', + 'Washing machine', + 'Vacuum cleaner', + 'Clock alarm', + 'Clock tick', + 'Glass breaking', + # Exterior/urban noises + 'Helicopter', + 'Chainsaw', + 'Siren', + 'Car horn', + 'Engine', + 'Train', + 'Church bells', + 'Airplane', + 'Fireworks', + 'Hand saw', + ] + meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv') + meta_info = collections.namedtuple( + 'META_INFO', + ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take')) + audio_path = os.path.join('ESC-50-master', 'audio') + + def __init__(self, + mode: str='train', + split: int=1, + feat_type: str='raw', + **kwargs): + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extract of an audio file. + """ + files, labels = self._get_data(mode, split) + super(ESC50, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) + + def _get_meta_info(self) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines()[1:]: + ret.append(self.meta_info(*line.strip().split(','))) + return ret + + def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info() + + files = [] + labels = [] + for sample in meta_info: + filename, fold, target, _, _, _, _ = sample + if mode == 'train' and int(fold) != split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + if mode != 'train' and int(fold) == split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + return files, labels diff --git a/paddlespeech/audio/datasets/voxceleb.py b/paddlespeech/audio/datasets/voxceleb.py new file mode 100644 index 000000000..4daa6bf6f --- /dev/null +++ b/paddlespeech/audio/datasets/voxceleb.py @@ -0,0 +1,356 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import csv +import glob +import os +import random +from multiprocessing import cpu_count +from typing import List + +from paddle.io import Dataset +from pathos.multiprocessing import Pool +from tqdm import tqdm + +from ...utils.env import DATA_HOME +from ..backends.soundfile_backend import soundfile_load as load_audio +from ..utils.download import decompress +from ..utils.download import download_and_decompress +from .dataset import feat_funcs + +__all__ = ['VoxCeleb'] + + +class VoxCeleb(Dataset): + source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/' + archieves_audio_dev = [ + { + 'url': source_url + 'vox1_dev_wav_partaa', + 'md5': 'e395d020928bc15670b570a21695ed96', + }, + { + 'url': source_url + 'vox1_dev_wav_partab', + 'md5': 'bbfaaccefab65d82b21903e81a8a8020', + }, + { + 'url': source_url + 'vox1_dev_wav_partac', + 'md5': '017d579a2a96a077f40042ec33e51512', + }, + { + 'url': source_url + 'vox1_dev_wav_partad', + 'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19', + }, + ] + archieves_audio_test = [ + { + 'url': source_url + 'vox1_test_wav.zip', + 'md5': '185fdc63c3c739954633d50379a3d102', + }, + ] + archieves_meta = [ + { + 'url': + 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt', + 'md5': + 'b73110731c9223c1461fe49cb48dddfc', + }, + ] + + num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41 + sample_rate = 16000 + meta_info = collections.namedtuple( + 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id')) + base_path = os.path.join(DATA_HOME, 'vox1') + wav_path = os.path.join(base_path, 'wav') + meta_path = os.path.join(base_path, 'meta') + veri_test_file = os.path.join(meta_path, 'veri_test2.txt') + csv_path = os.path.join(base_path, 'csv') + subsets = ['train', 'dev', 'enroll', 'test'] + + def __init__( + self, + subset: str='train', + feat_type: str='raw', + random_chunk: bool=True, + chunk_duration: float=3.0, # seconds + split_ratio: float=0.9, # train split ratio + seed: int=0, + target_dir: str=None, + vox2_base_path=None, + **kwargs): + """VoxCeleb data prepare and get the specific dataset audio info + + Args: + subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'. + feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'. + random_chunk (bool, optional): random select a duration from audio. Defaults to True. + chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0. + target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None. + vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None. + """ + assert subset in self.subsets, \ + 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset) + + self.subset = subset + self.spk_id2label = {} + self.feat_type = feat_type + self.feat_config = kwargs + self.random_chunk = random_chunk + self.chunk_duration = chunk_duration + self.split_ratio = split_ratio + self.target_dir = target_dir if target_dir else VoxCeleb.base_path + self.vox2_base_path = vox2_base_path + + # if we set the target dir, we will change the vox data info data from base path to target dir + VoxCeleb.csv_path = os.path.join( + target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path + VoxCeleb.meta_path = os.path.join( + target_dir, "voxceleb", + 'meta') if target_dir else VoxCeleb.meta_path + VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path, + 'veri_test2.txt') + # self._data = self._get_data()[:1000] # KP: Small dataset test. + self._data = self._get_data() + super(VoxCeleb, self).__init__() + + # Set up a seed to reproduce training or predicting result. + # random.seed(seed) + + def _get_data(self): + # Download audio files. + # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir + # so, we check the vox1/wav dir status + print(f"wav base path: {self.wav_path}") + if not os.path.isdir(self.wav_path): + print("start to download the voxceleb1 dataset") + download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip + self.archieves_audio_dev, + self.base_path, + decompress=False) + download_and_decompress( # download the vox1_test_wav.zip and unzip + self.archieves_audio_test, + self.base_path, + decompress=True) + + # Download all parts and concatenate the files into one zip file. + dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip') + print(f'Concatenating all parts to: {dev_zipfile}') + os.system( + f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}' + ) + + # Extract all audio files of dev and test set. + decompress(dev_zipfile, self.base_path) + + # Download meta files. + if not os.path.isdir(self.meta_path): + print("prepare the meta data") + download_and_decompress( + self.archieves_meta, self.meta_path, decompress=False) + + # Data preparation. + if not os.path.isdir(self.csv_path): + os.makedirs(self.csv_path) + self.prepare_data() + + data = [] + print( + f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}" + ) + with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: + for line in rf.readlines()[1:]: + audio_id, duration, wav, start, stop, spk_id = line.strip( + ).split(',') + data.append( + self.meta_info(audio_id, + float(duration), wav, + int(start), int(stop), spk_id)) + + with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f: + for line in f.readlines(): + spk_id, label = line.strip().split(' ') + self.spk_id2label[spk_id] = int(label) + + return data + + def _convert_to_record(self, idx: int): + sample = self._data[idx] + + record = {} + # To show all fields in a namedtuple: `type(sample)._fields` + for field in type(sample)._fields: + record[field] = getattr(sample, field) + + waveform, sr = load_audio(record['wav']) + + # random select a chunk audio samples from the audio + if self.random_chunk: + num_wav_samples = waveform.shape[0] + num_chunk_samples = int(self.chunk_duration * sr) + start = random.randint(0, num_wav_samples - num_chunk_samples - 1) + stop = start + num_chunk_samples + else: + start = record['start'] + stop = record['stop'] + + waveform = waveform[start:stop] + + assert self.feat_type in feat_funcs.keys(), \ + f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" + feat_func = feat_funcs[self.feat_type] + feat = feat_func( + waveform, sr=sr, **self.feat_config) if feat_func else waveform + + record.update({'feat': feat}) + if self.subset in ['train', + 'dev']: # Labels are available in train and dev. + record.update({'label': self.spk_id2label[record['spk_id']]}) + + return record + + @staticmethod + def _get_chunks(seg_dur, audio_id, audio_duration): + num_chunks = int(audio_duration / seg_dur) # all in milliseconds + + chunk_lst = [ + audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) + for i in range(num_chunks) + ] + return chunk_lst + + def _get_audio_info(self, wav_file: str, + split_chunks: bool) -> List[List[str]]: + waveform, sr = load_audio(wav_file) + spk_id, sess_id, utt_id = wav_file.split("/")[-3:] + audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]]) + audio_duration = waveform.shape[0] / sr + + ret = [] + if split_chunks: # Split into pieces of self.chunk_duration seconds. + uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id, + audio_duration) + + for chunk in uniq_chunks_list: + s, e = chunk.split("_")[-2:] # Timestamps of start and end + start_sample = int(float(s) * sr) + end_sample = int(float(e) * sr) + # id, duration, wav, start, stop, spk_id + ret.append([ + chunk, audio_duration, wav_file, start_sample, end_sample, + spk_id + ]) + else: # Keep whole audio. + ret.append([ + audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id + ]) + return ret + + def generate_csv(self, + wav_files: List[str], + output_file: str, + split_chunks: bool=True): + print(f'Generating csv: {output_file}') + header = ["id", "duration", "wav", "start", "stop", "spk_id"] + # Note: this may occurs c++ exception, but the program will execute fine + # so we can ignore the exception + with Pool(cpu_count()) as p: + infos = list( + tqdm( + p.imap(lambda x: self._get_audio_info(x, split_chunks), + wav_files), + total=len(wav_files))) + + csv_lines = [] + for info in infos: + csv_lines.extend(info) + + with open(output_file, mode="w") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) + csv_writer.writerow(header) + for line in csv_lines: + csv_writer.writerow(line) + + def prepare_data(self): + # Audio of speakers in veri_test_file should not be included in training set. + print("start to prepare the data csv file") + enroll_files = set() + test_files = set() + # get the enroll and test audio file path + with open(self.veri_test_file, 'r') as f: + for line in f.readlines(): + _, enrol_file, test_file = line.strip().split(' ') + enroll_files.add(os.path.join(self.wav_path, enrol_file)) + test_files.add(os.path.join(self.wav_path, test_file)) + enroll_files = sorted(enroll_files) + test_files = sorted(test_files) + + # get the enroll and test speakers + test_spks = set() + for file in (enroll_files + test_files): + spk = file.split('/wav/')[1].split('/')[0] + test_spks.add(spk) + + # get all the train and dev audios file path + audio_files = [] + speakers = set() + print("Getting file list...") + for path in [self.wav_path, self.vox2_base_path]: + # if vox2 directory is not set and vox2 is not a directory + # we will not process this directory + if not path or not os.path.exists(path): + print(f"{path} is an invalid path, please check again, " + "and we will ignore the vox2 base path") + continue + for file in glob.glob( + os.path.join(path, "**", "*.wav"), recursive=True): + spk = file.split('/wav/')[1].split('/')[0] + if spk in test_spks: + continue + speakers.add(spk) + audio_files.append(file) + + print( + f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}" + ) + # encode the train and dev speakers label to spk_id2label.txt + with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f: + for label, spk_id in enumerate( + sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2 + f.write(f'{spk_id} {label}\n') + + audio_files = sorted(audio_files) + random.shuffle(audio_files) + split_idx = int(self.split_ratio * len(audio_files)) + # split_ratio to train + train_files, dev_files = audio_files[:split_idx], audio_files[ + split_idx:] + + self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv')) + self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv')) + + self.generate_csv( + enroll_files, + os.path.join(self.csv_path, 'enroll.csv'), + split_chunks=False) + self.generate_csv( + test_files, + os.path.join(self.csv_path, 'test.csv'), + split_chunks=False) + + def __getitem__(self, idx): + return self._convert_to_record(idx) + + def __len__(self): + return len(self._data) diff --git a/paddlespeech/audio/functional/__init__.py b/paddlespeech/audio/functional/__init__.py new file mode 100644 index 000000000..c85232df1 --- /dev/null +++ b/paddlespeech/audio/functional/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .functional import compute_fbank_matrix +from .functional import create_dct +from .functional import fft_frequencies +from .functional import hz_to_mel +from .functional import mel_frequencies +from .functional import mel_to_hz +from .functional import power_to_db diff --git a/paddlespeech/audio/functional/functional.py b/paddlespeech/audio/functional/functional.py new file mode 100644 index 000000000..7c20f9013 --- /dev/null +++ b/paddlespeech/audio/functional/functional.py @@ -0,0 +1,266 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from librosa(https://github.com/librosa/librosa) +import math +from typing import Optional +from typing import Union + +import paddle +from paddle import Tensor + +__all__ = [ + 'hz_to_mel', + 'mel_to_hz', + 'mel_frequencies', + 'fft_frequencies', + 'compute_fbank_matrix', + 'power_to_db', + 'create_dct', +] + + +def hz_to_mel(freq: Union[Tensor, float], + htk: bool=False) -> Union[Tensor, float]: + """Convert Hz to Mels. + + Args: + freq (Union[Tensor, float]): The input tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + Union[Tensor, float]: Frequency in mels. + """ + + if htk: + if isinstance(freq, Tensor): + return 2595.0 * paddle.log10(1.0 + freq / 700.0) + else: + return 2595.0 * math.log10(1.0 + freq / 700.0) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + + if isinstance(freq, Tensor): + target = min_log_mel + paddle.log( + freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 + mask = (freq > min_log_hz).astype(freq.dtype) + mels = target * mask + mels * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if freq >= min_log_hz: + mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep + + return mels + + +def mel_to_hz(mel: Union[float, Tensor], + htk: bool=False) -> Union[float, Tensor]: + """Convert mel bin numbers to frequencies. + + Args: + mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + Union[float, Tensor]: Frequencies in Hz. + """ + if htk: + return 700.0 * (10.0**(mel / 2595.0) - 1.0) + + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mel + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + if isinstance(mel, Tensor): + target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) + mask = (mel > min_log_mel).astype(mel.dtype) + freqs = target * mask + freqs * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if mel >= min_log_mel: + freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int=64, + f_min: float=0.0, + f_max: float=11025.0, + htk: bool=False, + dtype: str='float32') -> Tensor: + """Compute mel frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + + Returns: + Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`. + """ + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(f_min, htk=htk) + max_mel = hz_to_mel(f_max, htk=htk) + mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype) + freqs = mel_to_hz(mels, htk=htk) + return freqs + + +def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor: + """Compute fourier frequencies. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + + Returns: + Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. + """ + return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) + + +def compute_fbank_matrix(sr: int, + n_fft: int, + n_mels: int=64, + f_min: float=0.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str='float32') -> Tensor: + """Compute fbank matrix. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + + Returns: + Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. + """ + + if f_max is None: + f_max = float(sr) / 2 + + # Initialize the weights + weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies( + n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype) + + fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f) + ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) + #ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = paddle.maximum( + paddle.zeros_like(lower), paddle.minimum(lower, upper)) + + # Slaney-style mel is scaled to be approx constant energy per channel + if norm == 'slaney': + enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) + weights *= enorm.unsqueeze(1) + elif isinstance(norm, int) or isinstance(norm, float): + weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1) + + return weights + + +def power_to_db(spect: Tensor, + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None) -> Tensor: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. + + Args: + spect (Tensor): STFT power spectrogram. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. + + Returns: + Tensor: Power spectrogram in db scale. + """ + if amin <= 0: + raise Exception("amin must be strictly positive") + + if ref_value <= 0: + raise Exception("ref_value must be strictly positive") + + ones = paddle.ones_like(spect) + log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect)) + log_spec -= 10.0 * math.log10(max(ref_value, amin)) + + if top_db is not None: + if top_db < 0: + raise Exception("top_db must be non-negative") + log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db)) + + return log_spec + + +def create_dct(n_mfcc: int, + n_mels: int, + norm: Optional[str]='ortho', + dtype: str='float32') -> Tensor: + """Create a discrete cosine transform(DCT) matrix. + + Args: + n_mfcc (int): Number of mel frequency cepstral coefficients. + n_mels (int): Number of mel filterbanks. + norm (Optional[str], optional): Normalization type. Defaults to 'ortho'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + + Returns: + Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`. + """ + n = paddle.arange(n_mels, dtype=dtype) + k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) + dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) * + k) # size (n_mfcc, n_mels) + if norm is None: + dct *= 2.0 + else: + assert norm == "ortho" + dct[0] *= 1.0 / math.sqrt(2.0) + dct *= math.sqrt(2.0 / float(n_mels)) + return dct.T diff --git a/paddlespeech/audio/functional/window.py b/paddlespeech/audio/functional/window.py new file mode 100644 index 000000000..c518dbab3 --- /dev/null +++ b/paddlespeech/audio/functional/window.py @@ -0,0 +1,373 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +import math +from typing import List +from typing import Tuple +from typing import Union + +import paddle +from paddle import Tensor + + +class WindowFunctionRegister(object): + def __init__(self): + self._functions_dict = dict() + + def register(self): + def add_subfunction(func): + name = func.__name__ + self._functions_dict[name] = func + return func + + return add_subfunction + + def get(self, name): + return self._functions_dict[name] + + +window_function_register = WindowFunctionRegister() + + +@window_function_register.register() +def _cat(x: List[Tensor], data_type: str) -> Tensor: + l = [paddle.to_tensor(_, data_type) for _ in x] + return paddle.concat(l) + + +@window_function_register.register() +def _acosh(x: Union[Tensor, float]) -> Tensor: + if isinstance(x, float): + return math.log(x + math.sqrt(x**2 - 1)) + return paddle.log(x + paddle.sqrt(paddle.square(x) - 1)) + + +@window_function_register.register() +def _extend(M: int, sym: bool) -> bool: + """Extend window by 1 sample if needed for DFT-even symmetry.""" + if not sym: + return M + 1, True + else: + return M, False + + +@window_function_register.register() +def _len_guards(M: int) -> bool: + """Handle small or incorrect window lengths.""" + if int(M) != M or M < 0: + raise ValueError('Window length M must be a non-negative integer') + + return M <= 1 + + +@window_function_register.register() +def _truncate(w: Tensor, needed: bool) -> Tensor: + """Truncate window by 1 sample if needed for DFT-even symmetry.""" + if needed: + return w[:-1] + else: + return w + + +@window_function_register.register() +def _general_gaussian(M: int, p, sig, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a window with a generalized Gaussian shape. + This function is consistent with scipy.signal.windows.general_gaussian(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p)) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _general_cosine(M: int, a: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generic weighted sum of cosine terms window. + This function is consistent with scipy.signal.windows.general_cosine(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) + w = paddle.zeros((M, ), dtype=dtype) + for k in range(len(a)): + w += a[k] * paddle.cos(k * fac) + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _general_hamming(M: int, alpha: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generalized Hamming window. + This function is consistent with scipy.signal.windows.general_hamming() + """ + return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype) + + +@window_function_register.register() +def _taylor(M: int, + nbar=4, + sll=30, + norm=True, + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Taylor window. + The Taylor window taper function approximates the Dolph-Chebyshev window's + constant sidelobe level for a parameterized number of near-in sidelobes. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + # Original text uses a negative sidelobe level parameter and then negates + # it in the calculation of B. To keep consistent with other methods we + # assume the sidelobe level parameter to be positive. + B = 10**(sll / 20) + A = _acosh(B) / math.pi + s2 = nbar**2 / (A**2 + (nbar - 0.5)**2) + ma = paddle.arange(1, nbar, dtype=dtype) + + Fm = paddle.empty((nbar - 1, ), dtype=dtype) + signs = paddle.empty_like(ma) + signs[::2] = 1 + signs[1::2] = -1 + m2 = ma * ma + for mi in range(len(ma)): + numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2 + )) + if mi == 0: + denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:]) + elif mi == len(ma) - 1: + denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) + else: + denom = (2 * paddle.prod(1 - m2[mi] / m2[:mi]) * + paddle.prod(1 - m2[mi] / m2[mi + 1:])) + + Fm[mi] = numer / denom + + def W(n): + return 1 + 2 * paddle.matmul( + Fm.unsqueeze(0), + paddle.cos(2 * math.pi * ma.unsqueeze(1) * + (n - M / 2.0 + 0.5) / M), ) + + w = W(paddle.arange(0, M, dtype=dtype)) + + # normalize (Note that this is not described in the original text [1]) + if norm: + scale = 1.0 / W((M - 1) / 2) + w *= scale + w = w.squeeze() + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hamming window. + The Hamming window is a taper formed by using a raised cosine with + non-zero endpoints, optimized to minimize the nearest side lobe. + """ + return _general_hamming(M, 0.54, sym, dtype=dtype) + + +@window_function_register.register() +def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hann window. + The Hann window is a taper formed by using a raised cosine or sine-squared + with ends that touch zero. + """ + return _general_hamming(M, 0.5, sym, dtype=dtype) + + +@window_function_register.register() +def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Tukey window. + The Tukey window is also known as a tapered cosine window. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + + if alpha <= 0: + return paddle.ones((M, ), dtype=dtype) + elif alpha >= 1.0: + return hann(M, sym=sym) + + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) + width = int(alpha * (M - 1) / 2.0) + n1 = n[0:width + 1] + n2 = n[width + 1:M - width - 1] + n3 = n[M - width - 1:] + + w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1)))) + w2 = paddle.ones(n2.shape, dtype=dtype) + w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / + (M - 1)))) + w = paddle.concat([w1, w2, w3]) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _gaussian(M: int, std: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Gaussian window. + The Gaussian widows has a Gaussian shape defined by the standard deviation(std). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + sig2 = 2 * std * std + w = paddle.exp(-(n**2) / sig2) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _exponential(M: int, + center=None, + tau=1.0, + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute an exponential (or Poisson) window.""" + if sym and center is not None: + raise ValueError("If sym==True, center must be None.") + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + if center is None: + center = (M - 1) / 2 + + n = paddle.arange(0, M, dtype=dtype) + w = paddle.exp(-paddle.abs(n - center) / tau) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a triangular window.""" + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype) + if M % 2 == 0: + w = (2 * n - 1.0) / M + w = paddle.concat([w, w[::-1]]) + else: + w = 2 * n / (M + 1.0) + w = paddle.concat([w, w[-2::-1]]) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Bohman window. + The Bohman window is the autocorrelation of a cosine window. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1]) + w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin( + math.pi * fac) + w = _cat([0, w, 0], dtype) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Blackman window. + The Blackman window is a taper formed by using the first three terms of + a summation of cosines. It was designed to have close to the minimal + leakage possible. It is close to optimal, only slightly worse than a + Kaiser window. + """ + return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) + + +@window_function_register.register() +def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a window with a simple cosine shape.""" + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5)) + + return _truncate(w, needs_trunc) + + +def get_window( + window: Union[str, Tuple[str, float]], + win_length: int, + fftbins: bool=True, + dtype: str='float64', ) -> Tensor: + """Return a window of a given length and type. + + Args: + window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. + win_length (int): Number of samples. + fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. + dtype (str, optional): The data type of the return window. Defaults to 'float64'. + + Returns: + Tensor: The window represented as a tensor. + + Examples: + .. code-block:: python + + import paddle + + n_fft = 512 + cosine_window = paddle.audio.functional.get_window('cosine', n_fft) + + std = 7 + gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft) + """ + sym = not fftbins + + args = () + if isinstance(window, tuple): + winstr = window[0] + if len(window) > 1: + args = window[1:] + elif isinstance(window, str): + if window in ['gaussian', 'exponential']: + raise ValueError("The '" + window + "' window needs one or " + "more parameters -- pass a tuple.") + else: + winstr = window + else: + raise ValueError("%s as window type is not supported." % + str(type(window))) + + try: + winfunc = window_function_register.get('_' + winstr) + except KeyError as e: + raise ValueError("Unknown window type.") from e + + params = (win_length, ) + args + kwargs = {'sym': sym} + return winfunc(*params, dtype=dtype, **kwargs) diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py index 2e82226df..664509842 100644 --- a/paddlespeech/audio/streamdata/autodecode.py +++ b/paddlespeech/audio/streamdata/autodecode.py @@ -304,13 +304,11 @@ def paddle_audio(key, data): if extension not in ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma"]: return None - import paddleaudio - with tempfile.TemporaryDirectory() as dirname: fname = os.path.join(dirname, f"file.{extension}") with open(fname, "wb") as stream: stream.write(data) - return paddleaudio.backends.soundfile_load(fname) + return paddlespeech.audio.backends.soundfile_load(fname) ################################################################ diff --git a/paddlespeech/audio/streamdata/filters.py b/paddlespeech/audio/streamdata/filters.py index 110b4a304..9a00c2dc6 100644 --- a/paddlespeech/audio/streamdata/filters.py +++ b/paddlespeech/audio/streamdata/filters.py @@ -22,8 +22,6 @@ from fnmatch import fnmatch from functools import reduce import paddle -from paddleaudio import backends -from paddleaudio.compliance import kaldi from . import autodecode from . import utils @@ -33,6 +31,8 @@ from ..transform.spec_augment import time_mask from ..transform.spec_augment import time_warp from ..utils.tensor_utils import pad_sequence from .utils import PipelineStage +from paddlespeech.audio import backends +from paddlespeech.audio.compliance import kaldi class FilterFunction(object): diff --git a/paddlespeech/audio/streamdata/soundfile.py b/paddlespeech/audio/streamdata/soundfile.py new file mode 100644 index 000000000..7611fd297 --- /dev/null +++ b/paddlespeech/audio/streamdata/soundfile.py @@ -0,0 +1,677 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import warnings +from typing import Optional +from typing import Tuple + +import numpy as np +import paddle +import resampy +import soundfile +from scipy.io import wavfile + +from ..utils import depth_convert +from ..utils import ParameterError +from .common import AudioInfo + +__all__ = [ + 'resample', + 'to_mono', + 'normalize', + 'save', + 'soundfile_save', + 'load', + 'soundfile_load', + 'info', +] +NORMALMIZE_TYPES = ['linear', 'gaussian'] +MERGE_TYPES = ['ch0', 'ch1', 'random', 'average'] +RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast'] +EPS = 1e-8 + + +def resample(y: np.ndarray, + src_sr: int, + target_sr: int, + mode: str='kaiser_fast') -> np.ndarray: + """Audio resampling. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + src_sr (int): Source sample rate. + target_sr (int): Target sample rate. + mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + np.ndarray: `y` resampled to `target_sr` + """ + + if mode == 'kaiser_best': + warnings.warn( + f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \ + we recommend the mode kaiser_fast in large scale audio training') + + if not isinstance(y, np.ndarray): + raise ParameterError( + 'Only support numpy np.ndarray, but received y in {type(y)}') + + if mode not in RESAMPLE_MODES: + raise ParameterError(f'resample mode must in {RESAMPLE_MODES}') + + return resampy.resample(y, src_sr, target_sr, filter=mode) + + +def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray: + """Convert sterior audio to mono. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'. + + Returns: + np.ndarray: `y` with mono channel. + """ + + if merge_type not in MERGE_TYPES: + raise ParameterError( + f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}' + ) + if y.ndim > 2: + raise ParameterError( + f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}') + if y.ndim == 1: # nothing to merge + return y + + if merge_type == 'ch0': + return y[0] + if merge_type == 'ch1': + return y[1] + if merge_type == 'random': + return y[np.random.randint(0, 2)] + + # need to do averaging according to dtype + + if y.dtype == 'float32': + y_out = (y[0] + y[1]) * 0.5 + elif y.dtype == 'int16': + y_out = y.astype('int32') + y_out = (y_out[0] + y_out[1]) // 2 + y_out = np.clip(y_out, np.iinfo(y.dtype).min, + np.iinfo(y.dtype).max).astype(y.dtype) + + elif y.dtype == 'int8': + y_out = y.astype('int16') + y_out = (y_out[0] + y_out[1]) // 2 + y_out = np.clip(y_out, np.iinfo(y.dtype).min, + np.iinfo(y.dtype).max).astype(y.dtype) + else: + raise ParameterError(f'Unsupported dtype: {y.dtype}') + return y_out + + +def soundfile_load_(file: os.PathLike, + offset: Optional[float]=None, + dtype: str='int16', + duration: Optional[int]=None) -> Tuple[np.ndarray, int]: + """Load audio using soundfile library. This function load audio file using libsndfile. + + Args: + file (os.PathLike): File of waveform. + offset (Optional[float], optional): Offset to the start of waveform. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'int16'. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. + """ + with soundfile.SoundFile(file) as sf_desc: + sr_native = sf_desc.samplerate + if offset: + sf_desc.seek(int(offset * sr_native)) + if duration is not None: + frame_duration = int(duration * sr_native) + else: + frame_duration = -1 + y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T + + return y, sf_desc.samplerate + + +def normalize(y: np.ndarray, norm_type: str='linear', + mul_factor: float=1.0) -> np.ndarray: + """Normalize an input audio with additional multiplier. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + mul_factor (float, optional): Scaling factor. Defaults to 1.0. + + Returns: + np.ndarray: `y` after normalization. + """ + + if norm_type == 'linear': + amax = np.max(np.abs(y)) + factor = 1.0 / (amax + EPS) + y = y * factor * mul_factor + elif norm_type == 'gaussian': + amean = np.mean(y) + astd = np.std(y) + astd = max(astd, EPS) + y = mul_factor * (y - amean) / astd + else: + raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}') + + return y + + +def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None: + """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + sr (int): Sample rate. + file (os.PathLike): Path of audio file to save. + """ + if not file.endswith('.wav'): + raise ParameterError( + f'only .wav file supported, but dst file name is: {file}') + + if sr <= 0: + raise ParameterError( + f'Sample rate should be larger than 0, received sr = {sr}') + + if y.dtype not in ['int16', 'int8']: + warnings.warn( + f'input data type is {y.dtype}, will convert data to int16 format before saving' + ) + y_out = depth_convert(y, 'int16') + else: + y_out = y + + wavfile.write(file, sr, y_out) + + +def soundfile_load( + file: os.PathLike, + sr: Optional[int]=None, + mono: bool=True, + merge_type: str='average', # ch0,ch1,random,average + normal: bool=True, + norm_type: str='linear', + norm_mul_factor: float=1.0, + offset: float=0.0, + duration: Optional[int]=None, + dtype: str='float32', + resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]: + """Load audio file from disk. This function loads audio from disk using using audio backend. + + Args: + file (os.PathLike): Path of audio file to load. + sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None. + mono (bool, optional): Return waveform with mono channel. Defaults to True. + merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'. + normal (bool, optional): Waveform normalization. Defaults to True. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0. + offset (float, optional): Offset to the start of waveform. Defaults to 0.0. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'float32'. + resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. + """ + + y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration) + + if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)): + raise ParameterError(f'audio file {file} looks empty') + + if mono: + y = to_mono(y, merge_type) + + if sr is not None and sr != r: + y = resample(y, r, sr, mode=resample_mode) + r = sr + + if normal: + y = normalize(y, norm_type, norm_mul_factor) + elif dtype in ['int8', 'int16']: + # still need to do normalization, before depth conversion + y = normalize(y, 'linear', 1.0) + + y = depth_convert(y, dtype) + return y, r + + +#The code below is taken from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py, with some modifications. + + +def _get_subtype_for_wav(dtype: paddle.dtype, + encoding: str, + bits_per_sample: int): + if not encoding: + if not bits_per_sample: + subtype = { + paddle.uint8: "PCM_U8", + paddle.int16: "PCM_16", + paddle.int32: "PCM_32", + paddle.float32: "FLOAT", + paddle.float64: "DOUBLE", + }.get(dtype) + if not subtype: + raise ValueError(f"Unsupported dtype for wav: {dtype}") + return subtype + if bits_per_sample == 8: + return "PCM_U8" + return f"PCM_{bits_per_sample}" + if encoding == "PCM_S": + if not bits_per_sample: + return "PCM_32" + if bits_per_sample == 8: + raise ValueError("wav does not support 8-bit signed PCM encoding.") + return f"PCM_{bits_per_sample}" + if encoding == "PCM_U": + if bits_per_sample in (None, 8): + return "PCM_U8" + raise ValueError("wav only supports 8-bit unsigned PCM encoding.") + if encoding == "PCM_F": + if bits_per_sample in (None, 32): + return "FLOAT" + if bits_per_sample == 64: + return "DOUBLE" + raise ValueError("wav only supports 32/64-bit float PCM encoding.") + if encoding == "ULAW": + if bits_per_sample in (None, 8): + return "ULAW" + raise ValueError("wav only supports 8-bit mu-law encoding.") + if encoding == "ALAW": + if bits_per_sample in (None, 8): + return "ALAW" + raise ValueError("wav only supports 8-bit a-law encoding.") + raise ValueError(f"wav does not support {encoding}.") + + +def _get_subtype_for_sphere(encoding: str, bits_per_sample: int): + if encoding in (None, "PCM_S"): + return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32" + if encoding in ("PCM_U", "PCM_F"): + raise ValueError(f"sph does not support {encoding} encoding.") + if encoding == "ULAW": + if bits_per_sample in (None, 8): + return "ULAW" + raise ValueError("sph only supports 8-bit for mu-law encoding.") + if encoding == "ALAW": + return "ALAW" + raise ValueError(f"sph does not support {encoding}.") + + +def _get_subtype(dtype: paddle.dtype, + format: str, + encoding: str, + bits_per_sample: int): + if format == "wav": + return _get_subtype_for_wav(dtype, encoding, bits_per_sample) + if format == "flac": + if encoding: + raise ValueError("flac does not support encoding.") + if not bits_per_sample: + return "PCM_16" + if bits_per_sample > 24: + raise ValueError("flac does not support bits_per_sample > 24.") + return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}" + if format in ("ogg", "vorbis"): + if encoding or bits_per_sample: + raise ValueError( + "ogg/vorbis does not support encoding/bits_per_sample.") + return "VORBIS" + if format == "sph": + return _get_subtype_for_sphere(encoding, bits_per_sample) + if format in ("nis", "nist"): + return "PCM_16" + raise ValueError(f"Unsupported format: {format}") + + +def save( + filepath: str, + src: paddle.Tensor, + sample_rate: int, + channels_first: bool=True, + compression: Optional[float]=None, + format: Optional[str]=None, + encoding: Optional[str]=None, + bits_per_sample: Optional[int]=None, ): + """Save audio data to file. + + Note: + The formats this function can handle depend on the soundfile installation. + This function is tested on the following formats; + + * WAV + + * 32-bit floating-point + * 32-bit signed integer + * 16-bit signed integer + * 8-bit unsigned integer + + * FLAC + * OGG/VORBIS + * SPHERE + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + + Args: + filepath (str or pathlib.Path): Path to audio file. + src (paddle.Tensor): Audio data to save. must be 2D tensor. + sample_rate (int): sampling rate + channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, + otherwise `[time, channel]`. + compression (float of None, optional): Not used. + It is here only for interface compatibility reason with "sox_io" backend. + format (str or None, optional): Override the audio format. + When ``filepath`` argument is path-like object, audio format is + inferred from file extension. If the file extension is missing or + different, you can specify the correct format with this argument. + + When ``filepath`` argument is file-like object, + this argument is required. + + Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``, + ``"flac"`` and ``"sph"``. + encoding (str or None, optional): Changes the encoding for supported formats. + This argument is effective only for supported formats, such as + ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are: + + - ``"PCM_S"`` (signed integer Linear PCM) + - ``"PCM_U"`` (unsigned integer Linear PCM) + - ``"PCM_F"`` (floating point PCM) + - ``"ULAW"`` (mu-law) + - ``"ALAW"`` (a-law) + + bits_per_sample (int or None, optional): Changes the bit depth for the + supported formats. + When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``, + you can change the bit depth. + Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. + + Supported formats/encodings/bit depth/compression are: + + ``"wav"`` + - 32-bit floating-point PCM + - 32-bit signed integer PCM + - 24-bit signed integer PCM + - 16-bit signed integer PCM + - 8-bit unsigned integer PCM + - 8-bit mu-law + - 8-bit a-law + + Note: + Default encoding/bit depth is determined by the dtype of + the input Tensor. + + ``"flac"`` + - 8-bit + - 16-bit (default) + - 24-bit + + ``"ogg"``, ``"vorbis"`` + - Doesn't accept changing configuration. + + ``"sph"`` + - 8-bit signed integer PCM + - 16-bit signed integer PCM + - 24-bit signed integer PCM + - 32-bit signed integer PCM (default) + - 8-bit mu-law + - 8-bit a-law + - 16-bit a-law + - 24-bit a-law + - 32-bit a-law + + """ + if src.ndim != 2: + raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.") + if compression is not None: + warnings.warn( + '`save` function of "soundfile" backend does not support "compression" parameter. ' + "The argument is silently ignored.") + if hasattr(filepath, "write"): + if format is None: + raise RuntimeError( + "`format` is required when saving to file object.") + ext = format.lower() + else: + ext = str(filepath).split(".")[-1].lower() + + if bits_per_sample not in (None, 8, 16, 24, 32, 64): + raise ValueError("Invalid bits_per_sample.") + if bits_per_sample == 24: + warnings.warn( + "Saving audio with 24 bits per sample might warp samples near -1. " + "Using 16 bits per sample might be able to avoid this.") + subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample) + + # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format, + # so we extend the extensions manually here + if ext in ["nis", "nist", "sph"] and format is None: + format = "NIST" + + if channels_first: + src = src.t() + + soundfile.write( + file=filepath, + data=src, + samplerate=sample_rate, + subtype=subtype, + format=format) + + +_SUBTYPE2DTYPE = { + "PCM_S8": "int8", + "PCM_U8": "uint8", + "PCM_16": "int16", + "PCM_32": "int32", + "FLOAT": "float32", + "DOUBLE": "float64", +} + + +def load( + filepath: str, + frame_offset: int=0, + num_frames: int=-1, + normalize: bool=True, + channels_first: bool=True, + format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]: + """Load audio data from file. + + Note: + The formats this function can handle depend on the soundfile installation. + This function is tested on the following formats; + + * WAV + + * 32-bit floating-point + * 32-bit signed integer + * 16-bit signed integer + * 8-bit unsigned integer + + * FLAC + * OGG/VORBIS + * SPHERE + + By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with + ``float32`` dtype and the shape of `[channel, time]`. + The samples are normalized to fit in the range of ``[-1.0, 1.0]``. + + When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit + signed integer and 8-bit unsigned integer (24-bit signed integer is not supported), + by providing ``normalize=False``, this function can return integer Tensor, where the samples + are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor + for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. + + ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as + ``flac`` and ``mp3``. + For these formats, this function always returns ``float32`` Tensor with values normalized to + ``[-1.0, 1.0]``. + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend. + + Args: + filepath (path-like object or file-like object): + Source of audio data. + frame_offset (int, optional): + Number of frames to skip before start reading data. + num_frames (int, optional): + Maximum number of frames to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + This function may return the less number of frames if there is not enough + frames in the given file. + normalize (bool, optional): + When ``True``, this function always return ``float32``, and sample values are + normalized to ``[-1.0, 1.0]``. + If input file is integer WAV, giving ``False`` will change the resulting Tensor type to + integer type. + This argument has no effect for formats other than integer WAV type. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Not used. PySoundFile does not accept format hint. + + Returns: + (paddle.Tensor, int): Resulting Tensor and sample rate. + If the input file has integer wav format and normalization is off, then it has + integer type, else ``float32`` type. If ``channels_first=True``, it has + `[channel, time]` else `[time, channel]`. + """ + with soundfile.SoundFile(filepath, "r") as file_: + if file_.format != "WAV" or normalize: + dtype = "float32" + elif file_.subtype not in _SUBTYPE2DTYPE: + raise ValueError(f"Unsupported subtype: {file_.subtype}") + else: + dtype = _SUBTYPE2DTYPE[file_.subtype] + + frames = file_._prepare_read(frame_offset, None, num_frames) + waveform = file_.read(frames, dtype, always_2d=True) + sample_rate = file_.samplerate + + waveform = paddle.to_tensor(waveform) + if channels_first: + waveform = paddle.transpose(waveform, perm=[1, 0]) + return waveform, sample_rate + + +# Mapping from soundfile subtype to number of bits per sample. +# This is mostly heuristical and the value is set to 0 when it is irrelevant +# (lossy formats) or when it can't be inferred. +# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard: +# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony, +# the default seems to be 8 bits but it can be compressed further to 4 bits. +# The dict is inspired from +# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94 +_SUBTYPE_TO_BITS_PER_SAMPLE = { + "PCM_S8": 8, # Signed 8 bit data + "PCM_16": 16, # Signed 16 bit data + "PCM_24": 24, # Signed 24 bit data + "PCM_32": 32, # Signed 32 bit data + "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only) + "FLOAT": 32, # 32 bit float data + "DOUBLE": 64, # 64 bit float data + "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types + "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types + "IMA_ADPCM": 0, # IMA ADPCM. + "MS_ADPCM": 0, # Microsoft ADPCM. + "GSM610": + 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate) + "VOX_ADPCM": 0, # OKI / Dialogix ADPCM + "G721_32": 0, # 32kbs G721 ADPCM encoding. + "G723_24": 0, # 24kbs G723 ADPCM encoding. + "G723_40": 0, # 40kbs G723 ADPCM encoding. + "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding. + "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding. + "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding. + "DWVW_N": 0, # N bit Delta Width Variable Word encoding. + "DPCM_8": 8, # 8 bit differential PCM (XI only) + "DPCM_16": 16, # 16 bit differential PCM (XI only) + "VORBIS": 0, # Xiph Vorbis encoding. (lossy) + "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit). + "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit). + "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit). + "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit). +} + + +def _get_bit_depth(subtype): + if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE: + warnings.warn( + f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample " + "attribute will be set to 0. If you are seeing this warning, please " + "report by opening an issue on github (after checking for existing/closed ones). " + "You may otherwise ignore this warning.") + return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0) + + +_SUBTYPE_TO_ENCODING = { + "PCM_S8": "PCM_S", + "PCM_16": "PCM_S", + "PCM_24": "PCM_S", + "PCM_32": "PCM_S", + "PCM_U8": "PCM_U", + "FLOAT": "PCM_F", + "DOUBLE": "PCM_F", + "ULAW": "ULAW", + "ALAW": "ALAW", + "VORBIS": "VORBIS", +} + + +def _get_encoding(format: str, subtype: str): + if format == "FLAC": + return "FLAC" + return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN") + + +def info(filepath: str, format: Optional[str]=None) -> AudioInfo: + """Get signal information of an audio file. + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + + Args: + filepath (path-like object or file-like object): + Source of audio data. + format (str or None, optional): + Not used. PySoundFile does not accept format hint. + + Returns: + AudioInfo: meta data of the given audio. + + """ + sinfo = soundfile.info(filepath) + return AudioInfo( + sinfo.samplerate, + sinfo.frames, + sinfo.channels, + bits_per_sample=_get_bit_depth(sinfo.subtype), + encoding=_get_encoding(sinfo.format, sinfo.subtype), ) diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py index 3adf4892a..8429e6f77 100644 --- a/paddlespeech/audio/streamdata/tariterators.py +++ b/paddlespeech/audio/streamdata/tariterators.py @@ -20,9 +20,9 @@ trace = False meta_prefix = "__" meta_suffix = "__" -import paddleaudio import paddle import numpy as np +from paddlespeech.audio.backends import soundfile_load AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) @@ -111,7 +111,7 @@ def tar_file_iterator(fileobj, assert pos > 0 prefix, postfix = name[:pos], name[pos + 1:] if postfix == 'wav': - waveform, sample_rate = paddleaudio.backends.soundfile_load( + waveform, sample_rate = soundfile_load( stream.extractfile(tarinfo), normal=False) result = dict( fname=prefix, wav=waveform, sample_rate=sample_rate) @@ -163,7 +163,7 @@ def tar_file_and_group_iterator(fileobj, if postfix == 'txt': example['txt'] = file_obj.read().decode('utf8').strip() elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = paddleaudio.backends.soundfile_load( + waveform, sample_rate = soundfile_load( file_obj, normal=False) waveform = paddle.to_tensor( np.expand_dims(np.array(waveform), 0), diff --git a/paddlespeech/audio/transform/spectrogram.py b/paddlespeech/audio/transform/spectrogram.py index f2dab3169..a4da86ec7 100644 --- a/paddlespeech/audio/transform/spectrogram.py +++ b/paddlespeech/audio/transform/spectrogram.py @@ -15,9 +15,10 @@ import librosa import numpy as np import paddle -from paddleaudio.compliance import kaldi from python_speech_features import logfbank +from paddlespeech.audio.compliance import kaldi + def stft(x, n_fft, diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 5e2168e3d..fa49f7bdb 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -22,11 +22,11 @@ import numpy as np import paddle import yaml from paddle.audio.features import LogMelSpectrogram -from paddleaudio.backends import soundfile_load as load from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper +from paddlespeech.audio.backends import soundfile_load as load __all__ = ['CLSExecutor'] diff --git a/paddlespeech/cli/kws/infer.py b/paddlespeech/cli/kws/infer.py index ce2f3f461..6dee4cc84 100644 --- a/paddlespeech/cli/kws/infer.py +++ b/paddlespeech/cli/kws/infer.py @@ -20,12 +20,12 @@ from typing import Union import paddle import yaml -from paddleaudio.backends import soundfile_load as load_audio -from paddleaudio.compliance.kaldi import fbank as kaldi_fbank from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper +from paddlespeech.audio.backends import soundfile_load as load_audio +from paddlespeech.audio.compliance.kaldi import fbank as kaldi_fbank __all__ = ['KWSExecutor'] diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 57a781656..c4ae11c75 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -22,13 +22,13 @@ from typing import Union import paddle import soundfile -from paddleaudio.backends import soundfile_load as load_audio -from paddleaudio.compliance.librosa import melspectrogram from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper +from paddlespeech.audio.backends import soundfile_load as load_audio +from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.modules.sid_model import SpeakerIdetification diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py index a6b735335..3085a8482 100644 --- a/paddlespeech/cls/exps/panns/deploy/predict.py +++ b/paddlespeech/cls/exps/panns/deploy/predict.py @@ -19,10 +19,10 @@ import paddle from paddle import inference from paddle.audio.datasets import ESC50 from paddle.audio.features import LogMelSpectrogram -from paddleaudio.backends import soundfile_load as load_audio from scipy.special import softmax import paddlespeech.utils +from paddlespeech.audio.backends import soundfile_load as load_audio # yapf: disable parser = argparse.ArgumentParser() diff --git a/paddlespeech/cls/exps/panns/export_model.py b/paddlespeech/cls/exps/panns/export_model.py index e860b54aa..5163dbacf 100644 --- a/paddlespeech/cls/exps/panns/export_model.py +++ b/paddlespeech/cls/exps/panns/export_model.py @@ -15,8 +15,8 @@ import argparse import os import paddle -from paddleaudio.datasets import ESC50 +from paddlespeech.audio.datasets import ESC50 from paddlespeech.cls.models import cnn14 from paddlespeech.cls.models import SoundClassifier diff --git a/paddlespeech/cls/exps/panns/predict.py b/paddlespeech/cls/exps/panns/predict.py index 4681e4dc9..6b0eb9f68 100644 --- a/paddlespeech/cls/exps/panns/predict.py +++ b/paddlespeech/cls/exps/panns/predict.py @@ -18,12 +18,11 @@ import paddle import paddle.nn.functional as F import yaml from paddle.audio.features import LogMelSpectrogram -from paddleaudio.backends import soundfile_load as load_audio -from paddleaudio.utils import logger +from paddlespeech.audio.backends import soundfile_load as load_audio +from paddlespeech.audio.utils import logger from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import -#from paddleaudio.features import LogMelSpectrogram # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py index b768919be..5e5e0809d 100644 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -17,9 +17,9 @@ import os import paddle import yaml from paddle.audio.features import LogMelSpectrogram -from paddleaudio.utils import logger -from paddleaudio.utils import Timer +from paddlespeech.audio.utils import logger +from paddlespeech.audio.utils import Timer from paddlespeech.cls.models import SoundClassifier from paddlespeech.utils.dynamic_import import dynamic_import diff --git a/paddlespeech/cls/models/panns/panns.py b/paddlespeech/cls/models/panns/panns.py index 6f9af9b52..37deae80c 100644 --- a/paddlespeech/cls/models/panns/panns.py +++ b/paddlespeech/cls/models/panns/panns.py @@ -15,8 +15,8 @@ import os import paddle.nn as nn import paddle.nn.functional as F -from paddleaudio.utils.download import load_state_dict_from_url +from paddlespeech.audio.utils.download import load_state_dict_from_url from paddlespeech.utils.env import MODEL_HOME __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6'] diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py index bb727d36a..d5bb5e020 100644 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -14,10 +14,10 @@ import os import paddle -from paddleaudio.utils import logger -from paddleaudio.utils import Timer from yacs.config import CfgNode +from paddlespeech.audio.utils import logger +from paddlespeech.audio.utils import Timer from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.loss import max_pooling_loss from paddlespeech.kws.models.mdtc import KWSModel diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py index 22329d5e0..ac5720fd5 100644 --- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py @@ -14,10 +14,11 @@ """Contains the audio featurizer class.""" import numpy as np import paddle -import paddleaudio.compliance.kaldi as kaldi from python_speech_features import delta from python_speech_features import mfcc +import paddlespeech.audio.compliance.kaldi as kaldi + class AudioFeaturizer(): """Audio featurizer, for extracting features from audio contents of diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py index 30671c274..8d76a4727 100644 --- a/paddlespeech/s2t/modules/fbank.py +++ b/paddlespeech/s2t/modules/fbank.py @@ -1,7 +1,7 @@ import paddle from paddle import nn -from paddleaudio.compliance import kaldi +from paddlespeech.audio.compliance import kaldi from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 7d86f3df7..f02a942fb 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -16,9 +16,9 @@ from collections import OrderedDict import numpy as np import paddle -from paddleaudio.backends import soundfile_load as load_audio -from paddleaudio.compliance.librosa import melspectrogram +from paddlespeech.audio.backends import soundfile_load as load_audio +from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.cli.log import logger from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.server.engine.base_engine import BaseEngine diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py index 6aa6fd589..47871922b 100644 --- a/paddlespeech/server/util.py +++ b/paddlespeech/server/util.py @@ -24,13 +24,13 @@ from typing import Any from typing import Dict import paddle -import paddleaudio import requests import yaml from paddle.framework import load from .entry import client_commands from .entry import server_commands +from paddlespeech.audio.backends import soundfile_load from paddlespeech.cli import download try: from .. import __version__ @@ -289,7 +289,7 @@ def _note_one_stat(cls_name, params={}): if 'audio_file' in params: try: - _, sr = paddleaudio.backends.soundfile_load(params['audio_file']) + _, sr = soundfile_load(params['audio_file']) except Exception: sr = -1 diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py index 5901c805a..b29d0863e 100644 --- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py +++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py @@ -13,9 +13,9 @@ # limitations under the License. import paddle import paddle.nn.functional as F -import paddleaudio.functional as audio_F from paddle import nn +from paddlespeech.audio.functional import create_dct from paddlespeech.utils.initialize import _calculate_gain from paddlespeech.utils.initialize import xavier_uniform_ @@ -243,7 +243,7 @@ class MFCC(nn.Layer): self.n_mfcc = n_mfcc self.n_mels = n_mels self.norm = 'ortho' - dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm) + dct_mat = create_dct(self.n_mfcc, self.n_mels, self.norm) self.register_buffer('dct_mat', dct_mat) def forward(self, mel_specgram: paddle.Tensor): diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py index 821b1deed..a2a19cb66 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py @@ -16,10 +16,10 @@ import os import time import paddle -from paddleaudio.backends import soundfile_load as load_audio -from paddleaudio.compliance.librosa import melspectrogram from yacs.config import CfgNode +from paddlespeech.audio.backends import soundfile_load as load_audio +from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn diff --git a/paddlespeech/vector/exps/ecapa_tdnn/test.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py index f15dbf9b7..167b82422 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/test.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py @@ -18,7 +18,7 @@ import numpy as np import paddle from paddle.io import BatchSampler from paddle.io import DataLoader -from paddleaudio.metric import compute_eer +from sklearn.metrics import roc_curve from tqdm import tqdm from yacs.config import CfgNode @@ -129,6 +129,23 @@ def compute_verification_scores(id2embedding, train_cohort, config): return scores, labels +def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]: + """Compute EER and return score threshold. + + Args: + labels (np.ndarray): the trial label, shape: [N], one-dimension, N refer to the samples num + scores (np.ndarray): the trial scores, shape: [N], one-dimension, N refer to the samples num + + Returns: + List[float]: eer and the specific threshold + """ + fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores) + fnr = 1 - tpr + eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))] + eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] + return eer, eer_threshold + + def main(args, config): """The main process for test the speaker verification model diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py index 2dc7a7164..3966a900d 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/train.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py @@ -20,9 +20,9 @@ import paddle from paddle.io import BatchSampler from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from paddleaudio.compliance.librosa import melspectrogram from yacs.config import CfgNode +from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.augment import build_augment_pipeline from paddlespeech.vector.io.augment import waveform_augment diff --git a/paddlespeech/vector/io/dataset.py b/paddlespeech/vector/io/dataset.py index dff8ad9fd..ae5c83637 100644 --- a/paddlespeech/vector/io/dataset.py +++ b/paddlespeech/vector/io/dataset.py @@ -15,9 +15,9 @@ from dataclasses import dataclass from dataclasses import fields from paddle.io import Dataset -from paddleaudio.backends import soundfile_load as load_audio -from paddleaudio.compliance.librosa import melspectrogram +from paddlespeech.audio.backends import soundfile_load as load_audio +from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() diff --git a/paddlespeech/vector/io/dataset_from_json.py b/paddlespeech/vector/io/dataset_from_json.py index 852f39a94..1d1a4ad9c 100644 --- a/paddlespeech/vector/io/dataset_from_json.py +++ b/paddlespeech/vector/io/dataset_from_json.py @@ -16,9 +16,10 @@ from dataclasses import dataclass from dataclasses import fields from paddle.io import Dataset -from paddleaudio.backends import soundfile_load as load_audio -from paddleaudio.compliance.librosa import melspectrogram -from paddleaudio.compliance.librosa import mfcc + +from paddlespeech.audio.backends import soundfile_load as load_audio +from paddlespeech.audio.compliance.librosa import melspectrogram +from paddlespeech.audio.compliance.librosa import mfcc @dataclass diff --git a/setup.py b/setup.py index 8c2a4c1b7..5996ff178 100644 --- a/setup.py +++ b/setup.py @@ -99,7 +99,6 @@ base = [ determine_opencc_version(), # opencc or opencc==1.1.6 "opencc-python-reimplemented", "pandas", - "paddleaudio>=1.1.0", "paddlenlp>=2.4.8", "paddleslim>=2.3.4", "ppdiffusers>=0.9.0", @@ -122,6 +121,9 @@ base = [ "webrtcvad", "yacs>=0.1.8", "zhon", + "scikit-learn", + "pathos", + "kaldiio", ] server = ["pattern_singleton", "websockets"] diff --git a/tests/unit/audiotools/core/test_audio_signal.py b/tests/unit/audiotools/core/test_audio_signal.py index 0e82ae9d5..19575828c 100644 --- a/tests/unit/audiotools/core/test_audio_signal.py +++ b/tests/unit/audiotools/core/test_audio_signal.py @@ -26,14 +26,14 @@ def test_io(): signal_from_file = AudioSignal(f.name) mp3_signal = AudioSignal(audio_path.replace("wav", "mp3")) - print(mp3_signal) assert signal == signal_from_file - print(signal) - print(signal.markdown()) mp3_signal = AudioSignal.excerpt( audio_path.replace("wav", "mp3"), offset=5, duration=5) + + assert mp3_signal.sample_rate == 44100 + assert mp3_signal.signal_length == 220500 assert mp3_signal.signal_duration == 5.0 assert mp3_signal.duration == 5.0 assert mp3_signal.length == mp3_signal.signal_length From 367b665ca17a209b3c0ab6746efa137c1d77d7b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 17 Feb 2025 17:25:11 +0800 Subject: [PATCH 14/46] lower the install requirements (#3985) * lower the install requirements * Update setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5996ff178..881b06deb 100644 --- a/setup.py +++ b/setup.py @@ -54,8 +54,8 @@ def determine_opencc_version(): # determine opencc version if gcc_version: - if int(gcc_version.split(".")[0]) <= 9: - return "opencc==1.1.6" # GCC<=9 need opencc==1.1.6 + if int(gcc_version.split(".")[0]) < 9: + return "opencc==1.1.6" # GCC<9 need opencc==1.1.6 return "opencc" # default From 793a89d53c8904103488ab806b255f2a5467ea86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Thu, 20 Feb 2025 20:30:36 +0800 Subject: [PATCH 15/46] fit with librosa (#3989) * fit with librosa * Update base_commands.py * Apply suggestions from code review * Apply suggestions from code review --- paddlespeech/cli/base_commands.py | 3 +++ setup.py | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py index dfeb5cae5..fb5a190ed 100644 --- a/paddlespeech/cli/base_commands.py +++ b/paddlespeech/cli/base_commands.py @@ -122,6 +122,9 @@ class StatsCommand: elif "multilingual" in key: line[4], line[1] = line[1].split("_")[0], line[1].split( "_")[1:] + # Avoid having arrays within the elements of the input parameters when passing them to numpy.array + if type(line[1]) is list: + line[1] = "/".join(line[1]) tmp = numpy.array(line) idx = [0, 5, 3, 4, 1, 2] line = tmp[idx] diff --git a/setup.py b/setup.py index 881b06deb..71e7aaf2a 100644 --- a/setup.py +++ b/setup.py @@ -88,9 +88,8 @@ base = [ "hyperpyyaml", "inflect", "jsonlines", - # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x - "numpy==1.23.5", - "librosa==0.8.1", + "numpy", + "librosa", determine_scipy_version(), # scipy or scipy>=1.4.0, <=1.12.0 "loguru", determine_matplotlib_version(), # matplotlib or matplotlib<=3.8.4 From afa6f12ba14d6f7abddbc6faaa93dbfcc9581033 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Wed, 26 Feb 2025 11:16:14 +0800 Subject: [PATCH 16/46] paddlespeech/audiotools/ml/basemodel.py (#3994) --- paddlespeech/audiotools/ml/basemodel.py | 37 ++++++++++--------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/paddlespeech/audiotools/ml/basemodel.py b/paddlespeech/audiotools/ml/basemodel.py index 97c31ff7a..2d5683266 100644 --- a/paddlespeech/audiotools/ml/basemodel.py +++ b/paddlespeech/audiotools/ml/basemodel.py @@ -110,7 +110,8 @@ class BaseModel(nn.Layer): state_dict = {"state_dict": self.state_dict(), "metadata": metadata} paddle.save(state_dict, str(path)) else: - self._save_package(path, intern=intern, extern=extern, mock=mock) + raise NotImplementedError( + "Currently Paddle does not support packaging") return path @@ -151,31 +152,21 @@ class BaseModel(nn.Layer): BaseModel A model that inherits from BaseModel. """ - try: - model = cls._load_package(location, package_name=package_name) - except: - model_dict = paddle.load(location) - metadata = model_dict["metadata"] - metadata["kwargs"].update(kwargs) - - sig = inspect.signature(cls) - class_keys = list(sig.parameters.keys()) - for k in list(metadata["kwargs"].keys()): - if k not in class_keys: - metadata["kwargs"].pop(k) - - model = cls(*args, **metadata["kwargs"]) - model.set_state_dict(model_dict["state_dict"]) - model.metadata = metadata + model_dict = paddle.load(location) + metadata = model_dict["metadata"] + metadata["kwargs"].update(kwargs) - return model + sig = inspect.signature(cls) + class_keys = list(sig.parameters.keys()) + for k in list(metadata["kwargs"].keys()): + if k not in class_keys: + metadata["kwargs"].pop(k) - def _save_package(self, path, intern=[], extern=[], mock=[], **kwargs): - raise NotImplementedError("Currently Paddle does not support packaging") + model = cls(*args, **metadata["kwargs"]) + model.set_state_dict(model_dict["state_dict"]) + model.metadata = metadata - @classmethod - def _load_package(cls, path, package_name=None): - raise NotImplementedError("Currently Paddle does not support packaging") + return model def save_to_folder( self, From d7bf91561d5a8a025f3cfc4bd7b28368fd98d102 Mon Sep 17 00:00:00 2001 From: cchenhaifeng <134115991+cchenhaifeng@users.noreply.github.com> Date: Wed, 26 Feb 2025 16:46:34 +0800 Subject: [PATCH 17/46] =?UTF-8?q?=E3=80=90Hackathon=208th=20No.9=E3=80=91?= =?UTF-8?q?=E5=9C=A8=20PaddleSpeech=20=E4=B8=AD=E5=A4=8D=E7=8E=B0=20DAC=20?= =?UTF-8?q?=E8=AE=AD=E7=BB=83=E9=9C=80=E8=A6=81=E7=94=A8=E5=88=B0=E7=9A=84?= =?UTF-8?q?=20loss=20(#3988)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add DAC loss * fix bug * fix codestyle * fix codestyle * fix codestyle * fix codestyle * fix codestyle * fix codestyle --- paddlespeech/__init__.py | 4 - paddlespeech/audiotools/core/__init__.py | 4 +- paddlespeech/audiotools/core/_julius.py | 3 +- paddlespeech/audiotools/core/util.py | 8 +- paddlespeech/t2s/modules/losses.py | 279 +++++++++++++++++++++++ tests/unit/audiotools/core/test_util.py | 5 +- tests/unit/audiotools/test_audiotools.sh | 1 - tests/unit/ci.sh | 2 + tests/unit/tts/test_losses.py | 61 +++++ 9 files changed, 351 insertions(+), 16 deletions(-) create mode 100644 tests/unit/tts/test_losses.py diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index 969d189f5..6c7e75c1f 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -13,7 +13,3 @@ # limitations under the License. import _locale _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) - -__version__ = '0.0.0' - -__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9' diff --git a/paddlespeech/audiotools/core/__init__.py b/paddlespeech/audiotools/core/__init__.py index 609d6a34a..3443a7676 100644 --- a/paddlespeech/audiotools/core/__init__.py +++ b/paddlespeech/audiotools/core/__init__.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. from . import util -from ._julius import fft_conv1d -from ._julius import FFTConv1D +from ...t2s.modules import fft_conv1d +from ...t2s.modules import FFTConv1D from ._julius import highpass_filter from ._julius import highpass_filters from ._julius import lowpass_filter diff --git a/paddlespeech/audiotools/core/_julius.py b/paddlespeech/audiotools/core/_julius.py index aef51f98f..113475cdd 100644 --- a/paddlespeech/audiotools/core/_julius.py +++ b/paddlespeech/audiotools/core/_julius.py @@ -20,8 +20,6 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlespeech.t2s.modules import fft_conv1d -from paddlespeech.t2s.modules import FFTConv1D from paddlespeech.utils import satisfy_paddle_version __all__ = [ @@ -312,6 +310,7 @@ class LowPassFilters(nn.Layer): mode="replicate", data_format="NCL") if self.fft: + from paddlespeech.t2s.modules import fft_conv1d out = fft_conv1d(_input, self.filters, stride=self.stride) else: out = F.conv1d(_input, self.filters, stride=self.stride) diff --git a/paddlespeech/audiotools/core/util.py b/paddlespeech/audiotools/core/util.py index 6da927a6f..676d57704 100644 --- a/paddlespeech/audiotools/core/util.py +++ b/paddlespeech/audiotools/core/util.py @@ -32,7 +32,6 @@ import soundfile from flatten_dict import flatten from flatten_dict import unflatten -from .audio_signal import AudioSignal from paddlespeech.utils import satisfy_paddle_version from paddlespeech.vector.training.seeding import seed_everything @@ -232,8 +231,7 @@ def ensure_tensor( def _get_value(other): # - from . import AudioSignal - + from .audio_signal import AudioSignal if isinstance(other, AudioSignal): return other.audio_data return other @@ -784,6 +782,8 @@ def collate(list_of_dicts: list, n_splits: int=None): Dictionary containing batched data. """ + from .audio_signal import AudioSignal + batches = [] list_len = len(list_of_dicts) @@ -873,7 +873,7 @@ def generate_chord_dataset( """ import librosa - from . import AudioSignal + from .audio_signal import AudioSignal from ..data.preprocess import create_csv min_midi = librosa.note_to_midi(min_note) diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index f819352d6..a1a65a9dc 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import math +from typing import Callable +from typing import List +from typing import Optional from typing import Tuple +from typing import Union import librosa import numpy as np @@ -23,6 +27,8 @@ from scipy import signal from scipy.stats import betabinom from typeguard import typechecked +from paddlespeech.audiotools.core.audio_signal import AudioSignal +from paddlespeech.audiotools.core.audio_signal import STFTParams from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.predictor.duration_predictor import ( DurationPredictorLoss, # noqa: H301 @@ -1326,3 +1332,276 @@ class ForwardSumLoss(nn.Layer): bb_prior[bidx, :T, :N] = prob return bb_prior + + +class MultiScaleSTFTLoss(nn.Layer): + """Computes the multi-scale STFT loss from [1]. + + References + ---------- + + 1. Engel, Jesse, Chenjie Gu, and Adam Roberts. + "DDSP: Differentiable Digital Signal Processing." + International Conference on Learning Representations. 2019. + + Implementation copied from: https://github.com/descriptinc/audiotools/blob/master/audiotools/metrics/spectral.py + """ + + def __init__( + self, + window_lengths: List[int]=[2048, 512], + loss_fn: Callable=nn.L1Loss(), + clamp_eps: float=1e-5, + mag_weight: float=1.0, + log_weight: float=1.0, + pow: float=2.0, + weight: float=1.0, + match_stride: bool=False, + window_type: Optional[str]=None, ): + """ + Args: + window_lengths : List[int], optional + Length of each window of each STFT, by default [2048, 512] + loss_fn : typing.Callable, optional + How to compare each loss, by default nn.L1Loss() + clamp_eps : float, optional + Clamp on the log magnitude, below, by default 1e-5 + mag_weight : float, optional + Weight of raw magnitude portion of loss, by default 1.0 + log_weight : float, optional + Weight of log magnitude portion of loss, by default 1.0 + pow : float, optional + Power to raise magnitude to before taking log, by default 2.0 + weight : float, optional + Weight of this loss, by default 1.0 + match_stride : bool, optional + Whether to match the stride of convolutional layers, by default False + window_type : str, optional + Type of window to use, by default None. + """ + super().__init__() + + self.stft_params = [ + STFTParams( + window_length=w, + hop_length=w // 4, + match_stride=match_stride, + window_type=window_type, ) for w in window_lengths + ] + self.loss_fn = loss_fn + self.log_weight = log_weight + self.mag_weight = mag_weight + self.clamp_eps = clamp_eps + self.weight = weight + self.pow = pow + + def forward(self, x: AudioSignal, y: AudioSignal): + """Computes multi-scale STFT between an estimate and a reference + signal. + + Args: + x : AudioSignal + Estimate signal + y : AudioSignal + Reference signal + + Returns: + paddle.Tensor + Multi-scale STFT loss. + + Example: + >>> from paddlespeech.audiotools.core.audio_signal import AudioSignal + >>> import paddle + + >>> x = AudioSignal("https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav", 2_05) + >>> y = x * 0.01 + >>> loss = MultiScaleSTFTLoss() + >>> loss(x, y).numpy() + 7.562150 + """ + for s in self.stft_params: + x.stft(s.window_length, s.hop_length, s.window_type) + y.stft(s.window_length, s.hop_length, s.window_type) + loss += self.log_weight * self.loss_fn( + x.magnitude.clip(self.clamp_eps).pow(self.pow).log10(), + y.magnitude.clip(self.clamp_eps).pow(self.pow).log10(), ) + loss += self.mag_weight * self.loss_fn(x.magnitude, y.magnitude) + return loss + + +class GANLoss(nn.Layer): + """ + Computes a discriminator loss, given a discriminator on + generated waveforms/spectrograms compared to ground truth + waveforms/spectrograms. Computes the loss for both the + discriminator and the generator in separate functions. + + Example: + >>> from paddlespeech.audiotools.core.audio_signal import AudioSignal + >>> import paddle + + >>> x = AudioSignal("https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav", 2_05) + >>> y = x * 0.01 + >>> class My_discriminator0: + >>> def __call__(self, x): + >>> return x.sum() + >>> loss = GANLoss(My_discriminator0()) + >>> [loss(x, y)[0].numpy(), loss(x, y)[1].numpy()] + [-0.102722, -0.001027] + + >>> class My_discriminator1: + >>> def __call__(self, x): + >>> return x.sum() + >>> loss = GANLoss(My_discriminator1()) + >>> [loss.generator_loss(x, y)[0].numpy(), loss.generator_loss(x, y)[1].numpy()] + [1.00019, 0] + + >>> loss.discriminator_loss(x, y) + 1.000200 + """ + + def __init__(self, discriminator): + """ + Args: + discriminator : paddle.nn.layer + Discriminator model + """ + super().__init__() + self.discriminator = discriminator + + def forward(self, + fake: Union[AudioSignal, paddle.Tensor], + real: Union[AudioSignal, paddle.Tensor]): + if isinstance(fake, AudioSignal): + d_fake = self.discriminator(fake.audio_data) + else: + d_fake = self.discriminator(fake) + + if isinstance(real, AudioSignal): + d_real = self.discriminator(real.audio_data) + else: + d_real = self.discriminator(real) + return d_fake, d_real + + def discriminator_loss(self, fake, real): + d_fake, d_real = self.forward(fake, real) + + loss_d = 0 + for x_fake, x_real in zip(d_fake, d_real): + loss_d += paddle.mean(x_fake[-1]**2) + loss_d += paddle.mean((1 - x_real[-1])**2) + return loss_d + + def generator_loss(self, fake, real): + d_fake, d_real = self.forward(fake, real) + + loss_g = 0 + for x_fake in d_fake: + loss_g += paddle.mean((1 - x_fake[-1])**2) + + loss_feature = 0 + + for i in range(len(d_fake)): + for j in range(len(d_fake[i]) - 1): + loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j]()) + return loss_g, loss_feature + + +class SISDRLoss(nn.Layer): + """ + Computes the Scale-Invariant Source-to-Distortion Ratio between a batch + of estimated and reference audio signals or aligned features. + + Implementation copied from: https://github.com/descriptinc/audiotools/blob/master/audiotools/metrics/distance.py + + Example: + >>> from paddlespeech.audiotools.core.audio_signal import AudioSignal + >>> import paddle + + >>> x = AudioSignal("https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav", 2_05) + >>> y = x * 0.01 + >>> sisdr = SISDRLoss() + >>> sisdr(x, y).numpy() + -145.377640 + """ + + def __init__( + self, + scaling: bool=True, + reduction: str="mean", + zero_mean: bool=True, + clip_min: Optional[int]=None, + weight: float=1.0, ): + """ + Args: + scaling : bool, optional + Whether to use scale-invariant (True) or + signal-to-noise ratio (False), by default True + reduction : str, optional + How to reduce across the batch (either 'mean', + 'sum', or none).], by default ' mean' + zero_mean : bool, optional + Zero mean the references and estimates before + computing the loss, by default True + clip_min : int, optional + The minimum possible loss value. Helps network + to not focus on making already good examples better, by default None + weight : float, optional + Weight of this loss, defaults to 1.0. + """ + self.scaling = scaling + self.reduction = reduction + self.zero_mean = zero_mean + self.clip_min = clip_min + self.weight = weight + super().__init__() + + def forward(self, + x: Union[AudioSignal, paddle.Tensor], + y: Union[AudioSignal, paddle.Tensor]): + eps = 1e-8 + # B, C, T + if isinstance(x, AudioSignal): + references = x.audio_data + estimates = y.audio_data + else: + references = x + estimates = y + + nb = references.shape[0] + references = references.reshape([nb, 1, -1]).transpose([0, 2, 1]) + estimates = estimates.reshape([nb, 1, -1]).transpose([0, 2, 1]) + + # samples now on axis 1 + if self.zero_mean: + mean_reference = references.mean(axis=1, keepdim=True) + mean_estimate = estimates.mean(axis=1, keepdim=True) + else: + mean_reference = 0 + mean_estimate = 0 + + _references = references - mean_reference + _estimates = estimates - mean_estimate + + references_projection = (_references**2).sum(axis=-2) + eps + references_on_estimates = (_estimates * _references).sum(axis=-2) + eps + + scale = ( + (references_on_estimates / references_projection).unsqueeze(axis=1) + if self.scaling else 1) + + e_true = scale * _references + e_res = _estimates - e_true + + signal = (e_true**2).sum(axis=1) + noise = (e_res**2).sum(axis=1) + sdr = -10 * paddle.log10(signal / noise + eps) + + if self.clip_min != None: + sdr = paddle.clip(sdr, min=self.clip_min) + + if self.reduction == "mean": + sdr = sdr.mean() + elif self.reduction == "sum": + sdr = sdr.sum() + return sdr diff --git a/tests/unit/audiotools/core/test_util.py b/tests/unit/audiotools/core/test_util.py index 155686acd..16e5d5e92 100644 --- a/tests/unit/audiotools/core/test_util.py +++ b/tests/unit/audiotools/core/test_util.py @@ -13,7 +13,6 @@ import pytest from paddlespeech.audiotools import util from paddlespeech.audiotools.core.audio_signal import AudioSignal -from paddlespeech.vector.training.seeding import seed_everything def test_check_random_state(): @@ -36,12 +35,12 @@ def test_check_random_state(): def test_seed(): - seed_everything(0) + util.seed_everything(0) paddle_result_a = paddle.randn([1]) np_result_a = np.random.randn(1) py_result_a = random.random() - seed_everything(0) + util.seed_everything(0) paddle_result_b = paddle.randn([1]) np_result_b = np.random.randn(1) py_result_b = random.random() diff --git a/tests/unit/audiotools/test_audiotools.sh b/tests/unit/audiotools/test_audiotools.sh index 3a0161900..f69447d62 100644 --- a/tests/unit/audiotools/test_audiotools.sh +++ b/tests/unit/audiotools/test_audiotools.sh @@ -1,4 +1,3 @@ -python -m pip install -r ../../../paddlespeech/audiotools/requirements.txt wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/audio.tar.gz wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/regression.tar.gz tar -zxvf audio.tar.gz diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh index 6beff0707..567af2210 100644 --- a/tests/unit/ci.sh +++ b/tests/unit/ci.sh @@ -1,6 +1,7 @@ function main(){ set -ex speech_ci_path=`pwd` + python -m pip install -r ../../paddlespeech/audiotools/requirements.txt echo "Start asr" cd ${speech_ci_path}/asr @@ -16,6 +17,7 @@ function main(){ python test_enfrontend.py python test_fftconv1d.py python test_mixfrontend.py + python test_losses.py echo "End TTS" echo "Start Vector" diff --git a/tests/unit/tts/test_losses.py b/tests/unit/tts/test_losses.py new file mode 100644 index 000000000..f99d15d1c --- /dev/null +++ b/tests/unit/tts/test_losses.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +from paddlespeech.audiotools.core.audio_signal import AudioSignal +from paddlespeech.t2s.modules.losses import GANLoss +from paddlespeech.t2s.modules.losses import MultiScaleSTFTLoss +from paddlespeech.t2s.modules.losses import SISDRLoss + + +def get_input(): + x = AudioSignal("https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav", + 2_05) + y = x * 0.01 + return x, y + + +def test_multi_scale_stft_loss(): + x, y = get_input() + loss = MultiScaleSTFTLoss() + pd_loss = loss(x, y) + assert np.abs(pd_loss.numpy() - 7.562150) < 1e-06 + + +def test_sisdr_loss(): + x, y = get_input() + loss = SISDRLoss() + pd_loss = loss(x, y) + assert np.abs(pd_loss.numpy() - (-145.377640)) < 1e-06 + + +def test_gan_loss(): + class My_discriminator0: + def __call__(self, x): + return x.sum() + + class My_discriminator1: + def __call__(self, x): + return x * (-0.2) + + x, y = get_input() + loss = GANLoss(My_discriminator0()) + pd_loss0, pd_loss1 = loss(x, y) + assert np.abs(pd_loss0.numpy() - (-0.102722)) < 1e-06 + assert np.abs(pd_loss1.numpy() - (-0.001027)) < 1e-06 + loss = GANLoss(My_discriminator1()) + pd_loss0, _ = loss.generator_loss(x, y) + assert np.abs(pd_loss0.numpy() - 1.000199) < 1e-06 + pd_loss = loss.discriminator_loss(x, y) + assert np.abs(pd_loss.numpy() - 1.000200) < 1e-06 From 48583b453aa590d1027643c8bf6316d8bdc7a772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Thu, 27 Feb 2025 11:12:30 +0800 Subject: [PATCH 18/46] Update setup.py (#3995) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 71e7aaf2a..8039d9245 100644 --- a/setup.py +++ b/setup.py @@ -89,7 +89,7 @@ base = [ "inflect", "jsonlines", "numpy", - "librosa", + "librosa>=0.9", determine_scipy_version(), # scipy or scipy>=1.4.0, <=1.12.0 "loguru", determine_matplotlib_version(), # matplotlib or matplotlib<=3.8.4 From 45f439ad32fdfa8182056da49c748e34f4d18a36 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 3 Mar 2025 15:29:50 +0800 Subject: [PATCH 19/46] mv audiotools requirement to setup.py (#3999) --- paddlespeech/audiotools/requirements.txt | 5 ----- setup.py | 5 +++++ tests/unit/ci.sh | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) delete mode 100644 paddlespeech/audiotools/requirements.txt diff --git a/paddlespeech/audiotools/requirements.txt b/paddlespeech/audiotools/requirements.txt deleted file mode 100644 index 0a018002e..000000000 --- a/paddlespeech/audiotools/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -ffmpeg-python -ffmpy -flatten_dict -pyloudnorm -rich \ No newline at end of file diff --git a/setup.py b/setup.py index 8039d9245..47e543ce4 100644 --- a/setup.py +++ b/setup.py @@ -123,6 +123,11 @@ base = [ "scikit-learn", "pathos", "kaldiio", + "ffmpeg-python", + "ffmpy", + "flatten_dict", + "pyloudnorm", + "rich", ] server = ["pattern_singleton", "websockets"] diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh index 567af2210..c298e3ae8 100644 --- a/tests/unit/ci.sh +++ b/tests/unit/ci.sh @@ -1,7 +1,6 @@ function main(){ set -ex speech_ci_path=`pwd` - python -m pip install -r ../../paddlespeech/audiotools/requirements.txt echo "Start asr" cd ${speech_ci_path}/asr From 9c01a0b980aeca72c87172d0453f3f3b140a6659 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Tue, 4 Mar 2025 14:39:50 +0800 Subject: [PATCH 20/46] =?UTF-8?q?=E3=80=90Doc=E3=80=91=E8=A1=A5=E5=85=A8?= =?UTF-8?q?=E5=90=88=E6=88=90=E7=B3=BB=E5=88=97=E4=B8=AD=E7=9A=84=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E4=B8=AD=E5=8F=82=E6=95=B0=E7=BC=BA=E5=A4=B1=20No.4?= =?UTF-8?q?=20(#3998)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix * CI --- examples/aishell3/tts3/README.md | 6 ++++-- examples/aishell3/tts3/run.sh | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index f97a84b50..8f3f66dac 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -109,8 +109,9 @@ pwg_aishell3_ckpt_0.5 ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} ``` +`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize.py [-h] [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] @@ -157,8 +158,9 @@ optional arguments: ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} ``` +`--stage` controls the vocoder model during synthesis, which can be `0` or `1`, use `pwgan` or `hifigan` model as vocoder. ```text usage: synthesize_e2e.py [-h] [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index 8dcecaa03..3fd5d73c6 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -27,13 +27,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then From f54df909d0520ec1933192d54142d0c8bfc393f2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Mar 2025 11:10:29 +0800 Subject: [PATCH 21/46] Bump axios from 0.26.1 to 1.8.2 in /demos/speech_web/web_client (#4001) Bumps [axios](https://github.com/axios/axios) from 0.26.1 to 1.8.2. - [Release notes](https://github.com/axios/axios/releases) - [Changelog](https://github.com/axios/axios/blob/v1.x/CHANGELOG.md) - [Commits](https://github.com/axios/axios/compare/v0.26.1...v1.8.2) --- updated-dependencies: - dependency-name: axios dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- demos/speech_web/web_client/package-lock.json | 596 +++++++++++++++--- demos/speech_web/web_client/package.json | 2 +- demos/speech_web/web_client/yarn.lock | 301 ++++++++- 3 files changed, 791 insertions(+), 108 deletions(-) diff --git a/demos/speech_web/web_client/package-lock.json b/demos/speech_web/web_client/package-lock.json index 509be385c..95e082ecd 100644 --- a/demos/speech_web/web_client/package-lock.json +++ b/demos/speech_web/web_client/package-lock.json @@ -8,8 +8,9 @@ "name": "paddlespeechwebclient", "version": "0.0.0", "dependencies": { + "@element-plus/icons-vue": "^2.0.9", "ant-design-vue": "^2.2.8", - "axios": "^0.26.1", + "axios": "^1.8.2", "element-plus": "^2.1.9", "js-audio-recorder": "0.5.7", "lamejs": "^1.2.1", @@ -18,7 +19,8 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^2.3.0", - "vite": "^2.9.0" + "@vue/compiler-sfc": "^3.1.0", + "vite": "^2.9.13" } }, "node_modules/@ant-design/colors": { @@ -79,9 +81,9 @@ } }, "node_modules/@element-plus/icons-vue": { - "version": "1.1.4", - "resolved": "https://registry.npmmirror.com/@element-plus/icons-vue/-/icons-vue-1.1.4.tgz", - "integrity": "sha512-Iz/nHqdp1sFPmdzRwHkEQQA3lKvoObk8azgABZ81QUOpW9s/lUyQVUSh0tNtEPZXQlKwlSh7SPgoVxzrE0uuVQ==", + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/@element-plus/icons-vue/-/icons-vue-2.3.1.tgz", + "integrity": "sha512-XxVUZv48RZAd87ucGS48jPf6pKu0yV5UCg9f4FFwtrYxXOwWuVJo6wOvSLKEoMQKjv8GsX/mhP6UsC1lRwbUWg==", "license": "MIT", "peerDependencies": { "vue": "^3.2.0" @@ -364,33 +366,46 @@ "integrity": "sha512-Pj2IR7u8hmUEDOwB++su6baaRi+QvsgajuFB9j95foM1N2gy5HM4z60hfusIO0fBPG5uLAEl6yCJr1jNSVugEQ==", "license": "MIT" }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, "node_modules/axios": { - "version": "0.26.1", - "resolved": "https://registry.npmmirror.com/axios/-/axios-0.26.1.tgz", - "integrity": "sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==", + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.8.2.tgz", + "integrity": "sha512-ls4GYBm5aig9vWx8AWDSGLpnpDQRtWAfrjU+EuytuODrFBkqesN2RkOQCBzrA1RQNHw1SmRMSDDDSwzNAYQ6Rg==", "license": "MIT", "dependencies": { - "follow-redirects": "^1.14.8" + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" } }, - "node_modules/axios/node_modules/follow-redirects": { - "version": "1.14.9", - "resolved": "https://registry.npmmirror.com/follow-redirects/-/follow-redirects-1.14.9.tgz", - "integrity": "sha512-MQDfihBQYMcyy5dhRDJUHcw7lb2Pv/TuE6xP1vyraLukNDHKbDxDNaOE3NbCAdKQApno+GPRyo1YAp89yCjK4w==", - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, "engines": { - "node": ">=4.0" + "node": ">= 0.4" + } + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } + "engines": { + "node": ">= 0.8" } }, "node_modules/compute-scroll-into-view": { @@ -424,6 +439,15 @@ "integrity": "sha512-JLC809s6Y948/FuCZPm5IX8rRhQwOiyMb2TfVVQEixG7P8Lm/gt5S7yoQZmC8x1UehI9Pb7sksEt4xx14m+7Ug==", "license": "MIT" }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/dom-align": { "version": "1.12.3", "resolved": "https://registry.npmmirror.com/dom-align/-/dom-align-1.12.3.tgz", @@ -434,6 +458,20 @@ "resolved": "https://registry.npmmirror.com/dom-scroll-into-view/-/dom-scroll-into-view-2.0.1.tgz", "integrity": "sha512-bvVTQe1lfaUr1oFzZX80ce9KLDlZ3iU+XGNE/bz9HnGdklTieqsbmsLHe+rT2XWqopvL0PckkYqN7ksmm5pe3w==" }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/element-plus": { "version": "2.1.9", "resolved": "https://registry.npmmirror.com/element-plus/-/element-plus-2.1.9.tgz", @@ -460,6 +498,15 @@ "vue": "^3.2.0" } }, + "node_modules/element-plus/node_modules/@element-plus/icons-vue": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@element-plus/icons-vue/-/icons-vue-1.1.4.tgz", + "integrity": "sha512-Iz/nHqdp1sFPmdzRwHkEQQA3lKvoObk8azgABZ81QUOpW9s/lUyQVUSh0tNtEPZXQlKwlSh7SPgoVxzrE0uuVQ==", + "license": "MIT", + "peerDependencies": { + "vue": "^3.2.0" + } + }, "node_modules/errno": { "version": "0.1.8", "resolved": "https://registry.npmmirror.com/errno/-/errno-0.1.8.tgz", @@ -472,6 +519,51 @@ "errno": "cli.js" } }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/esbuild": { "version": "0.14.36", "resolved": "https://registry.npmmirror.com/esbuild/-/esbuild-0.14.36.tgz", @@ -537,6 +629,41 @@ "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", "license": "MIT" }, + "node_modules/follow-redirects": { + "version": "1.15.9", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.9.tgz", + "integrity": "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "license": "MIT", + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/form-data": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.2.tgz", + "integrity": "sha512-hGfm/slu0ZabnNt4oaRZ6uREyfCj6P4fT/n6A1rGV+Z0VdGXjfOhVUpkn6qVQONHGIFwmveGXyDs75+nr6FM8w==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/fsevents": { "version": "2.3.2", "resolved": "https://registry.npmmirror.com/fsevents/-/fsevents-2.3.2.tgz", @@ -552,11 +679,62 @@ } }, "node_modules/function-bind": { - "version": "1.1.1", - "resolved": "https://registry.npmmirror.com/function-bind/-/function-bind-1.1.1.tgz", - "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==", - "dev": true, - "license": "MIT" + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } }, "node_modules/graceful-fs": { "version": "4.2.10", @@ -577,6 +755,45 @@ "node": ">= 0.4.0" } }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmmirror.com/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -728,6 +945,15 @@ "node": ">=6" } }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/memoize-one": { "version": "6.0.0", "resolved": "https://registry.npmmirror.com/memoize-one/-/memoize-one-6.0.0.tgz", @@ -746,6 +972,27 @@ "node": ">=4" } }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/moment": { "version": "2.29.4", "resolved": "https://registry.npmjs.org/moment/-/moment-2.29.4.tgz", @@ -755,9 +1002,15 @@ } }, "node_modules/nanoid": { - "version": "3.3.2", - "resolved": "https://registry.npmmirror.com/nanoid/-/nanoid-3.3.2.tgz", - "integrity": "sha512-CuHBogktKwpm5g2sRgv83jEy2ijFzBwMoYA60orPDR7ynsLijJDqgsi4RDGj3OJpy3Ieb+LYwiRmIOGyytgITA==", + "version": "3.3.9", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.9.tgz", + "integrity": "sha512-SppoicMGpZvbF1l3z4x7No3OlIjP7QJvC9XR7AhZr1kL133KHnKPztkKDc+Ir4aJ/1VhTySrtKhrsycmrMQfvg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], "license": "MIT", "bin": { "nanoid": "bin/nanoid.cjs" @@ -830,9 +1083,9 @@ "license": "MIT" }, "node_modules/picocolors": { - "version": "1.0.0", - "resolved": "https://registry.npmmirror.com/picocolors/-/picocolors-1.0.0.tgz", - "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", "license": "ISC" }, "node_modules/pify": { @@ -845,9 +1098,9 @@ } }, "node_modules/postcss": { - "version": "8.4.12", - "resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.4.12.tgz", - "integrity": "sha512-lg6eITwYe9v6Hr5CncVbK70SoioNQIq81nsaG86ev5hAidQvmOeETBqs7jm43K2F5/Ley3ytDtriImV6TpNiSg==", + "version": "8.5.3", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.3.tgz", + "integrity": "sha512-dle9A3yYxlBSrt8Fu+IpjGT8SY8hN0mlaA6GY8t0P5PjIOZemULz/E2Bnm/2dcUOena75OTNkHI76uZBNUUq3A==", "funding": [ { "type": "opencollective", @@ -856,18 +1109,28 @@ { "type": "tidelift", "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" } ], "license": "MIT", "dependencies": { - "nanoid": "^3.3.1", - "picocolors": "^1.0.0", - "source-map-js": "^1.0.2" + "nanoid": "^3.3.8", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" }, "engines": { "node": "^10 || ^12 || >=14" } }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "license": "MIT" + }, "node_modules/prr": { "version": "1.0.1", "resolved": "https://registry.npmmirror.com/prr/-/prr-1.0.1.tgz", @@ -962,9 +1225,9 @@ } }, "node_modules/source-map-js": { - "version": "1.0.2", - "resolved": "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.0.2.tgz", - "integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", "license": "BSD-3-Clause", "engines": { "node": ">=0.10.0" @@ -1001,16 +1264,16 @@ "license": "ISC" }, "node_modules/vite": { - "version": "2.9.1", - "resolved": "https://registry.npmmirror.com/vite/-/vite-2.9.1.tgz", - "integrity": "sha512-vSlsSdOYGcYEJfkQ/NeLXgnRv5zZfpAsdztkIrs7AZHV8RCMZQkwjo4DS5BnrYTqoWqLoUe1Cah4aVO4oNNqCQ==", + "version": "2.9.18", + "resolved": "https://registry.npmjs.org/vite/-/vite-2.9.18.tgz", + "integrity": "sha512-sAOqI5wNM9QvSEE70W3UGMdT8cyEn0+PmJMTFvTB8wB0YbYUWw3gUbY62AOyrXosGieF2htmeLATvNxpv/zNyQ==", "dev": true, "license": "MIT", "dependencies": { "esbuild": "^0.14.27", - "postcss": "^8.4.12", + "postcss": "^8.4.13", "resolve": "^1.22.0", - "rollup": "^2.59.0" + "rollup": ">=2.59.0 <2.78.0" }, "bin": { "vite": "bin/vite.js" @@ -1142,9 +1405,9 @@ "integrity": "sha512-ej5oVy6lykXsvieQtqZxCOaLT+xD4+QNarq78cIYISHmZXshCvROLudpQN3lfL8G0NL7plMSSK+zlyvCaIJ4Iw==" }, "@element-plus/icons-vue": { - "version": "1.1.4", - "resolved": "https://registry.npmmirror.com/@element-plus/icons-vue/-/icons-vue-1.1.4.tgz", - "integrity": "sha512-Iz/nHqdp1sFPmdzRwHkEQQA3lKvoObk8azgABZ81QUOpW9s/lUyQVUSh0tNtEPZXQlKwlSh7SPgoVxzrE0uuVQ==", + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/@element-plus/icons-vue/-/icons-vue-2.3.1.tgz", + "integrity": "sha512-XxVUZv48RZAd87ucGS48jPf6pKu0yV5UCg9f4FFwtrYxXOwWuVJo6wOvSLKEoMQKjv8GsX/mhP6UsC1lRwbUWg==", "requires": {} }, "@floating-ui/core": { @@ -1356,19 +1619,36 @@ "resolved": "https://registry.npmmirror.com/async-validator/-/async-validator-4.0.7.tgz", "integrity": "sha512-Pj2IR7u8hmUEDOwB++su6baaRi+QvsgajuFB9j95foM1N2gy5HM4z60hfusIO0fBPG5uLAEl6yCJr1jNSVugEQ==" }, + "asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + }, "axios": { - "version": "0.26.1", - "resolved": "https://registry.npmmirror.com/axios/-/axios-0.26.1.tgz", - "integrity": "sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==", + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.8.2.tgz", + "integrity": "sha512-ls4GYBm5aig9vWx8AWDSGLpnpDQRtWAfrjU+EuytuODrFBkqesN2RkOQCBzrA1RQNHw1SmRMSDDDSwzNAYQ6Rg==", "requires": { - "follow-redirects": "^1.14.8" - }, - "dependencies": { - "follow-redirects": { - "version": "1.14.9", - "resolved": "https://registry.npmmirror.com/follow-redirects/-/follow-redirects-1.14.9.tgz", - "integrity": "sha512-MQDfihBQYMcyy5dhRDJUHcw7lb2Pv/TuE6xP1vyraLukNDHKbDxDNaOE3NbCAdKQApno+GPRyo1YAp89yCjK4w==" - } + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, + "call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "requires": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + } + }, + "combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "requires": { + "delayed-stream": "~1.0.0" } }, "compute-scroll-into-view": { @@ -1399,6 +1679,11 @@ "resolved": "https://registry.npmmirror.com/dayjs/-/dayjs-1.11.0.tgz", "integrity": "sha512-JLC809s6Y948/FuCZPm5IX8rRhQwOiyMb2TfVVQEixG7P8Lm/gt5S7yoQZmC8x1UehI9Pb7sksEt4xx14m+7Ug==" }, + "delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==" + }, "dom-align": { "version": "1.12.3", "resolved": "https://registry.npmmirror.com/dom-align/-/dom-align-1.12.3.tgz", @@ -1409,6 +1694,16 @@ "resolved": "https://registry.npmmirror.com/dom-scroll-into-view/-/dom-scroll-into-view-2.0.1.tgz", "integrity": "sha512-bvVTQe1lfaUr1oFzZX80ce9KLDlZ3iU+XGNE/bz9HnGdklTieqsbmsLHe+rT2XWqopvL0PckkYqN7ksmm5pe3w==" }, + "dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "requires": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + } + }, "element-plus": { "version": "2.1.9", "resolved": "https://registry.npmmirror.com/element-plus/-/element-plus-2.1.9.tgz", @@ -1429,6 +1724,14 @@ "lodash-unified": "^1.0.2", "memoize-one": "^6.0.0", "normalize-wheel-es": "^1.1.2" + }, + "dependencies": { + "@element-plus/icons-vue": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@element-plus/icons-vue/-/icons-vue-1.1.4.tgz", + "integrity": "sha512-Iz/nHqdp1sFPmdzRwHkEQQA3lKvoObk8azgABZ81QUOpW9s/lUyQVUSh0tNtEPZXQlKwlSh7SPgoVxzrE0uuVQ==", + "requires": {} + } } }, "errno": { @@ -1440,6 +1743,35 @@ "prr": "~1.0.1" } }, + "es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==" + }, + "es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==" + }, + "es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "requires": { + "es-errors": "^1.3.0" + } + }, + "es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "requires": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + } + }, "esbuild": { "version": "0.14.36", "resolved": "https://registry.npmmirror.com/esbuild/-/esbuild-0.14.36.tgz", @@ -1485,6 +1817,22 @@ "resolved": "https://registry.npmmirror.com/estree-walker/-/estree-walker-2.0.2.tgz", "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==" }, + "follow-redirects": { + "version": "1.15.9", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.9.tgz", + "integrity": "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==" + }, + "form-data": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.2.tgz", + "integrity": "sha512-hGfm/slu0ZabnNt4oaRZ6uREyfCj6P4fT/n6A1rGV+Z0VdGXjfOhVUpkn6qVQONHGIFwmveGXyDs75+nr6FM8w==", + "requires": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "mime-types": "^2.1.12" + } + }, "fsevents": { "version": "2.3.2", "resolved": "https://registry.npmmirror.com/fsevents/-/fsevents-2.3.2.tgz", @@ -1493,10 +1841,40 @@ "optional": true }, "function-bind": { - "version": "1.1.1", - "resolved": "https://registry.npmmirror.com/function-bind/-/function-bind-1.1.1.tgz", - "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==", - "dev": true + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==" + }, + "get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "requires": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + } + }, + "get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "requires": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + } + }, + "gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==" }, "graceful-fs": { "version": "4.2.10", @@ -1513,6 +1891,27 @@ "function-bind": "^1.1.1" } }, + "has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==" + }, + "has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "requires": { + "has-symbols": "^1.0.3" + } + }, + "hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "requires": { + "function-bind": "^1.1.2" + } + }, "iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmmirror.com/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -1624,6 +2023,11 @@ "semver": "^5.6.0" } }, + "math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==" + }, "memoize-one": { "version": "6.0.0", "resolved": "https://registry.npmmirror.com/memoize-one/-/memoize-one-6.0.0.tgz", @@ -1635,15 +2039,28 @@ "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", "optional": true }, + "mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==" + }, + "mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "requires": { + "mime-db": "1.52.0" + } + }, "moment": { "version": "2.29.4", "resolved": "https://registry.npmjs.org/moment/-/moment-2.29.4.tgz", "integrity": "sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w==" }, "nanoid": { - "version": "3.3.2", - "resolved": "https://registry.npmmirror.com/nanoid/-/nanoid-3.3.2.tgz", - "integrity": "sha512-CuHBogktKwpm5g2sRgv83jEy2ijFzBwMoYA60orPDR7ynsLijJDqgsi4RDGj3OJpy3Ieb+LYwiRmIOGyytgITA==" + "version": "3.3.9", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.9.tgz", + "integrity": "sha512-SppoicMGpZvbF1l3z4x7No3OlIjP7QJvC9XR7AhZr1kL133KHnKPztkKDc+Ir4aJ/1VhTySrtKhrsycmrMQfvg==" }, "nanopop": { "version": "2.1.0", @@ -1700,9 +2117,9 @@ "dev": true }, "picocolors": { - "version": "1.0.0", - "resolved": "https://registry.npmmirror.com/picocolors/-/picocolors-1.0.0.tgz", - "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==" + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==" }, "pify": { "version": "4.0.1", @@ -1711,15 +2128,20 @@ "optional": true }, "postcss": { - "version": "8.4.12", - "resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.4.12.tgz", - "integrity": "sha512-lg6eITwYe9v6Hr5CncVbK70SoioNQIq81nsaG86ev5hAidQvmOeETBqs7jm43K2F5/Ley3ytDtriImV6TpNiSg==", + "version": "8.5.3", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.3.tgz", + "integrity": "sha512-dle9A3yYxlBSrt8Fu+IpjGT8SY8hN0mlaA6GY8t0P5PjIOZemULz/E2Bnm/2dcUOena75OTNkHI76uZBNUUq3A==", "requires": { - "nanoid": "^3.3.1", - "picocolors": "^1.0.0", - "source-map-js": "^1.0.2" + "nanoid": "^3.3.8", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" } }, + "proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + }, "prr": { "version": "1.0.1", "resolved": "https://registry.npmmirror.com/prr/-/prr-1.0.1.tgz", @@ -1793,9 +2215,9 @@ "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==" }, "source-map-js": { - "version": "1.0.2", - "resolved": "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.0.2.tgz", - "integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==" + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==" }, "sourcemap-codec": { "version": "1.4.8", @@ -1819,16 +2241,16 @@ "integrity": "sha512-IeiWvvEXfW5ltKVMkxq6FvNf2LojMKvB2OCeja6+ct24S1XOmQw2dGr2JyndwACWAGJva9B7yPHwAmeA9QCqAQ==" }, "vite": { - "version": "2.9.1", - "resolved": "https://registry.npmmirror.com/vite/-/vite-2.9.1.tgz", - "integrity": "sha512-vSlsSdOYGcYEJfkQ/NeLXgnRv5zZfpAsdztkIrs7AZHV8RCMZQkwjo4DS5BnrYTqoWqLoUe1Cah4aVO4oNNqCQ==", + "version": "2.9.18", + "resolved": "https://registry.npmjs.org/vite/-/vite-2.9.18.tgz", + "integrity": "sha512-sAOqI5wNM9QvSEE70W3UGMdT8cyEn0+PmJMTFvTB8wB0YbYUWw3gUbY62AOyrXosGieF2htmeLATvNxpv/zNyQ==", "dev": true, "requires": { "esbuild": "^0.14.27", "fsevents": "~2.3.2", - "postcss": "^8.4.12", + "postcss": "^8.4.13", "resolve": "^1.22.0", - "rollup": "^2.59.0" + "rollup": ">=2.59.0 <2.78.0" } }, "vue": { diff --git a/demos/speech_web/web_client/package.json b/demos/speech_web/web_client/package.json index d8c213e4a..f00afbd25 100644 --- a/demos/speech_web/web_client/package.json +++ b/demos/speech_web/web_client/package.json @@ -10,7 +10,7 @@ "dependencies": { "@element-plus/icons-vue": "^2.0.9", "ant-design-vue": "^2.2.8", - "axios": "^0.26.1", + "axios": "^1.8.2", "element-plus": "^2.1.9", "js-audio-recorder": "0.5.7", "lamejs": "^1.2.1", diff --git a/demos/speech_web/web_client/yarn.lock b/demos/speech_web/web_client/yarn.lock index 7f07daa06..741bfc005 100644 --- a/demos/speech_web/web_client/yarn.lock +++ b/demos/speech_web/web_client/yarn.lock @@ -22,11 +22,28 @@ "@ant-design/colors" "^6.0.0" "@ant-design/icons-svg" "^4.2.1" +"@babel/helper-string-parser@^7.25.9": + version "7.25.9" + resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz#1aabb72ee72ed35789b4bbcad3ca2862ce614e8c" + integrity sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA== + +"@babel/helper-validator-identifier@^7.25.9": + version "7.25.9" + resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz#24b64e2c3ec7cd3b3c547729b8d16871f22cbdc7" + integrity sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ== + "@babel/parser@^7.16.4": version "7.17.9" resolved "https://registry.npmmirror.com/@babel/parser/-/parser-7.17.9.tgz" integrity sha512-vqUSBLP8dQHFPdPi9bc5GK9vRkYHJ49fsZdtoJ8EQ8ibpwk5rPKfvNIwChB0KVXcIjcepEBBd2VHC5r9Gy8ueg== +"@babel/parser@^7.25.3": + version "7.26.9" + resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.26.9.tgz#d9e78bee6dc80f9efd8f2349dcfbbcdace280fd5" + integrity sha512-81NWa1njQblgZbQHxWHpxxCzNsa3ZwvFqpUg7P+NNUU6f3UU2jBEg4OlF/J6rl8+PQGh1q6/zWScd001YwcA5A== + dependencies: + "@babel/types" "^7.26.9" + "@babel/runtime@^7.10.5": version "7.17.9" resolved "https://registry.npmmirror.com/@babel/runtime/-/runtime-7.17.9.tgz" @@ -34,6 +51,14 @@ dependencies: regenerator-runtime "^0.13.4" +"@babel/types@^7.26.9": + version "7.26.9" + resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.26.9.tgz#08b43dec79ee8e682c2ac631c010bdcac54a21ce" + integrity sha512-Y3IR1cRnOxOCDvMmNiym7XpXQ93iGDDPHx+Zj+NM+rg0fBaShfQLkg+hKPaZCEvg5N/LeCo4+Rj/i3FuJsIQaw== + dependencies: + "@babel/helper-string-parser" "^7.25.9" + "@babel/helper-validator-identifier" "^7.25.9" + "@ctrl/tinycolor@^3.4.0": version "3.4.1" resolved "https://registry.npmmirror.com/@ctrl/tinycolor/-/tinycolor-3.4.1.tgz" @@ -61,6 +86,11 @@ dependencies: "@floating-ui/core" "^0.6.1" +"@jridgewell/sourcemap-codec@^1.5.0": + version "1.5.0" + resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz#3188bcb273a414b0d215fd22a58540b989b9409a" + integrity sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ== + "@popperjs/core@^2.11.4": version "2.11.5" resolved "https://registry.npmmirror.com/@popperjs/core/-/core-2.11.5.tgz" @@ -101,6 +131,17 @@ estree-walker "^2.0.2" source-map "^0.6.1" +"@vue/compiler-core@3.5.13": + version "3.5.13" + resolved "https://registry.yarnpkg.com/@vue/compiler-core/-/compiler-core-3.5.13.tgz#b0ae6c4347f60c03e849a05d34e5bf747c9bda05" + integrity sha512-oOdAkwqUfW1WqpwSYJce06wvt6HljgY3fGeM9NcVA1HaYOij3mZG9Rkysn0OHuyUAGMbEbARIpsG+LPVlBJ5/Q== + dependencies: + "@babel/parser" "^7.25.3" + "@vue/shared" "3.5.13" + entities "^4.5.0" + estree-walker "^2.0.2" + source-map-js "^1.2.0" + "@vue/compiler-dom@3.2.32": version "3.2.32" resolved "https://registry.npmmirror.com/@vue/compiler-dom/-/compiler-dom-3.2.32.tgz" @@ -109,6 +150,14 @@ "@vue/compiler-core" "3.2.32" "@vue/shared" "3.2.32" +"@vue/compiler-dom@3.5.13": + version "3.5.13" + resolved "https://registry.yarnpkg.com/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz#bb1b8758dbc542b3658dda973b98a1c9311a8a58" + integrity sha512-ZOJ46sMOKUjO3e94wPdCzQ6P1Lx/vhp2RSvfaab88Ajexs0AHeV0uasYhi99WPaogmBlRHNRuly8xV75cNTMDA== + dependencies: + "@vue/compiler-core" "3.5.13" + "@vue/shared" "3.5.13" + "@vue/compiler-sfc@3.2.32": version "3.2.32" resolved "https://registry.npmmirror.com/@vue/compiler-sfc/-/compiler-sfc-3.2.32.tgz" @@ -125,6 +174,21 @@ postcss "^8.1.10" source-map "^0.6.1" +"@vue/compiler-sfc@^3.1.0": + version "3.5.13" + resolved "https://registry.yarnpkg.com/@vue/compiler-sfc/-/compiler-sfc-3.5.13.tgz#461f8bd343b5c06fac4189c4fef8af32dea82b46" + integrity sha512-6VdaljMpD82w6c2749Zhf5T9u5uLBWKnVue6XWxprDobftnletJ8+oel7sexFfM3qIxNmVE7LSFGTpv6obNyaQ== + dependencies: + "@babel/parser" "^7.25.3" + "@vue/compiler-core" "3.5.13" + "@vue/compiler-dom" "3.5.13" + "@vue/compiler-ssr" "3.5.13" + "@vue/shared" "3.5.13" + estree-walker "^2.0.2" + magic-string "^0.30.11" + postcss "^8.4.48" + source-map-js "^1.2.0" + "@vue/compiler-ssr@3.2.32": version "3.2.32" resolved "https://registry.npmmirror.com/@vue/compiler-ssr/-/compiler-ssr-3.2.32.tgz" @@ -133,6 +197,14 @@ "@vue/compiler-dom" "3.2.32" "@vue/shared" "3.2.32" +"@vue/compiler-ssr@3.5.13": + version "3.5.13" + resolved "https://registry.yarnpkg.com/@vue/compiler-ssr/-/compiler-ssr-3.5.13.tgz#e771adcca6d3d000f91a4277c972a996d07f43ba" + integrity sha512-wMH6vrYHxQl/IybKJagqbquvxpWCuVYpoUJfCqFZwa/JY1GdATAQ+TgVtgrwwMZ0D07QhA99rs/EAAWfvG6KpA== + dependencies: + "@vue/compiler-dom" "3.5.13" + "@vue/shared" "3.5.13" + "@vue/reactivity-transform@3.2.32": version "3.2.32" resolved "https://registry.npmmirror.com/@vue/reactivity-transform/-/reactivity-transform-3.2.32.tgz" @@ -181,6 +253,11 @@ resolved "https://registry.npmmirror.com/@vue/shared/-/shared-3.2.32.tgz" integrity sha512-bjcixPErUsAnTQRQX4Z5IQnICYjIfNCyCl8p29v1M6kfVzvwOICPw+dz48nNuWlTOOx2RHhzHdazJibE8GSnsw== +"@vue/shared@3.5.13": + version "3.5.13" + resolved "https://registry.yarnpkg.com/@vue/shared/-/shared-3.5.13.tgz#87b309a6379c22b926e696893237826f64339b6f" + integrity sha512-/hnE/qP5ZoGpol0a5mDi45bOd7t3tjYJBjsgCsivow7D48cJeV5l05RD82lPqi7gRiphZM37rnhW1l6ZoCNNnQ== + "@vueuse/core@^8.2.4": version "8.2.5" resolved "https://registry.npmmirror.com/@vueuse/core/-/core-8.2.5.tgz" @@ -239,12 +316,34 @@ async-validator@^4.0.7: resolved "https://registry.npmmirror.com/async-validator/-/async-validator-4.0.7.tgz" integrity sha512-Pj2IR7u8hmUEDOwB++su6baaRi+QvsgajuFB9j95foM1N2gy5HM4z60hfusIO0fBPG5uLAEl6yCJr1jNSVugEQ== -axios@^0.26.1: - version "0.26.1" - resolved "https://registry.npmmirror.com/axios/-/axios-0.26.1.tgz" - integrity sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA== +asynckit@^0.4.0: + version "0.4.0" + resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79" + integrity sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q== + +axios@^1.8.2: + version "1.8.2" + resolved "https://registry.yarnpkg.com/axios/-/axios-1.8.2.tgz#fabe06e241dfe83071d4edfbcaa7b1c3a40f7979" + integrity sha512-ls4GYBm5aig9vWx8AWDSGLpnpDQRtWAfrjU+EuytuODrFBkqesN2RkOQCBzrA1RQNHw1SmRMSDDDSwzNAYQ6Rg== + dependencies: + follow-redirects "^1.15.6" + form-data "^4.0.0" + proxy-from-env "^1.1.0" + +call-bind-apply-helpers@^1.0.1, call-bind-apply-helpers@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz#4b5428c222be985d79c3d82657479dbe0b59b2d6" + integrity sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ== dependencies: - follow-redirects "^1.14.8" + es-errors "^1.3.0" + function-bind "^1.1.2" + +combined-stream@^1.0.8: + version "1.0.8" + resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f" + integrity sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg== + dependencies: + delayed-stream "~1.0.0" compute-scroll-into-view@^1.0.17: version "1.0.17" @@ -280,6 +379,11 @@ debug@^3.2.6: dependencies: ms "^2.1.1" +delayed-stream@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/delayed-stream/-/delayed-stream-1.0.0.tgz#df3ae199acadfb7d440aaae0b29e2272b24ec619" + integrity sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ== + dom-align@^1.12.1: version "1.12.3" resolved "https://registry.npmmirror.com/dom-align/-/dom-align-1.12.3.tgz" @@ -290,6 +394,15 @@ dom-scroll-into-view@^2.0.0: resolved "https://registry.npmmirror.com/dom-scroll-into-view/-/dom-scroll-into-view-2.0.1.tgz" integrity sha512-bvVTQe1lfaUr1oFzZX80ce9KLDlZ3iU+XGNE/bz9HnGdklTieqsbmsLHe+rT2XWqopvL0PckkYqN7ksmm5pe3w== +dunder-proto@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/dunder-proto/-/dunder-proto-1.0.1.tgz#d7ae667e1dc83482f8b70fd0f6eefc50da30f58a" + integrity sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A== + dependencies: + call-bind-apply-helpers "^1.0.1" + es-errors "^1.3.0" + gopd "^1.2.0" + element-plus@^2.1.9: version "2.1.9" resolved "https://registry.npmmirror.com/element-plus/-/element-plus-2.1.9.tgz" @@ -311,6 +424,11 @@ element-plus@^2.1.9: memoize-one "^6.0.0" normalize-wheel-es "^1.1.2" +entities@^4.5.0: + version "4.5.0" + resolved "https://registry.yarnpkg.com/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48" + integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw== + errno@^0.1.1: version "0.1.8" resolved "https://registry.npmmirror.com/errno/-/errno-0.1.8.tgz" @@ -318,6 +436,33 @@ errno@^0.1.1: dependencies: prr "~1.0.1" +es-define-property@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/es-define-property/-/es-define-property-1.0.1.tgz#983eb2f9a6724e9303f61addf011c72e09e0b0fa" + integrity sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g== + +es-errors@^1.3.0: + version "1.3.0" + resolved "https://registry.yarnpkg.com/es-errors/-/es-errors-1.3.0.tgz#05f75a25dab98e4fb1dcd5e1472c0546d5057c8f" + integrity sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw== + +es-object-atoms@^1.0.0, es-object-atoms@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/es-object-atoms/-/es-object-atoms-1.1.1.tgz#1c4f2c4837327597ce69d2ca190a7fdd172338c1" + integrity sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA== + dependencies: + es-errors "^1.3.0" + +es-set-tostringtag@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz#f31dbbe0c183b00a6d26eb6325c810c0fd18bd4d" + integrity sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA== + dependencies: + es-errors "^1.3.0" + get-intrinsic "^1.2.6" + has-tostringtag "^1.0.2" + hasown "^2.0.2" + esbuild-android-64@0.14.36: version "0.14.36" resolved "https://registry.yarnpkg.com/esbuild-android-64/-/esbuild-android-64-0.14.36.tgz#fc5f95ce78c8c3d790fa16bc71bd904f2bb42aa1" @@ -454,10 +599,20 @@ estree-walker@^2.0.2: resolved "https://registry.npmmirror.com/estree-walker/-/estree-walker-2.0.2.tgz" integrity sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w== -follow-redirects@^1.14.8: - version "1.14.9" - resolved "https://registry.npmmirror.com/follow-redirects/-/follow-redirects-1.14.9.tgz" - integrity sha512-MQDfihBQYMcyy5dhRDJUHcw7lb2Pv/TuE6xP1vyraLukNDHKbDxDNaOE3NbCAdKQApno+GPRyo1YAp89yCjK4w== +follow-redirects@^1.15.6: + version "1.15.9" + resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.15.9.tgz#a604fa10e443bf98ca94228d9eebcc2e8a2c8ee1" + integrity sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ== + +form-data@^4.0.0: + version "4.0.2" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.2.tgz#35cabbdd30c3ce73deb2c42d3c8d3ed9ca51794c" + integrity sha512-hGfm/slu0ZabnNt4oaRZ6uREyfCj6P4fT/n6A1rGV+Z0VdGXjfOhVUpkn6qVQONHGIFwmveGXyDs75+nr6FM8w== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + es-set-tostringtag "^2.1.0" + mime-types "^2.1.12" fsevents@~2.3.2: version "2.3.2" @@ -469,11 +624,57 @@ function-bind@^1.1.1: resolved "https://registry.npmmirror.com/function-bind/-/function-bind-1.1.1.tgz" integrity sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A== +function-bind@^1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.2.tgz#2c02d864d97f3ea6c8830c464cbd11ab6eab7a1c" + integrity sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA== + +get-intrinsic@^1.2.6: + version "1.3.0" + resolved "https://registry.yarnpkg.com/get-intrinsic/-/get-intrinsic-1.3.0.tgz#743f0e3b6964a93a5491ed1bffaae054d7f98d01" + integrity sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ== + dependencies: + call-bind-apply-helpers "^1.0.2" + es-define-property "^1.0.1" + es-errors "^1.3.0" + es-object-atoms "^1.1.1" + function-bind "^1.1.2" + get-proto "^1.0.1" + gopd "^1.2.0" + has-symbols "^1.1.0" + hasown "^2.0.2" + math-intrinsics "^1.1.0" + +get-proto@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/get-proto/-/get-proto-1.0.1.tgz#150b3f2743869ef3e851ec0c49d15b1d14d00ee1" + integrity sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g== + dependencies: + dunder-proto "^1.0.1" + es-object-atoms "^1.0.0" + +gopd@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/gopd/-/gopd-1.2.0.tgz#89f56b8217bdbc8802bd299df6d7f1081d7e51a1" + integrity sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg== + graceful-fs@^4.1.2: version "4.2.10" resolved "https://registry.npmmirror.com/graceful-fs/-/graceful-fs-4.2.10.tgz" integrity sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA== +has-symbols@^1.0.3, has-symbols@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/has-symbols/-/has-symbols-1.1.0.tgz#fc9c6a783a084951d0b971fe1018de813707a338" + integrity sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ== + +has-tostringtag@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/has-tostringtag/-/has-tostringtag-1.0.2.tgz#2cdc42d40bef2e5b4eeab7c01a73c54ce7ab5abc" + integrity sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw== + dependencies: + has-symbols "^1.0.3" + has@^1.0.3: version "1.0.3" resolved "https://registry.npmmirror.com/has/-/has-1.0.3.tgz" @@ -481,6 +682,13 @@ has@^1.0.3: dependencies: function-bind "^1.1.1" +hasown@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/hasown/-/hasown-2.0.2.tgz#003eaf91be7adc372e84ec59dc37252cedb80003" + integrity sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ== + dependencies: + function-bind "^1.1.2" + iconv-lite@^0.4.4: version "0.4.24" resolved "https://registry.npmmirror.com/iconv-lite/-/iconv-lite-0.4.24.tgz" @@ -573,6 +781,13 @@ magic-string@^0.25.7: dependencies: sourcemap-codec "^1.4.8" +magic-string@^0.30.11: + version "0.30.17" + resolved "https://registry.yarnpkg.com/magic-string/-/magic-string-0.30.17.tgz#450a449673d2460e5bbcfba9a61916a1714c7453" + integrity sha512-sNPKHvyjVf7gyjwS4xGTaW/mCnF8wnjtifKBEhxfZ7E/S8tQ0rssrwGNn6q8JH/ohItJfSQp9mBtQYuTlH5QnA== + dependencies: + "@jridgewell/sourcemap-codec" "^1.5.0" + make-dir@^2.1.0: version "2.1.0" resolved "https://registry.npmmirror.com/make-dir/-/make-dir-2.1.0.tgz" @@ -581,11 +796,28 @@ make-dir@^2.1.0: pify "^4.0.1" semver "^5.6.0" +math-intrinsics@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/math-intrinsics/-/math-intrinsics-1.1.0.tgz#a0dd74be81e2aa5c2f27e65ce283605ee4e2b7f9" + integrity sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g== + memoize-one@^6.0.0: version "6.0.0" resolved "https://registry.npmmirror.com/memoize-one/-/memoize-one-6.0.0.tgz" integrity sha512-rkpe71W0N0c0Xz6QD0eJETuWAJGnJ9afsl1srmwPrI+yBCkge5EycXXbYRyvL29zZVUWQCY7InPRCv3GDXuZNw== +mime-db@1.52.0: + version "1.52.0" + resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.52.0.tgz#bbabcdc02859f4987301c856e3387ce5ec43bf70" + integrity sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg== + +mime-types@^2.1.12: + version "2.1.35" + resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.35.tgz#381a871b62a734450660ae3deee44813f70d959a" + integrity sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw== + dependencies: + mime-db "1.52.0" + mime@^1.4.1: version "1.6.0" resolved "https://registry.npmmirror.com/mime/-/mime-1.6.0.tgz" @@ -606,6 +838,11 @@ nanoid@^3.3.1: resolved "https://registry.npmmirror.com/nanoid/-/nanoid-3.3.2.tgz" integrity sha512-CuHBogktKwpm5g2sRgv83jEy2ijFzBwMoYA60orPDR7ynsLijJDqgsi4RDGj3OJpy3Ieb+LYwiRmIOGyytgITA== +nanoid@^3.3.8: + version "3.3.9" + resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.9.tgz#e0097d8e026b3343ff053e9ccd407360a03f503a" + integrity sha512-SppoicMGpZvbF1l3z4x7No3OlIjP7QJvC9XR7AhZr1kL133KHnKPztkKDc+Ir4aJ/1VhTySrtKhrsycmrMQfvg== + nanopop@^2.1.0: version "2.1.0" resolved "https://registry.npmmirror.com/nanopop/-/nanopop-2.1.0.tgz" @@ -645,12 +882,17 @@ picocolors@^1.0.0: resolved "https://registry.npmmirror.com/picocolors/-/picocolors-1.0.0.tgz" integrity sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ== +picocolors@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.1.1.tgz#3d321af3eab939b083c8f929a1d12cda81c26b6b" + integrity sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA== + pify@^4.0.1: version "4.0.1" resolved "https://registry.npmmirror.com/pify/-/pify-4.0.1.tgz" integrity sha512-uB80kBFb/tfd68bVleG9T5GGsGPjJrLAUpR5PZIrhBnIaRTQRjqdJSsIKkOP6OAIFbj7GOrcudc5pNjZ+geV2g== -postcss@^8.1.10, postcss@^8.4.12: +postcss@^8.1.10: version "8.4.12" resolved "https://registry.npmmirror.com/postcss/-/postcss-8.4.12.tgz" integrity sha512-lg6eITwYe9v6Hr5CncVbK70SoioNQIq81nsaG86ev5hAidQvmOeETBqs7jm43K2F5/Ley3ytDtriImV6TpNiSg== @@ -659,6 +901,20 @@ postcss@^8.1.10, postcss@^8.4.12: picocolors "^1.0.0" source-map-js "^1.0.2" +postcss@^8.4.13, postcss@^8.4.48: + version "8.5.3" + resolved "https://registry.yarnpkg.com/postcss/-/postcss-8.5.3.tgz#1463b6f1c7fb16fe258736cba29a2de35237eafb" + integrity sha512-dle9A3yYxlBSrt8Fu+IpjGT8SY8hN0mlaA6GY8t0P5PjIOZemULz/E2Bnm/2dcUOena75OTNkHI76uZBNUUq3A== + dependencies: + nanoid "^3.3.8" + picocolors "^1.1.1" + source-map-js "^1.2.1" + +proxy-from-env@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2" + integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg== + prr@~1.0.1: version "1.0.1" resolved "https://registry.npmmirror.com/prr/-/prr-1.0.1.tgz" @@ -683,10 +939,10 @@ resolve@^1.22.0: path-parse "^1.0.7" supports-preserve-symlinks-flag "^1.0.0" -rollup@^2.59.0: - version "2.70.1" - resolved "https://registry.npmmirror.com/rollup/-/rollup-2.70.1.tgz" - integrity sha512-CRYsI5EuzLbXdxC6RnYhOuRdtz4bhejPMSWjsFLfVM/7w/85n2szZv6yExqUXsBdz5KT8eoubeyDUDjhLHEslA== +"rollup@>=2.59.0 <2.78.0": + version "2.77.3" + resolved "https://registry.yarnpkg.com/rollup/-/rollup-2.77.3.tgz#8f00418d3a2740036e15deb653bed1a90ee0cc12" + integrity sha512-/qxNTG7FbmefJWoeeYJFbHehJ2HNWnjkAFRKzWN/45eNBBF/r8lo992CwcJXEzyVxs5FmfId+vTSTQDb+bxA+g== optionalDependencies: fsevents "~2.3.2" @@ -722,6 +978,11 @@ source-map-js@^1.0.2: resolved "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.0.2.tgz" integrity sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw== +source-map-js@^1.2.0, source-map-js@^1.2.1: + version "1.2.1" + resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-1.2.1.tgz#1ce5650fddd87abc099eda37dcff024c2667ae46" + integrity sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA== + source-map@^0.6.1, source-map@~0.6.0: version "0.6.1" resolved "https://registry.npmmirror.com/source-map/-/source-map-0.6.1.tgz" @@ -747,15 +1008,15 @@ use-strict@1.0.1: resolved "https://registry.npmmirror.com/use-strict/-/use-strict-1.0.1.tgz" integrity sha512-IeiWvvEXfW5ltKVMkxq6FvNf2LojMKvB2OCeja6+ct24S1XOmQw2dGr2JyndwACWAGJva9B7yPHwAmeA9QCqAQ== -vite@^2.9.0: - version "2.9.1" - resolved "https://registry.npmmirror.com/vite/-/vite-2.9.1.tgz" - integrity sha512-vSlsSdOYGcYEJfkQ/NeLXgnRv5zZfpAsdztkIrs7AZHV8RCMZQkwjo4DS5BnrYTqoWqLoUe1Cah4aVO4oNNqCQ== +vite@^2.9.13: + version "2.9.18" + resolved "https://registry.yarnpkg.com/vite/-/vite-2.9.18.tgz#74e2a83b29da81e602dac4c293312cc575f091c7" + integrity sha512-sAOqI5wNM9QvSEE70W3UGMdT8cyEn0+PmJMTFvTB8wB0YbYUWw3gUbY62AOyrXosGieF2htmeLATvNxpv/zNyQ== dependencies: esbuild "^0.14.27" - postcss "^8.4.12" + postcss "^8.4.13" resolve "^1.22.0" - rollup "^2.59.0" + rollup ">=2.59.0 <2.78.0" optionalDependencies: fsevents "~2.3.2" From f8dc3252568dbe25b76f6662a16b7c04b66c70a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Thu, 13 Mar 2025 11:55:15 +0800 Subject: [PATCH 22/46] add docker (#4000) * add docker * fix unit error > Type promotion * fix url --- docker/ubuntu20-cpu/Dockerfile | 17 +++++++++++++++++ tests/unit/asr/reverse_pad_list.py | 11 +++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 docker/ubuntu20-cpu/Dockerfile diff --git a/docker/ubuntu20-cpu/Dockerfile b/docker/ubuntu20-cpu/Dockerfile new file mode 100644 index 000000000..bb113b2f2 --- /dev/null +++ b/docker/ubuntu20-cpu/Dockerfile @@ -0,0 +1,17 @@ +FROM registry.baidubce.com/paddlepaddle/paddle:3.0.0b1 +LABEL maintainer="ext_paddle_oss@baidu.com" + +RUN apt-get update \ + && apt-get install libsndfile-dev libsndfile1 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone --depth 1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech +RUN pip3 uninstall mccabe -y ; exit 0; +RUN pip3 install multiprocess==0.70.12 importlib-metadata==4.2.0 dill==0.3.4 + +WORKDIR /home/PaddleSpeech/ +RUN python setup.py bdist_wheel +RUN pip install dist/*.whl -i https://pypi.tuna.tsinghua.edu.cn/simple + +CMD ['bash'] diff --git a/tests/unit/asr/reverse_pad_list.py b/tests/unit/asr/reverse_pad_list.py index 215ed5ceb..1b63890a0 100644 --- a/tests/unit/asr/reverse_pad_list.py +++ b/tests/unit/asr/reverse_pad_list.py @@ -65,14 +65,16 @@ def reverse_pad_list_with_sos_eos(r_hyps, max_len = paddle.max(r_hyps_lens) index_range = paddle.arange(0, max_len, 1) seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) + seq_mask = seq_len_expand > index_range.astype( + seq_len_expand.dtype) # (beam, max_len) - index = (seq_len_expand - 1) - index_range # (beam, max_len) + index = (seq_len_expand - 1) - index_range.astype( + seq_len_expand.dtype) # (beam, max_len) # >>> index # >>> tensor([[ 2, 1, 0], # >>> [ 2, 1, 0], # >>> [ 0, -1, -2]]) - index = index * seq_mask + index = index * seq_mask.astype(index.dtype) # >>> index # >>> tensor([[2, 1, 0], @@ -103,7 +105,8 @@ def reverse_pad_list_with_sos_eos(r_hyps, # >>> tensor([[3, 2, 1], # >>> [4, 8, 9], # >>> [2, 2, 2]]) - r_hyps = paddle.where(seq_mask, r_hyps, eos) + r_hyps = paddle.where(seq_mask, r_hyps, + paddle.to_tensor(eos, dtype=r_hyps.dtype)) # >>> r_hyps # >>> tensor([[3, 2, 1], # >>> [4, 8, 9], From f357ec61720b37f3d51c49854ccdfe365debd451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Tue, 18 Mar 2025 11:43:12 +0800 Subject: [PATCH 23/46] =?UTF-8?q?Docker=3D-=3D=20=E5=85=BC=E5=AE=B9?= =?UTF-8?q?=E6=80=A7=E9=AA=8C=E8=AF=81=20(#4018)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add docker * fix unit error > Type promotion * fix url * add gpu docker * Update Dockerfile * fix pp3.0 0-d tensor problem * 兼容性验证 --- paddlespeech/s2t/models/whisper/whisper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/models/whisper/whisper.py b/paddlespeech/s2t/models/whisper/whisper.py index d20cc04b6..fdd3a6974 100644 --- a/paddlespeech/s2t/models/whisper/whisper.py +++ b/paddlespeech/s2t/models/whisper/whisper.py @@ -835,8 +835,14 @@ class BeamSearchDecoder(TokenDecoder): logprob, token = paddle.topk( logprobs[idx], k=self.beam_size + 1) for logprob, token in zip(logprob, token): - new_logprob = (sum_logprobs[idx] + logprob).tolist()[0] - sequence = tuple(prefix + [token.tolist()[0]]) + # after Paddle 3.0, tolist in 0-D tensor will return a float/int value instead of a list + new_logprob = (sum_logprobs[idx] + logprob).tolist() + new_logprob = new_logprob if isinstance( + new_logprob, float) else new_logprob[0] + new_token = token.tolist() + new_token = new_token if isinstance(new_token, + int) else new_token[0] + sequence = tuple(prefix + [new_token]) scores[sequence] = new_logprob sources[sequence] = idx From ca03f4db214b7300a00fd7159ee3a5b0a8fada91 Mon Sep 17 00:00:00 2001 From: Echo-Nie <157974576+Echo-Nie@users.noreply.github.com> Date: Tue, 18 Mar 2025 14:50:37 +0800 Subject: [PATCH 24/46] =?UTF-8?q?=E3=80=90PaddleSpeech=20No.6=E3=80=91?= =?UTF-8?q?=E8=A1=A5=E5=85=A8=E5=90=88=E6=88=90=E7=B3=BB=E5=88=97=E4=B8=AD?= =?UTF-8?q?=E7=9A=84=E8=84=9A=E6=9C=AC=E4=B8=AD=E5=8F=82=E6=95=B0=E7=BC=BA?= =?UTF-8?q?=E5=A4=B1=20(#4004)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * run.sh修改:为 synthesize 和 synthesize_e2e 添加 --stage 参数控制 vocoder 模型选择,REAMDE.md修改:补充 stage 参数说明,明确 vocoder 选择逻辑 * 添加run.sh中stage参数相关的注释 * HiFiGAN改为MultiBand MelGAN * cmsc文件改回原位(No.15不修改),这里只对No.6做修改 --- examples/canton/tts3/README.md | 1 + examples/canton/tts3/run.sh | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md index 87ef40907..d2c46f642 100644 --- a/examples/canton/tts3/README.md +++ b/examples/canton/tts3/README.md @@ -37,6 +37,7 @@ Run the command below to 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. + - `--stage` controls the vocoder model during synthesis (0 = pwgan, 1 = hifigan). - synthesize waveform from text file. ```bash ./run.sh diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh index acfc50223..0e1f52a1c 100755 --- a/examples/canton/tts3/run.sh +++ b/examples/canton/tts3/run.sh @@ -28,13 +28,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # synthesize, vocoder is pwgan by default - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # synthesize_e2e, vocoder is pwgan by default - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + # synthesize_e2e, vocoder is pwgan by default stage 0, stage 1 will use hifigan as vocoder + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh --stage 0 ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then From 05cdbd8d5ebd1189b32b059b76f6201495b56c28 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 18 Mar 2025 19:39:28 +0800 Subject: [PATCH 25/46] =?UTF-8?q?=E3=80=90doc=E3=80=91fix=20download=20lin?= =?UTF-8?q?k=20case=20abnormal=20traffic=20(#4020)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix download link case abnormal traffic * fix code style --- README.md | 22 +- README_cn.md | 22 +- audio/tests/backends/base.py | 4 +- audio/tests/backends/soundfile/base.py | 4 +- audio/tests/benchmark/log_melspectrogram.py | 5 +- audio/tests/benchmark/melspectrogram.py | 5 +- audio/tests/benchmark/mfcc.py | 5 +- audio/tests/features/base.py | 2 +- demos/TTSAndroid/README.md | 8 +- demos/TTSAndroid/app/build.gradle | 4 +- demos/TTSArmLinux/README.md | 2 +- demos/TTSArmLinux/download.sh | 6 +- demos/TTSCppFrontend/download.sh | 8 +- demos/audio_content_search/README.md | 2 +- demos/audio_content_search/README_cn.md | 2 +- demos/audio_searching/README.md | 4 +- demos/audio_searching/README_cn.md | 4 +- .../audio_searching/src/test_audio_search.py | 2 +- demos/audio_searching/src/test_vpr_search.py | 2 +- demos/audio_tagging/README.md | 2 +- demos/audio_tagging/README_cn.md | 2 +- demos/audio_tagging/run.sh | 2 +- demos/automatic_video_subtitiles/README.md | 2 +- demos/automatic_video_subtitiles/README_cn.md | 2 +- demos/automatic_video_subtitiles/run.sh | 2 +- .../custom_streaming_asr/websocket_server.sh | 2 +- demos/keyword_spotting/README.md | 2 +- demos/keyword_spotting/README_cn.md | 2 +- demos/keyword_spotting/run.sh | 2 +- demos/metaverse/run.sh | 6 +- demos/speaker_verification/README.md | 4 +- demos/speaker_verification/README_cn.md | 4 +- demos/speaker_verification/run.sh | 4 +- demos/speech_recognition/README.md | 2 +- demos/speech_recognition/README_cn.md | 2 +- demos/speech_recognition/run.sh | 6 +- demos/speech_server/README.md | 12 +- demos/speech_server/README_cn.md | 12 +- demos/speech_server/asr_client.sh | 2 +- demos/speech_server/cls_client.sh | 2 +- demos/speech_server/sid_client.sh | 4 +- demos/speech_ssl/README.md | 2 +- demos/speech_ssl/README_cn.md | 2 +- demos/speech_ssl/run.sh | 2 +- demos/speech_translation/README.md | 2 +- demos/speech_translation/README_cn.md | 2 +- demos/speech_translation/run.sh | 2 +- demos/speech_web/README.md | 20 +- demos/story_talker/run.sh | 8 +- demos/streaming_asr_server/README.md | 2 +- demos/streaming_asr_server/README_cn.md | 2 +- demos/streaming_asr_server/test.sh | 2 +- .../README.md | 4 +- .../README_cn.md | 4 +- demos/style_fs2/run.sh | 4 +- demos/whisper/README.md | 2 +- demos/whisper/README_cn.md | 2 +- demos/whisper/run.sh | 2 +- docs/source/demo_video.rst | 2 +- docs/source/install.md | 2 +- docs/source/install_cn.md | 2 +- docs/source/released_model.md | 114 +++--- docs/source/streaming_asr_demo_video.rst | 2 +- docs/source/streaming_tts_demo_video.rst | 2 +- docs/source/tts/README.md | 24 +- docs/source/tts/demo.rst | 356 +++++++++--------- docs/source/tts/demo_2.rst | 56 +-- docs/source/tts/svs_music_score.md | 26 +- docs/source/tts_demo_video.rst | 2 +- docs/topic/ctc/ctc_loss_speed_compare.ipynb | 2 +- docs/topic/gan_vocoder/gan_vocoder.ipynb | 2 +- docs/tutorial/asr/tutorial_deepspeech2.ipynb | 6 +- docs/tutorial/asr/tutorial_transformer.ipynb | 6 +- docs/tutorial/cls/cls_tutorial.ipynb | 4 +- docs/tutorial/st/st_tutorial.ipynb | 10 +- docs/tutorial/tts/tts_tutorial.ipynb | 22 +- examples/aishell/asr0/README.md | 6 +- examples/aishell/asr0/local/test_wav.sh | 2 +- examples/aishell/asr1/README.md | 6 +- examples/aishell/asr1/local/test_wav.sh | 2 +- examples/aishell/asr3/README.md | 10 +- examples/aishell/asr3/local/data.sh | 2 +- examples/aishell/asr3/local/test_wav.sh | 2 +- examples/aishell3/ernie_sat/README.md | 32 +- examples/aishell3/tts3/README.md | 16 +- examples/aishell3/vc0/README.md | 6 +- examples/aishell3/vc1/README.md | 6 +- examples/aishell3/vc2/README.md | 6 +- examples/aishell3/vits-vc/README.md | 4 +- examples/aishell3/vits/README.md | 4 +- examples/aishell3/voc1/README.md | 12 +- examples/aishell3/voc5/README.md | 12 +- examples/aishell3_vctk/ernie_sat/README.md | 34 +- examples/ami/sd0/run.sh | 2 +- examples/canton/tts3/README.md | 10 +- examples/csmsc/jets/README.md | 6 +- examples/csmsc/tts0/README.md | 8 +- examples/csmsc/tts2/README.md | 16 +- examples/csmsc/tts3/README.md | 30 +- examples/csmsc/tts3/README_cn.md | 10 +- examples/csmsc/tts3_rhy/README.md | 4 +- examples/csmsc/vits/README.md | 4 +- examples/csmsc/voc1/README.md | 22 +- examples/csmsc/voc3/README.md | 28 +- examples/csmsc/voc4/README.md | 4 +- examples/csmsc/voc5/README.md | 24 +- examples/csmsc/voc5/iSTFTNet.md | 2 +- examples/csmsc/voc6/README.md | 8 +- examples/hey_snips/README.md | 2 +- examples/iwslt2012/punc0/README.md | 14 +- examples/iwslt2012/punc0/local/data.sh | 2 +- examples/librispeech/asr0/README.md | 2 +- examples/librispeech/asr0/local/test_wav.sh | 2 +- examples/librispeech/asr1/README.md | 6 +- examples/librispeech/asr1/local/test_wav.sh | 2 +- examples/librispeech/asr2/README.md | 2 +- examples/librispeech/asr3/README.md | 10 +- examples/librispeech/asr3/local/data.sh | 2 +- examples/librispeech/asr3/local/test_wav.sh | 2 +- examples/librispeech/asr4/README.md | 10 +- examples/librispeech/asr4/local/data.sh | 2 +- examples/librispeech/asr4/local/test_wav.sh | 2 +- examples/librispeech/asr5/README.md | 10 +- examples/librispeech/asr5/local/data.sh | 2 +- examples/librispeech/asr5/local/test_wav.sh | 2 +- examples/ljspeech/tts0/README.md | 6 +- examples/ljspeech/tts1/README.md | 4 +- examples/ljspeech/tts3/README.md | 12 +- examples/ljspeech/voc0/README.md | 2 +- examples/ljspeech/voc1/README.md | 10 +- examples/ljspeech/voc5/README.md | 10 +- examples/opencpop/svs1/README.md | 4 +- examples/opencpop/svs1/README_cn.md | 4 +- examples/opencpop/voc1/README.md | 2 +- examples/other/ge2e/README.md | 2 +- examples/other/rhy/README.md | 2 +- examples/other/rhy/local/data.sh | 4 +- examples/other/tts_finetune/tts3/README.md | 40 +- examples/tal_cs/asr1/README.md | 6 +- examples/tal_cs/asr1/local/test_wav.sh | 2 +- .../ted_en_zh/st1/local/download_pretrain.sh | 4 +- examples/thchs30/align0/README.md | 6 +- examples/vctk/ernie_sat/README.md | 32 +- examples/vctk/tts3/README.md | 14 +- examples/vctk/vc3/README.md | 6 +- examples/vctk/voc1/README.md | 10 +- examples/vctk/voc5/README.md | 12 +- examples/voxceleb/sv0/README.md | 2 +- examples/wenetspeech/asr1/README.md | 6 +- examples/wenetspeech/asr1/RESULTS.md | 4 +- examples/wenetspeech/asr1/local/test_wav.sh | 2 +- examples/zh_en_tts/tts3/README.md | 22 +- examples/zh_en_tts/tts3/local/mfa_download.sh | 8 +- .../zh_en_tts/tts3/local/model_download.sh | 4 +- paddlespeech/cli/st/infer.py | 2 +- paddlespeech/resource/pretrained_models.py | 292 +++++++------- .../server/tests/asr/online/README.md | 2 +- .../server/tests/asr/online/README_cn.md | 2 +- paddlespeech/t2s/exps/stream_play_tts.py | 4 +- paddlespeech/t2s/modules/losses.py | 6 +- runtime/cmake/fastdeploy.cmake | 2 +- runtime/examples/codelab/decoder/run.sh | 4 +- runtime/examples/codelab/feat/run.sh | 4 +- runtime/examples/codelab/nnet/run.sh | 2 +- runtime/examples/codelab/u2/run.sh | 4 +- runtime/examples/custom_asr/run.sh | 2 +- .../wenetspeech/local/recognizer_wfst.sh | 2 +- .../local/recognizer_wfst_fastdeploy.sh | 2 +- .../wenetspeech/local/run_build_tlg.sh | 2 +- runtime/examples/u2pp_ol/wenetspeech/run.sh | 8 +- tests/benchmark/pwgan/run_all.sh | 2 +- tests/chains/speedyspeech/prepare.sh | 12 +- tests/test_tipc/prepare.sh | 8 +- .../unit/asr/deepspeech2_online_model_test.sh | 2 +- tests/unit/audiotools/test_audiotools.sh | 4 +- tests/unit/cli/test_cli.sh | 10 +- .../unit/server/offline/test_server_client.sh | 2 +- tests/unit/tts/test_losses.py | 2 +- tools/Makefile | 2 +- 179 files changed, 973 insertions(+), 964 deletions(-) diff --git a/README.md b/README.md index 6594a4b8f..ace7f7c57 100644 --- a/README.md +++ b/README.md @@ -46,14 +46,14 @@ - +
I knocked at the door on the ancient side of the building. - +
我认为跑步最重要的就是给我带来了身体健康。 @@ -76,7 +76,7 @@ - +
我 在 这栋 建筑 的 古老 门上 敲门。 @@ -99,42 +99,42 @@ Life was like a box of chocolates, you never know what you're gonna get. - +
早上好,今天是2020/10/29,最低温度是-3°C。 - +
季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 - +
大家好,我是 parrot 虚拟老师,我们来读一首诗,我与春风皆过客,I and the spring breeze are passing by,你携秋水揽星河,you take the autumn water to take the galaxy。 - +
宜家唔系事必要你讲,但系你所讲嘅说话将会变成呈堂证供。 - +
各个国家有各个国家嘅国歌 - +
@@ -283,8 +283,8 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl Test audio sample download ```shell -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav ``` ### Automatic Speech Recognition diff --git a/README_cn.md b/README_cn.md index 5b95a2879..491c61f39 100644 --- a/README_cn.md +++ b/README_cn.md @@ -51,14 +51,14 @@ - +
I knocked at the door on the ancient side of the building. - +
我认为跑步最重要的就是给我带来了身体健康。 @@ -81,7 +81,7 @@ - +
我 在 这栋 建筑 的 古老 门上 敲门。 @@ -104,42 +104,42 @@ Life was like a box of chocolates, you never know what you're gonna get. - +
早上好,今天是2020/10/29,最低温度是-3°C。 - +
季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 - +
大家好,我是 parrot 虚拟老师,我们来读一首诗,我与春风皆过客,I and the spring breeze are passing by,你携秋水揽星河,you take the autumn water to take the galaxy。 - +
宜家唔系事必要你讲,但系你所讲嘅说话将会变成呈堂证供。 - +
各个国家有各个国家嘅国歌 - +
@@ -286,8 +286,8 @@ pip install . 测试音频示例下载 ```shell -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav ``` ### 语音识别 diff --git a/audio/tests/backends/base.py b/audio/tests/backends/base.py index c2d53d209..b4f97e89b 100644 --- a/audio/tests/backends/base.py +++ b/audio/tests/backends/base.py @@ -15,8 +15,8 @@ import os import unittest import urllib.request -mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' -multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav' +mono_channel_wav = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav' +multi_channels_wav = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/cat.wav' class BackendTest(unittest.TestCase): diff --git a/audio/tests/backends/soundfile/base.py b/audio/tests/backends/soundfile/base.py index c2d53d209..b4f97e89b 100644 --- a/audio/tests/backends/soundfile/base.py +++ b/audio/tests/backends/soundfile/base.py @@ -15,8 +15,8 @@ import os import unittest import urllib.request -mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' -multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav' +mono_channel_wav = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav' +multi_channels_wav = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/cat.wav' class BackendTest(unittest.TestCase): diff --git a/audio/tests/benchmark/log_melspectrogram.py b/audio/tests/benchmark/log_melspectrogram.py index 1d03c1df3..1c772b421 100644 --- a/audio/tests/benchmark/log_melspectrogram.py +++ b/audio/tests/benchmark/log_melspectrogram.py @@ -21,11 +21,12 @@ import paddleaudio import torch import torchaudio -wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +wav_url = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav' if not os.path.isfile(os.path.basename(wav_url)): urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) -waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url))) +waveform, sr = paddleaudio.backends.soundfile_load( + os.path.abspath(os.path.basename(wav_url))) waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) diff --git a/audio/tests/benchmark/melspectrogram.py b/audio/tests/benchmark/melspectrogram.py index 28c4ac806..9df6ce092 100644 --- a/audio/tests/benchmark/melspectrogram.py +++ b/audio/tests/benchmark/melspectrogram.py @@ -21,11 +21,12 @@ import paddleaudio import torch import torchaudio -wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +wav_url = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav' if not os.path.isfile(os.path.basename(wav_url)): urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) -waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url))) +waveform, sr = paddleaudio.backends.soundfile_load( + os.path.abspath(os.path.basename(wav_url))) waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) diff --git a/audio/tests/benchmark/mfcc.py b/audio/tests/benchmark/mfcc.py index 544a5371b..7b1ecbe03 100644 --- a/audio/tests/benchmark/mfcc.py +++ b/audio/tests/benchmark/mfcc.py @@ -21,11 +21,12 @@ import paddleaudio import torch import torchaudio -wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +wav_url = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav' if not os.path.isfile(os.path.basename(wav_url)): urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) -waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url))) +waveform, sr = paddleaudio.backends.soundfile_load( + os.path.abspath(os.path.basename(wav_url))) waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py index 4a44e04bb..1d36c13d4 100644 --- a/audio/tests/features/base.py +++ b/audio/tests/features/base.py @@ -19,7 +19,7 @@ import numpy as np import paddle from paddleaudio.backends import soundfile_load as load -wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +wav_url = 'https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav' class FeatTest(unittest.TestCase): diff --git a/demos/TTSAndroid/README.md b/demos/TTSAndroid/README.md index 36848cbe3..a26172cb6 100644 --- a/demos/TTSAndroid/README.md +++ b/demos/TTSAndroid/README.md @@ -70,8 +70,8 @@ TTSAndroid/app/src/main/java/com/baidu/paddle/lite/demo/tts/Predictor.java ``` 2. `fastspeech2_csmsc_arm.nb` 和 `mb_melgan_csmsc_arm.nb`: 模型文件 (opt 工具转化后 Paddle Lite 模型) - ,分别来自 [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip) - 和 [mb_melgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_pdlite_1.3.0.zip)。 + ,分别来自 [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip) + 和 [mb_melgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_pdlite_1.3.0.zip)。 ```bash # 位置: @@ -161,7 +161,7 @@ Android 示例基于 Java API 开发,调用 Paddle Lite `Java API` 包括以 - C++ 中文前端 [lym0302/paddlespeech_tts_cpp](https://github.com/lym0302/paddlespeech_tts_cpp) - C++ 英文 g2p [yazone/g2pE_mobile](https://github.com/yazone/g2pE_mobile) -`phone_id_map.txt` 请参考 [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip)。 +`phone_id_map.txt` 请参考 [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip)。 ## 通过 setting 界面更新语音合成的相关参数 @@ -186,7 +186,7 @@ Android 示例基于 Java API 开发,调用 Paddle Lite `Java API` 包括以 ## Release -[2022-11-29-app-release.apk](https://paddlespeech.bj.bcebos.com/demos/TTSAndroid/2022-11-29-app-release.apk) +[2022-11-29-app-release.apk](https://paddlespeech.cdn.bcebos.com/demos/TTSAndroid/2022-11-29-app-release.apk) ## More 本 Demo 合并自 [yt605155624/TTSAndroid](https://github.com/yt605155624/TTSAndroid)。 diff --git a/demos/TTSAndroid/app/build.gradle b/demos/TTSAndroid/app/build.gradle index 40ee5e123..ee493d622 100644 --- a/demos/TTSAndroid/app/build.gradle +++ b/demos/TTSAndroid/app/build.gradle @@ -31,7 +31,7 @@ dependencies { implementation files('libs/PaddlePredictor.jar') } -def paddleLiteLibs = 'https://paddlespeech.bj.bcebos.com/demos/TTSAndroid/paddle_lite_libs_68b66fd3.tar.gz' +def paddleLiteLibs = 'https://paddlespeech.cdn.bcebos.com/demos/TTSAndroid/paddle_lite_libs_68b66fd3.tar.gz' task downloadAndExtractPaddleLiteLibs(type: DefaultTask) { doFirst { println "Downloading and extracting Paddle Lite libs" @@ -73,7 +73,7 @@ task downloadAndExtractPaddleLiteLibs(type: DefaultTask) { } preBuild.dependsOn downloadAndExtractPaddleLiteLibs -def paddleLiteModels = [['src' : 'https://paddlespeech.bj.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz', +def paddleLiteModels = [['src' : 'https://paddlespeech.cdn.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz', 'dest': 'src/main/assets/models'],] task downloadAndExtractPaddleLiteModels(type: DefaultTask) { doFirst { diff --git a/demos/TTSArmLinux/README.md b/demos/TTSArmLinux/README.md index a4ccba6c8..cd331155b 100644 --- a/demos/TTSArmLinux/README.md +++ b/demos/TTSArmLinux/README.md @@ -21,7 +21,7 @@ sudo yum install cmake wget tar unzip ### 下载 Paddle Lite 库文件和模型文件 -预编译的二进制使用与安卓 Demo 版本相同的 Paddle Lite 推理库([Paddle-Lite:68b66fd35](https://github.com/PaddlePaddle/Paddle-Lite/tree/68b66fd356c875c92167d311ad458e6093078449))和模型([fs2cnn_mbmelgan_cpu_v1.3.0](https://paddlespeech.bj.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz))。 +预编译的二进制使用与安卓 Demo 版本相同的 Paddle Lite 推理库([Paddle-Lite:68b66fd35](https://github.com/PaddlePaddle/Paddle-Lite/tree/68b66fd356c875c92167d311ad458e6093078449))和模型([fs2cnn_mbmelgan_cpu_v1.3.0](https://paddlespeech.cdn.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz))。 可用以下命令下载: diff --git a/demos/TTSArmLinux/download.sh b/demos/TTSArmLinux/download.sh index 7eaa836a5..2a0b23b27 100755 --- a/demos/TTSArmLinux/download.sh +++ b/demos/TTSArmLinux/download.sh @@ -45,17 +45,17 @@ download() { echo "Download models..." download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ - 'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ + 'https://paddlespeech.cdn.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \ '39e0c6604f97c70f5d13c573d7e709b9' \ "$LIBS_DIR" download 'inference_lite_lib.armlinux.armv7hf.gcc.with_extra.with_cv.tar.gz' \ - 'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv7hf.gcc.with_extra.with_cv.tar.gz' \ + 'https://paddlespeech.cdn.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv7hf.gcc.with_extra.with_cv.tar.gz' \ 'f5ceb509f0b610dafb8379889c5f36f8' \ "$LIBS_DIR" download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \ - 'https://paddlespeech.bj.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \ + 'https://paddlespeech.cdn.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \ '93ef17d44b498aff3bea93e2c5c09a1e' \ "$MODELS_DIR" diff --git a/demos/TTSCppFrontend/download.sh b/demos/TTSCppFrontend/download.sh index 0953e3a59..3051ce3f9 100755 --- a/demos/TTSCppFrontend/download.sh +++ b/demos/TTSCppFrontend/download.sh @@ -40,22 +40,22 @@ DIST_DIR="$PWD/front_demo/dict" mkdir -p "$DIST_DIR" download 'fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \ - 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \ + 'https://paddlespeech.cdn.bcebos.com/t2s/text_frontend/fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \ '7bf1bab1737375fa123c413eb429c573' \ "$DIST_DIR" download 'speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \ - 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \ + 'https://paddlespeech.cdn.bcebos.com/t2s/text_frontend/speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \ '0b7754b21f324789aef469c61f4d5b8f' \ "$DIST_DIR" download 'jieba.tar.gz' \ - 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/jieba.tar.gz' \ + 'https://paddlespeech.cdn.bcebos.com/t2s/text_frontend/jieba.tar.gz' \ '6d30f426bd8c0025110a483f051315ca' \ "$DIST_DIR" download 'tranditional_to_simplified.tar.gz' \ - 'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/tranditional_to_simplified.tar.gz' \ + 'https://paddlespeech.cdn.bcebos.com/t2s/text_frontend/tranditional_to_simplified.tar.gz' \ '258f5b59d5ebfe96d02007ca1d274a7f' \ "$DIST_DIR" diff --git a/demos/audio_content_search/README.md b/demos/audio_content_search/README.md index 89b1c0d89..d090fdf1e 100644 --- a/demos/audio_content_search/README.md +++ b/demos/audio_content_search/README.md @@ -27,7 +27,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` ### 3. run paddlespeech_server diff --git a/demos/audio_content_search/README_cn.md b/demos/audio_content_search/README_cn.md index 16c1a3dd7..a3f20c7e1 100644 --- a/demos/audio_content_search/README_cn.md +++ b/demos/audio_content_search/README_cn.md @@ -27,7 +27,7 @@ pip install -r requirements.txt 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` ### 3. 启动 server diff --git a/demos/audio_searching/README.md b/demos/audio_searching/README.md index 528fce9e8..5b3890382 100644 --- a/demos/audio_searching/README.md +++ b/demos/audio_searching/README.md @@ -128,7 +128,7 @@ Then to start the system server, and it provides HTTP backend services. Output: ```bash - Downloading https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz ... + Downloading https://paddlespeech.cdn.bcebos.com/vector/audio/example_audio.tar.gz ... ... Unpacking ./example_audio.tar.gz ... [2022-03-26 22:50:54,987] [ INFO] - checking the aduio file format...... @@ -136,7 +136,7 @@ Then to start the system server, and it provides HTTP backend services. [2022-03-26 22:50:54,987] [ INFO] - The audio file format is right [2022-03-26 22:50:54,988] [ INFO] - device type: cpu [2022-03-26 22:50:54,988] [ INFO] - load the pretrained model: ecapatdnn_voxceleb12-16k - [2022-03-26 22:50:54,990] [ INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz + [2022-03-26 22:50:54,990] [ INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.cdn.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz ... [2022-03-26 22:51:17,285] [ INFO] - start to dynamic import the model class [2022-03-26 22:51:17,285] [ INFO] - model name ecapatdnn diff --git a/demos/audio_searching/README_cn.md b/demos/audio_searching/README_cn.md index 6d38b91f5..30ec2a97e 100644 --- a/demos/audio_searching/README_cn.md +++ b/demos/audio_searching/README_cn.md @@ -130,7 +130,7 @@ ffce340b3790 minio/minio:RELEASE.2020-12-03T00-03-10Z "/usr/bin/docker-ent…" 输出: ```bash - Downloading https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz ... + Downloading https://paddlespeech.cdn.bcebos.com/vector/audio/example_audio.tar.gz ... ... Unpacking ./example_audio.tar.gz ... [2022-03-26 22:50:54,987] [ INFO] - checking the aduio file format...... @@ -138,7 +138,7 @@ ffce340b3790 minio/minio:RELEASE.2020-12-03T00-03-10Z "/usr/bin/docker-ent…" [2022-03-26 22:50:54,987] [ INFO] - The audio file format is right [2022-03-26 22:50:54,988] [ INFO] - device type: cpu [2022-03-26 22:50:54,988] [ INFO] - load the pretrained model: ecapatdnn_voxceleb12-16k - [2022-03-26 22:50:54,990] [ INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz + [2022-03-26 22:50:54,990] [ INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.cdn.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz ... [2022-03-26 22:51:17,285] [ INFO] - start to dynamic import the model class [2022-03-26 22:51:17,285] [ INFO] - model name ecapatdnn diff --git a/demos/audio_searching/src/test_audio_search.py b/demos/audio_searching/src/test_audio_search.py index f9ea2929e..5136c0192 100644 --- a/demos/audio_searching/src/test_audio_search.py +++ b/demos/audio_searching/src/test_audio_search.py @@ -24,7 +24,7 @@ def download_audio_data(): """ Download audio data """ - url = "https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz" + url = "https://paddlespeech.cdn.bcebos.com/vector/audio/example_audio.tar.gz" md5sum = "52ac69316c1aa1fdef84da7dd2c67b39" target_dir = "./" filepath = download(url, md5sum, target_dir) diff --git a/demos/audio_searching/src/test_vpr_search.py b/demos/audio_searching/src/test_vpr_search.py index cc795564e..67442c9d5 100644 --- a/demos/audio_searching/src/test_vpr_search.py +++ b/demos/audio_searching/src/test_vpr_search.py @@ -24,7 +24,7 @@ def download_audio_data(): """ Download audio data """ - url = "https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz" + url = "https://paddlespeech.cdn.bcebos.com/vector/audio/example_audio.tar.gz" md5sum = "52ac69316c1aa1fdef84da7dd2c67b39" target_dir = "./" filepath = download(url, md5sum, target_dir) diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index b602c6022..89f4a944d 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -18,7 +18,7 @@ The input of this demo should be a WAV file(`.wav`). Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/dog.wav ``` ### 3. Usage diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md index 36b5d8aaf..1a46abd62 100644 --- a/demos/audio_tagging/README_cn.md +++ b/demos/audio_tagging/README_cn.md @@ -18,7 +18,7 @@ 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/dog.wav ``` ### 3. 使用方法 diff --git a/demos/audio_tagging/run.sh b/demos/audio_tagging/run.sh index b30eba35f..3841af10c 100755 --- a/demos/audio_tagging/run.sh +++ b/demos/audio_tagging/run.sh @@ -1,4 +1,4 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/dog.wav paddlespeech cls --input ./cat.wav --topk 10 diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md index 89d8c73c9..0649f77f9 100644 --- a/demos/automatic_video_subtitiles/README.md +++ b/demos/automatic_video_subtitiles/README.md @@ -15,7 +15,7 @@ You can choose one way from easy, medium and hard to install paddlespeech. ### 2. Prepare Input Get a video file with the speech of the specific language: ```bash -wget -c https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 +wget -c https://paddlespeech.cdn.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 ``` Extract `.wav` with one channel and 16000 sample rate from the video: diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md index 990ff6dbd..7a44ff107 100644 --- a/demos/automatic_video_subtitiles/README_cn.md +++ b/demos/automatic_video_subtitiles/README_cn.md @@ -13,7 +13,7 @@ ### 2. 准备输入 获取包含特定语言语音的视频文件: ```bash -wget -c https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 +wget -c https://paddlespeech.cdn.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 ``` 从视频文件中提取单通道的 16kHz 采样率的 `.wav` 文件: ```bash diff --git a/demos/automatic_video_subtitiles/run.sh b/demos/automatic_video_subtitiles/run.sh index 9b9fd2ccc..943109099 100755 --- a/demos/automatic_video_subtitiles/run.sh +++ b/demos/automatic_video_subtitiles/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -video_url=https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 +video_url=https://paddlespeech.cdn.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 video_file=$(basename ${video_url}) audio_file=$(echo ${video_file} | awk -F'.' '{print $1}').wav num_channels=1 diff --git a/demos/custom_streaming_asr/websocket_server.sh b/demos/custom_streaming_asr/websocket_server.sh index 041c345be..a7ee39636 100755 --- a/demos/custom_streaming_asr/websocket_server.sh +++ b/demos/custom_streaming_asr/websocket_server.sh @@ -14,7 +14,7 @@ cmvn=./data/cmvn.ark #paddle_asr_online/resource.tar.gz if [ ! -f $cmvn ]; then - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/resource.tar.gz + wget -c https://paddlespeech.cdn.bcebos.com/s2t/paddle_asr_online/resource.tar.gz tar xzfv resource.tar.gz ln -s ./resource/data . fi diff --git a/demos/keyword_spotting/README.md b/demos/keyword_spotting/README.md index b55c71124..02291ff11 100644 --- a/demos/keyword_spotting/README.md +++ b/demos/keyword_spotting/README.md @@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/kws/hey_snips.wav https://paddlespeech.bj.bcebos.com/kws/non-keyword.wav +wget -c https://paddlespeech.cdn.bcebos.com/kws/hey_snips.wav https://paddlespeech.cdn.bcebos.com/kws/non-keyword.wav ``` ### 3. Usage diff --git a/demos/keyword_spotting/README_cn.md b/demos/keyword_spotting/README_cn.md index 0d8f44a53..b35d22f25 100644 --- a/demos/keyword_spotting/README_cn.md +++ b/demos/keyword_spotting/README_cn.md @@ -16,7 +16,7 @@ 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/kws/hey_snips.wav https://paddlespeech.bj.bcebos.com/kws/non-keyword.wav +wget -c https://paddlespeech.cdn.bcebos.com/kws/hey_snips.wav https://paddlespeech.cdn.bcebos.com/kws/non-keyword.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) diff --git a/demos/keyword_spotting/run.sh b/demos/keyword_spotting/run.sh index 7f9e0ebba..dec3cb9e5 100755 --- a/demos/keyword_spotting/run.sh +++ b/demos/keyword_spotting/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/kws/hey_snips.wav https://paddlespeech.bj.bcebos.com/kws/non-keyword.wav +wget -c https://paddlespeech.cdn.bcebos.com/kws/hey_snips.wav https://paddlespeech.cdn.bcebos.com/kws/non-keyword.wav # kws paddlespeech kws --input ./hey_snips.wav diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh index 551f0b4e5..02cca15f8 100755 --- a/demos/metaverse/run.sh +++ b/demos/metaverse/run.sh @@ -25,12 +25,12 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip # donload sources - wget -P download https://paddlespeech.bj.bcebos.com/demos/metaverse/Lamarr.png + wget -P download https://paddlespeech.cdn.bcebos.com/demos/metaverse/Lamarr.png fi diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md index 37c6bf3b9..c3055f3d4 100644 --- a/demos/speaker_verification/README.md +++ b/demos/speaker_verification/README.md @@ -18,8 +18,8 @@ The input of this cli demo should be a WAV file(`.wav`), and the sample rate mus Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/123456789.wav ``` ### 3. Usage diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md index 85224699c..71cb54c02 100644 --- a/demos/speaker_verification/README_cn.md +++ b/demos/speaker_verification/README_cn.md @@ -18,8 +18,8 @@ 可以下载此 demo 的示例音频: ```bash # 该音频的内容是数字串 85236145389 -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/123456789.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) diff --git a/demos/speaker_verification/run.sh b/demos/speaker_verification/run.sh index 6140f7f38..c7c589e82 100755 --- a/demos/speaker_verification/run.sh +++ b/demos/speaker_verification/run.sh @@ -1,7 +1,7 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/123456789.wav # vector paddlespeech vector --task spk --input ./85236145389.wav diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index e406590d2..cd8d69c7e 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. Usage diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index 62dce3bc9..496f91ca2 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -17,7 +17,7 @@ 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index 8ba6e4c3e..20fdb7aaf 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -1,8 +1,8 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/ch_zh_mix.wav # asr paddlespeech asr --input ./zh.wav diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 08788a89e..178374428 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -85,9 +85,9 @@ The input of ASR client demo should be a WAV file(`.wav`), and the sample rate Here are sample files for this ASR client demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` **Note:** The response time will be slightly longer when using the client for the first time @@ -204,7 +204,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav Here are sample files for this CLS Client demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` **Note:** The response time will be slightly longer when using the client for the first time @@ -257,8 +257,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav Here are sample files for this Speaker Verification Client demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/123456789.wav ``` #### 7.1 Extract speaker embedding diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index f2cb349e3..f724e9770 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -89,9 +89,9 @@ ASR 客户端的输入是一个 WAV 文件(`.wav`),并且采样率必须 可以下载 ASR 客户端的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` **注意:** 初次使用客户端时响应时间会略长 @@ -211,7 +211,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav 可以下载 CLS 客户端的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` **注意:** 初次使用客户端时响应时间会略长 @@ -264,8 +264,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav 可以下载声纹客户端的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/123456789.wav ``` #### 7.1 提取声纹特征 diff --git a/demos/speech_server/asr_client.sh b/demos/speech_server/asr_client.sh index 37a7ab0b0..47ae1baba 100755 --- a/demos/speech_server/asr_client.sh +++ b/demos/speech_server/asr_client.sh @@ -1,6 +1,6 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav # If `127.0.0.1` is not accessible, you need to use the actual service IP address. paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav diff --git a/demos/speech_server/cls_client.sh b/demos/speech_server/cls_client.sh index 67012648c..6a9e414f5 100755 --- a/demos/speech_server/cls_client.sh +++ b/demos/speech_server/cls_client.sh @@ -1,6 +1,6 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav # If `127.0.0.1` is not accessible, you need to use the actual service IP address. paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --topk 1 diff --git a/demos/speech_server/sid_client.sh b/demos/speech_server/sid_client.sh index 99bab21ae..c9b75f4e0 100755 --- a/demos/speech_server/sid_client.sh +++ b/demos/speech_server/sid_client.sh @@ -1,7 +1,7 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav -wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.cdn.bcebos.com/vector/audio/123456789.wav # sid extract paddlespeech_client vector --server_ip 127.0.0.1 --port 8090 --task spk --input ./85236145389.wav diff --git a/demos/speech_ssl/README.md b/demos/speech_ssl/README.md index 8677ebc57..42449147f 100644 --- a/demos/speech_ssl/README.md +++ b/demos/speech_ssl/README.md @@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav ``` ### 3. Usage diff --git a/demos/speech_ssl/README_cn.md b/demos/speech_ssl/README_cn.md index 5b209419a..42ffd634d 100644 --- a/demos/speech_ssl/README_cn.md +++ b/demos/speech_ssl/README_cn.md @@ -17,7 +17,7 @@ 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) diff --git a/demos/speech_ssl/run.sh b/demos/speech_ssl/run.sh index ca94bc5cc..9940207f8 100644 --- a/demos/speech_ssl/run.sh +++ b/demos/speech_ssl/run.sh @@ -1,7 +1,7 @@ #!/bin/bash # audio download -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav # to recognize text paddlespeech ssl --task asr --lang en --input ./en.wav diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index 4866336c0..df75cd353 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`). Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav ``` ### 3. Usage (not support for Windows now) diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md index 5119bf9f4..617340f5e 100644 --- a/demos/speech_translation/README_cn.md +++ b/demos/speech_translation/README_cn.md @@ -17,7 +17,7 @@ 这里给出一些样例文件供 Demo 使用: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav ``` ### 3. 使用方法 (暂不支持Windows) diff --git a/demos/speech_translation/run.sh b/demos/speech_translation/run.sh index 6619bd91f..6f316b355 100755 --- a/demos/speech_translation/run.sh +++ b/demos/speech_translation/run.sh @@ -1,4 +1,4 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav paddlespeech st --input ./en.wav diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md index fc1fe7105..507d82186 100644 --- a/demos/speech_web/README.md +++ b/demos/speech_web/README.md @@ -100,43 +100,43 @@ cd speech_server mkdir -p source/model cd source # 下载 & 解压 wav (包含VC测试音频) -wget https://paddlespeech.bj.bcebos.com/demos/speech_web/wav_vc.zip +wget https://paddlespeech.cdn.bcebos.com/demos/speech_web/wav_vc.zip unzip wav_vc.zip cd model # 下载 GE2E 相关模型 wget https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip unzip ge2e_ckpt_0.3.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip unzip pwg_aishell3_ckpt_0.5.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip unzip fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip # 下载 ECAPA-TDNN 相关模型 -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_vc2_1.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_vc2_1.2.0.zip unzip fastspeech2_aishell3_ckpt_vc2_1.2.0.zip # 下载 ERNIE-SAT 相关模型 # aishell3 ERNIE-SAT -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_ckpt_1.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_ckpt_1.2.0.zip unzip erniesat_aishell3_ckpt_1.2.0.zip # vctk ERNIE-SAT -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_vctk_ckpt_1.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_vctk_ckpt_1.2.0.zip unzip erniesat_vctk_ckpt_1.2.0.zip # aishell3_vctk ERNIE-SAT -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_vctk_ckpt_1.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_vctk_ckpt_1.2.0.zip unzip erniesat_aishell3_vctk_ckpt_1.2.0.zip # 下载 finetune 相关模型 -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip unzip fastspeech2_aishell3_ckpt_1.1.0.zip # 下载声码器 -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip unzip hifigan_aishell3_ckpt_0.2.0.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip unzip hifigan_vctk_ckpt_0.2.0.zip cd ../../../ diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh index 50335e73b..dadfacba3 100755 --- a/demos/story_talker/run.sh +++ b/demos/story_talker/run.sh @@ -19,13 +19,13 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip # download sources - wget -P download https://paddlespeech.bj.bcebos.com/demos/story_talker/simfang.ttf - wget -P download/imgs https://paddlespeech.bj.bcebos.com/demos/story_talker/000.jpg + wget -P download https://paddlespeech.cdn.bcebos.com/demos/story_talker/simfang.ttf + wget -P download/imgs https://paddlespeech.cdn.bcebos.com/demos/story_talker/000.jpg fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 423485466..670dce193 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -32,7 +32,7 @@ The input of ASR client demo should be a WAV file(`.wav`), and the sample rate Here are sample files for thisASR client demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` ### 3. Server Usage diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index f5f477ea1..2bb3f83fa 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -35,7 +35,7 @@ 可以下载此 ASR client的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` ### 3. 服务端使用方法 diff --git a/demos/streaming_asr_server/test.sh b/demos/streaming_asr_server/test.sh index 386c7f894..68eb7567f 100755 --- a/demos/streaming_asr_server/test.sh +++ b/demos/streaming_asr_server/test.sh @@ -1,5 +1,5 @@ # download the test wav -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav # read the wav and pass it to only streaming asr service # If `127.0.0.1` is not accessible, you need to use the actual service IP address. diff --git a/demos/streaming_tts_serving_fastdeploy/README.md b/demos/streaming_tts_serving_fastdeploy/README.md index 3e983a06d..460e257b4 100644 --- a/demos/streaming_tts_serving_fastdeploy/README.md +++ b/demos/streaming_tts_serving_fastdeploy/README.md @@ -31,8 +31,8 @@ export LANGUAGE="zh_CN:zh:en_US:en" #### 1.3 Download models(inside the docker) ```bash cd /models/streaming_tts_serving/1 -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip unzip fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip unzip mb_melgan_csmsc_onnx_0.2.0.zip ``` diff --git a/demos/streaming_tts_serving_fastdeploy/README_cn.md b/demos/streaming_tts_serving_fastdeploy/README_cn.md index 7edd32830..5675867b0 100644 --- a/demos/streaming_tts_serving_fastdeploy/README_cn.md +++ b/demos/streaming_tts_serving_fastdeploy/README_cn.md @@ -31,8 +31,8 @@ export LANGUAGE="zh_CN:zh:en_US:en" #### 1.3 下载模型(在docker内) ```bash cd /models/streaming_tts_serving/1 -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip +wget https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip unzip fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip unzip mb_melgan_csmsc_onnx_0.2.0.zip ``` diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh index 45fc0c104..fe86822d0 100755 --- a/demos/style_fs2/run.sh +++ b/demos/style_fs2/run.sh @@ -14,9 +14,9 @@ mkdir -p download if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip fi diff --git a/demos/whisper/README.md b/demos/whisper/README.md index 6e1b8011f..ccd695d5e 100644 --- a/demos/whisper/README.md +++ b/demos/whisper/README.md @@ -16,7 +16,7 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper Here are sample files for this demo that can be downloaded: ```bash - wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` ### 3. Usage diff --git a/demos/whisper/README_cn.md b/demos/whisper/README_cn.md index 6f7c35f04..868a5579c 100644 --- a/demos/whisper/README_cn.md +++ b/demos/whisper/README_cn.md @@ -17,7 +17,7 @@ Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper 可以下载此 demo 的示例音频: ```bash - wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav ``` ### 3. 使用方法 diff --git a/demos/whisper/run.sh b/demos/whisper/run.sh index b9595735f..7049192cf 100644 --- a/demos/whisper/run.sh +++ b/demos/whisper/run.sh @@ -1,7 +1,7 @@ #!/bin/bash # audio download -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav # to recognize text paddlespeech whisper --task transcribe --input ./zh.wav diff --git a/docs/source/demo_video.rst b/docs/source/demo_video.rst index dc7e718a6..cd56d4bfc 100644 --- a/docs/source/demo_video.rst +++ b/docs/source/demo_video.rst @@ -5,7 +5,7 @@ Demo Video