From f55c457357554b53b40636f69fe8b0764ea3db3a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 30 Jun 2022 12:28:30 +0000 Subject: [PATCH] more backend api --- cmake/summary.cmake | 3 +- paddlespeech/__init__.py | 9 ++- paddlespeech/audio/__init__.py | 22 +++++- paddlespeech/audio/_extension.py | 14 ++-- paddlespeech/audio/backends/__init__.py | 10 +-- .../audio/backends/soundfile_backend.py | 78 ++----------------- paddlespeech/audio/backends/sox_backend.py | 13 ---- paddlespeech/audio/compliance/librosa.py | 2 +- paddlespeech/audio/datasets/dataset.py | 5 +- paddlespeech/audio/datasets/rirs_noises.py | 8 +- paddlespeech/audio/datasets/voxceleb.py | 5 +- paddlespeech/audio/sox_effects/__init__.py | 13 ---- paddlespeech/audio/utils/__init__.py | 7 ++ paddlespeech/audio/utils/numeric.py | 78 +++++++++++++++++++ paddlespeech/cli/vector/infer.py | 2 +- paddlespeech/cls/exps/panns/deploy/predict.py | 2 +- .../engine/vector/python/vector_engine.py | 2 +- .../vector/exps/ecapa_tdnn/extract_emb.py | 2 +- tools/setup_helpers/extension.py | 2 +- 19 files changed, 143 insertions(+), 134 deletions(-) delete mode 100644 paddlespeech/audio/backends/sox_backend.py delete mode 100644 paddlespeech/audio/sox_effects/__init__.py diff --git a/cmake/summary.cmake b/cmake/summary.cmake index f1b5d3c5e..67e8be0a9 100644 --- a/cmake/summary.cmake +++ b/cmake/summary.cmake @@ -35,6 +35,7 @@ function (onnx_print_configuration_summary) message(STATUS " BUILD_ONNX_PYTHON : ${BUILD_ONNX_PYTHON}") message(STATUS " Python version : ${Python_VERSION}") message(STATUS " Python executable : ${Python_EXECUTABLE}") - message(STATUS " Python includes : ${Python_INCLUDE_DIRS}") + message(STATUS " Python includes : ${Python_INCLUDE_DIR}") + message(STATUS " Python libraries : ${Python_LIBRARY}") endfunction() \ No newline at end of file diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index b781c4a8e..6b36434b9 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -12,5 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import _locale - _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) + +from . import audio +# _init_audio_backend must called after audio import +audio.backends.utils._init_audio_backend() + +__all__ = [ + "audio" +] diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py index 6184c1dd4..4fab0d3bf 100644 --- a/paddlespeech/audio/__init__.py +++ b/paddlespeech/audio/__init__.py @@ -11,12 +11,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from . import compliance from . import datasets from . import features from . import functional from . import io from . import metric -from . import sox_effects -from .backends import load -from .backends import save +from . import utils + +from ._ops import ops + +from paddlespeech.audio.backends import get_audio_backend, list_audio_backends, set_audio_backend + +__all__ = [ + "io", + "compliance", + "datasets", + "functional", + "features", + "utils", + 'ops' + "list_audio_backends", + "get_audio_backend", + "set_audio_backend", +] \ No newline at end of file diff --git a/paddlespeech/audio/_extension.py b/paddlespeech/audio/_extension.py index fccba8838..5629a2826 100644 --- a/paddlespeech/audio/_extension.py +++ b/paddlespeech/audio/_extension.py @@ -44,7 +44,7 @@ def _load_lib(lib: str) -> bool: path = _get_lib_path(lib) if not path.exists(): return False - paddlespeech.ops.load_library(path) + paddlespeech.audio.ops.load_library(path) return True @@ -56,7 +56,7 @@ def _init_ffmpeg(): if _FFMPEG_INITIALIZED: return - if not paddlespeech.ops.paddlleaudio.is_ffmpeg_available(): + if not paddlespeech.audio.ops.paddlleaudio.is_ffmpeg_available(): raise RuntimeError( "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio." ) @@ -67,11 +67,11 @@ def _init_ffmpeg(): raise ImportError( "FFmpeg libraries are not found. Please install FFmpeg.") from err - import paddllespeech._paddlleaudio_ffmpeg # noqa + import paddllespeech.audio._paddlleaudio_ffmpeg # noqa - paddlespeech.ops.paddlleaudio.ffmpeg_init() - if paddlespeech.ops.paddlleaudio.ffmpeg_get_log_level() > 8: - paddlespeech.ops.paddlleaudio.ffmpeg_set_log_level(8) + paddlespeech.audio.ops.paddlleaudio.ffmpeg_init() + if paddlespeech.audio.ops.paddlleaudio.ffmpeg_get_log_level() > 8: + paddlespeech.audio.ops.paddlleaudio.ffmpeg_set_log_level(8) _FFMPEG_INITIALIZED = True @@ -84,7 +84,7 @@ def _init_extension(): _load_lib("libpaddleaudio") # This import is for initializing the methods registered via PyBind11 # This has to happen after the base library is loaded - from paddlespeech import _paddleaudio # noqa + from paddlespeech.audio import _paddleaudio # noqa # Because this part is executed as part of `import torchaudio`, we ignore the # initialization failure. diff --git a/paddlespeech/audio/backends/__init__.py b/paddlespeech/audio/backends/__init__.py index 8eae07e82..38b45c899 100644 --- a/paddlespeech/audio/backends/__init__.py +++ b/paddlespeech/audio/backends/__init__.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .soundfile_backend import depth_convert -from .soundfile_backend import load -from .soundfile_backend import normalize -from .soundfile_backend import resample -from .soundfile_backend import save -from .soundfile_backend import to_mono + +# flake8: noqa +from . import utils +from .utils import get_audio_backend, list_audio_backends, set_audio_backend \ No newline at end of file diff --git a/paddlespeech/audio/backends/soundfile_backend.py b/paddlespeech/audio/backends/soundfile_backend.py index c1155654f..16fcdf02b 100644 --- a/paddlespeech/audio/backends/soundfile_backend.py +++ b/paddlespeech/audio/backends/soundfile_backend.py @@ -23,11 +23,11 @@ import soundfile as sf from scipy.io import wavfile from ..utils import ParameterError +from ..utils import depth_convert __all__ = [ 'resample', 'to_mono', - 'depth_convert', 'normalize', 'save', 'load', @@ -117,78 +117,6 @@ def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray: return y_out -def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: - """Data type casting in a safe way, i.e., prevent overflow or underflow. - - Args: - y (np.ndarray): Input waveform array in 1D or 2D. - dtype (Union[type, str]): Data type of waveform. - - Returns: - np.ndarray: `y` after safe casting. - """ - if 'float' in str(y.dtype): - return np.clip(y, np.finfo(dtype).min, - np.finfo(dtype).max).astype(dtype) - else: - return np.clip(y, np.iinfo(dtype).min, - np.iinfo(dtype).max).astype(dtype) - - -def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: - """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of - preventing overflow/underflow and preserving audio range. - - Args: - y (np.ndarray): Input waveform array in 1D or 2D. - dtype (Union[type, str]): Data type of waveform. - - Returns: - np.ndarray: `y` after safe casting. - """ - - SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] - if y.dtype not in SUPPORT_DTYPE: - raise ParameterError( - 'Unsupported audio dtype, ' - f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}') - - if dtype not in SUPPORT_DTYPE: - raise ParameterError( - 'Unsupported audio dtype, ' - f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}') - - if dtype == y.dtype: - return y - - if dtype == 'float64' and y.dtype == 'float32': - return _safe_cast(y, dtype) - if dtype == 'float32' and y.dtype == 'float64': - return _safe_cast(y, dtype) - - if dtype == 'int16' or dtype == 'int8': - if y.dtype in ['float64', 'float32']: - factor = np.iinfo(dtype).max - y = np.clip(y * factor, np.iinfo(dtype).min, - np.iinfo(dtype).max).astype(dtype) - y = y.astype(dtype) - else: - if dtype == 'int16' and y.dtype == 'int8': - factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS - y = y.astype('float32') * factor - y = y.astype('int16') - - else: # dtype == 'int8' and y.dtype=='int16': - y = y.astype('int32') * np.iinfo('int8').max / \ - np.iinfo('int16').max - y = y.astype('int8') - - if dtype in ['float32', 'float64']: - org_dtype = y.dtype - y = y.astype(dtype) / np.iinfo(org_dtype).max - return y - - def sound_file_load(file: os.PathLike, offset: Optional[float]=None, dtype: str='int16', @@ -323,3 +251,7 @@ def load( y = depth_convert(y, dtype) return y, r + + +def info(filepath: str) -> None: + raise RuntimeError("No audio I/O backend is available.") \ No newline at end of file diff --git a/paddlespeech/audio/backends/sox_backend.py b/paddlespeech/audio/backends/sox_backend.py deleted file mode 100644 index 97043fd7b..000000000 --- a/paddlespeech/audio/backends/sox_backend.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/paddlespeech/audio/compliance/librosa.py b/paddlespeech/audio/compliance/librosa.py index 168632d7c..17ad51b41 100644 --- a/paddlespeech/audio/compliance/librosa.py +++ b/paddlespeech/audio/compliance/librosa.py @@ -22,7 +22,7 @@ import scipy from numpy.lib.stride_tricks import as_strided from scipy import signal -from ..backends import depth_convert +from ..utils import depth_convert from ..utils import ParameterError __all__ = [ diff --git a/paddlespeech/audio/datasets/dataset.py b/paddlespeech/audio/datasets/dataset.py index 488187a69..56eedcfba 100644 --- a/paddlespeech/audio/datasets/dataset.py +++ b/paddlespeech/audio/datasets/dataset.py @@ -16,7 +16,6 @@ from typing import List import numpy as np import paddle -from ..backends import load as load_audio from ..compliance.kaldi import fbank as kaldi_fbank from ..compliance.kaldi import mfcc as kaldi_mfcc from ..compliance.librosa import melspectrogram @@ -70,9 +69,9 @@ class AudioClassificationDataset(paddle.io.Dataset): file, label = self.files[idx], self.labels[idx] if self.sample_rate is None: - waveform, sample_rate = load_audio(file) + waveform, sample_rate = paddlespeech.audio.load(file) else: - waveform, sample_rate = load_audio(file, sr=self.sample_rate) + waveform, sample_rate = paddlespeech.audio.load(file, sr=self.sample_rate) feat_func = feat_funcs[self.feat_type] diff --git a/paddlespeech/audio/datasets/rirs_noises.py b/paddlespeech/audio/datasets/rirs_noises.py index 68639a604..4a8bd8c3f 100644 --- a/paddlespeech/audio/datasets/rirs_noises.py +++ b/paddlespeech/audio/datasets/rirs_noises.py @@ -20,8 +20,6 @@ from typing import List from paddle.io import Dataset from tqdm import tqdm -from ..backends import load as load_audio -from ..backends import save as save_wav from ..utils import DATA_HOME from ..utils.download import download_and_decompress from .dataset import feat_funcs @@ -105,7 +103,7 @@ class OpenRIRNoise(Dataset): for field in type(sample)._fields: record[field] = getattr(sample, field) - waveform, sr = load_audio(record['wav']) + waveform, sr = paddlespeech.audio.load(record['wav']) assert self.feat_type in feat_funcs.keys(), \ f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" @@ -128,7 +126,7 @@ class OpenRIRNoise(Dataset): def _get_audio_info(self, wav_file: str, split_chunks: bool) -> List[List[str]]: - waveform, sr = load_audio(wav_file) + waveform, sr = paddlespeech.audio.load(wav_file) audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0] audio_duration = waveform.shape[0] / sr @@ -143,7 +141,7 @@ class OpenRIRNoise(Dataset): end_sample = int(float(e) * sr) new_wav_file = os.path.join(self.base_path, audio_id + f'_chunk_{idx+1:02}.wav') - save_wav(waveform[start_sample:end_sample], sr, new_wav_file) + paddlespeech.audio.save(waveform[start_sample:end_sample], sr, new_wav_file) # id, duration, new_wav ret.append([chunk, self.chunk_duration, new_wav_file]) else: # Keep whole audio. diff --git a/paddlespeech/audio/datasets/voxceleb.py b/paddlespeech/audio/datasets/voxceleb.py index 07f44e0c1..e1a8aa38b 100644 --- a/paddlespeech/audio/datasets/voxceleb.py +++ b/paddlespeech/audio/datasets/voxceleb.py @@ -23,7 +23,6 @@ from paddle.io import Dataset from pathos.multiprocessing import Pool from tqdm import tqdm -from ..backends import load as load_audio from ..utils import DATA_HOME from ..utils import decompress from ..utils.download import download_and_decompress @@ -192,7 +191,7 @@ class VoxCeleb(Dataset): for field in type(sample)._fields: record[field] = getattr(sample, field) - waveform, sr = load_audio(record['wav']) + waveform, sr = paddlespeech.audio.load(record['wav']) # random select a chunk audio samples from the audio if self.random_chunk: @@ -231,7 +230,7 @@ class VoxCeleb(Dataset): def _get_audio_info(self, wav_file: str, split_chunks: bool) -> List[List[str]]: - waveform, sr = load_audio(wav_file) + waveform, sr = paddlespeech.audio.load(wav_file) spk_id, sess_id, utt_id = wav_file.split("/")[-3:] audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]]) audio_duration = waveform.shape[0] / sr diff --git a/paddlespeech/audio/sox_effects/__init__.py b/paddlespeech/audio/sox_effects/__init__.py deleted file mode 100644 index 97043fd7b..000000000 --- a/paddlespeech/audio/sox_effects/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/paddlespeech/audio/utils/__init__.py b/paddlespeech/audio/utils/__init__.py index 742f9f8ef..5fbc02bdc 100644 --- a/paddlespeech/audio/utils/__init__.py +++ b/paddlespeech/audio/utils/__init__.py @@ -13,11 +13,18 @@ # limitations under the License. from ...cli.utils import DATA_HOME from ...cli.utils import MODEL_HOME + from .download import decompress from .download import download_and_decompress from .download import load_state_dict_from_url + from .error import ParameterError + from .log import Logger from .log import logger + from .time import seconds_to_hms from .time import Timer + +from .numeric import pcm16to32 +from .numeric import depth_convert \ No newline at end of file diff --git a/paddlespeech/audio/utils/numeric.py b/paddlespeech/audio/utils/numeric.py index 126cada50..940f9ddd8 100644 --- a/paddlespeech/audio/utils/numeric.py +++ b/paddlespeech/audio/utils/numeric.py @@ -12,7 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import numpy as np +from typing import Union +__all__ = [ + "pcm16to32", + "depth_convert" +] def pcm16to32(audio: np.ndarray) -> np.ndarray: """pcm int16 to float32 @@ -28,3 +33,76 @@ def pcm16to32(audio: np.ndarray) -> np.ndarray: bits = np.iinfo(np.int16).bits audio = audio / (2**(bits - 1)) return audio + + +def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Data type casting in a safe way, i.e., prevent overflow or underflow. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. + """ + if 'float' in str(y.dtype): + return np.clip(y, np.finfo(dtype).min, + np.finfo(dtype).max).astype(dtype) + else: + return np.clip(y, np.iinfo(dtype).min, + np.iinfo(dtype).max).astype(dtype) + + +def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Convert audio array to target dtype safely. + This function convert audio waveform to a target dtype, with addition steps of + preventing overflow/underflow and preserving audio range. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. + """ + + SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] + if y.dtype not in SUPPORT_DTYPE: + raise ParameterError( + 'Unsupported audio dtype, ' + f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}') + + if dtype not in SUPPORT_DTYPE: + raise ParameterError( + 'Unsupported audio dtype, ' + f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}') + + if dtype == y.dtype: + return y + + if dtype == 'float64' and y.dtype == 'float32': + return _safe_cast(y, dtype) + if dtype == 'float32' and y.dtype == 'float64': + return _safe_cast(y, dtype) + + if dtype == 'int16' or dtype == 'int8': + if y.dtype in ['float64', 'float32']: + factor = np.iinfo(dtype).max + y = np.clip(y * factor, np.iinfo(dtype).min, + np.iinfo(dtype).max).astype(dtype) + y = y.astype(dtype) + else: + if dtype == 'int16' and y.dtype == 'int8': + factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS + y = y.astype('float32') * factor + y = y.astype('int16') + + else: # dtype == 'int8' and y.dtype=='int16': + y = y.astype('int32') * np.iinfo('int8').max / \ + np.iinfo('int16').max + y = y.astype('int8') + + if dtype in ['float32', 'float64']: + org_dtype = y.dtype + y = y.astype(dtype) / np.iinfo(org_dtype).max + return y \ No newline at end of file diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 4bc8e135a..f0eb3ae22 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -27,7 +27,7 @@ from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper -from paddlespeech.audio.backends import load as load_audio +from paddlespeech.audio import load as load_audio from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.modules.sid_model import SpeakerIdetification diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py index fe1c93fa8..3c58d61c4 100644 --- a/paddlespeech/cls/exps/panns/deploy/predict.py +++ b/paddlespeech/cls/exps/panns/deploy/predict.py @@ -18,7 +18,7 @@ import numpy as np from paddle import inference from scipy.special import softmax -from paddlespeech.audio.backends import load as load_audio +from paddlespeech.audio import load as load_audio from paddlespeech.audio.datasets import ESC50 from paddlespeech.audio.features import melspectrogram diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 3c72f55d4..056833dfe 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -17,7 +17,7 @@ from collections import OrderedDict import numpy as np import paddle -from paddlespeech.audio.backends import load as load_audio +from paddlespeech.audio import load as load_audio from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.cli.log import logger from paddlespeech.cli.vector.infer import VectorExecutor diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py index cd4538bb5..2d01598cd 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py @@ -18,7 +18,7 @@ import time import paddle from yacs.config import CfgNode -from paddlespeech.audio.backends import load as load_audio +from paddlespeech.audio import load as load_audio from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.batch import feature_normalize diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py index ed76cec3c..bacc9af16 100644 --- a/tools/setup_helpers/extension.py +++ b/tools/setup_helpers/extension.py @@ -90,7 +90,7 @@ class CMakeBuild(build_ext): f"-DCMAKE_INSTALL_PREFIX={extdir}", "-DCMAKE_VERBOSE_MAKEFILE=ON", f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}", - f"-DPYTHON_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}", + f"-DPython_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}", f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}", f"-DBUILD_MAD:BOOL={'ON' if _BUILD_MAD else 'OFF'}", # f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",