Merge pull request #2108 from zh794390558/api

[audio] audio backend
3 years ago · 5d34b7f66f
parent da6692c7b3 f55c457357
commit 5d34b7f66f
19 changed files with 143 additions and 134 deletions
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@ -35,6 +35,7 @@ function (onnx_print_configuration_summary)
  message(STATUS "  BUILD_ONNX_PYTHON         : ${BUILD_ONNX_PYTHON}")
  message(STATUS "    Python version        : ${Python_VERSION}")
  message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
-  message(STATUS "    Python includes       : ${Python_INCLUDE_DIRS}")
+  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
  message(STATUS "    Python libraries      : ${Python_LIBRARY}")
 endfunction()
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -12,5 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
 from . import audio
 # _init_audio_backend must called after audio import 
 audio.backends.utils._init_audio_backend()
 __all__ = [
    "audio"
 ]
--- a/paddlespeech/audio/init.py
+++ b/paddlespeech/audio/init.py
@ -11,12 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import compliance
 from . import datasets
 from . import features
 from . import functional
 from . import io
 from . import metric
-from . import sox_effects
+from . import utils
-from .backends import load
+
-from .backends import save
+from ._ops import ops
 from paddlespeech.audio.backends import get_audio_backend, list_audio_backends, set_audio_backend
 __all__ = [
    "io",
    "compliance",
    "datasets",
    "functional",
    "features",
    "utils",
    'ops'
    "list_audio_backends",
    "get_audio_backend",
    "set_audio_backend",
 ]
--- a/paddlespeech/audio/_extension.py
+++ b/paddlespeech/audio/_extension.py
@ -44,7 +44,7 @@ def _load_lib(lib: str) -> bool:
    path = _get_lib_path(lib)
    if not path.exists():
        return False
-    paddlespeech.ops.load_library(path)
+    paddlespeech.audio.ops.load_library(path)
    return True
@ -56,7 +56,7 @@ def _init_ffmpeg():
    if _FFMPEG_INITIALIZED:
        return
-    if not paddlespeech.ops.paddlleaudio.is_ffmpeg_available():
+    if not paddlespeech.audio.ops.paddlleaudio.is_ffmpeg_available():
        raise RuntimeError(
            "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
        )
@ -67,11 +67,11 @@ def _init_ffmpeg():
        raise ImportError(
            "FFmpeg libraries are not found. Please install FFmpeg.") from err
-    import paddllespeech._paddlleaudio_ffmpeg  # noqa
+    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa
-    paddlespeech.ops.paddlleaudio.ffmpeg_init()
+    paddlespeech.audio.ops.paddlleaudio.ffmpeg_init()
-    if paddlespeech.ops.paddlleaudio.ffmpeg_get_log_level() > 8:
+    if paddlespeech.audio.ops.paddlleaudio.ffmpeg_get_log_level() > 8:
-        paddlespeech.ops.paddlleaudio.ffmpeg_set_log_level(8)
+        paddlespeech.audio.ops.paddlleaudio.ffmpeg_set_log_level(8)
    _FFMPEG_INITIALIZED = True
@ -84,7 +84,7 @@ def _init_extension():
    _load_lib("libpaddleaudio")
    # This import is for initializing the methods registered via PyBind11
    # This has to happen after the base library is loaded
-    from paddlespeech import _paddleaudio  # noqa
+    from paddlespeech.audio import _paddleaudio  # noqa
    # Because this part is executed as part of `import torchaudio`, we ignore the
    # initialization failure.
--- a/paddlespeech/audio/backends/init.py
+++ b/paddlespeech/audio/backends/init.py
@ -11,9 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .soundfile_backend import depth_convert
+
-from .soundfile_backend import load
+# flake8: noqa
-from .soundfile_backend import normalize
+from . import utils
-from .soundfile_backend import resample
+from .utils import get_audio_backend, list_audio_backends, set_audio_backend
 from .soundfile_backend import save
 from .soundfile_backend import to_mono
--- a/paddlespeech/audio/backends/soundfile_backend.py
+++ b/paddlespeech/audio/backends/soundfile_backend.py
@ -23,11 +23,11 @@ import soundfile as sf
 from scipy.io import wavfile
 from ..utils import ParameterError
 from ..utils import depth_convert
 __all__ = [
    'resample',
    'to_mono',
    'depth_convert',
    'normalize',
    'save',
    'load',
@ -117,78 +117,6 @@ def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
    return y_out
 def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
    """Data type casting in a safe way, i.e., prevent overflow or underflow.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        dtype (Union[type, str]): Data type of waveform.
    Returns:
        np.ndarray: `y` after safe casting.
    """
    if 'float' in str(y.dtype):
        return np.clip(y, np.finfo(dtype).min,
                       np.finfo(dtype).max).astype(dtype)
    else:
        return np.clip(y, np.iinfo(dtype).min,
                       np.iinfo(dtype).max).astype(dtype)
 def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
    """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of
    preventing overflow/underflow and preserving audio range.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        dtype (Union[type, str]): Data type of waveform.
    Returns:
        np.ndarray: `y` after safe casting.
    """
    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
    if y.dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype == y.dtype:
        return y
    if dtype == 'float64' and y.dtype == 'float32':
        return _safe_cast(y, dtype)
    if dtype == 'float32' and y.dtype == 'float64':
        return _safe_cast(y, dtype)
    if dtype == 'int16' or dtype == 'int8':
        if y.dtype in ['float64', 'float32']:
            factor = np.iinfo(dtype).max
            y = np.clip(y * factor, np.iinfo(dtype).min,
                        np.iinfo(dtype).max).astype(dtype)
            y = y.astype(dtype)
        else:
            if dtype == 'int16' and y.dtype == 'int8':
                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
                y = y.astype('float32') * factor
                y = y.astype('int16')
            else:  # dtype == 'int8' and y.dtype=='int16':
                y = y.astype('int32') * np.iinfo('int8').max / \
                    np.iinfo('int16').max
                y = y.astype('int8')
    if dtype in ['float32', 'float64']:
        org_dtype = y.dtype
        y = y.astype(dtype) / np.iinfo(org_dtype).max
    return y
 def sound_file_load(file: os.PathLike,
                    offset: Optional[float]=None,
                    dtype: str='int16',
@ -323,3 +251,7 @@ def load(
    y = depth_convert(y, dtype)
    return y, r
 def info(filepath: str) -> None:
    raise RuntimeError("No audio I/O backend is available.")
--- a/paddlespeech/audio/backends/sox_backend.py
+++ b/paddlespeech/audio/backends/sox_backend.py
@ -1,13 +0,0 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/audio/compliance/librosa.py
+++ b/paddlespeech/audio/compliance/librosa.py
@ -22,7 +22,7 @@ import scipy
 from numpy.lib.stride_tricks import as_strided
 from scipy import signal
-from ..backends import depth_convert
+from ..utils import depth_convert
 from ..utils import ParameterError
 __all__ = [
--- a/paddlespeech/audio/datasets/dataset.py
+++ b/paddlespeech/audio/datasets/dataset.py
@ -16,7 +16,6 @@ from typing import List
 import numpy as np
 import paddle
 from ..backends import load as load_audio
 from ..compliance.kaldi import fbank as kaldi_fbank
 from ..compliance.kaldi import mfcc as kaldi_mfcc
 from ..compliance.librosa import melspectrogram
@ -70,9 +69,9 @@ class AudioClassificationDataset(paddle.io.Dataset):
        file, label = self.files[idx], self.labels[idx]
        if self.sample_rate is None:
-            waveform, sample_rate = load_audio(file)
+            waveform, sample_rate = paddlespeech.audio.load(file)
        else:
-            waveform, sample_rate = load_audio(file, sr=self.sample_rate)
+            waveform, sample_rate = paddlespeech.audio.load(file, sr=self.sample_rate)
        feat_func = feat_funcs[self.feat_type]
--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ b/paddlespeech/audio/datasets/rirs_noises.py
@ -20,8 +20,6 @@ from typing import List
 from paddle.io import Dataset
 from tqdm import tqdm
 from ..backends import load as load_audio
 from ..backends import save as save_wav
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from .dataset import feat_funcs
@ -105,7 +103,7 @@ class OpenRIRNoise(Dataset):
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)
-        waveform, sr = load_audio(record['wav'])
+        waveform, sr = paddlespeech.audio.load(record['wav'])
        assert self.feat_type in feat_funcs.keys(), \
            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
@ -128,7 +126,7 @@ class OpenRIRNoise(Dataset):
    def _get_audio_info(self, wav_file: str,
                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = load_audio(wav_file)
+        waveform, sr = paddlespeech.audio.load(wav_file)
        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
        audio_duration = waveform.shape[0] / sr
@ -143,7 +141,7 @@ class OpenRIRNoise(Dataset):
                end_sample = int(float(e) * sr)
                new_wav_file = os.path.join(self.base_path,
                                            audio_id + f'_chunk_{idx+1:02}.wav')
-                save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+                paddlespeech.audio.save(waveform[start_sample:end_sample], sr, new_wav_file)
                # id, duration, new_wav
                ret.append([chunk, self.chunk_duration, new_wav_file])
        else:  # Keep whole audio.
--- a/paddlespeech/audio/datasets/voxceleb.py
+++ b/paddlespeech/audio/datasets/voxceleb.py
@ -23,7 +23,6 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm
 from ..backends import load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
 from ..utils.download import download_and_decompress
@ -192,7 +191,7 @@ class VoxCeleb(Dataset):
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)
-        waveform, sr = load_audio(record['wav'])
+        waveform, sr = paddlespeech.audio.load(record['wav'])
        # random select a chunk audio samples from the audio
        if self.random_chunk:
@ -231,7 +230,7 @@ class VoxCeleb(Dataset):
    def _get_audio_info(self, wav_file: str,
                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = load_audio(wav_file)
+        waveform, sr = paddlespeech.audio.load(wav_file)
        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
        audio_duration = waveform.shape[0] / sr
--- a/paddlespeech/audio/sox_effects/init.py
+++ b/paddlespeech/audio/sox_effects/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/audio/utils/init.py
+++ b/paddlespeech/audio/utils/init.py
@ -13,11 +13,18 @@
 # limitations under the License.
 from ...cli.utils import DATA_HOME
 from ...cli.utils import MODEL_HOME
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
 from .error import ParameterError
 from .log import Logger
 from .log import logger
 from .time import seconds_to_hms
 from .time import Timer
 from .numeric import pcm16to32
 from .numeric import depth_convert
--- a/paddlespeech/audio/utils/numeric.py
+++ b/paddlespeech/audio/utils/numeric.py
@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from typing import Union
 __all__ = [
    "pcm16to32",
    "depth_convert"
 ]
 def pcm16to32(audio: np.ndarray) -> np.ndarray:
    """pcm int16 to float32
@ -28,3 +33,76 @@ def pcm16to32(audio: np.ndarray) -> np.ndarray:
        bits = np.iinfo(np.int16).bits
        audio = audio / (2**(bits - 1))
    return audio
 def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
    """Data type casting in a safe way, i.e., prevent overflow or underflow.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        dtype (Union[type, str]): Data type of waveform.
    Returns:
        np.ndarray: `y` after safe casting.
    """
    if 'float' in str(y.dtype):
        return np.clip(y, np.finfo(dtype).min,
                       np.finfo(dtype).max).astype(dtype)
    else:
        return np.clip(y, np.iinfo(dtype).min,
                       np.iinfo(dtype).max).astype(dtype)
 def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
    """Convert audio array to target dtype safely. 
    This function convert audio waveform to a target dtype, with addition steps of
    preventing overflow/underflow and preserving audio range.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        dtype (Union[type, str]): Data type of waveform.
    Returns:
        np.ndarray: `y` after safe casting.
    """
    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
    if y.dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype == y.dtype:
        return y
    if dtype == 'float64' and y.dtype == 'float32':
        return _safe_cast(y, dtype)
    if dtype == 'float32' and y.dtype == 'float64':
        return _safe_cast(y, dtype)
    if dtype == 'int16' or dtype == 'int8':
        if y.dtype in ['float64', 'float32']:
            factor = np.iinfo(dtype).max
            y = np.clip(y * factor, np.iinfo(dtype).min,
                        np.iinfo(dtype).max).astype(dtype)
            y = y.astype(dtype)
        else:
            if dtype == 'int16' and y.dtype == 'int8':
                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
                y = y.astype('float32') * factor
                y = y.astype('int16')
            else:  # dtype == 'int8' and y.dtype=='int16':
                y = y.astype('int32') * np.iinfo('int8').max / \
                    np.iinfo('int16').max
                y = y.astype('int8')
    if dtype in ['float32', 'float64']:
        org_dtype = y.dtype
        y = y.astype(dtype) / np.iinfo(org_dtype).max
    return y
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@ -27,7 +27,7 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@ -18,7 +18,7 @@ import numpy as np
 from paddle import inference
 from scipy.special import softmax
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.datasets import ESC50
 from paddlespeech.audio.features import melspectrogram
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@ -17,7 +17,7 @@ from collections import OrderedDict
 import numpy as np
 import paddle
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.vector.infer import VectorExecutor
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@ -18,7 +18,7 @@ import time
 import paddle
 from yacs.config import CfgNode
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import feature_normalize
--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
@ -90,7 +90,7 @@ class CMakeBuild(build_ext):
            f"-DCMAKE_INSTALL_PREFIX={extdir}",
            "-DCMAKE_VERBOSE_MAKEFILE=ON",
            f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
-            f"-DPYTHON_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}",
+            f"-DPython_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}",
            f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
            f"-DBUILD_MAD:BOOL={'ON' if _BUILD_MAD else 'OFF'}",
            # f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",