From f55c457357554b53b40636f69fe8b0764ea3db3a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 30 Jun 2022 12:28:30 +0000
Subject: [PATCH] more backend api

---
 cmake/summary.cmake                           |  3 +-
 paddlespeech/__init__.py                      |  9 ++-
 paddlespeech/audio/__init__.py                | 22 +++++-
 paddlespeech/audio/_extension.py              | 14 ++--
 paddlespeech/audio/backends/__init__.py       | 10 +--
 .../audio/backends/soundfile_backend.py       | 78 ++-----------------
 paddlespeech/audio/backends/sox_backend.py    | 13 ----
 paddlespeech/audio/compliance/librosa.py      |  2 +-
 paddlespeech/audio/datasets/dataset.py        |  5 +-
 paddlespeech/audio/datasets/rirs_noises.py    |  8 +-
 paddlespeech/audio/datasets/voxceleb.py       |  5 +-
 paddlespeech/audio/sox_effects/__init__.py    | 13 ----
 paddlespeech/audio/utils/__init__.py          |  7 ++
 paddlespeech/audio/utils/numeric.py           | 78 +++++++++++++++++++
 paddlespeech/cli/vector/infer.py              |  2 +-
 paddlespeech/cls/exps/panns/deploy/predict.py |  2 +-
 .../engine/vector/python/vector_engine.py     |  2 +-
 .../vector/exps/ecapa_tdnn/extract_emb.py     |  2 +-
 tools/setup_helpers/extension.py              |  2 +-
 19 files changed, 143 insertions(+), 134 deletions(-)
 delete mode 100644 paddlespeech/audio/backends/sox_backend.py
 delete mode 100644 paddlespeech/audio/sox_effects/__init__.py

diff --git a/cmake/summary.cmake b/cmake/summary.cmake
index f1b5d3c5e..67e8be0a9 100644
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -35,6 +35,7 @@ function (onnx_print_configuration_summary)
   message(STATUS "  BUILD_ONNX_PYTHON         : ${BUILD_ONNX_PYTHON}")
   message(STATUS "    Python version        : ${Python_VERSION}")
   message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
-  message(STATUS "    Python includes       : ${Python_INCLUDE_DIRS}")
+  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
+  message(STATUS "    Python libraries      : ${Python_LIBRARY}")
 
 endfunction()
\ No newline at end of file
diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index b781c4a8e..6b36434b9 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -12,5 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import _locale
-
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+from . import audio
+# _init_audio_backend must called after audio import 
+audio.backends.utils._init_audio_backend()
+
+__all__ = [
+    "audio"
+]
diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py
index 6184c1dd4..4fab0d3bf 100644
--- a/paddlespeech/audio/__init__.py
+++ b/paddlespeech/audio/__init__.py
@@ -11,12 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from . import compliance
 from . import datasets
 from . import features
 from . import functional
 from . import io
 from . import metric
-from . import sox_effects
-from .backends import load
-from .backends import save
+from . import utils
+
+from ._ops import ops
+
+from paddlespeech.audio.backends import get_audio_backend, list_audio_backends, set_audio_backend
+
+__all__ = [
+    "io",
+    "compliance",
+    "datasets",
+    "functional",
+    "features",
+    "utils",
+    'ops'
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
\ No newline at end of file
diff --git a/paddlespeech/audio/_extension.py b/paddlespeech/audio/_extension.py
index fccba8838..5629a2826 100644
--- a/paddlespeech/audio/_extension.py
+++ b/paddlespeech/audio/_extension.py
@@ -44,7 +44,7 @@ def _load_lib(lib: str) -> bool:
     path = _get_lib_path(lib)
     if not path.exists():
         return False
-    paddlespeech.ops.load_library(path)
+    paddlespeech.audio.ops.load_library(path)
     return True
 
 
@@ -56,7 +56,7 @@ def _init_ffmpeg():
     if _FFMPEG_INITIALIZED:
         return
 
-    if not paddlespeech.ops.paddlleaudio.is_ffmpeg_available():
+    if not paddlespeech.audio.ops.paddlleaudio.is_ffmpeg_available():
         raise RuntimeError(
             "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
         )
@@ -67,11 +67,11 @@ def _init_ffmpeg():
         raise ImportError(
             "FFmpeg libraries are not found. Please install FFmpeg.") from err
 
-    import paddllespeech._paddlleaudio_ffmpeg  # noqa
+    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa
 
-    paddlespeech.ops.paddlleaudio.ffmpeg_init()
-    if paddlespeech.ops.paddlleaudio.ffmpeg_get_log_level() > 8:
-        paddlespeech.ops.paddlleaudio.ffmpeg_set_log_level(8)
+    paddlespeech.audio.ops.paddlleaudio.ffmpeg_init()
+    if paddlespeech.audio.ops.paddlleaudio.ffmpeg_get_log_level() > 8:
+        paddlespeech.audio.ops.paddlleaudio.ffmpeg_set_log_level(8)
 
     _FFMPEG_INITIALIZED = True
 
@@ -84,7 +84,7 @@ def _init_extension():
     _load_lib("libpaddleaudio")
     # This import is for initializing the methods registered via PyBind11
     # This has to happen after the base library is loaded
-    from paddlespeech import _paddleaudio  # noqa
+    from paddlespeech.audio import _paddleaudio  # noqa
 
     # Because this part is executed as part of `import torchaudio`, we ignore the
     # initialization failure.
diff --git a/paddlespeech/audio/backends/__init__.py b/paddlespeech/audio/backends/__init__.py
index 8eae07e82..38b45c899 100644
--- a/paddlespeech/audio/backends/__init__.py
+++ b/paddlespeech/audio/backends/__init__.py
@@ -11,9 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .soundfile_backend import depth_convert
-from .soundfile_backend import load
-from .soundfile_backend import normalize
-from .soundfile_backend import resample
-from .soundfile_backend import save
-from .soundfile_backend import to_mono
+
+# flake8: noqa
+from . import utils
+from .utils import get_audio_backend, list_audio_backends, set_audio_backend
\ No newline at end of file
diff --git a/paddlespeech/audio/backends/soundfile_backend.py b/paddlespeech/audio/backends/soundfile_backend.py
index c1155654f..16fcdf02b 100644
--- a/paddlespeech/audio/backends/soundfile_backend.py
+++ b/paddlespeech/audio/backends/soundfile_backend.py
@@ -23,11 +23,11 @@ import soundfile as sf
 from scipy.io import wavfile
 
 from ..utils import ParameterError
+from ..utils import depth_convert
 
 __all__ = [
     'resample',
     'to_mono',
-    'depth_convert',
     'normalize',
     'save',
     'load',
@@ -117,78 +117,6 @@ def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
     return y_out
 
 
-def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
-    """Data type casting in a safe way, i.e., prevent overflow or underflow.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        dtype (Union[type, str]): Data type of waveform.
-
-    Returns:
-        np.ndarray: `y` after safe casting.
-    """
-    if 'float' in str(y.dtype):
-        return np.clip(y, np.finfo(dtype).min,
-                       np.finfo(dtype).max).astype(dtype)
-    else:
-        return np.clip(y, np.iinfo(dtype).min,
-                       np.iinfo(dtype).max).astype(dtype)
-
-
-def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
-    """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of
-    preventing overflow/underflow and preserving audio range.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        dtype (Union[type, str]): Data type of waveform.
-
-    Returns:
-        np.ndarray: `y` after safe casting.
-    """
-
-    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
-    if y.dtype not in SUPPORT_DTYPE:
-        raise ParameterError(
-            'Unsupported audio dtype, '
-            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
-
-    if dtype not in SUPPORT_DTYPE:
-        raise ParameterError(
-            'Unsupported audio dtype, '
-            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
-
-    if dtype == y.dtype:
-        return y
-
-    if dtype == 'float64' and y.dtype == 'float32':
-        return _safe_cast(y, dtype)
-    if dtype == 'float32' and y.dtype == 'float64':
-        return _safe_cast(y, dtype)
-
-    if dtype == 'int16' or dtype == 'int8':
-        if y.dtype in ['float64', 'float32']:
-            factor = np.iinfo(dtype).max
-            y = np.clip(y * factor, np.iinfo(dtype).min,
-                        np.iinfo(dtype).max).astype(dtype)
-            y = y.astype(dtype)
-        else:
-            if dtype == 'int16' and y.dtype == 'int8':
-                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
-                y = y.astype('float32') * factor
-                y = y.astype('int16')
-
-            else:  # dtype == 'int8' and y.dtype=='int16':
-                y = y.astype('int32') * np.iinfo('int8').max / \
-                    np.iinfo('int16').max
-                y = y.astype('int8')
-
-    if dtype in ['float32', 'float64']:
-        org_dtype = y.dtype
-        y = y.astype(dtype) / np.iinfo(org_dtype).max
-    return y
-
-
 def sound_file_load(file: os.PathLike,
                     offset: Optional[float]=None,
                     dtype: str='int16',
@@ -323,3 +251,7 @@ def load(
 
     y = depth_convert(y, dtype)
     return y, r
+
+
+def info(filepath: str) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
\ No newline at end of file
diff --git a/paddlespeech/audio/backends/sox_backend.py b/paddlespeech/audio/backends/sox_backend.py
deleted file mode 100644
index 97043fd7b..000000000
--- a/paddlespeech/audio/backends/sox_backend.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddlespeech/audio/compliance/librosa.py b/paddlespeech/audio/compliance/librosa.py
index 168632d7c..17ad51b41 100644
--- a/paddlespeech/audio/compliance/librosa.py
+++ b/paddlespeech/audio/compliance/librosa.py
@@ -22,7 +22,7 @@ import scipy
 from numpy.lib.stride_tricks import as_strided
 from scipy import signal
 
-from ..backends import depth_convert
+from ..utils import depth_convert
 from ..utils import ParameterError
 
 __all__ = [
diff --git a/paddlespeech/audio/datasets/dataset.py b/paddlespeech/audio/datasets/dataset.py
index 488187a69..56eedcfba 100644
--- a/paddlespeech/audio/datasets/dataset.py
+++ b/paddlespeech/audio/datasets/dataset.py
@@ -16,7 +16,6 @@ from typing import List
 import numpy as np
 import paddle
 
-from ..backends import load as load_audio
 from ..compliance.kaldi import fbank as kaldi_fbank
 from ..compliance.kaldi import mfcc as kaldi_mfcc
 from ..compliance.librosa import melspectrogram
@@ -70,9 +69,9 @@ class AudioClassificationDataset(paddle.io.Dataset):
         file, label = self.files[idx], self.labels[idx]
 
         if self.sample_rate is None:
-            waveform, sample_rate = load_audio(file)
+            waveform, sample_rate = paddlespeech.audio.load(file)
         else:
-            waveform, sample_rate = load_audio(file, sr=self.sample_rate)
+            waveform, sample_rate = paddlespeech.audio.load(file, sr=self.sample_rate)
 
         feat_func = feat_funcs[self.feat_type]
 
diff --git a/paddlespeech/audio/datasets/rirs_noises.py b/paddlespeech/audio/datasets/rirs_noises.py
index 68639a604..4a8bd8c3f 100644
--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ b/paddlespeech/audio/datasets/rirs_noises.py
@@ -20,8 +20,6 @@ from typing import List
 from paddle.io import Dataset
 from tqdm import tqdm
 
-from ..backends import load as load_audio
-from ..backends import save as save_wav
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from .dataset import feat_funcs
@@ -105,7 +103,7 @@ class OpenRIRNoise(Dataset):
         for field in type(sample)._fields:
             record[field] = getattr(sample, field)
 
-        waveform, sr = load_audio(record['wav'])
+        waveform, sr = paddlespeech.audio.load(record['wav'])
 
         assert self.feat_type in feat_funcs.keys(), \
             f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
@@ -128,7 +126,7 @@ class OpenRIRNoise(Dataset):
 
     def _get_audio_info(self, wav_file: str,
                         split_chunks: bool) -> List[List[str]]:
-        waveform, sr = load_audio(wav_file)
+        waveform, sr = paddlespeech.audio.load(wav_file)
         audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
         audio_duration = waveform.shape[0] / sr
 
@@ -143,7 +141,7 @@ class OpenRIRNoise(Dataset):
                 end_sample = int(float(e) * sr)
                 new_wav_file = os.path.join(self.base_path,
                                             audio_id + f'_chunk_{idx+1:02}.wav')
-                save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+                paddlespeech.audio.save(waveform[start_sample:end_sample], sr, new_wav_file)
                 # id, duration, new_wav
                 ret.append([chunk, self.chunk_duration, new_wav_file])
         else:  # Keep whole audio.
diff --git a/paddlespeech/audio/datasets/voxceleb.py b/paddlespeech/audio/datasets/voxceleb.py
index 07f44e0c1..e1a8aa38b 100644
--- a/paddlespeech/audio/datasets/voxceleb.py
+++ b/paddlespeech/audio/datasets/voxceleb.py
@@ -23,7 +23,6 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm
 
-from ..backends import load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
 from ..utils.download import download_and_decompress
@@ -192,7 +191,7 @@ class VoxCeleb(Dataset):
         for field in type(sample)._fields:
             record[field] = getattr(sample, field)
 
-        waveform, sr = load_audio(record['wav'])
+        waveform, sr = paddlespeech.audio.load(record['wav'])
 
         # random select a chunk audio samples from the audio
         if self.random_chunk:
@@ -231,7 +230,7 @@ class VoxCeleb(Dataset):
 
     def _get_audio_info(self, wav_file: str,
                         split_chunks: bool) -> List[List[str]]:
-        waveform, sr = load_audio(wav_file)
+        waveform, sr = paddlespeech.audio.load(wav_file)
         spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
         audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
         audio_duration = waveform.shape[0] / sr
diff --git a/paddlespeech/audio/sox_effects/__init__.py b/paddlespeech/audio/sox_effects/__init__.py
deleted file mode 100644
index 97043fd7b..000000000
--- a/paddlespeech/audio/sox_effects/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddlespeech/audio/utils/__init__.py b/paddlespeech/audio/utils/__init__.py
index 742f9f8ef..5fbc02bdc 100644
--- a/paddlespeech/audio/utils/__init__.py
+++ b/paddlespeech/audio/utils/__init__.py
@@ -13,11 +13,18 @@
 # limitations under the License.
 from ...cli.utils import DATA_HOME
 from ...cli.utils import MODEL_HOME
+
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
+
 from .error import ParameterError
+
 from .log import Logger
 from .log import logger
+
 from .time import seconds_to_hms
 from .time import Timer
+
+from .numeric import pcm16to32
+from .numeric import depth_convert
\ No newline at end of file
diff --git a/paddlespeech/audio/utils/numeric.py b/paddlespeech/audio/utils/numeric.py
index 126cada50..940f9ddd8 100644
--- a/paddlespeech/audio/utils/numeric.py
+++ b/paddlespeech/audio/utils/numeric.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
+from typing import Union
 
+__all__ = [
+    "pcm16to32",
+    "depth_convert"
+]
 
 def pcm16to32(audio: np.ndarray) -> np.ndarray:
     """pcm int16 to float32
@@ -28,3 +33,76 @@ def pcm16to32(audio: np.ndarray) -> np.ndarray:
         bits = np.iinfo(np.int16).bits
         audio = audio / (2**(bits - 1))
     return audio
+
+
+def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Data type casting in a safe way, i.e., prevent overflow or underflow.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+    if 'float' in str(y.dtype):
+        return np.clip(y, np.finfo(dtype).min,
+                       np.finfo(dtype).max).astype(dtype)
+    else:
+        return np.clip(y, np.iinfo(dtype).min,
+                       np.iinfo(dtype).max).astype(dtype)
+
+
+def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Convert audio array to target dtype safely. 
+    This function convert audio waveform to a target dtype, with addition steps of
+    preventing overflow/underflow and preserving audio range.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+
+    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype == y.dtype:
+        return y
+
+    if dtype == 'float64' and y.dtype == 'float32':
+        return _safe_cast(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return _safe_cast(y, dtype)
+
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min,
+                        np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+
+            else:  # dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / \
+                    np.iinfo('int16').max
+                y = y.astype('int8')
+
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
\ No newline at end of file
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 4bc8e135a..f0eb3ae22 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -27,7 +27,7 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py
index fe1c93fa8..3c58d61c4 100644
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -18,7 +18,7 @@ import numpy as np
 from paddle import inference
 from scipy.special import softmax
 
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.datasets import ESC50
 from paddlespeech.audio.features import melspectrogram
 
diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py
index 3c72f55d4..056833dfe 100644
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@@ -17,7 +17,7 @@ from collections import OrderedDict
 import numpy as np
 import paddle
 
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.vector.infer import VectorExecutor
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
index cd4538bb5..2d01598cd 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -18,7 +18,7 @@ import time
 import paddle
 from yacs.config import CfgNode
 
-from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio import load as load_audio
 from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import feature_normalize
diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py
index ed76cec3c..bacc9af16 100644
--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
@@ -90,7 +90,7 @@ class CMakeBuild(build_ext):
             f"-DCMAKE_INSTALL_PREFIX={extdir}",
             "-DCMAKE_VERBOSE_MAKEFILE=ON",
             f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
-            f"-DPYTHON_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}",
+            f"-DPython_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}",
             f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
             f"-DBUILD_MAD:BOOL={'ON' if _BUILD_MAD else 'OFF'}",
             # f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",