merge audio

3 years ago · 38c55e44e8
parent cffe555c91 a2e8b76a15
commit 38c55e44e8
9 changed files with 73 additions and 58 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -57,7 +57,7 @@ include(openblas)

 # packages
 find_package(Python3 COMPONENTS Interpreter Development)
-find_package(pybind11 CONFIG)
+find_package(pybind11 CONFIG REQUIRED)


 # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@ -37,5 +37,9 @@ function (onnx_print_configuration_summary)
  message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
  message(STATUS "    Python libraries      : ${Python_LIBRARY}")
-
+  message(STATUS "  PYBIND11                  : ${pybind11_FOUND}")
+  message(STATUS "    Pybind11 version        : ${pybind11_VERSION}")
+  message(STATUS "    Pybind11 include        : ${pybind11_INCLUDE_DIR}")
+  message(STATUS "    Pybind11 includes       : ${pybind11_INCLUDE_DIRS}")
+  message(STATUS "    Pybind11 libraries      : ${pybind11_LIBRARIES}")
 endfunction()
--- a/paddlespeech/audio/_internal/module_utils.py
+++ b/paddlespeech/audio/_internal/module_utils.py
@ -5,6 +5,7 @@ from typing import Optional

 #code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py

+
 def is_module_available(*modules: str) -> bool:
    r"""Returns if a top-level module with :attr:`name` exists *without**
    importing it. This is generally safer than try-catch block around a
--- a/paddlespeech/audio/backends/no_backend.py
+++ b/paddlespeech/audio/backends/no_backend.py
@ -8,19 +8,23 @@ from paddle import Tensor

 #code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py

+
 def load(
-    filepath: Union[str, Path],
-    out: Optional[Tensor] = None,
-    normalization: Union[bool, float, Callable] = True,
-    channels_first: bool = True,
-    num_frames: int = 0,
-    offset: int = 0,
-    filetype: Optional[str] = None,
-) -> Tuple[Tensor, int]:
+        filepath: Union[str, Path],
+        out: Optional[Tensor]=None,
+        normalization: Union[bool, float, Callable]=True,
+        channels_first: bool=True,
+        num_frames: int=0,
+        offset: int=0,
+        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
    raise RuntimeError("No audio I/O backend is available.")


-def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+def save(filepath: str,
+         src: Tensor,
+         sample_rate: int,
+         precision: int=16,
+         channels_first: bool=True) -> None:
    raise RuntimeError("No audio I/O backend is available.")


--- a/paddlespeech/audio/backends/sox_io_backend.py
+++ b/paddlespeech/audio/backends/sox_io_backend.py
@ -1,4 +1,3 @@
-
 from pathlib import Path
 from typing import Callable
 from typing import Optional
@ -43,17 +42,20 @@ _fallback_load = _fail_load
 _fallback_load_filebj = _fail_load_fileobj

 def load(
-    filepath: Union[str, Path],
-    out: Optional[Tensor] = None,
-    normalization: Union[bool, float, Callable] = True,
-    channels_first: bool = True,
-    num_frames: int = 0,
-    offset: int = 0,
-    filetype: Optional[str] = None,
-) -> Tuple[Tensor, int]:
+        filepath: Union[str, Path],
+        out: Optional[Tensor]=None,
+        normalization: Union[bool, float, Callable]=True,
+        channels_first: bool=True,
+        num_frames: int=0,
+        offset: int=0,
+        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
    raise RuntimeError("No audio I/O backend is available.")

-def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+def save(filepath: str, 
+         src: Tensor, 
+         sample_rate: int, 
+         precision: int = 16, 
+         channels_first: bool = True) -> None:
    raise RuntimeError("No audio I/O backend is available.")

@_mod_utils.requires_sox()
--- a/paddlespeech/audio/backends/utils.py
+++ b/paddlespeech/audio/backends/utils.py
@ -40,7 +40,8 @@ def set_audio_backend(backend: Optional[str]):
            of the system. If ``None`` is provided the  current backend is unassigned.
    """
    if backend is not None and backend not in list_audio_backends():
-        raise RuntimeError(f'Backend "{backend}" is not one of ' f"available backends: {list_audio_backends()}.")
+        raise RuntimeError(f'Backend "{backend}" is not one of '
+                           f"available backends: {list_audio_backends()}.")

    if backend is None:
        module = no_backend
@ -76,6 +77,7 @@ def _init_audio_backend():
        warnings.warn("No audio backend is available.")
        set_audio_backend(None)

+
 def get_audio_backend() -> Optional[str]:
    """Get the name of the current backend

--- a/paddlespeech/audio/kaldi/kaldi.py
+++ b/paddlespeech/audio/kaldi/kaldi.py
@ -27,33 +27,34 @@ __all__ = [


@module_utils.requires_kaldi()
-def fbank(wav,
-          samp_freq: int=16000,
-          frame_shift_ms: float=10.0,
-          frame_length_ms: float=25.0,
-          dither: float=0.0,
-          preemph_coeff: float=0.97,
-          remove_dc_offset: bool=True,
-          window_type: str='povey',
-          round_to_power_of_two: bool=True,
-          blackman_coeff: float=0.42,
-          snip_edges: bool=True,
-          allow_downsample: bool=False,
-          allow_upsample: bool=False,
-          max_feature_vectors: int=-1,
-          num_bins: int=23,
-          low_freq: float=20,
-          high_freq: float=0,
-          vtln_low: float=100,
-          vtln_high: float=-500,
-          debug_mel: bool=False,
-          htk_mode: bool=False,
-          use_energy: bool=False, # fbank opts
-          energy_floor: float=0.0,
-          raw_energy: bool=True,
-          htk_compat: bool=False,
-          use_log_fbank: bool=True,
-          use_power: bool=True):
+def fbank(
+        wav,
+        samp_freq: int=16000,
+        frame_shift_ms: float=10.0,
+        frame_length_ms: float=25.0,
+        dither: float=0.0,
+        preemph_coeff: float=0.97,
+        remove_dc_offset: bool=True,
+        window_type: str='povey',
+        round_to_power_of_two: bool=True,
+        blackman_coeff: float=0.42,
+        snip_edges: bool=True,
+        allow_downsample: bool=False,
+        allow_upsample: bool=False,
+        max_feature_vectors: int=-1,
+        num_bins: int=23,
+        low_freq: float=20,
+        high_freq: float=0,
+        vtln_low: float=100,
+        vtln_high: float=-500,
+        debug_mel: bool=False,
+        htk_mode: bool=False,
+        use_energy: bool=False,  # fbank opts
+        energy_floor: float=0.0,
+        raw_energy: bool=True,
+        htk_compat: bool=False,
+        use_log_fbank: bool=True,
+        use_power: bool=True):
    frame_opts = FrameExtractionOptions()
    mel_opts = MelBanksOptions()
    fbank_opts = FbankOptions()
@ -88,6 +89,7 @@ def fbank(wav,
    feat = ComputeFbank(frame_opts, mel_opts, fbank_opts, wav)
    return feat

+
@module_utils.requires_kaldi()
 def pitch(wav,
          samp_freq: int=16000,
--- a/paddlespeech/audio/src/CMakeLists.txt
+++ b/paddlespeech/audio/src/CMakeLists.txt
@ -105,7 +105,7 @@ function(define_extension name sources include_dirs libraries definitions)
  add_library(${name} SHARED ${sources})
  target_compile_definitions(${name} PRIVATE "${definitions}")
  target_include_directories(
-    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${include_dirs})
+    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
  target_link_libraries(
    ${name}
    ${libraries}
--- a/paddlespeech/audio/src/pybind/kaldi/feature_common.h
+++ b/paddlespeech/audio/src/pybind/kaldi/feature_common.h
@ -14,8 +14,8 @@

 #pragma once

-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/numpy.h"
 #include "feat/feature-window.h"

 namespace paddleaudio {