diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d260dd3d..57d806e16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,7 +57,7 @@ include(openblas) # packages find_package(Python3 COMPONENTS Interpreter Development) -find_package(pybind11 CONFIG) +find_package(pybind11 CONFIG REQUIRED) # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g") diff --git a/cmake/summary.cmake b/cmake/summary.cmake index 67e8be0a9..f04d44698 100644 --- a/cmake/summary.cmake +++ b/cmake/summary.cmake @@ -37,5 +37,9 @@ function (onnx_print_configuration_summary) message(STATUS " Python executable : ${Python_EXECUTABLE}") message(STATUS " Python includes : ${Python_INCLUDE_DIR}") message(STATUS " Python libraries : ${Python_LIBRARY}") - + message(STATUS " PYBIND11 : ${pybind11_FOUND}") + message(STATUS " Pybind11 version : ${pybind11_VERSION}") + message(STATUS " Pybind11 include : ${pybind11_INCLUDE_DIR}") + message(STATUS " Pybind11 includes : ${pybind11_INCLUDE_DIRS}") + message(STATUS " Pybind11 libraries : ${pybind11_LIBRARIES}") endfunction() \ No newline at end of file diff --git a/paddlespeech/audio/_internal/module_utils.py b/paddlespeech/audio/_internal/module_utils.py index 919ba852d..ca1ba4b84 100644 --- a/paddlespeech/audio/_internal/module_utils.py +++ b/paddlespeech/audio/_internal/module_utils.py @@ -5,6 +5,7 @@ from typing import Optional #code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py + def is_module_available(*modules: str) -> bool: r"""Returns if a top-level module with :attr:`name` exists *without** importing it. This is generally safer than try-catch block around a diff --git a/paddlespeech/audio/backends/no_backend.py b/paddlespeech/audio/backends/no_backend.py index d391f7aaf..157536f46 100644 --- a/paddlespeech/audio/backends/no_backend.py +++ b/paddlespeech/audio/backends/no_backend.py @@ -8,21 +8,25 @@ from paddle import Tensor #code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py + def load( - filepath: Union[str, Path], - out: Optional[Tensor] = None, - normalization: Union[bool, float, Callable] = True, - channels_first: bool = True, - num_frames: int = 0, - offset: int = 0, - filetype: Optional[str] = None, -) -> Tuple[Tensor, int]: + filepath: Union[str, Path], + out: Optional[Tensor]=None, + normalization: Union[bool, float, Callable]=True, + channels_first: bool=True, + num_frames: int=0, + offset: int=0, + filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: raise RuntimeError("No audio I/O backend is available.") -def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None: +def save(filepath: str, + src: Tensor, + sample_rate: int, + precision: int=16, + channels_first: bool=True) -> None: raise RuntimeError("No audio I/O backend is available.") def info(filepath: str) -> None: - raise RuntimeError("No audio I/O backend is available.") \ No newline at end of file + raise RuntimeError("No audio I/O backend is available.") diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index 53c2ad0df..a91220042 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -1,4 +1,3 @@ - from pathlib import Path from typing import Callable from typing import Optional @@ -43,17 +42,20 @@ _fallback_load = _fail_load _fallback_load_filebj = _fail_load_fileobj def load( - filepath: Union[str, Path], - out: Optional[Tensor] = None, - normalization: Union[bool, float, Callable] = True, - channels_first: bool = True, - num_frames: int = 0, - offset: int = 0, - filetype: Optional[str] = None, -) -> Tuple[Tensor, int]: + filepath: Union[str, Path], + out: Optional[Tensor]=None, + normalization: Union[bool, float, Callable]=True, + channels_first: bool=True, + num_frames: int=0, + offset: int=0, + filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: raise RuntimeError("No audio I/O backend is available.") -def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None: +def save(filepath: str, + src: Tensor, + sample_rate: int, + precision: int = 16, + channels_first: bool = True) -> None: raise RuntimeError("No audio I/O backend is available.") @_mod_utils.requires_sox() diff --git a/paddlespeech/audio/backends/utils.py b/paddlespeech/audio/backends/utils.py index 834fbca0a..9ea2eaca7 100644 --- a/paddlespeech/audio/backends/utils.py +++ b/paddlespeech/audio/backends/utils.py @@ -40,7 +40,8 @@ def set_audio_backend(backend: Optional[str]): of the system. If ``None`` is provided the current backend is unassigned. """ if backend is not None and backend not in list_audio_backends(): - raise RuntimeError(f'Backend "{backend}" is not one of ' f"available backends: {list_audio_backends()}.") + raise RuntimeError(f'Backend "{backend}" is not one of ' + f"available backends: {list_audio_backends()}.") if backend is None: module = no_backend @@ -76,6 +77,7 @@ def _init_audio_backend(): warnings.warn("No audio backend is available.") set_audio_backend(None) + def get_audio_backend() -> Optional[str]: """Get the name of the current backend @@ -88,4 +90,4 @@ def get_audio_backend() -> Optional[str]: return "sox_io" if paddlespeech.audio.load == soundfile_backend.load: return "soundfile" - raise ValueError("Unknown backend.") \ No newline at end of file + raise ValueError("Unknown backend.") diff --git a/paddlespeech/audio/kaldi/kaldi.py b/paddlespeech/audio/kaldi/kaldi.py index 4f1ad8475..40c6bda91 100644 --- a/paddlespeech/audio/kaldi/kaldi.py +++ b/paddlespeech/audio/kaldi/kaldi.py @@ -27,37 +27,38 @@ __all__ = [ @module_utils.requires_kaldi() -def fbank(wav, - samp_freq: int=16000, - frame_shift_ms: float=10.0, - frame_length_ms: float=25.0, - dither: float=0.0, - preemph_coeff: float=0.97, - remove_dc_offset: bool=True, - window_type: str='povey', - round_to_power_of_two: bool=True, - blackman_coeff: float=0.42, - snip_edges: bool=True, - allow_downsample: bool=False, - allow_upsample: bool=False, - max_feature_vectors: int=-1, - num_bins: int=23, - low_freq: float=20, - high_freq: float=0, - vtln_low: float=100, - vtln_high: float=-500, - debug_mel: bool=False, - htk_mode: bool=False, - use_energy: bool=False, # fbank opts - energy_floor: float=0.0, - raw_energy: bool=True, - htk_compat: bool=False, - use_log_fbank: bool=True, - use_power: bool=True): +def fbank( + wav, + samp_freq: int=16000, + frame_shift_ms: float=10.0, + frame_length_ms: float=25.0, + dither: float=0.0, + preemph_coeff: float=0.97, + remove_dc_offset: bool=True, + window_type: str='povey', + round_to_power_of_two: bool=True, + blackman_coeff: float=0.42, + snip_edges: bool=True, + allow_downsample: bool=False, + allow_upsample: bool=False, + max_feature_vectors: int=-1, + num_bins: int=23, + low_freq: float=20, + high_freq: float=0, + vtln_low: float=100, + vtln_high: float=-500, + debug_mel: bool=False, + htk_mode: bool=False, + use_energy: bool=False, # fbank opts + energy_floor: float=0.0, + raw_energy: bool=True, + htk_compat: bool=False, + use_log_fbank: bool=True, + use_power: bool=True): frame_opts = FrameExtractionOptions() mel_opts = MelBanksOptions() fbank_opts = FbankOptions() - frame_opts.samp_freq = samp_freq + frame_opts.samp_freq = samp_freq frame_opts.frame_shift_ms = frame_shift_ms frame_opts.frame_length_ms = frame_length_ms frame_opts.dither = dither @@ -71,7 +72,7 @@ def fbank(wav, frame_opts.allow_upsample = allow_upsample frame_opts.max_feature_vectors = max_feature_vectors - mel_opts.num_bins = num_bins + mel_opts.num_bins = num_bins mel_opts.low_freq = low_freq mel_opts.high_freq = high_freq mel_opts.vtln_low = vtln_low @@ -79,7 +80,7 @@ def fbank(wav, mel_opts.debug_mel = debug_mel mel_opts.htk_mode = htk_mode - fbank_opts.use_energy = use_energy + fbank_opts.use_energy = use_energy fbank_opts.energy_floor = energy_floor fbank_opts.raw_energy = raw_energy fbank_opts.htk_compat = htk_compat @@ -88,6 +89,7 @@ def fbank(wav, feat = ComputeFbank(frame_opts, mel_opts, fbank_opts, wav) return feat + @module_utils.requires_kaldi() def pitch(wav, samp_freq: int=16000, @@ -114,7 +116,7 @@ def pitch(wav, pitch_opts.samp_freq = samp_freq pitch_opts.frame_shift_ms = frame_shift_ms pitch_opts.frame_length_ms = frame_length_ms - pitch_opts.preemph_coeff = preemph_coeff + pitch_opts.preemph_coeff = preemph_coeff pitch_opts.min_f0 = min_f0 pitch_opts.max_f0 = max_f0 pitch_opts.soft_min_f0 = soft_min_f0 diff --git a/paddlespeech/audio/src/CMakeLists.txt b/paddlespeech/audio/src/CMakeLists.txt index 4249e04e2..eea07f637 100644 --- a/paddlespeech/audio/src/CMakeLists.txt +++ b/paddlespeech/audio/src/CMakeLists.txt @@ -105,7 +105,7 @@ function(define_extension name sources include_dirs libraries definitions) add_library(${name} SHARED ${sources}) target_compile_definitions(${name} PRIVATE "${definitions}") target_include_directories( - ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${include_dirs}) + ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs}) target_link_libraries( ${name} ${libraries} diff --git a/paddlespeech/audio/src/pybind/kaldi/feature_common.h b/paddlespeech/audio/src/pybind/kaldi/feature_common.h index dbac4ceac..05522bb7e 100644 --- a/paddlespeech/audio/src/pybind/kaldi/feature_common.h +++ b/paddlespeech/audio/src/pybind/kaldi/feature_common.h @@ -14,8 +14,8 @@ #pragma once -#include -#include +#include "pybind11/pybind11.h" +#include "pybind11/numpy.h" #include "feat/feature-window.h" namespace paddleaudio {