diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py index 26da1991f..8a231ae5b 100644 --- a/paddlespeech/audio/__init__.py +++ b/paddlespeech/audio/__init__.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from . import _extension from . import compliance from . import datasets from . import features @@ -18,7 +20,6 @@ from . import functional from . import io from . import metric from . import utils -from ._ops import ops from paddlespeech.audio.backends import get_audio_backend from paddlespeech.audio.backends import list_audio_backends from paddlespeech.audio.backends import set_audio_backend @@ -30,7 +31,6 @@ __all__ = [ "functional", "features", "utils", - 'ops' "list_audio_backends", "get_audio_backend", "set_audio_backend", diff --git a/paddlespeech/audio/_extension.py b/paddlespeech/audio/_extension.py index 5629a2826..000fae131 100644 --- a/paddlespeech/audio/_extension.py +++ b/paddlespeech/audio/_extension.py @@ -4,8 +4,69 @@ from pathlib import Path from ._internal import module_utils as _mod_utils # noqa: F401 -_LIB_DIR = Path(__file__) / "lib" +import contextlib +import ctypes +import os +import sys +import types + +# Query `hasattr` only once. +_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys, + 'setdlopenflags') + + +@contextlib.contextmanager +def dl_open_guard(): + """ + # https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html + Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a + shared library to load custom operators. + """ + if _SET_GLOBAL_FLAGS: + old_flags = sys.getdlopenflags() + sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL) + yield + if _SET_GLOBAL_FLAGS: + sys.setdlopenflags(old_flags) + + +def resolve_library_path(path: str) -> str: + return os.path.realpath(path) + + +class _Ops(types.ModuleType): + #__file__ = '_ops.py' + + def __init__(self): + super(_Ops, self).__init__('paddlespeech.ops') + self.loaded_libraries = set() + + def load_library(self, path): + """ + Loads a shared library from the given path into the current process. + This allows dynamically loading custom operators. For this, + you should compile your operator and + the static registration code into a shared library object, and then + call ``paddlespeech.ops.load_library('path/to/libcustom.so')`` to load the + shared object. + After the library is loaded, it is added to the + ``paddlespeech.ops.loaded_libraries`` attribute, a set that may be inspected + for the paths of all libraries loaded using this function. + Args: + path (str): A path to a shared library to load. + """ + path = resolve_library_path(path) + with dl_open_guard(): + # https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries + # Import the shared library into the process, thus running its + # static (global) initialization code in order to register custom + # operators with the JIT. + ctypes.CDLL(path) + self.loaded_libraries.add(path) + + +_LIB_DIR = Path(__file__).parent / "lib" def _get_lib_path(lib: str): suffix = "pyd" if os.name == "nt" else "so" @@ -42,9 +103,12 @@ def _load_lib(lib: str) -> bool: If a dependency is missing, then users have to install it. """ path = _get_lib_path(lib) + warnings.warn("lib path is :" + str(path)) if not path.exists(): + warnings.warn("lib path is not exists:" + str(path)) return False - paddlespeech.audio.ops.load_library(path) + #paddlespeech.audio.ops.load_library(path) + ops.load_library(path) return True @@ -56,7 +120,7 @@ def _init_ffmpeg(): if _FFMPEG_INITIALIZED: return - if not paddlespeech.audio.ops.paddlleaudio.is_ffmpeg_available(): + if not paddlespeech.audio._paddlleaudio.is_ffmpeg_available(): raise RuntimeError( "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio." ) @@ -69,15 +133,15 @@ def _init_ffmpeg(): import paddllespeech.audio._paddlleaudio_ffmpeg # noqa - paddlespeech.audio.ops.paddlleaudio.ffmpeg_init() - if paddlespeech.audio.ops.paddlleaudio.ffmpeg_get_log_level() > 8: - paddlespeech.audio.ops.paddlleaudio.ffmpeg_set_log_level(8) + paddlespeech.audio._paddlleaudio.ffmpeg_init() + if paddlespeech.audio._paddlleaudio.ffmpeg_get_log_level() > 8: + paddlespeech.audio._paddlleaudio.ffmpeg_set_log_level(8) _FFMPEG_INITIALIZED = True def _init_extension(): - if not _mod_utils.is_module_available("paddlespeech._paddleaudio"): + if not _mod_utils.is_module_available("paddlespeech.audio._paddleaudio"): warnings.warn("paddlespeech C++ extension is not available.") return @@ -96,4 +160,6 @@ def _init_extension(): pass +ops = _Ops() + _init_extension() diff --git a/paddlespeech/audio/backends/common.py b/paddlespeech/audio/backends/common.py new file mode 100644 index 000000000..7ccab1d33 --- /dev/null +++ b/paddlespeech/audio/backends/common.py @@ -0,0 +1,55 @@ +# code from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py + +class AudioMetaData: + """Return type of ``torchaudio.info`` function. + + This class is used by :ref:`"sox_io" backend` and + :ref:`"soundfile" backend with the new interface`. + + :ivar int sample_rate: Sample rate + :ivar int num_frames: The number of frames + :ivar int num_channels: The number of channels + :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats, + or when it cannot be accurately inferred. + :ivar str encoding: Audio encoding + The values encoding can take are one of the following: + + * ``PCM_S``: Signed integer linear PCM + * ``PCM_U``: Unsigned integer linear PCM + * ``PCM_F``: Floating point linear PCM + * ``FLAC``: Flac, Free Lossless Audio Codec + * ``ULAW``: Mu-law + * ``ALAW``: A-law + * ``MP3`` : MP3, MPEG-1 Audio Layer III + * ``VORBIS``: OGG Vorbis + * ``AMR_WB``: Adaptive Multi-Rate + * ``AMR_NB``: Adaptive Multi-Rate Wideband + * ``OPUS``: Opus + * ``HTK``: Single channel 16-bit PCM + * ``UNKNOWN`` : None of above + """ + + def __init__( + self, + sample_rate: int, + num_frames: int, + num_channels: int, + bits_per_sample: int, + encoding: str, + ): + self.sample_rate = sample_rate + self.num_frames = num_frames + self.num_channels = num_channels + self.bits_per_sample = bits_per_sample + self.encoding = encoding + + def __str__(self): + return ( + f"AudioMetaData(" + f"sample_rate={self.sample_rate}, " + f"num_frames={self.num_frames}, " + f"num_channels={self.num_channels}, " + f"bits_per_sample={self.bits_per_sample}, " + f"encoding={self.encoding}" + f")" + ) diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index f22222d66..a91220042 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -5,9 +5,41 @@ from typing import Tuple from typing import Union from paddle import Tensor +from .common import AudioMetaData + +from paddlespeech.audio._internal import module_utils as _mod_utils +from paddlespeech.audio._paddleaudio import get_info_file +from paddlespeech.audio._paddleaudio import get_info_fileobj #https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py +def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData: + raise RuntimeError("Failed to fetch metadata from {}".format(filepath)) + + +def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioMetaData: + raise RuntimeError("Failed to fetch metadata from {}".format(fileobj)) + + +# Note: need to comply TorchScript syntax -- need annotation and no f-string +def _fail_load( + filepath: str, + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, +) -> Tuple[paddle.Tensor, int]: + raise RuntimeError("Failed to load audio from {}".format(filepath)) + + +def _fail_load_fileobj(fileobj, *args, **kwargs): + raise RuntimeError(f"Failed to load audio from {fileobj}") + +_fallback_info = _fail_info +_fallback_info_fileobj = _fail_info_fileobj +_fallback_load = _fail_load +_fallback_load_filebj = _fail_load_fileobj def load( filepath: Union[str, Path], @@ -19,14 +51,16 @@ def load( filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: raise RuntimeError("No audio I/O backend is available.") - -def save(filepath: str, - src: Tensor, - sample_rate: int, - precision: int=16, - channels_first: bool=True) -> None: +def save(filepath: str, + src: Tensor, + sample_rate: int, + precision: int = 16, + channels_first: bool = True) -> None: raise RuntimeError("No audio I/O backend is available.") - -def info(filepath: str) -> None: - raise RuntimeError("No audio I/O backend is available.") +@_mod_utils.requires_sox() +def info(filepath: str, format: Optional[str]) -> None: + sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format) + if sinfo is not None: + return AudioMetaData(*sinfo) + return _fallback_info(filepath, format) diff --git a/paddlespeech/audio/kaldi/__init__.py b/paddlespeech/audio/kaldi/__init__.py index 2b52ad23d..f951e280a 100644 --- a/paddlespeech/audio/kaldi/__init__.py +++ b/paddlespeech/audio/kaldi/__init__.py @@ -11,5 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from . import fbank -from . import pitch +from .kaldi import fbank +from .kaldi import pitch diff --git a/paddlespeech/audio/kaldi/kaldi.py b/paddlespeech/audio/kaldi/kaldi.py index 69347ddbb..e8e5693c4 100644 --- a/paddlespeech/audio/kaldi/kaldi.py +++ b/paddlespeech/audio/kaldi/kaldi.py @@ -12,13 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddlespeech.audio._internal import module_utils -import paddlespeech.audio.ops.paddleaudio.ComputeFbank as ComputeFbank -import paddlespeech.audio.ops.paddleaudio.PitchExtractionOptions as PitchExtractionOptions -import paddlespeech.audio.ops.paddleaudio.FrameExtractionOptions as FrameExtractionOptions -import paddlespeech.audio.ops.paddleaudio.MelBanksOptions as MelBanksOptions -import paddlespeech.audio.ops.paddleaudio.FbankOptions as FbankOptions -import paddlespeech.audio.ops.paddleaudio.ComputeKaldiPitch as ComputeKaldiPitch +from paddlespeech.audio._internal import module_utils __all__ = [ 'fbank', @@ -55,9 +49,9 @@ def fbank( htk_compat: bool=False, use_log_fbank: bool=True, use_power: bool=True): - frame_opts = FrameExtractionOptions() - mel_opts = MelBanksOptions() - fbank_opts = FbankOptions() + frame_opts = paddlespeech.audio._paddleaudio.FrameExtractionOptions() + mel_opts = paddlespeech.audio._paddleaudio.MelBanksOptions() + fbank_opts = paddlespeech.audio._paddleaudio.FbankOptions() frame_opts.samp_freq = samp_freq frame_opts.frame_shift_ms = frame_shift_ms frame_opts.frame_length_ms = frame_length_ms @@ -86,7 +80,7 @@ def fbank( fbank_opts.htk_compat = htk_compat fbank_opts.use_log_fbank = use_log_fbank fbank_opts.use_power = use_power - feat = ComputeFbank(frame_opts, mel_opts, fbank_opts, wav) + feat = paddlespeech.audio._paddleaudio.CopmputeFbank(frame_opts, mel_opts, fbank_opts, wav) return feat @@ -112,7 +106,7 @@ def pitch(wav, recompute_frame: int=500, nccf_ballast_online: bool=False, snip_edges: bool=True): - pitch_opts = PitchExtractionOptions() + pitch_opts = paddlespeech.audio._paddleaudio.PitchExtractionOptions() pitch_opts.samp_freq = samp_freq pitch_opts.frame_shift_ms = frame_shift_ms pitch_opts.frame_length_ms = frame_length_ms @@ -133,5 +127,5 @@ def pitch(wav, pitch_opts.recompute_frame = recompute_frame pitch_opts.nccf_ballast_online = nccf_ballast_online pitch_opts.snip_edges = snip_edges - pitch = ComputeKaldiPitch(pitch_opts, wav) + pitch = paddlespeech.audio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav) return pitch diff --git a/tests/unit/audio/features/test_kaldi_feat.py b/tests/unit/audio/features/test_kaldi_feat.py index 031fdfac2..e0ca1fa1d 100644 --- a/tests/unit/audio/features/test_kaldi_feat.py +++ b/tests/unit/audio/features/test_kaldi_feat.py @@ -16,8 +16,8 @@ import unittest import numpy as np import paddle -import paddlespeech.audio.kaldi.fbank as fbank -import paddlespeech.audio.kaldi.pitch as pitch +from paddlespeech.audio.kaldi import fbank as fbank +from paddlespeech.audio.kaldi import pitch as pitch from kaldiio import ReadHelper # the groundtruth feats computed in kaldi command below.