Merge pull request #2108 from zh794390558/api

[audio] audio backend
pull/2157/head
Hui Zhang 3 years ago committed by GitHub
commit 5d34b7f66f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -35,6 +35,7 @@ function (onnx_print_configuration_summary)
message(STATUS " BUILD_ONNX_PYTHON : ${BUILD_ONNX_PYTHON}") message(STATUS " BUILD_ONNX_PYTHON : ${BUILD_ONNX_PYTHON}")
message(STATUS " Python version : ${Python_VERSION}") message(STATUS " Python version : ${Python_VERSION}")
message(STATUS " Python executable : ${Python_EXECUTABLE}") message(STATUS " Python executable : ${Python_EXECUTABLE}")
message(STATUS " Python includes : ${Python_INCLUDE_DIRS}") message(STATUS " Python includes : ${Python_INCLUDE_DIR}")
message(STATUS " Python libraries : ${Python_LIBRARY}")
endfunction() endfunction()

@ -12,5 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import _locale import _locale
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
from . import audio
# _init_audio_backend must called after audio import
audio.backends.utils._init_audio_backend()
__all__ = [
"audio"
]

@ -11,12 +11,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from . import compliance from . import compliance
from . import datasets from . import datasets
from . import features from . import features
from . import functional from . import functional
from . import io from . import io
from . import metric from . import metric
from . import sox_effects from . import utils
from .backends import load
from .backends import save from ._ops import ops
from paddlespeech.audio.backends import get_audio_backend, list_audio_backends, set_audio_backend
__all__ = [
"io",
"compliance",
"datasets",
"functional",
"features",
"utils",
'ops'
"list_audio_backends",
"get_audio_backend",
"set_audio_backend",
]

@ -44,7 +44,7 @@ def _load_lib(lib: str) -> bool:
path = _get_lib_path(lib) path = _get_lib_path(lib)
if not path.exists(): if not path.exists():
return False return False
paddlespeech.ops.load_library(path) paddlespeech.audio.ops.load_library(path)
return True return True
@ -56,7 +56,7 @@ def _init_ffmpeg():
if _FFMPEG_INITIALIZED: if _FFMPEG_INITIALIZED:
return return
if not paddlespeech.ops.paddlleaudio.is_ffmpeg_available(): if not paddlespeech.audio.ops.paddlleaudio.is_ffmpeg_available():
raise RuntimeError( raise RuntimeError(
"paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio." "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
) )
@ -67,11 +67,11 @@ def _init_ffmpeg():
raise ImportError( raise ImportError(
"FFmpeg libraries are not found. Please install FFmpeg.") from err "FFmpeg libraries are not found. Please install FFmpeg.") from err
import paddllespeech._paddlleaudio_ffmpeg # noqa import paddllespeech.audio._paddlleaudio_ffmpeg # noqa
paddlespeech.ops.paddlleaudio.ffmpeg_init() paddlespeech.audio.ops.paddlleaudio.ffmpeg_init()
if paddlespeech.ops.paddlleaudio.ffmpeg_get_log_level() > 8: if paddlespeech.audio.ops.paddlleaudio.ffmpeg_get_log_level() > 8:
paddlespeech.ops.paddlleaudio.ffmpeg_set_log_level(8) paddlespeech.audio.ops.paddlleaudio.ffmpeg_set_log_level(8)
_FFMPEG_INITIALIZED = True _FFMPEG_INITIALIZED = True
@ -84,7 +84,7 @@ def _init_extension():
_load_lib("libpaddleaudio") _load_lib("libpaddleaudio")
# This import is for initializing the methods registered via PyBind11 # This import is for initializing the methods registered via PyBind11
# This has to happen after the base library is loaded # This has to happen after the base library is loaded
from paddlespeech import _paddleaudio # noqa from paddlespeech.audio import _paddleaudio # noqa
# Because this part is executed as part of `import torchaudio`, we ignore the # Because this part is executed as part of `import torchaudio`, we ignore the
# initialization failure. # initialization failure.

@ -11,9 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .soundfile_backend import depth_convert
from .soundfile_backend import load # flake8: noqa
from .soundfile_backend import normalize from . import utils
from .soundfile_backend import resample from .utils import get_audio_backend, list_audio_backends, set_audio_backend
from .soundfile_backend import save
from .soundfile_backend import to_mono

@ -23,11 +23,11 @@ import soundfile as sf
from scipy.io import wavfile from scipy.io import wavfile
from ..utils import ParameterError from ..utils import ParameterError
from ..utils import depth_convert
__all__ = [ __all__ = [
'resample', 'resample',
'to_mono', 'to_mono',
'depth_convert',
'normalize', 'normalize',
'save', 'save',
'load', 'load',
@ -117,78 +117,6 @@ def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
return y_out return y_out
def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
"""Data type casting in a safe way, i.e., prevent overflow or underflow.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
"""
if 'float' in str(y.dtype):
return np.clip(y, np.finfo(dtype).min,
np.finfo(dtype).max).astype(dtype)
else:
return np.clip(y, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
"""Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
"""
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
if y.dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype == y.dtype:
return y
if dtype == 'float64' and y.dtype == 'float32':
return _safe_cast(y, dtype)
if dtype == 'float32' and y.dtype == 'float64':
return _safe_cast(y, dtype)
if dtype == 'int16' or dtype == 'int8':
if y.dtype in ['float64', 'float32']:
factor = np.iinfo(dtype).max
y = np.clip(y * factor, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
y = y.astype(dtype)
else:
if dtype == 'int16' and y.dtype == 'int8':
factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
y = y.astype('float32') * factor
y = y.astype('int16')
else: # dtype == 'int8' and y.dtype=='int16':
y = y.astype('int32') * np.iinfo('int8').max / \
np.iinfo('int16').max
y = y.astype('int8')
if dtype in ['float32', 'float64']:
org_dtype = y.dtype
y = y.astype(dtype) / np.iinfo(org_dtype).max
return y
def sound_file_load(file: os.PathLike, def sound_file_load(file: os.PathLike,
offset: Optional[float]=None, offset: Optional[float]=None,
dtype: str='int16', dtype: str='int16',
@ -323,3 +251,7 @@ def load(
y = depth_convert(y, dtype) y = depth_convert(y, dtype)
return y, r return y, r
def info(filepath: str) -> None:
raise RuntimeError("No audio I/O backend is available.")

@ -1,13 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -22,7 +22,7 @@ import scipy
from numpy.lib.stride_tricks import as_strided from numpy.lib.stride_tricks import as_strided
from scipy import signal from scipy import signal
from ..backends import depth_convert from ..utils import depth_convert
from ..utils import ParameterError from ..utils import ParameterError
__all__ = [ __all__ = [

@ -16,7 +16,6 @@ from typing import List
import numpy as np import numpy as np
import paddle import paddle
from ..backends import load as load_audio
from ..compliance.kaldi import fbank as kaldi_fbank from ..compliance.kaldi import fbank as kaldi_fbank
from ..compliance.kaldi import mfcc as kaldi_mfcc from ..compliance.kaldi import mfcc as kaldi_mfcc
from ..compliance.librosa import melspectrogram from ..compliance.librosa import melspectrogram
@ -70,9 +69,9 @@ class AudioClassificationDataset(paddle.io.Dataset):
file, label = self.files[idx], self.labels[idx] file, label = self.files[idx], self.labels[idx]
if self.sample_rate is None: if self.sample_rate is None:
waveform, sample_rate = load_audio(file) waveform, sample_rate = paddlespeech.audio.load(file)
else: else:
waveform, sample_rate = load_audio(file, sr=self.sample_rate) waveform, sample_rate = paddlespeech.audio.load(file, sr=self.sample_rate)
feat_func = feat_funcs[self.feat_type] feat_func = feat_funcs[self.feat_type]

@ -20,8 +20,6 @@ from typing import List
from paddle.io import Dataset from paddle.io import Dataset
from tqdm import tqdm from tqdm import tqdm
from ..backends import load as load_audio
from ..backends import save as save_wav
from ..utils import DATA_HOME from ..utils import DATA_HOME
from ..utils.download import download_and_decompress from ..utils.download import download_and_decompress
from .dataset import feat_funcs from .dataset import feat_funcs
@ -105,7 +103,7 @@ class OpenRIRNoise(Dataset):
for field in type(sample)._fields: for field in type(sample)._fields:
record[field] = getattr(sample, field) record[field] = getattr(sample, field)
waveform, sr = load_audio(record['wav']) waveform, sr = paddlespeech.audio.load(record['wav'])
assert self.feat_type in feat_funcs.keys(), \ assert self.feat_type in feat_funcs.keys(), \
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
@ -128,7 +126,7 @@ class OpenRIRNoise(Dataset):
def _get_audio_info(self, wav_file: str, def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]: split_chunks: bool) -> List[List[str]]:
waveform, sr = load_audio(wav_file) waveform, sr = paddlespeech.audio.load(wav_file)
audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0] audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
audio_duration = waveform.shape[0] / sr audio_duration = waveform.shape[0] / sr
@ -143,7 +141,7 @@ class OpenRIRNoise(Dataset):
end_sample = int(float(e) * sr) end_sample = int(float(e) * sr)
new_wav_file = os.path.join(self.base_path, new_wav_file = os.path.join(self.base_path,
audio_id + f'_chunk_{idx+1:02}.wav') audio_id + f'_chunk_{idx+1:02}.wav')
save_wav(waveform[start_sample:end_sample], sr, new_wav_file) paddlespeech.audio.save(waveform[start_sample:end_sample], sr, new_wav_file)
# id, duration, new_wav # id, duration, new_wav
ret.append([chunk, self.chunk_duration, new_wav_file]) ret.append([chunk, self.chunk_duration, new_wav_file])
else: # Keep whole audio. else: # Keep whole audio.

@ -23,7 +23,6 @@ from paddle.io import Dataset
from pathos.multiprocessing import Pool from pathos.multiprocessing import Pool
from tqdm import tqdm from tqdm import tqdm
from ..backends import load as load_audio
from ..utils import DATA_HOME from ..utils import DATA_HOME
from ..utils import decompress from ..utils import decompress
from ..utils.download import download_and_decompress from ..utils.download import download_and_decompress
@ -192,7 +191,7 @@ class VoxCeleb(Dataset):
for field in type(sample)._fields: for field in type(sample)._fields:
record[field] = getattr(sample, field) record[field] = getattr(sample, field)
waveform, sr = load_audio(record['wav']) waveform, sr = paddlespeech.audio.load(record['wav'])
# random select a chunk audio samples from the audio # random select a chunk audio samples from the audio
if self.random_chunk: if self.random_chunk:
@ -231,7 +230,7 @@ class VoxCeleb(Dataset):
def _get_audio_info(self, wav_file: str, def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]: split_chunks: bool) -> List[List[str]]:
waveform, sr = load_audio(wav_file) waveform, sr = paddlespeech.audio.load(wav_file)
spk_id, sess_id, utt_id = wav_file.split("/")[-3:] spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]]) audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
audio_duration = waveform.shape[0] / sr audio_duration = waveform.shape[0] / sr

@ -1,13 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -13,11 +13,18 @@
# limitations under the License. # limitations under the License.
from ...cli.utils import DATA_HOME from ...cli.utils import DATA_HOME
from ...cli.utils import MODEL_HOME from ...cli.utils import MODEL_HOME
from .download import decompress from .download import decompress
from .download import download_and_decompress from .download import download_and_decompress
from .download import load_state_dict_from_url from .download import load_state_dict_from_url
from .error import ParameterError from .error import ParameterError
from .log import Logger from .log import Logger
from .log import logger from .log import logger
from .time import seconds_to_hms from .time import seconds_to_hms
from .time import Timer from .time import Timer
from .numeric import pcm16to32
from .numeric import depth_convert

@ -12,7 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
from typing import Union
__all__ = [
"pcm16to32",
"depth_convert"
]
def pcm16to32(audio: np.ndarray) -> np.ndarray: def pcm16to32(audio: np.ndarray) -> np.ndarray:
"""pcm int16 to float32 """pcm int16 to float32
@ -28,3 +33,76 @@ def pcm16to32(audio: np.ndarray) -> np.ndarray:
bits = np.iinfo(np.int16).bits bits = np.iinfo(np.int16).bits
audio = audio / (2**(bits - 1)) audio = audio / (2**(bits - 1))
return audio return audio
def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
"""Data type casting in a safe way, i.e., prevent overflow or underflow.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
"""
if 'float' in str(y.dtype):
return np.clip(y, np.finfo(dtype).min,
np.finfo(dtype).max).astype(dtype)
else:
return np.clip(y, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
"""Convert audio array to target dtype safely.
This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
"""
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
if y.dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype == y.dtype:
return y
if dtype == 'float64' and y.dtype == 'float32':
return _safe_cast(y, dtype)
if dtype == 'float32' and y.dtype == 'float64':
return _safe_cast(y, dtype)
if dtype == 'int16' or dtype == 'int8':
if y.dtype in ['float64', 'float32']:
factor = np.iinfo(dtype).max
y = np.clip(y * factor, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
y = y.astype(dtype)
else:
if dtype == 'int16' and y.dtype == 'int8':
factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
y = y.astype('float32') * factor
y = y.astype('int16')
else: # dtype == 'int8' and y.dtype=='int16':
y = y.astype('int32') * np.iinfo('int8').max / \
np.iinfo('int16').max
y = y.astype('int8')
if dtype in ['float32', 'float64']:
org_dtype = y.dtype
y = y.astype(dtype) / np.iinfo(org_dtype).max
return y

@ -27,7 +27,7 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import stats_wrapper from ..utils import stats_wrapper
from paddlespeech.audio.backends import load as load_audio from paddlespeech.audio import load as load_audio
from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.audio.compliance.librosa import melspectrogram
from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification from paddlespeech.vector.modules.sid_model import SpeakerIdetification

@ -18,7 +18,7 @@ import numpy as np
from paddle import inference from paddle import inference
from scipy.special import softmax from scipy.special import softmax
from paddlespeech.audio.backends import load as load_audio from paddlespeech.audio import load as load_audio
from paddlespeech.audio.datasets import ESC50 from paddlespeech.audio.datasets import ESC50
from paddlespeech.audio.features import melspectrogram from paddlespeech.audio.features import melspectrogram

@ -17,7 +17,7 @@ from collections import OrderedDict
import numpy as np import numpy as np
import paddle import paddle
from paddlespeech.audio.backends import load as load_audio from paddlespeech.audio import load as load_audio
from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.audio.compliance.librosa import melspectrogram
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.cli.vector.infer import VectorExecutor

@ -18,7 +18,7 @@ import time
import paddle import paddle
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.audio.backends import load as load_audio from paddlespeech.audio import load as load_audio
from paddlespeech.audio.compliance.librosa import melspectrogram from paddlespeech.audio.compliance.librosa import melspectrogram
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.io.batch import feature_normalize

@ -90,7 +90,7 @@ class CMakeBuild(build_ext):
f"-DCMAKE_INSTALL_PREFIX={extdir}", f"-DCMAKE_INSTALL_PREFIX={extdir}",
"-DCMAKE_VERBOSE_MAKEFILE=ON", "-DCMAKE_VERBOSE_MAKEFILE=ON",
f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}", f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
f"-DPYTHON_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}", f"-DPython_LIBRARY={distutils.sysconfig.get_config_var('LIBDIR')}",
f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}", f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
f"-DBUILD_MAD:BOOL={'ON' if _BUILD_MAD else 'OFF'}", f"-DBUILD_MAD:BOOL={'ON' if _BUILD_MAD else 'OFF'}",
# f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}", # f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",

Loading…
Cancel
Save