add paddleaudio test

3 years ago · 12339daddb
parent 1c4e4e7d9c
commit 12339daddb
8 changed files with 259 additions and 22 deletions
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
@ -2,24 +2,12 @@
 # compile kaldi without openfst
 add_definitions("-DCOMPILE_WITHOUT_OPENFST")

-# function (define_library name source include_dirs link_libraries compile_defs)
-#   add_library(${name} INTERFACE ${source})
-#   target_include_directories(${name} INTERFACE ${include_dirs})
-#   target_link_libraries(${name} INTERFACE ${link_libraries})
-#   target_compile_definitions(${name} INTERFACE ${compile_defs})
-#   set_target_properties(${name} PROPERTIES PREFIX "")
-#   if (MSVC)
-#     set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
-#   endif(MSVC)
-#   install(
-#     TARGETS ${name}
-#     LIBRARY DESTINATION lib
-#     RUNTIME DESTINATION lib  # For Windows
-#     )
-# endfunction()
-
-
-
+if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
+    file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+    file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+    file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+    file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+endif()

 # kaldi-base
 add_library(kaldi-base STATIC
--- a/audio/paddleaudio/third_party/kaldi/base
+++ b/audio/paddleaudio/third_party/kaldi/base
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/base
--- a/audio/paddleaudio/third_party/kaldi/feat
+++ b/audio/paddleaudio/third_party/kaldi/feat
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/feat
--- a/audio/paddleaudio/third_party/kaldi/matrix
+++ b/audio/paddleaudio/third_party/kaldi/matrix
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/matrix
--- a/audio/paddleaudio/third_party/kaldi/util
+++ b/audio/paddleaudio/third_party/kaldi/util
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/util
--- a/audio/tests/backends/soundfile/common_utils
+++ b/audio/tests/backends/soundfile/common_utils
@ -0,0 +1 @@
+../../common_utils
--- a/audio/tests/common_utils/data_utils.py
+++ b/audio/tests/common_utils/data_utils.py
@ -0,0 +1,136 @@
+import os.path
+from typing import Optional, Union
+
+import paddle
+import json
+
+from parameterized import param, parameterized
+#code is from:https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/common_utils/data_utils.py with modification.
+
+_TEST_DIR_PATH = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
+
+
+def get_asset_path(*paths):
+    """Return full path of a test asset"""
+    return os.path.join(_TEST_DIR_PATH, "assets", *paths)
+
+def load_params(*paths):
+    with open(get_asset_path(*paths), "r") as file:
+        return [param(json.loads(line)) for line in file]
+
+def load_effects_params(*paths):
+    params = []
+    with open(*paths, "r") as file:
+        for line in file:
+            data = json.loads(line)
+            for effect in data["effects"]:
+                for i, arg in enumerate(effect):
+                    if arg.startswith("<ASSET_DIR>"):
+                        effect[i] = arg.replace("<ASSET_DIR>", get_asset_path())
+            params.append(param(data))
+    return params
+
+def convert_tensor_encoding(
+    tensor: paddle.tensor,
+    dtype: paddle.dtype,
+):
+    """Convert input tensor with values between -1 and 1 to integer encoding
+    Args:
+        tensor: input tensor, assumed between -1 and 1
+        dtype: desired output tensor dtype
+    Returns:
+        Tensor: shape of (n_channels, sample_rate * duration)
+    """
+    if dtype == paddle.int32:
+        tensor *= (tensor > 0) * 2147483647 + (tensor < 0) * 2147483648
+    if dtype == paddle.int16:
+        tensor *= (tensor > 0) * 32767 + (tensor < 0) * 32768
+    if dtype == paddle.uint8:
+        tensor *= (tensor > 0) * 127 + (tensor < 0) * 128
+        tensor += 128
+    tensor = paddle.to_tensor(tensor, dtype)
+    return tensor
+
+
+#def get_whitenoise(
+    #*,
+    #sample_rate: int = 16000,
+    #duration: float = 1,  # seconds
+    #n_channels: int = 1,
+    #seed: int = 0,
+    #dtype: Union[str, paddle.dtype] = "float32",
+    #device: Union[str, paddle.device] = "cpu",
+    #channels_first=True,
+    #scale_factor: float = 1,
+#):
+    #"""Generate pseudo audio data with whitenoise
+    #Args:
+        #sample_rate: Sampling rate
+        #duration: Length of the resulting Tensor in seconds.
+        #n_channels: Number of channels
+        #seed: Seed value used for random number generation.
+            #Note that this function does not modify global random generator state.
+        #dtype: Torch dtype
+        #device: device
+        #channels_first: whether first dimension is n_channels
+        #scale_factor: scale the Tensor before clamping and quantization
+    #Returns:
+        #Tensor: shape of (n_channels, sample_rate * duration)
+    #"""
+    #if isinstance(dtype, str):
+        #dtype = getattr(paddle, dtype)
+    #if dtype not in [paddle.float64, paddle.float32, paddle.int32, paddle.int16, paddle.uint8]:
+        #raise NotImplementedError(f"dtype {dtype} is not supported.")
+    ## According to the doc, folking rng on all CUDA devices is slow when there are many CUDA devices,
+    ## so we only fork on CPU, generate values and move the data to the given device
+    #with paddle.random.fork_rng([]):
+        #paddle.random.manual_seed(seed)
+        #tensor = paddle.randn([n_channels, int(sample_rate * duration)], dtype=paddle.float32, device="cpu")
+    #tensor /= 2.0
+    #tensor *= scale_factor
+    #tensor.clamp_(-1.0, 1.0)
+    #if not channels_first:
+        #tensor = tensor.t()
+
+    #tensor = tensor.to(device)
+
+    #return convert_tensor_encoding(tensor, dtype)
+
+
+def get_sinusoid(
+    *,
+    frequency: float = 300,
+    sample_rate: int = 16000,
+    duration: float = 1,  # seconds
+    n_channels: int = 1,
+    dtype: str = "float32",
+    device: str = "cpu",
+    channels_first: bool = True,
+):
+    """Generate pseudo audio data with sine wave.
+
+    Args:
+        frequency: Frequency of sine wave
+        sample_rate: Sampling rate
+        duration: Length of the resulting Tensor in seconds.
+        n_channels: Number of channels
+        dtype: Torch dtype
+        device: device
+
+    Returns:
+        Tensor: shape of (n_channels, sample_rate * duration)
+    """
+    if isinstance(dtype, str):
+        dtype = getattr(paddle, dtype)
+    pie2 = 2 * 3.141592653589793
+    end = pie2 * frequency * duration
+    num_frames = int(sample_rate * duration)
+    # Randomize the initial phase. (except the first channel)
+    theta0 = pie2 * paddle.randn([n_channels, 1], dtype=paddle.float32)
+    theta0[0, :] = 0
+    theta = paddle.linspace(0, end, num_frames, dtype=paddle.float32)
+    theta = theta0 + theta
+    tensor = paddle.sin(theta)
+    if not channels_first:
+        tensor = paddle.t(tensor)
+    return convert_tensor_encoding(tensor, dtype)
--- a/audio/tests/common_utils/sox_utils.py
+++ b/audio/tests/common_utils/sox_utils.py
@ -0,0 +1,116 @@
+import subprocess
+import sys
+import warnings
+
+
+def get_encoding(dtype):
+    encodings = {
+        "float32": "floating-point",
+        "int32": "signed-integer",
+        "int16": "signed-integer",
+        "uint8": "unsigned-integer",
+    }
+    return encodings[dtype]
+
+
+def get_bit_depth(dtype):
+    bit_depths = {
+        "float32": 32,
+        "int32": 32,
+        "int16": 16,
+        "uint8": 8,
+    }
+    return bit_depths[dtype]
+
+
+def gen_audio_file(
+    path,
+    sample_rate,
+    num_channels,
+    *,
+    encoding=None,
+    bit_depth=None,
+    compression=None,
+    attenuation=None,
+    duration=1,
+    comment_file=None,
+):
+    """Generate synthetic audio file with `sox` command."""
+    if path.endswith(".wav"):
+        warnings.warn("Use get_wav_data and save_wav to generate wav file for accurate result.")
+    command = [
+        "sox",
+        "-V3",  # verbose
+        "--no-dither",  # disable automatic dithering
+        "-R",
+        # -R is supposed to be repeatable, though the implementation looks suspicious
+        # and not setting the seed to a fixed value.
+        # https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
+        # search "sox_globals.repeatable"
+    ]
+    if bit_depth is not None:
+        command += ["--bits", str(bit_depth)]
+    command += [
+        "--rate",
+        str(sample_rate),
+        "--null",  # no input
+        "--channels",
+        str(num_channels),
+    ]
+    if compression is not None:
+        command += ["--compression", str(compression)]
+    if bit_depth is not None:
+        command += ["--bits", str(bit_depth)]
+    if encoding is not None:
+        command += ["--encoding", str(encoding)]
+    if comment_file is not None:
+        command += ["--comment-file", str(comment_file)]
+    command += [
+        str(path),
+        "synth",
+        str(duration),  # synthesizes for the given duration [sec]
+        "sawtooth",
+        "1",
+        # saw tooth covers the both ends of value range, which is a good property for test.
+        # similar to linspace(-1., 1.)
+        # this introduces bigger boundary effect than sine when converted to mp3
+    ]
+    if attenuation is not None:
+        command += ["vol", f"-{attenuation}dB"]
+    print(" ".join(command), file=sys.stderr)
+    subprocess.run(command, check=True)
+
+
+def convert_audio_file(src_path, dst_path, *, encoding=None, bit_depth=None, compression=None):
+    """Convert audio file with `sox` command."""
+    command = ["sox", "-V3", "--no-dither", "-R", str(src_path)]
+    if encoding is not None:
+        command += ["--encoding", str(encoding)]
+    if bit_depth is not None:
+        command += ["--bits", str(bit_depth)]
+    if compression is not None:
+        command += ["--compression", str(compression)]
+    command += [dst_path]
+    print(" ".join(command), file=sys.stderr)
+    subprocess.run(command, check=True)
+
+
+def _flattern(effects):
+    if not effects:
+        return effects
+    if isinstance(effects[0], str):
+        return effects
+    return [item for sublist in effects for item in sublist]
+
+
+def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None):
+    """Run sox effects"""
+    effect = _flattern(effect)
+    command = ["sox", "-V", "--no-dither", input_file]
+    if output_bitdepth:
+        command += ["--bits", str(output_bitdepth)]
+    command += [output_file] + effect
+    if output_sample_rate:
+        command += ["rate", str(output_sample_rate)]
+    print(" ".join(command))
+    subprocess.run(command, check=True)