add 2 file test

9 months ago · 080bd7f5db
parent 643f1c6071
commit 080bd7f5db
19 changed files with 804 additions and 414 deletions
--- a/audio/audiotools/core/audio_signal.py
+++ b/audio/audiotools/core/audio_signal.py
@ -17,14 +17,13 @@ import soundfile
 from . import util
 from ._julius import resample_frac
 from .display import DisplayMixin
 from .dsp import DSPMixin
 from .effects import EffectMixin
 from .effects import ImpulseResponseMixin
 from .ffmpeg import FFMPEGMixin
 from .loudness import LoudnessMixin
 # from .display import DisplayMixin
 # from .playback import PlayMixin
 # from .whisper import WhisperMixin
@ -98,7 +97,7 @@ class AudioSignal(
        # PlayMixin,
        ImpulseResponseMixin,
        DSPMixin,
-        # DisplayMixin,
+        DisplayMixin,
        FFMPEGMixin,
        # WhisperMixin,
 ):
@ -1498,6 +1497,8 @@ class AudioSignal(
        amin = amin**2
        log_spec = 10.0 * paddle.log10(magnitude.pow(2).clip(min=amin))
        if paddle.is_tensor(ref_value):
            ref_value = ref_value.item()
        log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
        if top_db is not None:
--- a/audio/audiotools/core/display.py
+++ b/audio/audiotools/core/display.py
@ -0,0 +1,191 @@
 import inspect
 import typing
 from functools import wraps
 from . import util
 def format_figure(func):
    """Decorator for formatting figures produced by the code below.
    See :py:func:`audiotools.core.util.format_figure` for more.
    Parameters
    ----------
    func : Callable
        Plotting function that is decorated by this function.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        f_keys = inspect.signature(util.format_figure).parameters.keys()
        f_kwargs = {}
        for k, v in list(kwargs.items()):
            if k in f_keys:
                kwargs.pop(k)
                f_kwargs[k] = v
        func(*args, **kwargs)
        util.format_figure(**f_kwargs)
    return wrapper
 class DisplayMixin:
    @format_figure
    def specshow(
            self,
            preemphasis: bool=False,
            x_axis: str="time",
            y_axis: str="linear",
            n_mels: int=128,
            **kwargs, ):
        """Displays a spectrogram, using ``librosa.display.specshow``.
        Parameters
        ----------
        preemphasis : bool, optional
            Whether or not to apply preemphasis, which makes high
            frequency detail easier to see, by default False
        x_axis : str, optional
            How to label the x axis, by default "time"
        y_axis : str, optional
            How to label the y axis, by default "linear"
        n_mels : int, optional
            If displaying a mel spectrogram with ``y_axis = "mel"``,
            this controls the number of mels, by default 128.
        kwargs : dict, optional
            Keyword arguments to :py:func:`audiotools.core.util.format_figure`.
        """
        import librosa
        import librosa.display
        # Always re-compute the STFT data before showing it, in case
        # it changed.
        signal = self.clone()
        signal.stft_data = None
        if preemphasis:
            signal.preemphasis()
        ref = signal.magnitude.max()
        log_mag = signal.log_magnitude(ref_value=ref)
        if y_axis == "mel":
            log_mag = 20 * signal.mel_spectrogram(n_mels).clip(1e-5).log10()
            log_mag -= log_mag.max()
        librosa.display.specshow(
            log_mag.numpy()[0].mean(axis=0),
            x_axis=x_axis,
            y_axis=y_axis,
            sr=signal.sample_rate,
            **kwargs, )
    @format_figure
    def waveplot(self, x_axis: str="time", **kwargs):
        """Displays a waveform plot, using ``librosa.display.waveshow``.
        Parameters
        ----------
        x_axis : str, optional
            How to label the x axis, by default "time"
        kwargs : dict, optional
            Keyword arguments to :py:func:`audiotools.core.util.format_figure`.
        """
        import librosa
        import librosa.display
        audio_data = self.audio_data[0].mean(axis=0)
        audio_data = audio_data.cpu().numpy()
        plot_fn = "waveshow" if hasattr(librosa.display,
                                        "waveshow") else "waveplot"
        wave_plot_fn = getattr(librosa.display, plot_fn)
        wave_plot_fn(audio_data, x_axis=x_axis, sr=self.sample_rate, **kwargs)
    @format_figure
    def wavespec(self, x_axis: str="time", **kwargs):
        """Displays a waveform plot, using ``librosa.display.waveshow``.
        Parameters
        ----------
        x_axis : str, optional
            How to label the x axis, by default "time"
        kwargs : dict, optional
            Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow`.
        """
        import matplotlib.pyplot as plt
        from matplotlib.gridspec import GridSpec
        gs = GridSpec(6, 1)
        plt.subplot(gs[0, :])
        self.waveplot(x_axis=x_axis)
        plt.subplot(gs[1:, :])
        self.specshow(x_axis=x_axis, **kwargs)
    def write_audio_to_tb(
            self,
            tag: str,
            writer,
            step: int=None,
            plot_fn: typing.Union[typing.Callable, str]="specshow",
            **kwargs, ):
        """Writes a signal and its spectrogram to Tensorboard. Will show up
        under the Audio and Images tab in Tensorboard.
        Parameters
        ----------
        tag : str
            Tag to write signal to (e.g. ``clean/sample_0.wav``). The image will be
            written to the corresponding ``.png`` file (e.g. ``clean/sample_0.png``).
        writer : SummaryWriter
            A SummaryWriter object from PyTorch library.
        step : int, optional
            The step to write the signal to, by default None
        plot_fn : typing.Union[typing.Callable, str], optional
            How to create the image. Set to ``None`` to avoid plotting, by default "specshow"
        kwargs : dict, optional
            Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow` or
            whatever ``plot_fn`` is set to.
        """
        import matplotlib.pyplot as plt
        audio_data = self.audio_data[0, 0].detach().cpu().numpy()
        sample_rate = self.sample_rate
        writer.add_audio(tag, audio_data, step, sample_rate)
        if plot_fn is not None:
            if isinstance(plot_fn, str):
                plot_fn = getattr(self, plot_fn)
            fig = plt.figure()
            plt.clf()
            plot_fn(**kwargs)
            writer.add_figure(tag.replace("wav", "png"), fig, step)
    def save_image(
            self,
            image_path: str,
            plot_fn: typing.Union[typing.Callable, str]="specshow",
            **kwargs, ):
        """Save AudioSignal spectrogram (or whatever ``plot_fn`` is set to) to
        a specified file.
        Parameters
        ----------
        image_path : str
            Where to save the file to.
        plot_fn : typing.Union[typing.Callable, str], optional
            How to create the image. Set to ``None`` to avoid plotting, by default "specshow"
        kwargs : dict, optional
            Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow` or
            whatever ``plot_fn`` is set to.
        """
        import matplotlib.pyplot as plt
        if isinstance(plot_fn, str):
            plot_fn = getattr(self, plot_fn)
        plt.clf()
        plot_fn(**kwargs)
        plt.savefig(image_path, bbox_inches="tight", pad_inches=0)
        plt.close()
--- a/audio/audiotools/core/dsp.py
+++ b/audio/audiotools/core/dsp.py
@ -7,148 +7,201 @@ from . import _julius
 from . import util
 def _unfold(x, kernel_sizes, strides):
    # https://github.com/PaddlePaddle/Paddle/pull/70102
    if 1 == kernel_sizes[0]:
        x_zeros = paddle.zeros_like(x)
        x = paddle.concat([x, x_zeros], axis=2)
        kernel_sizes = (2, kernel_sizes[1])
    unfolded = paddle.nn.functional.unfold(
        x,
        kernel_sizes=kernel_sizes,
        strides=strides, )
    if 2 == kernel_sizes[0]:
        unfolded = unfolded[:, :kernel_sizes[1]]
    return unfolded
 def _fold(x, output_sizes, kernel_sizes, strides):
    # https://github.com/PaddlePaddle/Paddle/pull/70102
    if 1 == output_sizes[0] and 1 == kernel_sizes[0]:
        x_zeros = paddle.zeros_like(x)
        x = paddle.concat([x, x_zeros], axis=1)
        output_sizes = (2, output_sizes[1])
        kernel_sizes = (2, kernel_sizes[1])
    fold = paddle.nn.functional.fold(
        x,
        output_sizes=output_sizes,
        kernel_sizes=kernel_sizes,
        strides=strides, )
    if 2 == kernel_sizes[0]:
        fold = fold[:, :, :1]
    return fold
 class DSPMixin:
    _original_batch_size = None
    _original_num_channels = None
    _padded_signal_length = None
-    # def _preprocess_signal_for_windowing(self, window_duration, hop_duration):
+    def _preprocess_signal_for_windowing(self, window_duration, hop_duration):
-    #     self._original_batch_size = self.batch_size
+        self._original_batch_size = self.batch_size
-    #     self._original_num_channels = self.num_channels
+        self._original_num_channels = self.num_channels
    #     window_length = int(window_duration * self.sample_rate)
    #     hop_length = int(hop_duration * self.sample_rate)
    #     if window_length % hop_length != 0:
    #         factor = window_length // hop_length
    #         window_length = factor * hop_length
    #     self.zero_pad(hop_length, hop_length)
    #     self._padded_signal_length = self.signal_length
    #     return window_length, hop_length
    # def windows(
    #     self, window_duration: float, hop_duration: float, preprocess: bool = True
    # ):
    #     """Generator which yields windows of specified duration from signal with a specified
    #     hop length.
    #     Parameters
    #     ----------
    #     window_duration : float
    #         Duration of every window in seconds.
    #     hop_duration : float
    #         Hop between windows in seconds.
    #     preprocess : bool, optional
    #         Whether to preprocess the signal, so that the first sample is in
    #         the middle of the first window, by default True
    #     Yields
    #     ------
    #     AudioSignal
    #         Each window is returned as an AudioSignal.
    #     """
    #     if preprocess:
    #         window_length, hop_length = self._preprocess_signal_for_windowing(
    #             window_duration, hop_duration
    #         )
-    #     self.audio_data = self.audio_data.reshape(-1, 1, self.signal_length)
+        window_length = int(window_duration * self.sample_rate)
-
+        hop_length = int(hop_duration * self.sample_rate)
-    #     for b in range(self.batch_size):
+
-    #         i = 0
+        if window_length % hop_length != 0:
-    #         start_idx = i * hop_length
+            factor = window_length // hop_length
-    #         while True:
+            window_length = factor * hop_length
-    #             start_idx = i * hop_length
+
-    #             i += 1
+        self.zero_pad(hop_length, hop_length)
-    #             end_idx = start_idx + window_length
+        self._padded_signal_length = self.signal_length
-    #             if end_idx > self.signal_length:
+
-    #                 break
+        return window_length, hop_length
-    #             yield self[b, ..., start_idx:end_idx]
+
-
+    def windows(self,
-    # def collect_windows(
+                window_duration: float,
-    #     self, window_duration: float, hop_duration: float, preprocess: bool = True
+                hop_duration: float,
-    # ):
+                preprocess: bool=True):
-    #     """Reshapes signal into windows of specified duration from signal with a specified
+        """Generator which yields windows of specified duration from signal with a specified
-    #     hop length. Window are placed along the batch dimension. Use with
+        hop length.
-    #     :py:func:`audiotools.core.dsp.DSPMixin.overlap_and_add` to reconstruct the
+
-    #     original signal.
+        Parameters
-
+        ----------
-    #     Parameters
+        window_duration : float
-    #     ----------
+            Duration of every window in seconds.
-    #     window_duration : float
+        hop_duration : float
-    #         Duration of every window in seconds.
+            Hop between windows in seconds.
-    #     hop_duration : float
+        preprocess : bool, optional
-    #         Hop between windows in seconds.
+            Whether to preprocess the signal, so that the first sample is in
-    #     preprocess : bool, optional
+            the middle of the first window, by default True
-    #         Whether to preprocess the signal, so that the first sample is in
+
-    #         the middle of the first window, by default True
+        Yields
-
+        ------
-    #     Returns
+        AudioSignal
-    #     -------
+            Each window is returned as an AudioSignal.
-    #     AudioSignal
+        """
-    #         AudioSignal unfolded with shape ``(nb * nch * num_windows, 1, window_length)``
+        if preprocess:
-    #     """
+            window_length, hop_length = self._preprocess_signal_for_windowing(
-    #     if preprocess:
+                window_duration, hop_duration)
-    #         window_length, hop_length = self._preprocess_signal_for_windowing(
+
-    #             window_duration, hop_duration
+        self.audio_data = self.audio_data.reshape([-1, 1, self.signal_length])
-    #         )
+
        for b in range(self.batch_size):
            i = 0
            start_idx = i * hop_length
            while True:
                start_idx = i * hop_length
                i += 1
                end_idx = start_idx + window_length
                if end_idx > self.signal_length:
                    break
                yield self[b, ..., start_idx:end_idx]
    def collect_windows(self,
                        window_duration: float,
                        hop_duration: float,
                        preprocess: bool=True):
        """Reshapes signal into windows of specified duration from signal with a specified
        hop length. Window are placed along the batch dimension. Use with
        :py:func:`audiotools.core.dsp.DSPMixin.overlap_and_add` to reconstruct the
        original signal.
        Parameters
        ----------
        window_duration : float
            Duration of every window in seconds.
        hop_duration : float
            Hop between windows in seconds.
        preprocess : bool, optional
            Whether to preprocess the signal, so that the first sample is in
            the middle of the first window, by default True
        Returns
        -------
        AudioSignal
            AudioSignal unfolded with shape ``(nb * nch * num_windows, 1, window_length)``
        """
        if preprocess:
            window_length, hop_length = self._preprocess_signal_for_windowing(
                window_duration, hop_duration)
-    #     # self.audio_data: (nb, nch, nt).
+        # self.audio_data: (nb, nch, nt).
        # unfolded = paddle.nn.functional.unfold(
-    #         self.audio_data.reshape(-1, 1, 1, self.signal_length),
+        #     self.audio_data.reshape([-1, 1, 1, self.signal_length]),
-    #         kernel_size=(1, window_length),
+        #     kernel_sizes=(1, window_length),
-    #         stride=(1, hop_length),
+        #     strides=(1, hop_length),
        # )
-    #     # unfolded: (nb * nch, window_length, num_windows).
+        unfolded = _unfold(
-    #     # -> (nb * nch * num_windows, 1, window_length)
+            self.audio_data.reshape([-1, 1, 1, self.signal_length]),
-    #     unfolded = unfolded.permute(0, 2, 1).reshape(-1, 1, window_length)
+            kernel_sizes=(1, window_length),
-    #     self.audio_data = unfolded
+            strides=(1, hop_length), )
-    #     return self
+        # unfolded: (nb * nch, window_length, num_windows).
-
+        # -> (nb * nch * num_windows, 1, window_length)
-    # def overlap_and_add(self, hop_duration: float):
+        unfolded = unfolded.transpose([0, 2, 1]).reshape([-1, 1, window_length])
-    #     """Function which takes a list of windows and overlap adds them into a
+        self.audio_data = unfolded
-    #     signal the same length as ``audio_signal``.
+        return self
-
+
-    #     Parameters
+    def overlap_and_add(self, hop_duration: float):
-    #     ----------
+        """Function which takes a list of windows and overlap adds them into a
-    #     hop_duration : float
+        signal the same length as ``audio_signal``.
-    #         How much to shift for each window
+
-    #         (overlap is window_duration - hop_duration) in seconds.
+        Parameters
-
+        ----------
-    #     Returns
+        hop_duration : float
-    #     -------
+            How much to shift for each window
-    #     AudioSignal
+            (overlap is window_duration - hop_duration) in seconds.
-    #         overlap-and-added signal.
+
-    #     """
+        Returns
-    #     hop_length = int(hop_duration * self.sample_rate)
+        -------
-    #     window_length = self.signal_length
+        AudioSignal
-
+            overlap-and-added signal.
-    #     nb, nch = self._original_batch_size, self._original_num_channels
+        """
-
+        hop_length = int(hop_duration * self.sample_rate)
-    #     unfolded = self.audio_data.reshape(nb * nch, -1, window_length).permute(0, 2, 1)
+        window_length = self.signal_length
        nb, nch = self._original_batch_size, self._original_num_channels
        unfolded = self.audio_data.reshape(
            [nb * nch, -1, window_length]).transpose([0, 2, 1])
        # folded = paddle.nn.functional.fold(
        #     unfolded,
-    #         output_size=(1, self._padded_signal_length),
+        #     output_sizes=(1, self._padded_signal_length),
-    #         kernel_size=(1, window_length),
+        #     kernel_sizes=(1, window_length),
-    #         stride=(1, hop_length),
+        #     strides=(1, hop_length),
        # )
        folded = _fold(
            unfolded,
            output_sizes=(1, self._padded_signal_length),
            kernel_sizes=(1, window_length),
            strides=(1, hop_length), )
-    #     norm = paddle.ones_like(unfolded, device=unfolded.device)
+        norm = paddle.ones_like(unfolded)
        # norm = paddle.nn.functional.fold(
        #     norm,
-    #         output_size=(1, self._padded_signal_length),
+        #     output_sizes=(1, self._padded_signal_length),
-    #         kernel_size=(1, window_length),
+        #     kernel_sizes=(1, window_length),
-    #         stride=(1, hop_length),
+        #     strides=(1, hop_length),
        # )
        norm = _fold(
            norm,
            output_sizes=(1, self._padded_signal_length),
            kernel_sizes=(1, window_length),
            strides=(1, hop_length), )
-    #     folded = folded / norm
+        folded = folded / norm
-    #     folded = folded.reshape(nb, nch, -1)
+        folded = folded.reshape([nb, nch, -1])
-    #     self.audio_data = folded
+        self.audio_data = folded
-    #     self.trim(hop_length, hop_length)
+        self.trim(hop_length, hop_length)
-    #     return self
+        return self
    def low_pass(self,
                 cutoffs: typing.Union[paddle.Tensor, np.ndarray, float],
@ -312,87 +365,92 @@ class DSPMixin:
        self.stft_data = mag * paddle.exp(1j * phase)
        return self
-    # def mask_low_magnitudes(
+    def mask_low_magnitudes(
-    #     self, db_cutoff: typing.Union[paddle.Tensor, np.ndarray, float], val: float = 0.0
+            self,
-    # ):
+            db_cutoff: typing.Union[paddle.Tensor, np.ndarray, float],
-    #     """Mask away magnitudes below a specified threshold, which
+            val: float=0.0):
-    #     can be different for every item in the batch.
+        """Mask away magnitudes below a specified threshold, which
-
+        can be different for every item in the batch.
    #     Parameters
    #     ----------
    #     db_cutoff : typing.Union[paddle.Tensor, np.ndarray, float]
    #         Decibel value for which things below it will be masked away.
    #     val : float, optional
    #         Value to fill in for masked portions, by default 0.0
    #     Returns
    #     -------
    #     AudioSignal
    #         Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
    #         masked audio data.
    #     """
    #     mag = self.magnitude
    #     log_mag = self.log_magnitude()
    #     db_cutoff = util.ensure_tensor(db_cutoff, ndim=mag.ndim)
    #     mask = log_mag < db_cutoff
    #     mag = mag.masked_fill(mask, val)
-    #     self.magnitude = mag
+        Parameters
-    #     return self
+        ----------
-
+        db_cutoff : typing.Union[paddle.Tensor, np.ndarray, float]
-    # def shift_phase(self, shift: typing.Union[paddle.Tensor, np.ndarray, float]):
+            Decibel value for which things below it will be masked away.
-    #     """Shifts the phase by a constant value.
+        val : float, optional
-
+            Value to fill in for masked portions, by default 0.0
-    #     Parameters
+
-    #     ----------
+        Returns
-    #     shift : typing.Union[paddle.Tensor, np.ndarray, float]
+        -------
-    #         What to shift the phase by.
+        AudioSignal
-
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
-    #     Returns
+            masked audio data.
-    #     -------
+        """
-    #     AudioSignal
+        mag = self.magnitude
-    #         Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
+        log_mag = self.log_magnitude()
-    #         masked audio data.
+
-    #     """
+        db_cutoff = util.ensure_tensor(db_cutoff, ndim=mag.ndim)
-    #     shift = util.ensure_tensor(shift, ndim=self.phase.ndim)
+        mask = log_mag < db_cutoff
-    #     self.phase = self.phase + shift
+        mag = mag.masked_fill(mask, val)
-    #     return self
+
-
+        self.magnitude = mag
-    # def corrupt_phase(self, scale: typing.Union[paddle.Tensor, np.ndarray, float]):
+        return self
-    #     """Corrupts the phase randomly by some scaled value.
+
-
+    def shift_phase(self,
-    #     Parameters
+                    shift: typing.Union[paddle.Tensor, np.ndarray, float]):
-    #     ----------
+        """Shifts the phase by a constant value.
-    #     scale : typing.Union[paddle.Tensor, np.ndarray, float]
+
-    #         Standard deviation of noise to add to the phase.
+        Parameters
-
+        ----------
-    #     Returns
+        shift : typing.Union[paddle.Tensor, np.ndarray, float]
-    #     -------
+            What to shift the phase by.
-    #     AudioSignal
+
-    #         Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
+        Returns
-    #         masked audio data.
+        -------
-    #     """
+        AudioSignal
-    #     scale = util.ensure_tensor(scale, ndim=self.phase.ndim)
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
-    #     self.phase = self.phase + scale * paddle.randn_like(self.phase)
+            masked audio data.
-    #     return self
+        """
-
+        shift = util.ensure_tensor(shift, ndim=self.phase.ndim)
-    # def preemphasis(self, coef: float = 0.85):
+        self.phase = self.phase + shift
-    #     """Applies pre-emphasis to audio signal.
+        return self
-
+
-    #     Parameters
+    def corrupt_phase(self,
-    #     ----------
+                      scale: typing.Union[paddle.Tensor, np.ndarray, float]):
-    #     coef : float, optional
+        """Corrupts the phase randomly by some scaled value.
-    #         How much pre-emphasis to apply, lower values do less. 0 does nothing.
+
-    #         by default 0.85
+        Parameters
-
+        ----------
-    #     Returns
+        scale : typing.Union[paddle.Tensor, np.ndarray, float]
-    #     -------
+            Standard deviation of noise to add to the phase.
-    #     AudioSignal
+
-    #         Pre-emphasized signal.
+        Returns
-    #     """
+        -------
-    #     kernel = paddle.to_tensor([1, -coef, 0]).view(1, 1, -1).to(self.device)
+        AudioSignal
-    #     x = self.audio_data.reshape(-1, 1, self.signal_length)
+            Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
-    #     x = paddle.nn.functional.conv1d(x, kernel, padding=1)
+            masked audio data.
-    #     self.audio_data = x.reshape(*self.audio_data.shape)
+        """
-    #     return self
+        scale = util.ensure_tensor(scale, ndim=self.phase.ndim)
        self.phase = self.phase + scale * paddle.randn(
            shape=self.phase.shape, dtype=self.phase.dtype)
        return self
    def preemphasis(self, coef: float=0.85):
        """Applies pre-emphasis to audio signal.
        Parameters
        ----------
        coef : float, optional
            How much pre-emphasis to apply, lower values do less. 0 does nothing.
            by default 0.85
        Returns
        -------
        AudioSignal
            Pre-emphasized signal.
        """
        kernel = paddle.to_tensor([1, -coef, 0]).reshape([1, 1, -1])
        x = self.audio_data.reshape([-1, 1, self.signal_length])
        x = paddle.nn.functional.conv1d(
            x.astype(kernel.dtype), kernel, padding=1)
        self.audio_data = x.reshape(self.audio_data.shape)
        return self
--- a/audio/audiotools/requirements.txt
+++ b/audio/audiotools/requirements.txt
@ -1,8 +1,8 @@
 flatten_dict
 gradio
 IPython
-librosa
+librosa==0.8.1markdown2
-markdown2
+numpy==1.23.5
 pyloudnorm
 pytest
 pytest-xdist
--- a/audio/tests/audiotools/core/test_audio_signal✅.py
+++ b/audio/tests/audiotools/core/test_audio_signal✅.py
@ -13,7 +13,7 @@ from audiotools import AudioSignal
 def test_io():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(pathlib.Path(audio_path))
    with tempfile.NamedTemporaryFile(suffix=".wav") as f:
@ -61,7 +61,7 @@ def test_io():
    assert signal.audio_data.ndim == 3
    assert paddle.all(signal.samples == signal.audio_data)
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    assert AudioSignal(audio_path).hash() == AudioSignal(audio_path).hash()
    assert AudioSignal(audio_path).hash() != AudioSignal(audio_path).normalize(
        -20).hash()
@ -71,7 +71,7 @@ def test_io():
 def test_copy_and_clone():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path)
    signal.stft()
    signal.loudness()
@ -369,7 +369,7 @@ def test_trim():
 def test_to_from_ops():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path)
    signal.stft()
    signal.loudness()
@ -384,7 +384,7 @@ def test_to_from_ops():
 def test_device():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path)
    signal.to("cpu")
@ -397,7 +397,7 @@ def test_device():
 def test_stft(window_length, hop_length, window_type):
    if hop_length >= window_length:
        hop_length = window_length // 2
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    stft_params = audiotools.STFTParams(
        window_length=window_length,
        hop_length=hop_length,
@ -456,7 +456,7 @@ def test_stft(window_length, hop_length, window_type):
 def test_log_magnitude():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    for _ in range(10):
        signal = AudioSignal.excerpt(audio_path, duration=5.0)
        magnitude = signal.magnitude.numpy()[0, 0]
@ -474,7 +474,7 @@ def test_log_magnitude():
 def test_mel_spectrogram(n_mels, window_length, hop_length, window_type):
    if hop_length >= window_length:
        hop_length = window_length // 2
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    stft_params = audiotools.STFTParams(
        window_length=window_length,
        hop_length=hop_length,
@ -492,7 +492,7 @@ def test_mel_spectrogram(n_mels, window_length, hop_length, window_type):
 def test_mfcc(n_mfcc, n_mels, window_length, hop_length):
    if hop_length >= window_length:
        hop_length = window_length // 2
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    stft_params = audiotools.STFTParams(
        window_length=window_length, hop_length=hop_length)
    for _stft_params in [None, stft_params]:
--- a/audio/tests/audiotools/core/test_display✅.py
+++ b/audio/tests/audiotools/core/test_display✅.py
@ -0,0 +1,48 @@
 import sys
 from pathlib import Path
 import numpy as np
 sys.path.append("/home/aistudio/PaddleSpeech/audio")
 from audiotools import AudioSignal
 from visualdl import LogWriter
 def test_specshow():
    array = np.zeros((1, 16000))
    AudioSignal(array, sample_rate=16000).specshow()
    AudioSignal(array, sample_rate=16000).specshow(preemphasis=True)
    AudioSignal(
        array, sample_rate=16000).specshow(
            title="test", preemphasis=True)
    AudioSignal(
        array, sample_rate=16000).specshow(
            format=False, preemphasis=True)
    AudioSignal(
        array, sample_rate=16000).specshow(
            format=False, preemphasis=False, y_axis="mel")
 def test_waveplot():
    array = np.zeros((1, 16000))
    AudioSignal(array, sample_rate=16000).waveplot()
 def test_wavespec():
    array = np.zeros((1, 16000))
    AudioSignal(array, sample_rate=16000).wavespec()
 def test_write_audio_to_tb():
    signal = AudioSignal("./audio/spk/f10_script4_produced.mp3", duration=5)
    Path("./scratch").mkdir(parents=True, exist_ok=True)
    writer = LogWriter("./scratch/")
    signal.write_audio_to_tb("tag", writer)
 def test_save_image():
    signal = AudioSignal(
        "./audio/spk/f10_script4_produced.wav", duration=10, offset=10)
    Path("./scratch").mkdir(parents=True, exist_ok=True)
    signal.save_image("./scratch/image.png")
--- a/audio/tests/audiotools/core/test_dsp✅.py
+++ b/audio/tests/audiotools/core/test_dsp✅.py
@ -0,0 +1,178 @@
 import sys
 import numpy as np
 import paddle
 import pytest
 sys.path.append("/home/aistudio/PaddleSpeech/audio")
 from audiotools import AudioSignal
 from audiotools.core.util import sample_from_dist
@pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0])
@pytest.mark.parametrize("sample_rate", [8000, 16000, 22050, 44100])
@pytest.mark.parametrize("duration", [0.5, 1.0, 2.0, 10.0])
 def test_overlap_add(duration, sample_rate, window_duration):
    np.random.seed(0)
    if duration > window_duration:
        spk_signal = AudioSignal.batch([
            AudioSignal.excerpt(
                "./audio/spk/f10_script4_produced.wav", duration=duration)
            for _ in range(16)
        ])
        spk_signal.resample(sample_rate)
        noise = paddle.randn([16, 1, int(duration * sample_rate)])
        nz_signal = AudioSignal(noise, sample_rate=sample_rate)
        def _test(signal):
            hop_duration = window_duration / 2
            windowed_signal = signal.deepcopy().collect_windows(window_duration,
                                                                hop_duration)
            recombined = windowed_signal.overlap_and_add(hop_duration)
            assert recombined == signal
            assert np.allclose(recombined.audio_data, signal.audio_data, 1e-3)
        _test(nz_signal)
        _test(spk_signal)
@pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0])
@pytest.mark.parametrize("sample_rate", [8000, 16000, 22050, 44100])
@pytest.mark.parametrize("duration", [0.5, 1.0, 2.0, 10.0])
 def test_inplace_overlap_add(duration, sample_rate, window_duration):
    np.random.seed(0)
    if duration > window_duration:
        spk_signal = AudioSignal.batch([
            AudioSignal.excerpt(
                "./audio/spk/f10_script4_produced.wav", duration=duration)
            for _ in range(16)
        ])
        spk_signal.resample(sample_rate)
        noise = paddle.randn([16, 1, int(duration * sample_rate)])
        nz_signal = AudioSignal(noise, sample_rate=sample_rate)
        def _test(signal):
            hop_duration = window_duration / 2
            windowed_signal = signal.deepcopy().collect_windows(window_duration,
                                                                hop_duration)
            # Compare in-place with unfold results
            for i, window in enumerate(
                    signal.deepcopy().windows(window_duration, hop_duration)):
                assert np.allclose(window.audio_data,
                                   windowed_signal.audio_data[i])
        _test(nz_signal)
        _test(spk_signal)
 def test_low_pass():
    sample_rate = 44100
    f = 440
    t = paddle.arange(0, 1, 1 / sample_rate)
    sine_wave = paddle.sin(2 * np.pi * f * t)
    window = AudioSignal.get_window("hann", sine_wave.shape[-1])
    sine_wave = sine_wave * window
    signal = AudioSignal(sine_wave.unsqueeze(0), sample_rate=sample_rate)
    out = signal.deepcopy().low_pass(220)
    assert out.audio_data.abs().max() < 1e-4
    out = signal.deepcopy().low_pass(880)
    assert (out - signal).audio_data.abs().max() < 1e-3
    batch = AudioSignal.batch(
        [signal.deepcopy(), signal.deepcopy(), signal.deepcopy()])
    cutoffs = [220, 880, 220]
    out = batch.deepcopy().low_pass(cutoffs)
    assert out.audio_data[0].abs().max() < 1e-4
    assert out.audio_data[2].abs().max() < 1e-4
    assert (out - batch).audio_data[1].abs().max() < 1e-3
 def test_high_pass():
    sample_rate = 44100
    f = 440
    t = paddle.arange(0, 1, 1 / sample_rate)
    sine_wave = paddle.sin(2 * np.pi * f * t)
    window = AudioSignal.get_window("hann", sine_wave.shape[-1])
    sine_wave = sine_wave * window
    signal = AudioSignal(sine_wave.unsqueeze(0), sample_rate=sample_rate)
    out = signal.deepcopy().high_pass(220)
    assert (signal - out).audio_data.abs().max() < 1e-4
 def test_mask_frequencies():
    sample_rate = 44100
    fs = paddle.to_tensor([500.0, 2000.0, 8000.0, 32000.0])[None]
    t = paddle.arange(0, 1, 1 / sample_rate)[:, None]
    sine_wave = paddle.sin(2 * np.pi * t @ fs).sum(axis=-1)
    sine_wave = AudioSignal(sine_wave, sample_rate)
    masked_sine_wave = sine_wave.mask_frequencies(fmin_hz=1500, fmax_hz=10000)
    fs2 = paddle.to_tensor([500.0, 32000.0])[None]
    sine_wave2 = paddle.sin(2 * np.pi * t @ fs).sum(axis=-1)
    sine_wave2 = AudioSignal(sine_wave2, sample_rate)
    assert paddle.allclose(masked_sine_wave.audio_data, sine_wave2.audio_data)
 def test_mask_timesteps():
    sample_rate = 44100
    f = 440
    t = paddle.linspace(0, 1, sample_rate)
    sine_wave = paddle.sin(2 * np.pi * f * t)
    sine_wave = AudioSignal(sine_wave, sample_rate)
    masked_sine_wave = sine_wave.mask_timesteps(tmin_s=0.25, tmax_s=0.75)
    masked_sine_wave.istft()
    mask = ((0.3 < t) & (t < 0.7))[None, None]
    assert paddle.allclose(
        masked_sine_wave.audio_data[mask],
        paddle.zeros_like(masked_sine_wave.audio_data[mask]), )
 def test_shift_phase():
    sample_rate = 44100
    f = 440
    t = paddle.linspace(0, 1, sample_rate)
    sine_wave = paddle.sin(2 * np.pi * f * t)
    sine_wave = AudioSignal(sine_wave, sample_rate)
    sine_wave2 = sine_wave.clone()
    shifted_sine_wave = sine_wave.shift_phase(np.pi)
    shifted_sine_wave.istft()
    sine_wave2.phase = sine_wave2.phase + np.pi
    sine_wave2.istft()
    assert paddle.allclose(shifted_sine_wave.audio_data, sine_wave2.audio_data)
 def test_corrupt_phase():
    sample_rate = 44100
    f = 440
    t = paddle.linspace(0, 1, sample_rate)
    sine_wave = paddle.sin(2 * np.pi * f * t)
    sine_wave = AudioSignal(sine_wave, sample_rate)
    sine_wave2 = sine_wave.clone()
    shifted_sine_wave = sine_wave.corrupt_phase(scale=np.pi)
    shifted_sine_wave.istft()
    assert (sine_wave2.phase - shifted_sine_wave.phase).abs().mean() > 0.0
    assert ((sine_wave2.phase - shifted_sine_wave.phase).std() / np.pi) < 1.0
 def test_preemphasis():
    x = AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=5)
    import matplotlib.pyplot as plt
    x.specshow(preemphasis=False)
    x.specshow(preemphasis=True)
    x.preemphasis()
--- a/audio/tests/audiotools/core/test_effects✅.py
+++ b/audio/tests/audiotools/core/test_effects✅.py
@ -8,7 +8,7 @@ from audiotools import AudioSignal
 def test_normalize():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=10)
    signal = signal.normalize()
    assert np.allclose(signal.loudness(), -24, atol=1e-1)
@ -35,7 +35,7 @@ def test_normalize():
 def test_volume_change():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=10)
    boost = 3
@ -50,10 +50,10 @@ def test_volume_change():
 def test_mix():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=10)
-    audio_path = "tests/audiotools/audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
+    audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
    nz = AudioSignal(audio_path, offset=10, duration=10)
    spk.deepcopy().mix(nz, snr=-10)
@ -61,10 +61,10 @@ def test_mix():
    assert np.allclose(snr, -10, atol=1)
    # Test in batch
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=10)
-    audio_path = "tests/audiotools/audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
+    audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
    nz = AudioSignal(audio_path, offset=10, duration=10)
    batch_size = 4
@ -86,7 +86,7 @@ def test_mix():
 def test_convolve():
    np.random.seed(6)  # Found a failing seed
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=10)
    impulse = np.zeros((1, 16000), dtype="float32")
@ -106,7 +106,7 @@ def test_convolve():
    assert convolved == spk_batch
    # Short duration
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=0.1)
    impulse = np.zeros((1, 16000), dtype="float32")
@ -128,14 +128,14 @@ def test_convolve():
 def test_pipeline():
    # An actual IR, no batching
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=5)
-    audio_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
+    audio_path = "./audio/ir/h179_Bar_1txts.wav"
    ir = AudioSignal(audio_path)
    spk.deepcopy().convolve(ir)
-    audio_path = "tests/audiotools/audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
+    audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
    nz = AudioSignal(audio_path, offset=10, duration=5)
    batch_size = 16
@ -146,7 +146,7 @@ def test_pipeline():
 # def test_codec():
-#     audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+#     audio_path = "./audio/spk/f10_script4_produced.wav"
 #     spk = AudioSignal(audio_path, offset=10, duration=10)
 #     with pytest.raises(ValueError):
@ -156,7 +156,7 @@ def test_pipeline():
 #     out = spk.deepcopy().apply_codec("8-bit")
 # def test_pitch_shift():
-#     audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+#     audio_path = "./audio/spk/f10_script4_produced.wav"
 #     spk = AudioSignal(audio_path, offset=10, duration=1)
 #     single = spk.deepcopy().pitch_shift(5)
@ -169,7 +169,7 @@ def test_pipeline():
 #     assert np.allclose(batched[0].audio_data, single[0].audio_data)
 # def test_time_stretch():
-#     audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+#     audio_path = "./audio/spk/f10_script4_produced.wav"
 #     spk = AudioSignal(audio_path, offset=10, duration=1)
 #     single = spk.deepcopy().time_stretch(0.8)
@ -184,7 +184,7 @@ def test_pipeline():
@pytest.mark.parametrize("n_bands", [1, 2, 4, 8, 12, 16])
 def test_mel_filterbank(n_bands):
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=1)
    fbank = spk.deepcopy().mel_filterbank(n_bands)
@ -192,8 +192,7 @@ def test_mel_filterbank(n_bands):
    # Check if it works in batches.
    spk_batch = AudioSignal.batch([
-        AudioSignal.excerpt(
+        AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
            "tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
        for _ in range(16)
    ])
    fbank = spk_batch.deepcopy().mel_filterbank(n_bands)
@ -203,7 +202,7 @@ def test_mel_filterbank(n_bands):
@pytest.mark.parametrize("n_bands", [1, 2, 4, 8, 12, 16])
 def test_equalizer(n_bands):
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=10)
    db = -3 + 1 * paddle.rand([n_bands])
@ -212,15 +211,14 @@ def test_equalizer(n_bands):
    db = -3 + 1 * np.random.rand(n_bands)
    spk.deepcopy().equalizer(db)
-    audio_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
+    audio_path = "./audio/ir/h179_Bar_1txts.wav"
    ir = AudioSignal(audio_path)
    db = -3 + 1 * paddle.rand([n_bands])
    spk.deepcopy().convolve(ir.equalizer(db))
    spk_batch = AudioSignal.batch([
-        AudioSignal.excerpt(
+        AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
            "tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
        for _ in range(16)
    ])
@ -231,13 +229,12 @@ def test_equalizer(n_bands):
 def test_clip_distortion():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=2)
    clipped = spk.deepcopy().clip_distortion(0.05)
    spk_batch = AudioSignal.batch([
-        AudioSignal.excerpt(
+        AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
            "tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
        for _ in range(16)
    ])
    percs = paddle.to_tensor(np.random.uniform(size=(16, ))).astype("float32")
@ -249,7 +246,7 @@ def test_clip_distortion():
@pytest.mark.parametrize("quant_ch", [2, 4, 8, 16, 32, 64, 128])
 def test_quantization(quant_ch):
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=2)
    quantized = spk.deepcopy().quantization(quant_ch)
@ -260,8 +257,7 @@ def test_quantization(quant_ch):
    assert found_quant_ch <= quant_ch
    spk_batch = AudioSignal.batch([
-        AudioSignal.excerpt(
+        AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
            "tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
        for _ in range(16)
    ])
@ -277,7 +273,7 @@ def test_quantization(quant_ch):
@pytest.mark.parametrize("quant_ch", [2, 4, 8, 16, 32, 64, 128])
 def test_mulaw_quantization(quant_ch):
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    spk = AudioSignal(audio_path, offset=10, duration=2)
    quantized = spk.deepcopy().mulaw_quantization(quant_ch)
@ -288,8 +284,7 @@ def test_mulaw_quantization(quant_ch):
    assert found_quant_ch <= quant_ch
    spk_batch = AudioSignal.batch([
-        AudioSignal.excerpt(
+        AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
            "tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
        for _ in range(16)
    ])
@ -304,7 +299,7 @@ def test_mulaw_quantization(quant_ch):
 def test_impulse_response_augmentation():
-    audio_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
+    audio_path = "./audio/ir/h179_Bar_1txts.wav"
    batch_size = 16
    ir = AudioSignal(audio_path)
    ir_batch = AudioSignal.batch([ir for _ in range(batch_size)])
@ -330,8 +325,8 @@ def test_impulse_response_augmentation():
 def test_apply_ir():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
-    ir_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
+    ir_path = "./audio/ir/h179_Bar_1txts.wav"
    spk = AudioSignal(audio_path, offset=10, duration=2)
    ir = AudioSignal(ir_path)
--- a/audio/tests/audiotools/core/test_grad✅.py
+++ b/audio/tests/audiotools/core/test_grad✅.py
@ -9,8 +9,8 @@ from audiotools import AudioSignal
 def test_audio_grad():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
-    ir_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
+    ir_path = "./audio/ir/h179_Bar_1txts.wav"
    def _test_audio_grad(attr: str, target=True, kwargs: dict={}):
        signal = AudioSignal(audio_path)
@ -153,7 +153,7 @@ def test_audio_grad():
 def test_batch_grad():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path)
    signal.audio_data.stop_gradient = False
--- a/audio/tests/audiotools/core/test_highpass✅.py
+++ b/audio/tests/audiotools/core/test_highpass✅.py
@ -97,75 +97,5 @@ class TestHighPassFilters(_BaseTest):
        self.assertSimilar(y, y2, x)
 # class TestBandPassFilters(_BaseTest):
 #     def setUp(self):
 #         paddle.seed(1234)
 #         random.seed(1234)
 #     def test_keep_or_kill(self):
 #         for _ in range(10):
 #             freq = random.uniform(0.01, 0.4)
 #             sr = 1024
 #             tone = pure_tone(freq * sr, sr=sr, dur=10)
 #             # For this test we accept 5% tolerance in amplitude, or -26dB in power.
 #             tol = 5
 #             zeros = 16
 #             y_pass = filters.bandpass_filter(tone, 0.9 * freq, 1.1 * freq, zeros=zeros)
 #             self.assertSimilar(y_pass, tone, tone, f"freq={freq}, pass", tol=tol)
 #             y_killed = filters.bandpass_filter(tone, 1.1 * freq, 1.2 * freq, zeros=zeros)
 #             self.assertSimilar(y_killed, 0 * tone, tone, f"freq={freq}, kill", tol=tol)
 #             y_killed = filters.bandpass_filter(tone, 0.8 * freq, 0.9 * freq, zeros=zeros)
 #             self.assertSimilar(y_killed, 0 * tone, tone, f"freq={freq}, kill", tol=tol)
 #     def test_fft_nofft(self):
 #         for _ in range(10):
 #             x = paddle.randn([1024])
 #             freq = random.uniform(0.01, 0.5)
 #             freq2 = random.uniform(freq, 0.5)
 #             y_fft = filters.bandpass_filter(x, freq, freq2, fft=True)
 #             y_ref = filters.bandpass_filter(x, freq, freq2, fft=False)
 #             self.assertSimilar(y_fft, y_ref, x, f"freq={freq}", tol=0.01)
 #     def test_constant(self):
 #         x = paddle.ones([2048])
 #         for zeros in [4, 10]:
 #             for freq in [0.01, 0.1]:
 #                 y = filters.bandpass_filter(x, freq, 1.2 * freq, zeros=zeros)
 #                 self.assertLessEqual(y.abs().mean(), 1e-6, (zeros, freq))
 #     def test_stride(self):
 #         x = paddle.randn([1024])
 #         y = filters.bandpass_filter(x, 0.1, 0.2, stride=1)[::3]
 #         y2 = filters.bandpass_filter(x, 0.1, 0.2, stride=3)
 #         self.assertEqual(y.shape, y2.shape)
 #         self.assertSimilar(y, y2, x)
 #         y = filters.bandpass_filter(x, 0.1, 0.2, stride=1, pad=False)[::3]
 #         y2 = filters.bandpass_filter(x, 0.1, 0.2, stride=3, pad=False)
 #         self.assertEqual(y.shape, y2.shape)
 #         self.assertSimilar(y, y2, x)
 #     def test_same_as_highpass(self):
 #         x = paddle.randn([1024])
 #         y_ref = highpass_filter(x, 0.2)
 #         y = filters.bandpass_filter(x, 0.2, 0.5)
 #         self.assertSimilar(y, y_ref, x)
 #     def test_same_as_lowpass(self):
 #         x = paddle.randn([1024])
 #         y_ref = filters.lowpass_filter(x, 0.2)
 #         y = filters.bandpass_filter(x, 0.0, 0.2)
 #         self.assertSimilar(y, y_ref, x)
 if __name__ == "__main__":
    unittest.main()
--- a/audio/tests/audiotools/core/test_loudness✅.py
+++ b/audio/tests/audiotools/core/test_loudness✅.py
@ -13,7 +13,7 @@ ATOL = 1e-1
 def test_loudness_against_pyln():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=5, duration=10)
    signal_loudness = signal.loudness()
@ -24,7 +24,7 @@ def test_loudness_against_pyln():
 def test_loudness_short():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=0.25)
    signal_loudness = signal.loudness()
@ -58,7 +58,7 @@ def test_batch_loudness():
 # Tests below are copied from pyloudnorm
 def test_integrated_loudness():
-    data, rate = sf.read("tests/audiotools/audio/loudness/sine_1000.wav")
+    data, rate = sf.read("./audio/loudness/sine_1000.wav")
    meter = Meter(rate)
    loudness = meter(data)
@ -67,8 +67,7 @@ def test_integrated_loudness():
 def test_rel_gate_test():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_RelGateTest.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_RelGateTest.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -77,8 +76,7 @@ def test_rel_gate_test():
 def test_abs_gate_test():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_AbsGateTest.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_AbsGateTest.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -87,8 +85,7 @@ def test_abs_gate_test():
 def test_24LKFS_25Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_25Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_25Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -97,8 +94,7 @@ def test_24LKFS_25Hz_2ch():
 def test_24LKFS_100Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_100Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_100Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -107,8 +103,7 @@ def test_24LKFS_100Hz_2ch():
 def test_24LKFS_500Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_500Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_500Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -117,8 +112,7 @@ def test_24LKFS_500Hz_2ch():
 def test_24LKFS_1000Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_1000Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_1000Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -127,8 +121,7 @@ def test_24LKFS_1000Hz_2ch():
 def test_24LKFS_2000Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_2000Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_2000Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -137,8 +130,7 @@ def test_24LKFS_2000Hz_2ch():
 def test_24LKFS_10000Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_10000Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_10000Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -147,8 +139,7 @@ def test_24LKFS_10000Hz_2ch():
 def test_23LKFS_25Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_25Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_25Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -157,8 +148,7 @@ def test_23LKFS_25Hz_2ch():
 def test_23LKFS_100Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_100Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_100Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -167,8 +157,7 @@ def test_23LKFS_100Hz_2ch():
 def test_23LKFS_500Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_500Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_500Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -177,8 +166,7 @@ def test_23LKFS_500Hz_2ch():
 def test_23LKFS_1000Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_1000Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_1000Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -187,8 +175,7 @@ def test_23LKFS_1000Hz_2ch():
 def test_23LKFS_2000Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_2000Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_2000Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -197,8 +184,7 @@ def test_23LKFS_2000Hz_2ch():
 def test_23LKFS_10000Hz_2ch():
-    data, rate = sf.read(
+    data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_10000Hz_2ch.wav")
        "tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_10000Hz_2ch.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -208,7 +194,7 @@ def test_23LKFS_10000Hz_2ch():
 def test_18LKFS_frequency_sweep():
    data, rate = sf.read(
-        "tests/audiotools/audio/loudness/1770-2_Comp_18LKFS_FrequencySweep.wav")
+        "./audio/loudness/1770-2_Comp_18LKFS_FrequencySweep.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -218,7 +204,7 @@ def test_18LKFS_frequency_sweep():
 def test_conf_stereo_vinL_R_23LKFS():
    data, rate = sf.read(
-        "tests/audiotools/audio/loudness/1770-2_Conf_Stereo_VinL+R-23LKFS.wav")
+        "./audio/loudness/1770-2_Conf_Stereo_VinL+R-23LKFS.wav")
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -228,8 +214,7 @@ def test_conf_stereo_vinL_R_23LKFS():
 def test_conf_monovoice_music_24LKFS():
    data, rate = sf.read(
-        "tests/audiotools/audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav"
+        "./audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav")
    )
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -239,8 +224,7 @@ def test_conf_monovoice_music_24LKFS():
 def conf_monovoice_music_24LKFS():
    data, rate = sf.read(
-        "tests/audiotools/audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav"
+        "./audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav")
    )
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -250,8 +234,7 @@ def conf_monovoice_music_24LKFS():
 def test_conf_monovoice_music_23LKFS():
    data, rate = sf.read(
-        "tests/audiotools/audio/loudness/1770-2_Conf_Mono_Voice+Music-23LKFS.wav"
+        "./audio/loudness/1770-2_Conf_Mono_Voice+Music-23LKFS.wav")
    )
    meter = Meter(rate)
    loudness = meter.integrated_loudness(data)
@ -266,7 +249,7 @@ def test_fir_accuracy():
        transforms.HighPass(prob=0.5),
        transforms.Equalizer(prob=0.5),
        prob=0.5, )
-    loader = datasets.AudioLoader(sources=["tests/audiotools/audio/spk.csv"])
+    loader = datasets.AudioLoader(sources=["./audio/spk.csv"])
    dataset = datasets.AudioDataset(
        loader,
        44100,
--- a/audio/tests/audiotools/core/test_util✅.py
+++ b/audio/tests/audiotools/core/test_util✅.py
@ -66,8 +66,7 @@ def test_find_audio():
    assert not audio_files
    # Make sure it works with single audio files
-    audio_files = util.find_audio(
+    audio_files = util.find_audio("./audio/spk//f10_script4_produced.wav")
        "tests/audiotools/audio/spk//f10_script4_produced.wav")
    # Make sure it works with globs
    audio_files = util.find_audio("tests/**/*.wav")
--- a/audio/tests/audiotools/data/test_datasets✅.py
+++ b/audio/tests/audiotools/data/test_datasets✅.py
@ -45,7 +45,7 @@ def test_audio_dataset():
            tfm.Silence(prob=0.5),
        ], )
    loader = audiotools.data.datasets.AudioLoader(
-        sources=["tests/audiotools/audio/spk.csv"],
+        sources=["./audio/spk.csv"],
        transform=transform, )
    dataset = audiotools.data.datasets.AudioDataset(
        loader,
@ -161,11 +161,10 @@ def test_loader_out_of_range():
 def test_dataset_pipeline():
    transform = tfm.Compose([
-        tfm.RoomImpulseResponse(sources=["tests/audiotools/audio/irs.csv"]),
+        tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
-        tfm.BackgroundNoise(sources=["tests/audiotools/audio/noises.csv"]),
+        tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
    ])
-    loader = audiotools.data.datasets.AudioLoader(
+    loader = audiotools.data.datasets.AudioLoader(sources=["./audio/spk.csv"])
        sources=["tests/audiotools/audio/spk.csv"])
    dataset = audiotools.data.datasets.AudioDataset(
        loader,
        44100,
--- a/audio/tests/audiotools/data/test_preprocess✅.py
+++ b/audio/tests/audiotools/data/test_preprocess✅.py
@ -12,13 +12,11 @@ from audiotools.data import preprocess
 def test_create_csv():
    with tempfile.NamedTemporaryFile(suffix=".csv") as f:
        preprocess.create_csv(
-            find_audio("./tests/audiotools/audio/spk", ext=["wav"]),
+            find_audio("././audio/spk", ext=["wav"]), f.name, loudness=True)
            f.name,
            loudness=True)
 def test_create_csv_with_empty_rows():
-    audio_files = find_audio("./tests/audiotools/audio/spk", ext=["wav"])
+    audio_files = find_audio("././audio/spk", ext=["wav"])
    audio_files.insert(0, "")
    audio_files.insert(2, "")
--- a/audio/tests/audiotools/data/test_transforms✅.py
+++ b/audio/tests/audiotools/data/test_transforms✅.py
@ -49,13 +49,13 @@ def test_transform(transform_name):
    kwargs = {}
    if transform_name == "BackgroundNoise":
-        kwargs["sources"] = ["tests/audiotools/audio/noises.csv"]
+        kwargs["sources"] = ["./audio/noises.csv"]
    if transform_name == "RoomImpulseResponse":
-        kwargs["sources"] = ["tests/audiotools/audio/irs.csv"]
+        kwargs["sources"] = ["./audio/irs.csv"]
    if transform_name == "CrossTalk":
-        kwargs["sources"] = ["tests/audiotools/audio/spk.csv"]
+        kwargs["sources"] = ["./audio/spk.csv"]
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    signal.metadata["loudness"] = AudioSignal(
        audio_path).ffmpeg_loudness().item()
@ -102,12 +102,12 @@ def test_transform(transform_name):
 def test_compose_basic():
    seed = 0
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    transform = tfm.Compose(
        [
-            tfm.RoomImpulseResponse(sources=["tests/audiotools/audio/irs.csv"]),
+            tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
-            tfm.BackgroundNoise(sources=["tests/audiotools/audio/noises.csv"]),
+            tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
        ], )
    kwargs = transform.instantiate(seed, signal)
@ -143,7 +143,7 @@ def test_compose_with_duplicate_transforms():
    full_mul = np.prod(muls)
    kwargs = transform.instantiate(0)
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    output = transform(signal.clone(), **kwargs)
@ -162,7 +162,7 @@ def test_nested_compose():
    full_mul = np.prod(muls)
    kwargs = transform.instantiate(0)
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    output = transform(signal.clone(), **kwargs)
@ -176,7 +176,7 @@ def test_compose_filtering():
    transform = tfm.Compose([MulTransform(x, name=str(x)) for x in muls])
    kwargs = transform.instantiate(0)
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    for s in range(len(muls)):
@ -199,7 +199,7 @@ def test_sequential_compose():
    full_mul = np.prod(muls)
    kwargs = transform.instantiate(0)
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    output = transform(signal.clone(), **kwargs)
@ -210,11 +210,11 @@ def test_sequential_compose():
 def test_choose_basic():
    seed = 0
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    transform = tfm.Choose([
-        tfm.RoomImpulseResponse(sources=["tests/audiotools/audio/irs.csv"]),
+        tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
-        tfm.BackgroundNoise(sources=["tests/audiotools/audio/noises.csv"]),
+        tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
    ])
    kwargs = transform.instantiate(seed, signal)
@ -251,7 +251,7 @@ def test_choose_basic():
 def test_choose_weighted():
    seed = 0
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    transform = tfm.Choose(
        [
            MulTransform(0.0),
@ -277,7 +277,7 @@ def test_choose_weighted():
 def test_choose_with_compose():
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    transform = tfm.Choose([
@ -296,7 +296,7 @@ def test_choose_with_compose():
 def test_repeat():
    seed = 0
-    audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
+    audio_path = "./audio/spk/f10_script4_produced.wav"
    signal = AudioSignal(audio_path, offset=10, duration=2)
    kwargs = {}
@ -356,7 +356,7 @@ class DummyData(paddle.io.Dataset):
 def test_masking():
-    dataset = DummyData("tests/audiotools/audio/spk/f10_script4_produced.wav")
+    dataset = DummyData("./audio/spk/f10_script4_produced.wav")
    dataloader = paddle.io.DataLoader(
        dataset,
        batch_size=16,
@ -385,8 +385,7 @@ def test_nested_masking():
        ],
        prob=0.9, )
-    loader = audiotools.data.datasets.AudioLoader(
+    loader = audiotools.data.datasets.AudioLoader(sources=["./audio/spk.csv"])
        sources=["tests/audiotools/audio/spk.csv"])
    dataset = audiotools.data.datasets.AudioDataset(
        loader,
        44100,
--- a/audio/tests/audiotools/test_audiotools.sh
+++ b/audio/tests/audiotools/test_audiotools.sh
@ -0,0 +1,4 @@
 python -m pip install -r ../audiotools/requirements.txt
 # wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle
 # wget 
 find . -name "*✅.py" | xargs python -m pytest
--- a/audio/tests/audiotools/test_post✅.py
+++ b/audio/tests/audiotools/test_post✅.py
@ -13,8 +13,7 @@ def test_audio_table():
    audio_dict = {}
    audio_dict["inputs"] = [
-        AudioSignal.excerpt(
+        AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=5)
            "tests/audiotools/audio/spk/f10_script4_produced.wav", duration=5)
        for _ in range(3)
    ]
    audio_dict["outputs"] = []
--- a/tests/unit/ci.sh
+++ b/tests/unit/ci.sh
@ -31,6 +31,13 @@ function main(){
  cd ${speech_ci_path}/server/offline
  bash test_server_client.sh
  echo "End server"
  echo "Start testing audiotools"
  cd ${speech_ci_path}/../../audio/tests/audiotools
  bash test_audiotools.sh
  echo "End testing audiotools"
 }
 main
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@ -115,3 +115,4 @@ paddlespeech whisper --task translate --input ./zh.wav
 paddlespeech whisper --lang en --size base --task transcribe  --input ./en.wav
 echo -e "\033[32mTest success !!!\033[0m"
`@ -115,3 +115,4 @@ paddlespeech whisper --task translate --input ./zh.wav`
	`paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav`	`paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav`

	`echo -e "\033[32mTest success !!!\033[0m"`	`echo -e "\033[32mTest success !!!\033[0m"`