add AudioSignal && util

10 months ago · 9e7dca2bc5
parent afa9466c89
commit 9e7dca2bc5
4 changed files with 2694 additions and 0 deletions
--- a/audio/audiotools/audio_signal.py
+++ b/audio/audiotools/audio_signal.py
--- a/audio/audiotools/requirements.txt
+++ b/audio/audiotools/requirements.txt
@ -0,0 +1,5 @@
+soundfile
+librosa
+scipy
+rich
+flatten_dict
--- a/audio/audiotools/resample.py
+++ b/audio/audiotools/resample.py
@ -0,0 +1,240 @@
+import inspect
+from typing import Optional, Sequence
+import paddle
+import paddle.nn.functional as F
+import math
+
+
+def simple_repr(
+    obj, attrs: Optional[Sequence[str]] = None, overrides: dict = {}
+):
+    """
+    Return a simple representation string for `obj`.
+    If `attrs` is not None, it should be a list of attributes to include.
+    """
+    params = inspect.signature(obj.__class__).parameters
+    attrs_repr = []
+    if attrs is None:
+        attrs = list(params.keys())
+    for attr in attrs:
+        display = False
+        if attr in overrides:
+            value = overrides[attr]
+        elif hasattr(obj, attr):
+            value = getattr(obj, attr)
+        else:
+            continue
+        if attr in params:
+            param = params[attr]
+            if param.default is inspect._empty or value != param.default:  # type: ignore
+                display = True
+        else:
+            display = True
+
+        if display:
+            attrs_repr.append(f"{attr}={value}")
+    return f"{obj.__class__.__name__}({','.join(attrs_repr)})"
+
+
+def sinc(x: paddle.Tensor):
+    """
+    Implementation of sinc, i.e. sin(x) / x
+
+    __Warning__: the input is not multiplied by `pi`!
+    """
+    return paddle.where(
+        x == 0,
+        paddle.to_tensor(1.0, dtype=x.dtype, place=x.place),
+        paddle.sin(x) / x,
+    )
+
+
+class ResampleFrac(paddle.nn.Layer):
+    """
+    Resampling from the sample rate `old_sr` to `new_sr`.
+    """
+
+    def __init__(
+        self, old_sr: int, new_sr: int, zeros: int = 24, rolloff: float = 0.945
+    ):
+        """
+        Args:
+            old_sr (int): sample rate of the input signal x.
+            new_sr (int): sample rate of the output.
+            zeros (int): number of zero crossing to keep in the sinc filter.
+            rolloff (float): use a lowpass filter that is `rolloff * new_sr / 2`,
+                to ensure sufficient margin due to the imperfection of the FIR filter used.
+                Lowering this value will reduce anti-aliasing, but will reduce some of the
+                highest frequencies.
+
+        Shape:
+
+            - Input: `[*, T]`
+            - Output: `[*, T']` with `T' = int(new_sr * T / old_sr)`
+
+
+        .. caution::
+            After dividing `old_sr` and `new_sr` by their GCD, both should be small
+            for this implementation to be fast.
+
+        >>> import paddle
+        >>> resample = ResampleFrac(4, 5)
+        >>> x = paddle.randn([1000])
+        >>> print(len(resample(x)))
+        1250
+        """
+        super(ResampleFrac, self).__init__()
+        if not isinstance(old_sr, int) or not isinstance(new_sr, int):
+            raise ValueError("old_sr and new_sr should be integers")
+        gcd = math.gcd(old_sr, new_sr)
+        self.old_sr = old_sr // gcd
+        self.new_sr = new_sr // gcd
+        self.zeros = zeros
+        self.rolloff = rolloff
+
+        self._init_kernels()
+
+    def _init_kernels(self):
+        if self.old_sr == self.new_sr:
+            return
+
+        kernels = []
+        sr = min(self.new_sr, self.old_sr)
+        # rolloff will perform antialiasing filtering by removing the highest frequencies.
+        # At first I thought I only needed this when downsampling, but when upsampling
+        # you will get edge artifacts without this, the edge is equivalent to zero padding,
+        # which will add high freq artifacts.
+        sr *= self.rolloff
+
+        # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
+        # using the sinc interpolation formula:
+        #   x(t) = sum_i x[i] sinc(pi * old_sr * (i / old_sr - t))
+        # We can then sample the function x(t) with a different sample rate:
+        #    y[j] = x(j / new_sr)
+        # or,
+        #    y[j] = sum_i x[i] sinc(pi * old_sr * (i / old_sr - j / new_sr))
+
+        # We see here that y[j] is the convolution of x[i] with a specific filter, for which
+        # we take an FIR approximation, stopping when we see at least `zeros` zeros crossing.
+        # But y[j+1] is going to have a different set of weights and so on, until y[j + new_sr].
+        # Indeed:
+        # y[j + new_sr] = sum_i x[i] sinc(pi * old_sr * ((i / old_sr - (j + new_sr) / new_sr))
+        #               = sum_i x[i] sinc(pi * old_sr * ((i - old_sr) / old_sr - j / new_sr))
+        #               = sum_i x[i + old_sr] sinc(pi * old_sr * (i / old_sr - j / new_sr))
+        # so y[j+new_sr] uses the same filter as y[j], but on a shifted version of x by `old_sr`.
+        # This will explain the F.conv1d after, with a stride of old_sr.
+        self._width = math.ceil(self.zeros * self.old_sr / sr)
+        # If old_sr is still big after GCD reduction, most filters will be very unbalanced, i.e.,
+        # they will have a lot of almost zero values to the left or to the right...
+        # There is probably a way to evaluate those filters more efficiently, but this is kept for
+        # future work.
+        idx = paddle.arange(
+            -self._width, self._width + self.old_sr, dtype="float32"
+        )
+        for i in range(self.new_sr):
+            t = (-i / self.new_sr + idx / self.old_sr) * sr
+            t = paddle.clip(t, -self.zeros, self.zeros)
+            t *= math.pi
+            window = paddle.cos(t / self.zeros / 2) ** 2
+            kernel = sinc(t) * window
+            # Renormalize kernel to ensure a constant signal is preserved.
+            kernel = kernel / kernel.sum()
+            kernels.append(kernel)
+
+        _kernel = paddle.stack(kernels).reshape([self.new_sr, 1, -1])
+        self.kernel = self.create_parameter(
+            shape=_kernel.shape,
+            dtype=_kernel.dtype,
+        )
+        self.kernel.set_value(_kernel)
+
+    def forward(
+        self,
+        x: paddle.Tensor,
+        output_length: Optional[int] = None,
+        full: bool = False,
+    ):
+        """
+        Resample x.
+        Args:
+            x (Tensor): signal to resample, time should be the last dimension
+            output_length (None or int): This can be set to the desired output length
+                (last dimension). Allowed values are between 0 and
+                ceil(length * new_sr / old_sr). When None (default) is specified, the
+                floored output length will be used. In order to select the largest possible
+                size, use the `full` argument.
+            full (bool): return the longest possible output from the input. This can be useful
+                if you chain resampling operations, and want to give the `output_length` only
+                for the last one, while passing `full=True` to all the other ones.
+        """
+        if self.old_sr == self.new_sr:
+            return x
+        shape = x.shape
+        length = x.shape[-1]
+        x = x.reshape([-1, length])
+        x = F.pad(
+            x.unsqueeze(1),
+            [self._width, self._width + self.old_sr],
+            mode="replicate",
+            data_format="NCL",
+        )
+        ys = F.conv1d(x, self.kernel, stride=self.old_sr, data_format="NCL")
+        y = ys.transpose([0, 2, 1]).reshape(list(shape[:-1]) + [-1])
+
+        float_output_length = paddle.to_tensor(
+            self.new_sr * length / self.old_sr, dtype="float32"
+        )
+        max_output_length = paddle.ceil(float_output_length).astype("int64")
+        default_output_length = paddle.floor(float_output_length).astype(
+            "int64"
+        )
+
+        if output_length is None:
+            applied_output_length = (
+                max_output_length if full else default_output_length
+            )
+        elif output_length < 0 or output_length > max_output_length:
+            raise ValueError(
+                f"output_length must be between 0 and {max_output_length.numpy()}"
+            )
+        else:
+            applied_output_length = paddle.to_tensor(
+                output_length, dtype="int64"
+            )
+            if full:
+                raise ValueError(
+                    "You cannot pass both full=True and output_length"
+                )
+        return y[..., :applied_output_length]
+
+    def __repr__(self):
+        return simple_repr(self)
+
+
+def resample_frac(
+    x: paddle.Tensor,
+    old_sr: int,
+    new_sr: int,
+    zeros: int = 24,
+    rolloff: float = 0.945,
+    output_length: Optional[int] = None,
+    full: bool = False,
+):
+    """
+    Functional version of `ResampleFrac`, refer to its documentation for more information.
+
+    ..warning::
+        If you call repeatidly this functions with the same sample rates, then the
+        resampling kernel will be recomputed everytime. For best performance, you should use
+        and cache an instance of `ResampleFrac`.
+    """
+    return ResampleFrac(old_sr, new_sr, zeros, rolloff)(
+        x, output_length, full
+    )
+
+
+if __name__ == "__main__":
+
+    resample = ResampleFrac(4, 5)
+    x = paddle.randn([1000])
+    print(len(resample(x)))
--- a/audio/audiotools/util.py
+++ b/audio/audiotools/util.py
@ -0,0 +1,669 @@
+import csv
+import glob
+import math
+import numbers
+import os
+import random
+import typing
+import soundfile
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Optional, List
+
+import numpy as np
+import paddle
+from flatten_dict import flatten
+from flatten_dict import unflatten
+
+
+@dataclass
+class Info:
+
+    sample_rate: float
+    num_frames: int
+
+    @property
+    def duration(self) -> float:
+        return self.num_frames / self.sample_rate
+
+
+def info(audio_path: str):
+    """✅
+
+    Parameters
+    ----------
+    audio_path : str
+        Path to audio file.
+    """
+    info = soundfile.info(str(audio_path))
+    info = Info(sample_rate=info.samplerate, num_frames=info.frames)
+
+    return info
+
+
+def ensure_tensor(
+    x: typing.Union[np.ndarray, paddle.Tensor, float, int],
+    ndim: int = None,
+    batch_size: int = None,
+):
+    """✅Ensures that the input ``x`` is a tensor of specified
+    dimensions and batch size.
+
+    Parameters
+    ----------
+    x : typing.Union[np.ndarray, paddle.Tensor, float, int]
+        Data that will become a tensor on its way out.
+    ndim : int, optional
+        How many dimensions should be in the output, by default None
+    batch_size : int, optional
+        The batch size of the output, by default None
+
+    Returns
+    -------
+    paddle.Tensor
+        Modified version of ``x`` as a tensor.
+    """
+    if not paddle.is_tensor(x):
+        x = paddle.to_tensor(x)
+    if ndim is not None:
+        assert x.ndim <= ndim
+        while x.ndim < ndim:
+            x = x.unsqueeze(-1)
+    if batch_size is not None:
+        if x.shape[0] != batch_size:
+            shape = list(x.shape)
+            shape[0] = batch_size
+            x = paddle.expand(x, shape)
+    return x
+
+
+def _get_value(other):
+    # ✅
+    # from . import AudioSignal
+    from audio_signal import AudioSignal
+
+    if isinstance(other, AudioSignal):
+        return other.audio_data
+    return other
+
+
+def hz_to_bin(hz: paddle.Tensor, n_fft: int, sample_rate: int):
+    """Closest frequency bin given a frequency, number
+    of bins, and a sampling rate.
+
+    Parameters
+    ----------
+    hz : paddle.Tensor
+       Tensor of frequencies in Hz.
+    n_fft : int
+        Number of FFT bins.
+    sample_rate : int
+        Sample rate of audio.
+
+    Returns
+    -------
+    paddle.Tensor
+        Closest bins to the data.
+    """
+    shape = hz.shape
+    hz = hz.flatten()
+    freqs = paddle.linspace(0, sample_rate / 2, 2 + n_fft // 2)
+    hz[hz > sample_rate / 2] = sample_rate / 2
+
+    closest = (hz[None, :] - freqs[:, None]).abs()
+    closest_bins = closest.min(dim=0).indices
+
+    return closest_bins.reshape(*shape)
+
+
+def random_state(seed: typing.Union[int, np.random.RandomState]):
+    """✅
+    Turn seed into a np.random.RandomState instance.
+
+    Parameters
+    ----------
+    seed : typing.Union[int, np.random.RandomState] or None
+        If seed is None, return the RandomState singleton used by np.random.
+        If seed is an int, return a new RandomState instance seeded with seed.
+        If seed is already a RandomState instance, return it.
+        Otherwise raise ValueError.
+
+    Returns
+    -------
+    np.random.RandomState
+        Random state object.
+
+    Raises
+    ------
+    ValueError
+        If seed is not valid, an error is thrown.
+    """
+    if seed is None or seed is np.random:
+        return np.random.mtrand._rand
+    elif isinstance(seed, (numbers.Integral, np.integer, int)):
+        return np.random.RandomState(seed)
+    elif isinstance(seed, np.random.RandomState):
+        return seed
+    else:
+        raise ValueError(
+            "%r cannot be used to seed a numpy.random.RandomState"
+            " instance" % seed
+        )
+
+
+def seed(random_seed, set_cudnn=False):
+    """
+    Seeds all random states with the same random seed
+    for reproducibility. Seeds ``numpy``, ``random`` and ``paddle``
+    random generators.
+    For full reproducibility, two further options must be set
+    according to the paddle documentation:
+    https://pypaddle.org/docs/stable/notes/randomness.html
+    To do this, ``set_cudnn`` must be True. It defaults to
+    False, since setting it to True results in a performance
+    hit.
+
+    Args:
+        random_seed (int): integer corresponding to random seed to
+        use.
+        set_cudnn (bool): Whether or not to set cudnn into determinstic
+        mode and off of benchmark mode. Defaults to False.
+    """
+
+    paddle.manual_seed(random_seed)
+    np.random.seed(random_seed)
+    random.seed(random_seed)
+
+    if set_cudnn:
+        paddle.backends.cudnn.deterministic = True
+        paddle.backends.cudnn.benchmark = False
+
+
+@contextmanager
+def _close_temp_files(tmpfiles: list):
+    """Utility function for creating a context and closing all temporary files
+    once the context is exited. For correct functionality, all temporary file
+    handles created inside the context must be appended to the ```tmpfiles```
+    list.
+
+    This function is taken wholesale from Scaper.
+
+    Parameters
+    ----------
+    tmpfiles : list
+        List of temporary file handles
+    """
+
+    def _close():
+        for t in tmpfiles:
+            try:
+                t.close()
+                os.unlink(t.name)
+            except:
+                pass
+
+    try:
+        yield
+    except:  # pragma: no cover
+        _close()
+        raise
+    _close()
+
+
+AUDIO_EXTENSIONS = [".wav", ".flac", ".mp3", ".mp4"]
+
+
+def find_audio(folder: str, ext: List[str] = AUDIO_EXTENSIONS):
+    """Finds all audio files in a directory recursively.
+    Returns a list.
+
+    Parameters
+    ----------
+    folder : str
+        Folder to look for audio files in, recursively.
+    ext : List[str], optional
+        Extensions to look for without the ., by default
+        ``['.wav', '.flac', '.mp3', '.mp4']``.
+    """
+    folder = Path(folder)
+    # Take care of case where user has passed in an audio file directly
+    # into one of the calling functions.
+    if str(folder).endswith(tuple(ext)):
+        # if, however, there's a glob in the path, we need to
+        # return the glob, not the file.
+        if "*" in str(folder):
+            return glob.glob(str(folder), recursive=("**" in str(folder)))
+        else:
+            return [folder]
+
+    files = []
+    for x in ext:
+        files += folder.glob(f"**/*{x}")
+    return files
+
+
+def read_sources(
+    sources: List[str],
+    remove_empty: bool = True,
+    relative_path: str = "",
+    ext: List[str] = AUDIO_EXTENSIONS,
+):
+    """Reads audio sources that can either be folders
+    full of audio files, or CSV files that contain paths
+    to audio files. CSV files that adhere to the expected
+    format can be generated by
+    :py:func:`audiotools.data.preprocess.create_csv`.
+
+    Parameters
+    ----------
+    sources : List[str]
+        List of audio sources to be converted into a
+        list of lists of audio files.
+    remove_empty : bool, optional
+        Whether or not to remove rows with an empty "path"
+        from each CSV file, by default True.
+
+    Returns
+    -------
+    list
+        List of lists of rows of CSV files.
+    """
+    files = []
+    relative_path = Path(relative_path)
+    for source in sources:
+        source = str(source)
+        _files = []
+        if source.endswith(".csv"):
+            with open(source, "r") as f:
+                reader = csv.DictReader(f)
+                for x in reader:
+                    if remove_empty and x["path"] == "":
+                        continue
+                    if x["path"] != "":
+                        x["path"] = str(relative_path / x["path"])
+                    _files.append(x)
+        else:
+            for x in find_audio(source, ext=ext):
+                x = str(relative_path / x)
+                _files.append({"path": x})
+        files.append(sorted(_files, key=lambda x: x["path"]))
+    return files
+
+
+def choose_from_list_of_lists(
+    state: np.random.RandomState, list_of_lists: list, p: float = None
+):
+    """Choose a single item from a list of lists.
+
+    Parameters
+    ----------
+    state : np.random.RandomState
+        Random state to use when choosing an item.
+    list_of_lists : list
+        A list of lists from which items will be drawn.
+    p : float, optional
+        Probabilities of each list, by default None
+
+    Returns
+    -------
+    typing.Any
+        An item from the list of lists.
+    """
+    source_idx = state.choice(list(range(len(list_of_lists))), p=p)
+    item_idx = state.randint(len(list_of_lists[source_idx]))
+    return list_of_lists[source_idx][item_idx], source_idx, item_idx
+
+
+@contextmanager
+def chdir(newdir: typing.Union[Path, str]):
+    """✅
+    Context manager for switching directories to run a
+    function. Useful for when you want to use relative
+    paths to different runs.
+
+    Parameters
+    ----------
+    newdir : typing.Union[Path, str]
+        Directory to switch to.
+    """
+    curdir = os.getcwd()
+    try:
+        os.chdir(newdir)
+        yield
+    finally:
+        os.chdir(curdir)
+
+
+def prepare_batch(
+    batch: typing.Union[dict, list, paddle.Tensor], device: str = "cpu"
+):
+    """Moves items in a batch (typically generated by a DataLoader as a list
+    or a dict) to the specified device. This works even if dictionaries
+    are nested.
+
+    Parameters
+    ----------
+    batch : typing.Union[dict, list, paddle.Tensor]
+        Batch, typically generated by a dataloader, that will be moved to
+        the device.
+    device : str, optional
+        Device to move batch to, by default "cpu"
+
+    Returns
+    -------
+    typing.Union[dict, list, paddle.Tensor]
+        Batch with all values moved to the specified device.
+    """
+    if isinstance(batch, dict):
+        batch = flatten(batch)
+        for key, val in batch.items():
+            try:
+                batch[key] = val.to(device)
+            except:
+                pass
+        batch = unflatten(batch)
+    elif paddle.is_tensor(batch):
+        batch = batch.to(device)
+    elif isinstance(batch, list):
+        for i in range(len(batch)):
+            try:
+                batch[i] = batch[i].to(device)
+            except:
+                pass
+    return batch
+
+
+def sample_from_dist(dist_tuple: tuple, state: np.random.RandomState = None):
+    """Samples from a distribution defined by a tuple. The first
+    item in the tuple is the distribution type, and the rest of the
+    items are arguments to that distribution. The distribution function
+    is gotten from the ``np.random.RandomState`` object.
+
+    Parameters
+    ----------
+    dist_tuple : tuple
+        Distribution tuple
+    state : np.random.RandomState, optional
+        Random state, or seed to use, by default None
+
+    Returns
+    -------
+    typing.Union[float, int, str]
+        Draw from the distribution.
+
+    Examples
+    --------
+    Sample from a uniform distribution:
+
+    >>> dist_tuple = ("uniform", 0, 1)
+    >>> sample_from_dist(dist_tuple)
+
+    Sample from a constant distribution:
+
+    >>> dist_tuple = ("const", 0)
+    >>> sample_from_dist(dist_tuple)
+
+    Sample from a normal distribution:
+
+    >>> dist_tuple = ("normal", 0, 0.5)
+    >>> sample_from_dist(dist_tuple)
+
+    """
+    if dist_tuple[0] == "const":
+        return dist_tuple[1]
+    state = random_state(state)
+    dist_fn = getattr(state, dist_tuple[0])
+    return dist_fn(*dist_tuple[1:])
+
+
+def collate(list_of_dicts: list, n_splits: int = None):
+    """Collates a list of dictionaries (e.g. as returned by a
+    dataloader) into a dictionary with batched values. This routine
+    uses the default paddle collate function for everything
+    except AudioSignal objects, which are handled by the
+    :py:func:`audiotools.core.audio_signal.AudioSignal.batch`
+    function.
+
+    This function takes n_splits to enable splitting a batch
+    into multiple sub-batches for the purposes of gradient accumulation,
+    etc.
+
+    Parameters
+    ----------
+    list_of_dicts : list
+        List of dictionaries to be collated.
+    n_splits : int
+        Number of splits to make when creating the batches (split into
+        sub-batches). Useful for things like gradient accumulation.
+
+    Returns
+    -------
+    dict
+        Dictionary containing batched data.
+    """
+
+    from . import AudioSignal
+
+    batches = []
+    list_len = len(list_of_dicts)
+
+    return_list = False if n_splits is None else True
+    n_splits = 1 if n_splits is None else n_splits
+    n_items = int(math.ceil(list_len / n_splits))
+
+    for i in range(0, list_len, n_items):
+        # Flatten the dictionaries to avoid recursion.
+        list_of_dicts_ = [flatten(d) for d in list_of_dicts[i : i + n_items]]
+        dict_of_lists = {
+            k: [dic[k] for dic in list_of_dicts_] for k in list_of_dicts_[0]
+        }
+
+        batch = {}
+        for k, v in dict_of_lists.items():
+            if isinstance(v, list):
+                if all(isinstance(s, AudioSignal) for s in v):
+                    batch[k] = AudioSignal.batch(v, pad_signals=True)
+                else:
+                    # Borrow the default collate fn from paddle.
+                    batch[k] = paddle.utils.data._utils.collate.default_collate(
+                        v
+                    )
+        batches.append(unflatten(batch))
+
+    batches = batches[0] if not return_list else batches
+    return batches
+
+
+BASE_SIZE = 864
+DEFAULT_FIG_SIZE = (9, 3)
+
+
+def format_figure(
+    fig_size: tuple = None,
+    title: str = None,
+    fig=None,
+    format_axes: bool = True,
+    format: bool = True,
+    font_color: str = "white",
+):
+    """Prettifies the spectrogram and waveform plots. A title
+    can be inset into the top right corner, and the axes can be
+    inset into the figure, allowing the data to take up the entire
+    image. Used in
+
+    - :py:func:`audiotools.core.display.DisplayMixin.specshow`
+    - :py:func:`audiotools.core.display.DisplayMixin.waveplot`
+    - :py:func:`audiotools.core.display.DisplayMixin.wavespec`
+
+    Parameters
+    ----------
+    fig_size : tuple, optional
+        Size of figure, by default (9, 3)
+    title : str, optional
+        Title to inset in top right, by default None
+    fig : matplotlib.figure.Figure, optional
+        Figure object, if None ``plt.gcf()`` will be used, by default None
+    format_axes : bool, optional
+        Format the axes to be inside the figure, by default True
+    format : bool, optional
+        This formatting can be skipped entirely by passing ``format=False``
+        to any of the plotting functions that use this formater, by default True
+    font_color : str, optional
+        Color of font of axes, by default "white"
+    """
+    import matplotlib
+    import matplotlib.pyplot as plt
+
+    if fig_size is None:
+        fig_size = DEFAULT_FIG_SIZE
+    if not format:
+        return
+    if fig is None:
+        fig = plt.gcf()
+    fig.set_size_inches(*fig_size)
+    axs = fig.axes
+
+    pixels = (fig.get_size_inches() * fig.dpi)[0]
+    font_scale = pixels / BASE_SIZE
+
+    if format_axes:
+        axs = fig.axes
+
+        for ax in axs:
+            ymin, _ = ax.get_ylim()
+            xmin, _ = ax.get_xlim()
+
+            ticks = ax.get_yticks()
+            for t in ticks[2:-1]:
+                t = axs[0].annotate(
+                    f"{(t / 1000):2.1f}k",
+                    xy=(xmin, t),
+                    xycoords="data",
+                    xytext=(5, -5),
+                    textcoords="offset points",
+                    ha="left",
+                    va="top",
+                    color=font_color,
+                    fontsize=12 * font_scale,
+                    alpha=0.75,
+                )
+
+            ticks = ax.get_xticks()[2:]
+            for t in ticks[:-1]:
+                t = axs[0].annotate(
+                    f"{t:2.1f}s",
+                    xy=(t, ymin),
+                    xycoords="data",
+                    xytext=(5, 5),
+                    textcoords="offset points",
+                    ha="center",
+                    va="bottom",
+                    color=font_color,
+                    fontsize=12 * font_scale,
+                    alpha=0.75,
+                )
+
+            ax.margins(0, 0)
+            ax.set_axis_off()
+            ax.xaxis.set_major_locator(plt.NullLocator())
+            ax.yaxis.set_major_locator(plt.NullLocator())
+
+        plt.subplots_adjust(
+            top=1, bottom=0, right=1, left=0, hspace=0, wspace=0
+        )
+
+    if title is not None:
+        t = axs[0].annotate(
+            title,
+            xy=(1, 1),
+            xycoords="axes fraction",
+            fontsize=20 * font_scale,
+            xytext=(-5, -5),
+            textcoords="offset points",
+            ha="right",
+            va="top",
+            color="white",
+        )
+        t.set_bbox(dict(facecolor="black", alpha=0.5, edgecolor="black"))
+
+
+def generate_chord_dataset(
+    max_voices: int = 8,
+    sample_rate: int = 44100,
+    num_items: int = 5,
+    duration: float = 1.0,
+    min_note: str = "C2",
+    max_note: str = "C6",
+    output_dir: Path = "chords",
+):
+    """
+    Generates a toy multitrack dataset of chords, synthesized from sine waves.
+
+
+    Parameters
+    ----------
+    max_voices : int, optional
+        Maximum number of voices in a chord, by default 8
+    sample_rate : int, optional
+        Sample rate of audio, by default 44100
+    num_items : int, optional
+        Number of items to generate, by default 5
+    duration : float, optional
+        Duration of each item, by default 1.0
+    min_note : str, optional
+        Minimum note in the dataset, by default "C2"
+    max_note : str, optional
+        Maximum note in the dataset, by default "C6"
+    output_dir : Path, optional
+        Directory to save the dataset, by default "chords"
+
+    """
+    import librosa
+    from . import AudioSignal
+    from ..data.preprocess import create_csv
+
+    min_midi = librosa.note_to_midi(min_note)
+    max_midi = librosa.note_to_midi(max_note)
+
+    tracks = []
+    for idx in range(num_items):
+        track = {}
+        # figure out how many voices to put in this track
+        num_voices = random.randint(1, max_voices)
+        for voice_idx in range(num_voices):
+            # choose some random params
+            midinote = random.randint(min_midi, max_midi)
+            dur = random.uniform(0.85 * duration, duration)
+
+            sig = AudioSignal.wave(
+                frequency=librosa.midi_to_hz(midinote),
+                duration=dur,
+                sample_rate=sample_rate,
+                shape="sine",
+            )
+            track[f"voice_{voice_idx}"] = sig
+        tracks.append(track)
+
+    # save the tracks to disk
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+    for idx, track in enumerate(tracks):
+        track_dir = output_dir / f"track_{idx}"
+        track_dir.mkdir(exist_ok=True)
+        for voice_name, sig in track.items():
+            sig.write(track_dir / f"{voice_name}.wav")
+
+    all_voices = list(set([k for track in tracks for k in track.keys()]))
+    voice_lists = {voice: [] for voice in all_voices}
+    for track in tracks:
+        for voice_name in all_voices:
+            if voice_name in track:
+                voice_lists[voice_name].append(track[voice_name].path_to_file)
+            else:
+                voice_lists[voice_name].append("")
+
+    for voice_name, paths in voice_lists.items():
+        create_csv(paths, output_dir / f"{voice_name}.csv", loudness=True)
+
+    return output_dir