Adapt to paddle3.0 && update readme

8 months ago · e16999b3c3
parent ff8e4bb4fe
commit e16999b3c3
6 changed files with 71 additions and 10 deletions
--- a/audio/audiotools/README.md
+++ b/audio/audiotools/README.md
@ -2,12 +2,67 @@ Audiotools is a comprehensive toolkit designed for audio processing and analysis

 ### Directory Structure

- **core directory**: Contains the core class AudioSignal, which is responsible for the fundamental representation and manipulation of audio signals.
+```
+.
+├── audiotools
+│   ├── README.md
+│   ├── __init__.py
+│   ├── core
+│   │   ├── __init__.py
+│   │   ├── _julius.py
+│   │   ├── audio_signal.py
+│   │   ├── display.py
+│   │   ├── dsp.py
+│   │   ├── effects.py
+│   │   ├── ffmpeg.py
+│   │   ├── loudness.py
+│   │   └── util.py
+│   ├── data
+│   │   ├── __init__.py
+│   │   ├── datasets.py
+│   │   ├── preprocess.py
+│   │   └── transforms.py
+│   ├── metrics
+│   │   ├── __init__.py
+│   │   └── quality.py
+│   ├── ml
+│   │   ├── __init__.py
+│   │   ├── accelerator.py
+│   │   ├── basemodel.py
+│   │   └── decorators.py
+│   ├── requirements.txt
+│   └── post.py
+├── tests
+│   └── audiotools
+│       ├── core
+│       │   ├── test_audio_signal.py
+│       │   ├── test_bands.py
+│       │   ├── test_display.py
+│       │   ├── test_dsp.py
+│       │   ├── test_effects.py
+│       │   ├── test_fftconv.py
+│       │   ├── test_grad.py
+│       │   ├── test_highpass.py
+│       │   ├── test_loudness.py
+│       │   ├── test_lowpass.py
+│       │   └── test_util.py
+│       ├── data
+│       │   ├── test_datasets.py
+│       │   ├── test_preprocess.py
+│       │   └── test_transforms.py
+│       ├── ml
+│       │   ├── test_decorators.py
+│       │   └── test_model.py
+│       └── test_post.py

- **data directory**: Primarily dedicated to storing and processing datasets, including classes and functions for data preprocessing, ensuring efficient loading and transformation of audio data.
+```

- **metrics directory**: Implements functions for various audio evaluation metrics, enabling precise assessment of the performance of audio models and processing algorithms.
+- **core**: Contains the core class AudioSignal, which is responsible for the fundamental representation and manipulation of audio signals.

- **ml directory**: Comprises classes and methods related to model training, supporting the construction, training, and optimization of machine learning models in the context of audio.
+- **data**: Primarily dedicated to storing and processing datasets, including classes and functions for data preprocessing, ensuring efficient loading and transformation of audio data.
+
+- **metrics**: Implements functions for various audio evaluation metrics, enabling precise assessment of the performance of audio models and processing algorithms.
+
+- **ml**: Comprises classes and methods related to model training, supporting the construction, training, and optimization of machine learning models in the context of audio.

 This project aims to provide developers and researchers with an efficient and flexible framework to foster innovation and exploration across various domains of audio technology.
--- a/audio/audiotools/core/dsp.py
+++ b/audio/audiotools/core/dsp.py
@ -349,6 +349,9 @@ class DSPMixin:
            nbins, )
        bins_hz = bins_hz[None, None, :, None].tile(
            [self.batch_size, 1, 1, mag.shape[-1]])
+
+        fmin_hz, fmax_hz = fmin_hz.astype(bins_hz.dtype), fmax_hz.astype(
+            bins_hz.dtype)
        mask = (fmin_hz <= bins_hz) & (bins_hz < fmax_hz)

        mag = paddle.where(mask, paddle.full_like(mag, val), mag)
@ -429,6 +432,7 @@ class DSPMixin:
        log_mag = self.log_magnitude()

        db_cutoff = util.ensure_tensor(db_cutoff, ndim=mag.ndim)
+        db_cutoff = db_cutoff.astype(log_mag.dtype)
        mask = log_mag < db_cutoff
        # mag = mag.masked_fill(mask, val)
        mag = paddle.where(mask, mag, val * paddle.ones_like(mag))
@ -452,6 +456,7 @@ class DSPMixin:
            masked audio data.
        """
        shift = util.ensure_tensor(shift, ndim=self.phase.ndim)
+        shift = shift.astype(self.phase.dtype)
        self.phase = self.phase + shift
        return self

--- a/audio/audiotools/core/effects.py
+++ b/audio/audiotools/core/effects.py
@ -266,7 +266,7 @@ class EffectMixin:
        """
        db = util.ensure_tensor(db)
        ref_db = self.loudness()
-        gain = db - ref_db
+        gain = db.astype(ref_db.dtype) - ref_db
        gain = util.exp_compat(gain * self.GAIN_FACTOR)

        self.audio_data = self.audio_data * gain[:, None, None]
@ -388,6 +388,7 @@ class EffectMixin:
            quantization_channels, ndim=3)

        x = self.audio_data
+        quantization_channels = quantization_channels.astype(x.dtype)
        x = (x + 1) / 2
        x = x * quantization_channels
        x = x.floor()
@ -424,7 +425,7 @@ class EffectMixin:
        x = ((x + 1) / 2 * mu + 0.5).astype("int64")

        # unquantize
-        x = (x / mu) * 2 - 1.0
+        x = (x.astype(mu.dtype) / mu) * 2 - 1.0
        x = paddle.sign(x) * (
            util.exp_compat(paddle.abs(x) * paddle.log1p(mu)) - 1.0) / mu

--- a/audio/audiotools/core/loudness.py
+++ b/audio/audiotools/core/loudness.py
@ -317,7 +317,7 @@ class Meter(paddle.nn.Layer):
        z_avg_gated = z
        z_avg_gated[l <= Gamma_a] = 0
        masked = l > Gamma_a
-        z_avg_gated = z_avg_gated.sum(2) / masked.sum(2)
+        z_avg_gated = z_avg_gated.sum(2) / masked.sum(2).astype("float32")

        # calculate the relative threshold value (see eq. 6)
        Gamma_r = -0.691 + 10.0 * paddle.log10(
--- a/audio/audiotools/core/util.py
+++ b/audio/audiotools/core/util.py
@ -338,7 +338,7 @@ def _close_temp_files(tmpfiles: list):
    _close()


-AUDIO_EXTENSIONS = [".wav", ".flac", ".mp3", ".mp4"]
+AUDIO_EXTENSIONS = [".wav", ".flac", ".mp3"]


 def find_audio(folder: str, ext: List[str]=AUDIO_EXTENSIONS):
@ -869,7 +869,7 @@ def hz_to_bin(hz: paddle.Tensor, n_fft: int, sample_rate: int):
    shape = hz.shape
    hz = hz.reshape([-1])
    freqs = paddle.linspace(0, sample_rate / 2, 2 + n_fft // 2)
-    hz = paddle.clip(hz, max=sample_rate / 2)
+    hz = paddle.clip(hz, max=sample_rate / 2).astype(freqs.dtype)

    closest = (hz[None, :] - freqs[:, None]).abs()
    closest_bins = closest.argmin(axis=0)
--- a/audio/tests/audiotools/core/test_util.py
+++ b/audio/tests/audiotools/core/test_util.py
@ -88,7 +88,7 @@ def test_seed():


 def test_hz_to_bin():
-    hz = paddle.to_tensor(np.array([100, 200, 300]))
+    hz = paddle.to_tensor(np.array([100, 200, 300]), dtype="float32")
    sr = 1000
    n_fft = 2048