From ecbf8f61401a2607960c958221b15bb0fb9cef20 Mon Sep 17 00:00:00 2001
From: drryanhuang <zihaohuang@aliyun.com>
Date: Mon, 30 Dec 2024 08:24:02 +0000
Subject: [PATCH] mv unfold

---
 audio/audiotools/core/_julius.py  | 34 -------------------------
 audio/audiotools/core/loudness.py | 41 +++++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/audio/audiotools/core/_julius.py b/audio/audiotools/core/_julius.py
index e80731cb6..929efb6bc 100644
--- a/audio/audiotools/core/_julius.py
+++ b/audio/audiotools/core/_julius.py
@@ -247,40 +247,6 @@ def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):
     return paddle.cos(2 * math.pi * freq * time)
 
 
-# def unfold(_input, kernel_size: int, stride: int):
-#     """1D only unfolding similar to the one from PyTorch.
-#     However PyTorch unfold is extremely slow.
-
-#     Given an _input tensor of size `[*, T]` this will return
-#     a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
-#     of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
-#     This will automatically pad the _input to cover at least once all entries in `_input`.
-
-#     Args:
-#         _input (Tensor): tensor for which to return the frames.
-#         kernel_size (int): size of each frame.
-#         stride (int): stride between each frame.
-
-#     Shape:
-
-#         - Inputs: `_input` is `[*, T]`
-#         - Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)`
-
-#     ..Warning:: unlike PyTorch unfold, this will pad the _input
-#         so that any position in `_input` is covered by at least one frame.
-#     """
-#     shape = list(_input.shape)
-#     length = shape.pop(-1)
-#     n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
-#     tgt_length = (n_frames - 1) * stride + kernel_size
-#     padded = F.pad(_input, (0, tgt_length - length), data_format="NCL")
-#     strides: typing.List[int] = []
-#     for dim in range(padded.dim()):
-#         strides.append(padded.strides[dim])
-#     assert strides.pop(-1) == 1, "data should be contiguous"
-#     strides = strides + [stride, 1]
-#     return padded.as_strided(shape + [n_frames, kernel_size], strides)
-
 # def _new_rfft(x: paddle.Tensor):
 #     z = paddle.fft.rfft(x, axis=-1)
 
diff --git a/audio/audiotools/core/loudness.py b/audio/audiotools/core/loudness.py
index 841f84d5c..4d85edf62 100644
--- a/audio/audiotools/core/loudness.py
+++ b/audio/audiotools/core/loudness.py
@@ -1,4 +1,6 @@
 import copy
+import math
+import typing
 
 import numpy as np
 import paddle
@@ -8,6 +10,41 @@ import scipy
 from . import _julius
 
 
+def unfold(_input, kernel_size: int, stride: int):
+    """1D only unfolding similar to the one from PyTorch.
+    However PyTorch unfold is extremely slow.
+
+    Given an _input tensor of size `[*, T]` this will return
+    a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
+    of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
+    This will automatically pad the _input to cover at least once all entries in `_input`.
+
+    Args:
+        _input (Tensor): tensor for which to return the frames.
+        kernel_size (int): size of each frame.
+        stride (int): stride between each frame.
+
+    Shape:
+
+        - Inputs: `_input` is `[*, T]`
+        - Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)`
+
+    ..Warning:: unlike PyTorch unfold, this will pad the _input
+        so that any position in `_input` is covered by at least one frame.
+    """
+    shape = list(_input.shape)
+    length = shape.pop(-1)
+    n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    padded = F.pad(_input, (0, tgt_length - length), data_format="NCL")
+    strides: typing.List[int] = []
+    for dim in range(padded.dim()):
+        strides.append(padded.strides[dim])
+    assert strides.pop(-1) == 1, "data should be contiguous"
+    strides = strides + [stride, 1]
+    return padded.as_strided(shape + [n_frames, kernel_size], strides)
+
+
 class Meter(paddle.nn.Layer):
     """Tensorized version of pyloudnorm.Meter. Works with batched audio tensors.
 
@@ -181,8 +218,8 @@ class Meter(paddle.nn.Layer):
 
         kernel_size = int(T_g * self.rate)
         stride = int(T_g * self.rate * step)
-        unfolded = _julius.unfold(
-            input_data.transpose([0, 2, 1]), kernel_size, stride)
+        print("--", kernel_size, stride)
+        unfolded = unfold(input_data.transpose([0, 2, 1]), kernel_size, stride)
         unfolded = unfolded.transpose([0, 1, 3, 2])
 
         return unfolded