refactor paddleaudio, test=doc

4 years ago · c437a7c5c1
parent a942226066
commit c437a7c5c1
30 changed files with 234 additions and 249 deletions
--- a/.gitignore
+++ b/.gitignore
@ -30,5 +30,6 @@ tools/OpenBLAS/
 tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
 tools/CRF++-0.58/
 *output/
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
@ -1 +1,4 @@
 # Changelog
 Date: 2022-2-25, Author: Hui Zhang.
  - Refactor architecture.
--- a/paddleaudio/features/augment.py
+++ b/paddleaudio/features/augment.py
@ -1,170 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import numpy as np
 from numpy import ndarray as array
 from ..backends import depth_convert
 from ..utils import ParameterError
 __all__ = [
    'depth_augment',
    'spect_augment',
    'random_crop1d',
    'random_crop2d',
    'adaptive_spect_augment',
 ]
 def randint(high: int) -> int:
    """Generate one random integer in range [0 high)
     This is a helper function for random data augmentaiton
    """
    return int(np.random.randint(0, high=high))
 def rand() -> float:
    """Generate one floating-point number in range [0 1)
    This is a helper function for random data augmentaiton
    """
    return float(np.random.rand(1))
 def depth_augment(y: array,
                  choices: List=['int8', 'int16'],
                  probs: List[float]=[0.5, 0.5]) -> array:
    """ Audio depth augmentation
    Do audio depth augmentation to simulate the distortion brought by quantization.
    """
    assert len(probs) == len(
        choices
    ), 'number of choices {} must be equal to size of probs {}'.format(
        len(choices), len(probs))
    depth = np.random.choice(choices, p=probs)
    src_depth = y.dtype
    y1 = depth_convert(y, depth)
    y2 = depth_convert(y1, src_depth)
    return y2
 def adaptive_spect_augment(spect: array, tempo_axis: int=0,
                           level: float=0.1) -> array:
    """Do adpative spectrogram augmentation
    The level of the augmentation is gowern by the paramter level,
    ranging from 0 to 1, with 0 represents no augmentation。
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    time_mask_width = int(nt * level * 0.5)
    freq_mask_width = int(nf * level * 0.5)
    num_time_mask = int(10 * level)
    num_freq_mask = int(10 * level)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def spect_augment(spect: array,
                  tempo_axis: int=0,
                  max_time_mask: int=3,
                  max_freq_mask: int=3,
                  max_time_mask_width: int=30,
                  max_freq_mask_width: int=20) -> array:
    """Do spectrogram augmentation in both time and freq axis
    Reference:
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    num_time_mask = randint(max_time_mask)
    num_freq_mask = randint(max_freq_mask)
    time_mask_width = randint(max_time_mask_width)
    freq_mask_width = randint(max_freq_mask_width)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def random_crop1d(y: array, crop_len: int) -> array:
    """ Do random cropping on 1d input signal
    The input is a 1d signal, typically a sound waveform
    """
    if y.ndim != 1:
        'only accept 1d tensor or numpy array'
    n = len(y)
    idx = randint(n - crop_len)
    return y[idx:idx + crop_len]
 def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
    """ Do random cropping for 2D array, typically a spectrogram.
    The cropping is done in temporal direction on the time-freq input signal.
    """
    if tempo_axis >= s.ndim:
        raise ParameterError('axis out of range')
    n = s.shape[tempo_axis]
    idx = randint(high=n - crop_len)
    sli = [slice(None) for i in range(s.ndim)]
    sli[tempo_axis] = slice(idx, idx + crop_len)
    out = s[tuple(sli)]
    return out
--- a/paddleaudio/paddleaudio/init.py
+++ b/paddleaudio/paddleaudio/init.py
@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .audio import *
--- a/paddleaudio/paddleaudio/backends/init.py
+++ b/paddleaudio/paddleaudio/backends/init.py
@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .backends import *
 from .features import *
--- a/paddleaudio/paddleaudio/backends/soundfile_backend.py
+++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py
--- a/paddleaudio/paddleaudio/backends/sox_backend.py
+++ b/paddleaudio/paddleaudio/backends/sox_backend.py
--- a/paddleaudio/paddleaudio/datasets/init.py
+++ b/paddleaudio/paddleaudio/datasets/init.py
--- a/paddleaudio/paddleaudio/datasets/dataset.py
+++ b/paddleaudio/paddleaudio/datasets/dataset.py
--- a/paddleaudio/paddleaudio/datasets/esc50.py
+++ b/paddleaudio/paddleaudio/datasets/esc50.py
--- a/paddleaudio/paddleaudio/datasets/gtzan.py
+++ b/paddleaudio/paddleaudio/datasets/gtzan.py
--- a/paddleaudio/paddleaudio/datasets/tess.py
+++ b/paddleaudio/paddleaudio/datasets/tess.py
--- a/paddleaudio/paddleaudio/datasets/urban_sound.py
+++ b/paddleaudio/paddleaudio/datasets/urban_sound.py
--- a/paddleaudio/paddleaudio/features/init.py
+++ b/paddleaudio/paddleaudio/features/init.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .augment import *
+
-from .core import *
+from .librosa import Spectrogram
-from .spectrum import *
+from .librosa import MelSpectrogram
 from .librosa import LogMelSpectrogram
--- a/paddleaudio/paddleaudio/features/librosa.py
+++ b/paddleaudio/paddleaudio/features/librosa.py
@ -19,7 +19,7 @@ from typing import Union
 import paddle
 import paddle.nn as nn
-from .window import get_window
+from ..functional.window import get_window
 __all__ = [
    'Spectrogram',
--- a/paddleaudio/paddleaudio/functional/init.py
+++ b/paddleaudio/paddleaudio/functional/init.py
--- a/paddleaudio/paddleaudio/functional/functional.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@ -21,11 +21,14 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from scipy.signal import get_window
+from scipy import signal
 from ..utils import ParameterError
 from ..backends import depth_convert
 __all__ = [
    # dsp
    'stft',
    'mfcc',
    'hz_to_mel',
@ -38,6 +41,12 @@ __all__ = [
    'spectrogram',
    'mu_encode',
    'mu_decode',
    # augmentation
    'depth_augment',
    'spect_augment',
    'random_crop1d',
    'random_crop2d',
    'adaptive_spect_augment',
 ]
@ -303,7 +312,7 @@ def stft(x: array,
    if hop_length is None:
        hop_length = int(win_length // 4)
-    fft_window = get_window(window, win_length, fftbins=True)
+    fft_window = signal.get_window(window, win_length, fftbins=True)
    # Pad the window out to n_fft size
    fft_window = pad_center(fft_window, n_fft)
@ -576,3 +585,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
        y = y * 2 / mu - 1
    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
    return x
 def randint(high: int) -> int:
    """Generate one random integer in range [0 high)
     This is a helper function for random data augmentaiton
    """
    return int(np.random.randint(0, high=high))
 def rand() -> float:
    """Generate one floating-point number in range [0 1)
    This is a helper function for random data augmentaiton
    """
    return float(np.random.rand(1))
 def depth_augment(y: array,
                  choices: List=['int8', 'int16'],
                  probs: List[float]=[0.5, 0.5]) -> array:
    """ Audio depth augmentation
    Do audio depth augmentation to simulate the distortion brought by quantization.
    """
    assert len(probs) == len(
        choices
    ), 'number of choices {} must be equal to size of probs {}'.format(
        len(choices), len(probs))
    depth = np.random.choice(choices, p=probs)
    src_depth = y.dtype
    y1 = depth_convert(y, depth)
    y2 = depth_convert(y1, src_depth)
    return y2
 def adaptive_spect_augment(spect: array, tempo_axis: int=0,
                           level: float=0.1) -> array:
    """Do adpative spectrogram augmentation
    The level of the augmentation is gowern by the paramter level,
    ranging from 0 to 1, with 0 represents no augmentation。
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    time_mask_width = int(nt * level * 0.5)
    freq_mask_width = int(nf * level * 0.5)
    num_time_mask = int(10 * level)
    num_freq_mask = int(10 * level)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def spect_augment(spect: array,
                  tempo_axis: int=0,
                  max_time_mask: int=3,
                  max_freq_mask: int=3,
                  max_time_mask_width: int=30,
                  max_freq_mask_width: int=20) -> array:
    """Do spectrogram augmentation in both time and freq axis
    Reference:
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    num_time_mask = randint(max_time_mask)
    num_freq_mask = randint(max_freq_mask)
    time_mask_width = randint(max_time_mask_width)
    freq_mask_width = randint(max_freq_mask_width)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def random_crop1d(y: array, crop_len: int) -> array:
    """ Do random cropping on 1d input signal
    The input is a 1d signal, typically a sound waveform
    """
    if y.ndim != 1:
        'only accept 1d tensor or numpy array'
    n = len(y)
    idx = randint(n - crop_len)
    return y[idx:idx + crop_len]
 def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
    """ Do random cropping for 2D array, typically a spectrogram.
    The cropping is done in temporal direction on the time-freq input signal.
    """
    if tempo_axis >= s.ndim:
        raise ParameterError('axis out of range')
    n = s.shape[tempo_axis]
    idx = randint(high=n - crop_len)
    sli = [slice(None) for i in range(s.ndim)]
    sli[tempo_axis] = slice(idx, idx + crop_len)
    out = s[tuple(sli)]
    return out
--- a/paddleaudio/paddleaudio/functional/window.py
+++ b/paddleaudio/paddleaudio/functional/window.py
@ -20,6 +20,19 @@ from paddle import Tensor
 __all__ = [
    'get_window',
    # windows
    'taylor',
    'hamming',
    'hann',
    'tukey',
    'kaiser',
    'gaussian',
    'exponential',
    'triang',
    'bohman',
    'blackman',
    'cosine',
 ]
@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
    return _truncate(w, needs_trunc)
 def general_cosine(M: int, a: float, sym: bool=True,
                   dtype: str='float64') -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
        return paddle.ones((M, ), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
    w = paddle.zeros((M, ), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)
 def general_hamming(M: int, alpha: float, sym: bool=True,
                    dtype: str='float64') -> Tensor:
    """Compute a generalized Hamming window.
@ -143,21 +171,6 @@ def taylor(M: int,
    return _truncate(w, needs_trunc)
 def general_cosine(M: int, a: float, sym: bool=True,
                   dtype: str='float64') -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
        return paddle.ones((M, ), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
    w = paddle.zeros((M, ), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)
 def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    """Compute a Hamming window.
    The Hamming window is a taper formed by using a raised cosine with
@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _truncate(w, needs_trunc)
 ## factory function
 def get_window(window: Union[str, Tuple[str, float]],
               win_length: int,
               fftbins: bool=True,
--- a/paddleaudio/paddleaudio/io/init.py
+++ b/paddleaudio/paddleaudio/io/init.py
@ -0,0 +1,6 @@
 from .audio import save_wav
 from .audio import load
 from .audio import normalize
 from .audio import to_mono
 from .audio import resample
 from .audio import depth_convert
--- a/paddleaudio/paddleaudio/io/audio.py
+++ b/paddleaudio/paddleaudio/io/audio.py
--- a/paddleaudio/paddleaudio/kaldi/init.py
+++ b/paddleaudio/paddleaudio/kaldi/init.py
--- a/paddleaudio/paddleaudio/sox_effects/init.py
+++ b/paddleaudio/paddleaudio/sox_effects/init.py
--- a/paddleaudio/paddleaudio/utils/init.py
+++ b/paddleaudio/paddleaudio/utils/init.py
@ -11,8 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .download import *
+
-from .env import *
+from .env import USER_HOME
-from .error import *
+from .env import PPAUDIO_HOME
-from .log import *
+from .env import MODEL_HOME
-from .time import *
+from .env import DATA_HOME
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
 from .error import ParameterError
 from .log import logger
 from .log import Logger
 from .time import Timer
 from .time import seconds_to_hms
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@ -22,6 +22,11 @@ from .log import logger
 download.logger = logger
 __all__ = [
    'decompress',
    'download_and_decompress',
    'load_state_dict_from_url',
 ]
 def decompress(file: str):
    """
--- a/paddleaudio/paddleaudio/utils/env.py
+++ b/paddleaudio/paddleaudio/utils/env.py
@ -20,6 +20,12 @@ PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. D
 '''
 import os
 __all__ = [
    'USER_HOME',
    'PPAUDIO_HOME',
 'MODEL_HOME' ,
 'DATA_HOME' ,
 ]
 def _get_user_home():
    return os.path.expanduser('~')
--- a/paddleaudio/paddleaudio/utils/error.py
+++ b/paddleaudio/paddleaudio/utils/error.py
--- a/paddleaudio/paddleaudio/utils/log.py
+++ b/paddleaudio/paddleaudio/utils/log.py
@ -19,7 +19,10 @@ import time
 import colorlog
-loggers = {}
+__all__ = [
    'Logger',
    'logger',
 ]
 log_config = {
    'DEBUG': {
--- a/paddleaudio/paddleaudio/utils/time.py
+++ b/paddleaudio/paddleaudio/utils/time.py
@ -14,6 +14,10 @@
 import math
 import time
 __all__ = [
    'Timer',
    'seconds_to_hms',
 ]
 class Timer(object):
    '''Calculate runing speed and estimated time of arrival(ETA)'''
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@ -14,7 +14,7 @@
 import setuptools
 # set the version here
-VERSION = '0.1.0'
+VERSION = '0.2.0'
 def write_version_py(filename='paddleaudio/__init__.py'):
--- a/requirements.txt
+++ b/requirements.txt
@ -1,48 +0,0 @@
 ConfigArgParse
 coverage
 editdistance
 g2p_en
 g2pM
 gpustat
 h5py
 inflect
 jieba
 jsonlines
 kaldiio
 librosa
 loguru
 matplotlib
 nara_wpe
 nltk
 paddleaudio
 paddlenlp
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
 phkit
 Pillow
 praatio==5.0.0
 pre-commit
 pybind11
 pypi-kenlm
 pypinyin
 python-dateutil
 pyworld
 resampy==0.2.2
 sacrebleu
 scipy
 sentencepiece~=0.1.96
 snakeviz
 soundfile~=0.10
 sox
 soxbindings
 textgrid
 timer
 tqdm
 typeguard
 unidecode
 visualdl
 webrtcvad
 yacs~=0.1.8
 yq
 zhon