* Fix

* Fix
pull/3985/head
co63oc 7 months ago committed by GitHub
parent 4e5181c949
commit f3a5df2049
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -183,7 +183,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
sr (int): Sample rate.
file (os.PathLike): Path of auido file to save.
file (os.PathLike): Path of audio file to save.
"""
if not file.endswith('.wav'):
raise ParameterError(
@ -216,10 +216,10 @@ def soundfile_load(
duration: Optional[int]=None,
dtype: str='float32',
resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
"""Load audio file from disk. This function loads audio from disk using using audio beackend.
"""Load audio file from disk. This function loads audio from disk using using audio backend.
Args:
file (os.PathLike): Path of auido file to load.
file (os.PathLike): Path of audio file to load.
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
mono (bool, optional): Return waveform with mono channel. Defaults to True.
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
@ -250,14 +250,14 @@ def soundfile_load(
if normal:
y = normalize(y, norm_type, norm_mul_factor)
elif dtype in ['int8', 'int16']:
# still need to do normalization, before depth convertion
# still need to do normalization, before depth conversion
y = normalize(y, 'linear', 1.0)
y = depth_convert(y, dtype)
return y, r
#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion.
#The code below is taken from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py, with some modifications.
def _get_subtype_for_wav(dtype: paddle.dtype,
@ -382,7 +382,7 @@ def save(
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float of None, optional): Not used.
It is here only for interface compatibility reson with "sox_io" backend.
It is here only for interface compatibility reason with "sox_io" backend.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is
inferred from file extension. If the file extension is missing or
@ -394,8 +394,8 @@ def save(
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
``"flac"`` and ``"sph"``.
encoding (str or None, optional): Changes the encoding for supported formats.
This argument is effective only for supported formats, sush as
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
This argument is effective only for supported formats, such as
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are:
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)

@ -626,7 +626,7 @@ def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
def _randint(high: int) -> int:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
This is a helper function for random data augmentation
"""
return int(np.random.randint(0, high=high))
@ -659,7 +659,7 @@ def depth_augment(y: np.ndarray,
def adaptive_spect_augment(spect: np.ndarray,
tempo_axis: int=0,
level: float=0.1) -> np.ndarray:
"""Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
"""Do adaptive spectrogram augmentation. The level of the augmentation is govern by the parameter level, ranging from 0 to 1, with 0 represents no augmentation.
Args:
spect (np.ndarray): Input spectrogram.
@ -711,9 +711,9 @@ def spect_augment(spect: np.ndarray,
spect (np.ndarray): Input spectrogram.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
max_freq_mask (int, optional): Maximum number of frequency masking. Defaults to 3.
max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
max_freq_mask_width (int, optional): Maximum width of frequency masking. Defaults to 20.
Returns:
np.ndarray: The augmented spectrogram.

@ -449,7 +449,7 @@ unsigned get_precision(const std::string filetype, py::dtype dtype) {
return SOX_UNSPEC;
if (filetype == "wav" || filetype == "amb") {
switch (dtype.num()) {
case 1: // byte in numpy dype num
case 1: // byte in numpy dtype num
return 8;
case 3: // short, in numpy dtype num
return 16;

@ -58,7 +58,7 @@ class MockedSaveTest(unittest.TestCase):
encoding=encoding,
bits_per_sample=bits_per_sample, )
# on +Py3.8 call_args.kwargs is more descreptive
# on +Py3.8 call_args.kwargs is more descriptive
args = mocked_write.call_args[1]
assert args["file"] == filepath
assert args["samplerate"] == sample_rate

@ -58,7 +58,7 @@ def download(url, md5sum, target_dir, filename=None):
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
print("\nMD5 Checksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:

@ -108,7 +108,7 @@ def create_manifest(data_dir, manifest_path):
def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file.
"""Download, unpack and create summary manifest file.
"""
if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
# download

Loading…
Cancel
Save