parent
63b4494700
commit
59d82c0c65
@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
def get_encoding(ext, dtype):
|
||||||
|
exts = {
|
||||||
|
"mp3",
|
||||||
|
"flac",
|
||||||
|
"vorbis",
|
||||||
|
}
|
||||||
|
encodings = {
|
||||||
|
"float32": "PCM_F",
|
||||||
|
"int32": "PCM_S",
|
||||||
|
"int16": "PCM_S",
|
||||||
|
"uint8": "PCM_U",
|
||||||
|
}
|
||||||
|
return ext.upper() if ext in exts else encodings[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bit_depth(dtype):
|
||||||
|
bit_depths = {
|
||||||
|
"float32": 32,
|
||||||
|
"int32": 32,
|
||||||
|
"int16": 16,
|
||||||
|
"uint8": 8,
|
||||||
|
}
|
||||||
|
return bit_depths[dtype]
|
||||||
|
|
||||||
|
def get_bits_per_sample(ext, dtype):
|
||||||
|
bits_per_samples = {
|
||||||
|
"flac": 24,
|
||||||
|
"mp3": 0,
|
||||||
|
"vorbis": 0,
|
||||||
|
}
|
||||||
|
return bits_per_samples.get(ext, get_bit_depth(dtype))
|
@ -0,0 +1,34 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddlespeech.audio.backends import sox_io_backend
|
||||||
|
|
||||||
|
class TestInfo(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels, sample_size):
|
||||||
|
"""check wav file correctly """
|
||||||
|
path = 'testdata/test.wav'
|
||||||
|
info = sox_io_backend.get_info_file(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_size # duration*sample_rate
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == get_bit_depth(dtype)
|
||||||
|
assert info.encoding == get_encoding('wav', dtype)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,47 @@
|
|||||||
|
import unittest
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
from parameterized import parameterized
|
||||||
|
import numpy as np
|
||||||
|
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||||
|
from paddlespeech.audio.backends import sox_io_backend
|
||||||
|
|
||||||
|
from tests.unit.common_utils import (
|
||||||
|
get_wav_data,
|
||||||
|
load_wav,
|
||||||
|
save_wav,
|
||||||
|
)
|
||||||
|
|
||||||
|
#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/load_test.py
|
||||||
|
|
||||||
|
class TestLoad(unittest.TestCase):
|
||||||
|
|
||||||
|
def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
|
||||||
|
"""`sox_io_backend.load` can load wav format correctly.
|
||||||
|
|
||||||
|
Wav data loaded with sox_io backend should match those with scipy
|
||||||
|
"""
|
||||||
|
path = 'testdata/reference.wav'
|
||||||
|
data = get_wav_data(dtype, num_channels, normalize=normalize, num_frames=duration * sample_rate)
|
||||||
|
save_wav(path, data, sample_rate)
|
||||||
|
expected = load_wav(path, normalize=normalize)[0]
|
||||||
|
data, sr = sox_io_backend.load(path, normalize=normalize)
|
||||||
|
assert sr == sample_rate
|
||||||
|
np.testing.assert_array_almost_equal(data, expected, decimal=4)
|
||||||
|
|
||||||
|
@parameterized.expand(
|
||||||
|
list(
|
||||||
|
itertools.product(
|
||||||
|
["float64", "float32", "int32",],
|
||||||
|
[8000, 16000],
|
||||||
|
[1, 2],
|
||||||
|
[False, True],
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels, normalize):
|
||||||
|
"""`sox_io_backend.load` can load wav format correctly."""
|
||||||
|
self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,34 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddlespeech.audio.backends import sox_io_backend
|
||||||
|
|
||||||
|
class TestInfo(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_wav(self, dtype, sample_rate, num_channels, sample_size):
|
||||||
|
"""check wav file correctly """
|
||||||
|
path = 'testdata/test.wav'
|
||||||
|
info = sox_io_backend.get_info_file(path)
|
||||||
|
assert info.sample_rate == sample_rate
|
||||||
|
assert info.num_frames == sample_size # duration*sample_rate
|
||||||
|
assert info.num_channels == num_channels
|
||||||
|
assert info.bits_per_sample == get_bit_depth(dtype)
|
||||||
|
assert info.encoding == get_encoding('wav', dtype)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1 @@
|
|||||||
|
../../features/testdata
|
@ -0,0 +1,8 @@
|
|||||||
|
from .wav_utils import get_wav_data, load_wav, save_wav, normalize_wav
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"get_wav_data",
|
||||||
|
"load_wav",
|
||||||
|
"save_wav",
|
||||||
|
"normalize_wav"
|
||||||
|
]
|
@ -0,0 +1,92 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import scipy.io.wavfile
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
def normalize_wav(tensor: paddle.Tensor) -> paddle.Tensor:
|
||||||
|
if tensor.dtype == paddle.float32:
|
||||||
|
pass
|
||||||
|
elif tensor.dtype == paddle.int32:
|
||||||
|
tensor = paddle.cast(tensor, paddle.float32)
|
||||||
|
tensor[tensor > 0] /= 2147483647.0
|
||||||
|
tensor[tensor < 0] /= 2147483648.0
|
||||||
|
elif tensor.dtype == paddle.int16:
|
||||||
|
tensor = paddle.cast(tensor, paddle.float32)
|
||||||
|
tensor[tensor > 0] /= 32767.0
|
||||||
|
tensor[tensor < 0] /= 32768.0
|
||||||
|
elif tensor.dtype == paddle.uint8:
|
||||||
|
tensor = paddle.cast(tensor, paddle.float32) - 128
|
||||||
|
tensor[tensor > 0] /= 127.0
|
||||||
|
tensor[tensor < 0] /= 128.0
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
|
||||||
|
def get_wav_data(
|
||||||
|
dtype: str,
|
||||||
|
num_channels: int,
|
||||||
|
*,
|
||||||
|
num_frames: Optional[int] = None,
|
||||||
|
normalize: bool = True,
|
||||||
|
channels_first: bool = True,
|
||||||
|
):
|
||||||
|
"""Generate linear signal of the given dtype and num_channels
|
||||||
|
|
||||||
|
Data range is
|
||||||
|
[-1.0, 1.0] for float32,
|
||||||
|
[-2147483648, 2147483647] for int32
|
||||||
|
[-32768, 32767] for int16
|
||||||
|
[0, 255] for uint8
|
||||||
|
|
||||||
|
num_frames allow to change the linear interpolation parameter.
|
||||||
|
Default values are 256 for uint8, else 1 << 16.
|
||||||
|
1 << 16 as default is so that int16 value range is completely covered.
|
||||||
|
"""
|
||||||
|
dtype_ = getattr(paddle, dtype)
|
||||||
|
|
||||||
|
if num_frames is None:
|
||||||
|
if dtype == "uint8":
|
||||||
|
num_frames = 256
|
||||||
|
else:
|
||||||
|
num_frames = 1 << 16
|
||||||
|
|
||||||
|
# paddle linspace not support uint8, int8, int16
|
||||||
|
#if dtype == "uint8":
|
||||||
|
# base = paddle.linspace(0, 255, num_frames, dtype=dtype_)
|
||||||
|
#elif dtype == "int8":
|
||||||
|
# base = paddle.linspace(-128, 127, num_frames, dtype=dtype_)
|
||||||
|
if dtype == "float32":
|
||||||
|
base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_)
|
||||||
|
elif dtype == "float64":
|
||||||
|
base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_)
|
||||||
|
elif dtype == "int32":
|
||||||
|
base = paddle.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_)
|
||||||
|
#elif dtype == "int16":
|
||||||
|
# base = paddle.linspace(-32768, 32767, num_frames, dtype=dtype_)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Unsupported dtype {dtype}")
|
||||||
|
data = base.tile([num_channels, 1])
|
||||||
|
if not channels_first:
|
||||||
|
data = data.transpose([1, 0])
|
||||||
|
if normalize:
|
||||||
|
data = normalize_wav(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def load_wav(path: str, normalize=True, channels_first=True) -> paddle.Tensor:
|
||||||
|
"""Load wav file without paddleaudio"""
|
||||||
|
sample_rate, data = scipy.io.wavfile.read(path)
|
||||||
|
data = paddle.to_tensor(data.copy())
|
||||||
|
if data.ndim == 1:
|
||||||
|
data = data.unsqueeze(1)
|
||||||
|
if normalize:
|
||||||
|
data = normalize_wav(data)
|
||||||
|
if channels_first:
|
||||||
|
data = data.transpose([1, 0])
|
||||||
|
return data, sample_rate
|
||||||
|
|
||||||
|
|
||||||
|
def save_wav(path, data, sample_rate, channels_first=True):
|
||||||
|
"""Save wav file without paddleaudio"""
|
||||||
|
if channels_first:
|
||||||
|
data = data.transpose([1, 0])
|
||||||
|
scipy.io.wavfile.write(path, sample_rate, data.numpy())
|
Loading…
Reference in new issue