fix logfbank using PCM16

pull/1012/head
Hui Zhang 3 years ago
parent d62092ac28
commit 8b0e344c69

@ -23,7 +23,3 @@ process:
n_mask: 2
inplace: true
replace_with_zero: true

@ -25,6 +25,8 @@ import soxbindings as sox
from scipy import signal
from .utility import subfile_from_tar
from .utility import convert_samples_to_float32
from .utility import convert_samples_from_float32
class AudioSegment():
@ -689,15 +691,7 @@ class AudioSegment():
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2**(bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
return convert_samples_to_float32(samples)
def _convert_samples_from_float32(self, samples, dtype):
"""Convert sample type from float32 to dtype.
@ -708,20 +702,4 @@ class AudioSegment():
This is for writing a audio file.
"""
dtype = np.dtype(dtype)
output_samples = samples.copy()
if dtype in np.sctypes['int']:
bits = np.iinfo(dtype).bits
output_samples *= (2**(bits - 1) / 1.)
min_val = np.iinfo(dtype).min
max_val = np.iinfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
elif samples.dtype in np.sctypes['float']:
min_val = np.finfo(dtype).min
max_val = np.finfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return output_samples.astype(dtype)
return convert_samples_from_float32(samples, dtype)

@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
__all__ = [
"load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
"EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
"convert_samples_from_float32"
]
IGNORE_ID = -1
@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str):
else:
raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1]
def convert_samples_to_float32(samples):
"""Convert sample type to float32.
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
PCM16 -> PCM32
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2**(bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
def convert_samples_from_float32(samples, dtype):
"""Convert sample type from float32 to dtype.
Audio sample type is usually integer or float-point. For integer
type, float32 will be rescaled from [-1, 1] to the maximum range
supported by the integer type.
PCM32 -> PCM16
"""
dtype = np.dtype(dtype)
output_samples = samples.copy()
if dtype in np.sctypes['int']:
bits = np.iinfo(dtype).bits
output_samples *= (2**(bits - 1) / 1.)
min_val = np.iinfo(dtype).min
max_val = np.iinfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
elif samples.dtype in np.sctypes['float']:
min_val = np.finfo(dtype).min
max_val = np.finfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return output_samples.astype(dtype)

@ -307,6 +307,9 @@ class IStft():
center=self.center, )
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
class LogMelSpectrogramKaldi():
def __init__(
self,
@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi():
def __repr__(self):
return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
"n_shift={n_shift}, win_length={win_length}, window={window}, "
"fmin={fmin}, fmax={fmax}, eps={eps}))".format(
"fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format(
name=self.__class__.__name__,
fs=self.fs,
n_mels=self.n_mels,
@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi():
window=self.window,
fmin=self.fmin,
fmax=self.fmax,
eps=self.eps, ))
eps=self.eps,
preemph=self.preemph,
window=self.window,
dither=self.dither))
def __call__(self, x):
"""
@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi():
"""
if x.ndim != 1:
raise ValueError("Not support x: [Time, Channel]")
if x.dtype == np.int16:
x = x / 2**(16 - 1)
return logfbank(
logger.info(f"in {x}")
if x.dtype in np.sctypes['float']:
# PCM32 -> PCM16
bits = np.iinfo(np.int16).bits
x = x * 2**(bits - 1)
logger.info(f"b {x}")
# logfbank need PCM16 input
y = logfbank(
signal=x,
samplerate=self.fs,
winlen=self.win_length, # unit ms
@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi():
remove_dc_offset=self.remove_dc_offset,
preemph=self.preemph,
wintype=self.window)
logger.info(f"a {y}")
return y

Loading…
Cancel
Save