diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml index bcbc7ad2..97ebf41d 100644 --- a/examples/librispeech/s1/conf/preprocess.yaml +++ b/examples/librispeech/s1/conf/preprocess.yaml @@ -23,7 +23,3 @@ process: n_mask: 2 inplace: true replace_with_zero: true - - - - diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 13dc3a44..4171f85b 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -25,6 +25,8 @@ import soxbindings as sox from scipy import signal from .utility import subfile_from_tar +from .utility import convert_samples_to_float32 +from .utility import convert_samples_from_float32 class AudioSegment(): @@ -689,15 +691,7 @@ class AudioSegment(): Audio sample type is usually integer or float-point. Integers will be scaled to [-1, 1] in float32. """ - float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: - bits = np.iinfo(samples.dtype).bits - float32_samples *= (1. / 2**(bits - 1)) - elif samples.dtype in np.sctypes['float']: - pass - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return float32_samples + return convert_samples_to_float32(samples) def _convert_samples_from_float32(self, samples, dtype): """Convert sample type from float32 to dtype. @@ -708,20 +702,4 @@ class AudioSegment(): This is for writing a audio file. """ - dtype = np.dtype(dtype) - output_samples = samples.copy() - if dtype in np.sctypes['int']: - bits = np.iinfo(dtype).bits - output_samples *= (2**(bits - 1) / 1.) - min_val = np.iinfo(dtype).min - max_val = np.iinfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - elif samples.dtype in np.sctypes['float']: - min_val = np.finfo(dtype).min - max_val = np.finfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return output_samples.astype(dtype) + return convert_samples_from_float32(samples, dtype) diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 089890d2..58e5b1b0 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -30,7 +30,8 @@ logger = Log(__name__).getlog() __all__ = [ "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", - "EOS", "UNK", "BLANK", "MASKCTC", "SPACE" + "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32", + "convert_samples_from_float32" ] IGNORE_ID = -1 @@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str): else: raise ValueError(f"cmvn file type no support: {filetype}") return cmvn[0], cmvn[1] + + +def convert_samples_to_float32(samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + + PCM16 -> PCM32 + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + +def convert_samples_from_float32(samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + PCM32 -> PCM16 + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) + diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 6956b908..9e576d0d 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -307,6 +307,9 @@ class IStft(): center=self.center, ) +from paddlespeech.s2t.utils.log import Log +logger = Log(__name__).getlog() + class LogMelSpectrogramKaldi(): def __init__( self, @@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi(): def __repr__(self): return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " "n_shift={n_shift}, win_length={win_length}, window={window}, " - "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format( name=self.__class__.__name__, fs=self.fs, n_mels=self.n_mels, @@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi(): window=self.window, fmin=self.fmin, fmax=self.fmax, - eps=self.eps, )) + eps=self.eps, + preemph=self.preemph, + window=self.window, + dither=self.dither)) def __call__(self, x): """ @@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi(): """ if x.ndim != 1: raise ValueError("Not support x: [Time, Channel]") - if x.dtype == np.int16: - x = x / 2**(16 - 1) - return logfbank( + + logger.info(f"in {x}") + if x.dtype in np.sctypes['float']: + # PCM32 -> PCM16 + bits = np.iinfo(np.int16).bits + x = x * 2**(bits - 1) + logger.info(f"b {x}") + + # logfbank need PCM16 input + y = logfbank( signal=x, samplerate=self.fs, winlen=self.win_length, # unit ms @@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi(): remove_dc_offset=self.remove_dc_offset, preemph=self.preemph, wintype=self.window) + logger.info(f"a {y}") + + + return y