fix logfbank using PCM16

4 years ago · 8b0e344c69
parent d62092ac28
commit 8b0e344c69
4 changed files with 76 additions and 36 deletions
--- a/examples/librispeech/s1/conf/preprocess.yaml
+++ b/examples/librispeech/s1/conf/preprocess.yaml
@ -23,7 +23,3 @@ process:
    n_mask: 2
    inplace: true
    replace_with_zero: true
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@ -25,6 +25,8 @@ import soxbindings as sox
 from scipy import signal
 from .utility import subfile_from_tar
 from .utility import convert_samples_to_float32
 from .utility import convert_samples_from_float32
 class AudioSegment():
@ -689,15 +691,7 @@ class AudioSegment():
        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
-        float32_samples = samples.astype('float32')
+        return convert_samples_to_float32(samples)
        if samples.dtype in np.sctypes['int']:
            bits = np.iinfo(samples.dtype).bits
            float32_samples *= (1. / 2**(bits - 1))
        elif samples.dtype in np.sctypes['float']:
            pass
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return float32_samples
    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
@ -708,20 +702,4 @@ class AudioSegment():
        This is for writing a audio file.
        """
-        dtype = np.dtype(dtype)
+        return convert_samples_from_float32(samples, dtype)
        output_samples = samples.copy()
        if dtype in np.sctypes['int']:
            bits = np.iinfo(dtype).bits
            output_samples *= (2**(bits - 1) / 1.)
            min_val = np.iinfo(dtype).min
            max_val = np.iinfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        elif samples.dtype in np.sctypes['float']:
            min_val = np.finfo(dtype).min
            max_val = np.finfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
 __all__ = [
    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
-    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
+    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
    "convert_samples_from_float32"
 ]
 IGNORE_ID = -1
@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str):
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]
 def convert_samples_to_float32(samples):
    """Convert sample type to float32.
    Audio sample type is usually integer or float-point.
    Integers will be scaled to [-1, 1] in float32.
    PCM16 -> PCM32
    """
    float32_samples = samples.astype('float32')
    if samples.dtype in np.sctypes['int']:
        bits = np.iinfo(samples.dtype).bits
        float32_samples *= (1. / 2**(bits - 1))
    elif samples.dtype in np.sctypes['float']:
        pass
    else:
        raise TypeError("Unsupported sample type: %s." % samples.dtype)
    return float32_samples
 def convert_samples_from_float32(samples, dtype):
    """Convert sample type from float32 to dtype.
    Audio sample type is usually integer or float-point. For integer
    type, float32 will be rescaled from [-1, 1] to the maximum range
    supported by the integer type.
    PCM32 -> PCM16
    """
    dtype = np.dtype(dtype)
    output_samples = samples.copy()
    if dtype in np.sctypes['int']:
        bits = np.iinfo(dtype).bits
        output_samples *= (2**(bits - 1) / 1.)
        min_val = np.iinfo(dtype).min
        max_val = np.iinfo(dtype).max
        output_samples[output_samples > max_val] = max_val
        output_samples[output_samples < min_val] = min_val
    elif samples.dtype in np.sctypes['float']:
        min_val = np.finfo(dtype).min
        max_val = np.finfo(dtype).max
        output_samples[output_samples > max_val] = max_val
        output_samples[output_samples < min_val] = min_val
    else:
        raise TypeError("Unsupported sample type: %s." % samples.dtype)
    return output_samples.astype(dtype)
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@ -307,6 +307,9 @@ class IStft():
            center=self.center, )
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
 class LogMelSpectrogramKaldi():
    def __init__(
            self,
@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi():
    def __repr__(self):
        return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
                "n_shift={n_shift}, win_length={win_length}, window={window}, "
-                "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format(
                    name=self.__class__.__name__,
                    fs=self.fs,
                    n_mels=self.n_mels,
@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi():
                    window=self.window,
                    fmin=self.fmin,
                    fmax=self.fmax,
-                    eps=self.eps, ))
+                    eps=self.eps, 
                    preemph=self.preemph,
                    window=self.window,
                    dither=self.dither))
    def __call__(self, x):
        """
@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi():
        """
        if x.ndim != 1:
            raise ValueError("Not support x: [Time, Channel]")
-        if x.dtype == np.int16:
+
-            x = x / 2**(16 - 1)
+        logger.info(f"in {x}")
-        return logfbank(
+        if x.dtype in np.sctypes['float']:
            # PCM32 -> PCM16
            bits = np.iinfo(np.int16).bits
            x = x * 2**(bits - 1)
        logger.info(f"b {x}")
        # logfbank need PCM16 input
        y = logfbank(
            signal=x,
            samplerate=self.fs,
            winlen=self.win_length,  # unit ms
@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi():
            remove_dc_offset=self.remove_dc_offset,
            preemph=self.preemph,
            wintype=self.window)
        logger.info(f"a {y}")
        return y