diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 66a2f169f..48772997a 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -225,19 +225,16 @@ class ASRExecutor(BaseExecutor): if self.change_format: if audio.shape[1] >= 2: - audio = audio.mean(axis=1) + audio = audio.mean(axis=1, dtype=np.int16) else: audio = audio[:, 0] # pcm16 -> pcm 32 - audio = audio.astype("float32") - bits = np.iinfo(np.int16).bits - audio = audio / (2**(bits - 1)) + audio = self._pcm16to32(audio) audio = librosa.resample(audio, audio_sample_rate, self.sample_rate) audio_sample_rate = self.sample_rate # pcm32 -> pcm 16 - audio = audio * (2**(bits - 1)) - audio = np.round(audio).astype("int16") + audio = self._pcm32to16(audio) else: audio = audio[:, 0] @@ -312,6 +309,20 @@ class ASRExecutor(BaseExecutor): """ return self._outputs["result"] + def _pcm16to32(self, audio): + assert(audio.dtype == np.int16) + audio = audio.astype("float32") + bits = np.iinfo(np.int16).bits + audio = audio / (2**(bits - 1)) + return audio + + def _pcm32to16(self, audio): + assert(audio.dtype == np.float32) + bits = np.iinfo(np.int16).bits + audio = audio * (2**(bits - 1)) + audio = np.round(audio).astype("int16") + return audio + def _check(self, audio_file: str, sample_rate: int): self.sample_rate = sample_rate if self.sample_rate != 16000 and self.sample_rate != 8000: