From 7ebe904e20b4f3147e8d82222649d105916360a3 Mon Sep 17 00:00:00 2001 From: WilliamZhang06 Date: Tue, 22 Feb 2022 17:51:46 +0800 Subject: [PATCH] fixed overload , test=doc --- paddlespeech/server/conf/asr/asr.yaml | 2 +- paddlespeech/server/conf/asr/asr_pd.yaml | 4 +- .../engine/asr/paddleinference/asr_engine.py | 97 ------------------- .../server/engine/asr/python/asr_engine.py | 95 ------------------ 4 files changed, 3 insertions(+), 195 deletions(-) diff --git a/paddlespeech/server/conf/asr/asr.yaml b/paddlespeech/server/conf/asr/asr.yaml index c6f28bd6..50e55a3c 100644 --- a/paddlespeech/server/conf/asr/asr.yaml +++ b/paddlespeech/server/conf/asr/asr.yaml @@ -4,4 +4,4 @@ sample_rate: 16000 cfg_path: # [optional] ckpt_path: # [optional] decode_method: 'attention_rescoring' -force_yes: False +force_yes: True diff --git a/paddlespeech/server/conf/asr/asr_pd.yaml b/paddlespeech/server/conf/asr/asr_pd.yaml index 2667a0f4..43a63f1b 100644 --- a/paddlespeech/server/conf/asr/asr_pd.yaml +++ b/paddlespeech/server/conf/asr/asr_pd.yaml @@ -6,13 +6,13 @@ # am choices=['deepspeech2offline_aishell'] TODO ################################################################## model_type: 'deepspeech2offline_aishell' -am_model: # the pdmodel file of am static model [optional] +am_model: # the pdmodel file of am static model [optional] am_params: # the pdiparams file of am static model [optional] lang: 'zh' sample_rate: 16000 cfg_path: decode_method: -force_yes: +force_yes: True am_predictor_conf: use_gpu: True diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 0a68e232..6d072322 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -60,108 +60,11 @@ pretrained_models = { } - class ASRServerExecutor(ASRExecutor): def __init__(self): super().__init__() pass - def _check(self, audio_file: str, sample_rate: int, force_yes: bool): - self.sample_rate = sample_rate - if self.sample_rate != 16000 and self.sample_rate != 8000: - logger.error("please input --sr 8000 or --sr 16000") - return False - - logger.info("checking the audio file format......") - try: - audio, audio_sample_rate = soundfile.read( - audio_file, dtype="int16", always_2d=True) - except Exception as e: - logger.exception(e) - logger.error( - "can not open the audio file, please check the audio file format is 'wav'. \n \ - you can try to use sox to change the file format.\n \ - For example: \n \ - sample rate: 16k \n \ - sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \ - sample rate: 8k \n \ - sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \ - ") - - logger.info("The sample rate is %d" % audio_sample_rate) - if audio_sample_rate != self.sample_rate: - logger.warning("The sample rate of the input file is not {}.\n \ - The program will resample the wav file to {}.\n \ - If the result does not meet your expectations,\n \ - Please input the 16k 16 bit 1 channel wav file. \ - ".format(self.sample_rate, self.sample_rate)) - self.change_format = True - else: - logger.info("The audio file format is right") - self.change_format = False - - return True - - def preprocess(self, model_type: str, input: Union[str, os.PathLike]): - """ - Input preprocess and return paddle.Tensor stored in self.input. - Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). - """ - - audio_file = input - - # Get the object for feature extraction - if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: - audio, _ = self.collate_fn_test.process_utterance( - audio_file=audio_file, transcript=" ") - - audio_len = audio.shape[0] - audio = paddle.to_tensor(audio, dtype='float32') - audio_len = paddle.to_tensor(audio_len) - audio = paddle.unsqueeze(audio, axis=0) - - self._inputs["audio"] = audio - self._inputs["audio_len"] = audio_len - logger.info(f"audio feat shape: {audio.shape}") - - elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: - logger.info("get the preprocess conf") - preprocess_conf = self.config.preprocess_config - preprocess_args = {"train": False} - preprocessing = Transformation(preprocess_conf) - logger.info("read the audio file") - audio, audio_sample_rate = soundfile.read( - audio_file, dtype="int16", always_2d=True) - - if self.change_format: - if audio.shape[1] >= 2: - audio = audio.mean(axis=1, dtype=np.int16) - else: - audio = audio[:, 0] - # pcm16 -> pcm 32 - audio = self._pcm16to32(audio) - audio = librosa.resample(audio, audio_sample_rate, - self.sample_rate) - audio_sample_rate = self.sample_rate - # pcm32 -> pcm 16 - audio = self._pcm32to16(audio) - else: - audio = audio[:, 0] - - logger.info(f"audio shape: {audio.shape}") - # fbank - audio = preprocessing(audio, **preprocess_args) - - audio_len = paddle.to_tensor(audio.shape[0]) - audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) - - self._inputs["audio"] = audio - self._inputs["audio_len"] = audio_len - logger.info(f"audio feat shape: {audio.shape}") - - else: - raise Exception("wrong type") - def _init_from_path(self, model_type: str='wenetspeech', am_model: Optional[os.PathLike]=None, diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index 168ab237..fd67b029 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -38,101 +38,6 @@ class ASRServerExecutor(ASRExecutor): super().__init__() pass - def _check(self, audio_file: str, sample_rate: int, force_yes: bool): - self.sample_rate = sample_rate - if self.sample_rate != 16000 and self.sample_rate != 8000: - logger.error("please input --sr 8000 or --sr 16000") - return False - - logger.info("checking the audio file format......") - try: - audio, audio_sample_rate = soundfile.read( - audio_file, dtype="int16", always_2d=True) - except Exception as e: - logger.exception(e) - logger.error( - "can not open the audio file, please check the audio file format is 'wav'. \n \ - you can try to use sox to change the file format.\n \ - For example: \n \ - sample rate: 16k \n \ - sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \ - sample rate: 8k \n \ - sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \ - ") - - logger.info("The sample rate is %d" % audio_sample_rate) - if audio_sample_rate != self.sample_rate: - logger.warning("The sample rate of the input file is not {}.\n \ - The program will resample the wav file to {}.\n \ - If the result does not meet your expectations,\n \ - Please input the 16k 16 bit 1 channel wav file. \ - ".format(self.sample_rate, self.sample_rate)) - self.change_format = True - else: - logger.info("The audio file format is right") - self.change_format = False - - return True - - def preprocess(self, model_type: str, input: Union[str, os.PathLike]): - """ - Input preprocess and return paddle.Tensor stored in self.input. - Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). - """ - - audio_file = input - - # Get the object for feature extraction - if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: - audio, _ = self.collate_fn_test.process_utterance( - audio_file=audio_file, transcript=" ") - audio_len = audio.shape[0] - audio = paddle.to_tensor(audio, dtype='float32') - audio_len = paddle.to_tensor(audio_len) - audio = paddle.unsqueeze(audio, axis=0) - # vocab_list = collate_fn_test.vocab_list - self._inputs["audio"] = audio - self._inputs["audio_len"] = audio_len - logger.info(f"audio feat shape: {audio.shape}") - - elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: - logger.info("get the preprocess conf") - preprocess_conf = self.config.preprocess_config - preprocess_args = {"train": False} - preprocessing = Transformation(preprocess_conf) - logger.info("read the audio file") - audio, audio_sample_rate = soundfile.read( - audio_file, dtype="int16", always_2d=True) - - if self.change_format: - if audio.shape[1] >= 2: - audio = audio.mean(axis=1, dtype=np.int16) - else: - audio = audio[:, 0] - # pcm16 -> pcm 32 - audio = self._pcm16to32(audio) - audio = librosa.resample(audio, audio_sample_rate, - self.sample_rate) - audio_sample_rate = self.sample_rate - # pcm32 -> pcm 16 - audio = self._pcm32to16(audio) - else: - audio = audio[:, 0] - - logger.info(f"audio shape: {audio.shape}") - # fbank - audio = preprocessing(audio, **preprocess_args) - - audio_len = paddle.to_tensor(audio.shape[0]) - audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) - - self._inputs["audio"] = audio - self._inputs["audio_len"] = audio_len - logger.info(f"audio feat shape: {audio.shape}") - - else: - raise Exception("wrong type") - class ASREngine(BaseEngine): """ASR server engine