diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 07c676e5..ade8cdd6 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -382,7 +382,7 @@ class TTSExecutor(BaseExecutor): text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") self.frontend_time = time.time() - frontend_st self.am_time = 0 diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index fb521b30..bd1186df 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -123,7 +123,6 @@ class TTSClientExecutor(BaseExecutor): time_end = time.time() time_consume = time_end - time_start response_dict = res.json() - logger.info(response_dict["message"]) logger.info("Save synthesized audio successfully on %s." % (output)) logger.info("Audio duration: %f s." % (response_dict['result']['duration'])) @@ -702,7 +701,6 @@ class VectorClientExecutor(BaseExecutor): test_audio=args.test, task=task) time_end = time.time() - logger.info(f"The vector: {res}") logger.info("Response time %f s." % (time_end - time_start)) return True except Exception as e: diff --git a/paddlespeech/server/engine/acs/python/acs_engine.py b/paddlespeech/server/engine/acs/python/acs_engine.py index 930101ac..63964a82 100644 --- a/paddlespeech/server/engine/acs/python/acs_engine.py +++ b/paddlespeech/server/engine/acs/python/acs_engine.py @@ -30,7 +30,7 @@ class ACSEngine(BaseEngine): """The ACSEngine Engine """ super(ACSEngine, self).__init__() - logger.info("Create the ACSEngine Instance") + logger.debug("Create the ACSEngine Instance") self.word_list = [] def init(self, config: dict): @@ -42,7 +42,7 @@ class ACSEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the acs engine") + logger.debug("Init the acs engine") try: self.config = config self.device = self.config.get("device", paddle.get_device()) @@ -50,7 +50,7 @@ class ACSEngine(BaseEngine): # websocket default ping timeout is 20 seconds self.ping_timeout = self.config.get("ping_timeout", 20) paddle.set_device(self.device) - logger.info(f"ACS Engine set the device: {self.device}") + logger.debug(f"ACS Engine set the device: {self.device}") except BaseException as e: logger.error( @@ -66,7 +66,9 @@ class ACSEngine(BaseEngine): self.url = "ws://" + self.config.asr_server_ip + ":" + str( self.config.asr_server_port) + "/paddlespeech/asr/streaming" - logger.info("Init the acs engine successfully") + logger.info("Initialize acs server engine successfully on device: %s." % + (self.device)) + return True def read_search_words(self): @@ -95,12 +97,12 @@ class ACSEngine(BaseEngine): Returns: _type_: _description_ """ - logger.info("send a message to the server") + logger.debug("send a message to the server") if self.url is None: logger.error("No asr server, please input valid ip and port") return "" ws = websocket.WebSocket() - logger.info(f"set the ping timeout: {self.ping_timeout} seconds") + logger.debug(f"set the ping timeout: {self.ping_timeout} seconds") ws.connect(self.url, ping_timeout=self.ping_timeout) audio_info = json.dumps( { @@ -123,7 +125,7 @@ class ACSEngine(BaseEngine): logger.info(f"audio result: {msg}") # 3. send chunk audio data to engine - logger.info("send the end signal") + logger.debug("send the end signal") audio_info = json.dumps( { "name": "test.wav", @@ -197,7 +199,7 @@ class ACSEngine(BaseEngine): start = max(time_stamp[m.start(0)]['bg'] - offset, 0) end = min(time_stamp[m.end(0) - 1]['ed'] + offset, max_ed) - logger.info(f'start: {start}, end: {end}') + logger.debug(f'start: {start}, end: {end}') acs_result.append({'w': w, 'bg': start, 'ed': end}) return acs_result, asr_result @@ -212,7 +214,7 @@ class ACSEngine(BaseEngine): Returns: acs_result, asr_result: the acs result and the asr result """ - logger.info("start to process the audio content search") + logger.debug("start to process the audio content search") msg = self.get_asr_content(io.BytesIO(audio_data)) acs_result, asr_result = self.get_macthed_word(msg) diff --git a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py index 06793164..9ec7bb5a 100644 --- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py @@ -44,7 +44,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -152,12 +152,12 @@ class PaddleASRConnectionHanddler: self.output_reset() def extract_feat(self, samples: ByteString): - logger.info("Online ASR extract the feat") + logger.debug("Online ASR extract the feat") samples = np.frombuffer(samples, dtype=np.int16) assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -168,7 +168,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -202,14 +202,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -237,7 +237,7 @@ class PaddleASRConnectionHanddler: return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) @@ -355,7 +355,7 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) @@ -367,7 +367,7 @@ class ASRServerExecutor(ASRExecutor): if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = onnx_infer.get_sess( model_path=self.am_model, sess_conf=self.am_predictor_conf) else: @@ -400,7 +400,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -422,12 +422,11 @@ class ASRServerExecutor(ASRExecutor): # self.res_path, self.task_resource.res_dict[ # 'params']) if am_params is None else os.path.abspath(am_params) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - # logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") #Init body. self.config = CfgNode(new_allowed=True) @@ -436,7 +435,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -450,7 +449,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -501,7 +500,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -509,7 +508,8 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py index efb726aa..1f0a8b80 100644 --- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py @@ -44,7 +44,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -157,7 +157,7 @@ class PaddleASRConnectionHanddler: assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -168,7 +168,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -202,14 +202,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -237,13 +237,13 @@ class PaddleASRConnectionHanddler: return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None @@ -294,7 +294,7 @@ class PaddleASRConnectionHanddler: Returns: logprob: poster probability. """ - logger.info("start to decoce one chunk for deepspeech2") + logger.debug("start to decoce one chunk for deepspeech2") input_names = self.am_predictor.get_input_names() audio_handle = self.am_predictor.get_input_handle(input_names[0]) audio_len_handle = self.am_predictor.get_input_handle(input_names[1]) @@ -369,7 +369,7 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) @@ -381,7 +381,7 @@ class ASRServerExecutor(ASRExecutor): if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = init_predictor( model_file=self.am_model, params_file=self.am_params, @@ -415,7 +415,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -437,12 +437,12 @@ class ASRServerExecutor(ASRExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") + logger.debug(f" am_params path: {self.am_params}") #Init body. self.config = CfgNode(new_allowed=True) @@ -451,7 +451,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -465,7 +465,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -516,7 +516,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -524,7 +524,9 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) + return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index daa9fc50..b3472a83 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -49,7 +49,7 @@ class PaddleASRConnectionHanddler: asr_engine (ASREngine): the global asr engine """ super().__init__() - logger.info( + logger.debug( "create an paddle asr connection handler to process the websocket connection" ) self.config = asr_engine.config # server config @@ -107,7 +107,7 @@ class PaddleASRConnectionHanddler: # acoustic model self.model = self.asr_engine.executor.model self.continuous_decoding = self.config.continuous_decoding - logger.info(f"continue decoding: {self.continuous_decoding}") + logger.debug(f"continue decoding: {self.continuous_decoding}") # ctc decoding config self.ctc_decode_config = self.asr_engine.executor.config.decode @@ -207,7 +207,7 @@ class PaddleASRConnectionHanddler: assert samples.ndim == 1 self.num_samples += samples.shape[0] - logger.info( + logger.debug( f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" ) @@ -218,7 +218,7 @@ class PaddleASRConnectionHanddler: else: assert self.remained_wav.ndim == 1 # (T,) self.remained_wav = np.concatenate([self.remained_wav, samples]) - logger.info( + logger.debug( f"The concatenation of remain and now audio samples length is: {self.remained_wav.shape}" ) @@ -252,14 +252,14 @@ class PaddleASRConnectionHanddler: # update remained wav self.remained_wav = self.remained_wav[self.n_shift * num_frames:] - logger.info( + logger.debug( f"process the audio feature success, the cached feat shape: {self.cached_feat.shape}" ) - logger.info( + logger.debug( f"After extract feat, the cached remain the audio samples: {self.remained_wav.shape}" ) - logger.info(f"global samples: {self.num_samples}") - logger.info(f"global frames: {self.num_frames}") + logger.debug(f"global samples: {self.num_samples}") + logger.debug(f"global frames: {self.num_frames}") def decode(self, is_finished=False): """advance decoding @@ -283,24 +283,24 @@ class PaddleASRConnectionHanddler: stride = subsampling * decoding_chunk_size if self.cached_feat is None: - logger.info("no audio feat, please input more pcm data") + logger.debug("no audio feat, please input more pcm data") return num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None # if is_finished=True, we need at least context frames if num_frames < context: - logger.info( + logger.debug( "flast {num_frames} is less than context {context} frames, and we cannot do model forward" ) return None, None @@ -354,7 +354,7 @@ class PaddleASRConnectionHanddler: Returns: logprob: poster probability. """ - logger.info("start to decoce one chunk for deepspeech2") + logger.debug("start to decoce one chunk for deepspeech2") input_names = self.am_predictor.get_input_names() audio_handle = self.am_predictor.get_input_handle(input_names[0]) audio_len_handle = self.am_predictor.get_input_handle(input_names[1]) @@ -391,7 +391,7 @@ class PaddleASRConnectionHanddler: self.decoder.next(output_chunk_probs, output_chunk_lens) trans_best, trans_beam = self.decoder.decode() - logger.info(f"decode one best result for deepspeech2: {trans_best[0]}") + logger.debug(f"decode one best result for deepspeech2: {trans_best[0]}") return trans_best[0] @paddle.no_grad() @@ -402,7 +402,7 @@ class PaddleASRConnectionHanddler: # reset endpiont state self.endpoint_state = False - logger.info( + logger.debug( "Conformer/Transformer: start to decode with advanced_decoding method" ) cfg = self.ctc_decode_config @@ -427,25 +427,25 @@ class PaddleASRConnectionHanddler: stride = subsampling * decoding_chunk_size if self.cached_feat is None: - logger.info("no audio feat, please input more pcm data") + logger.debug("no audio feat, please input more pcm data") return # (B=1,T,D) num_frames = self.cached_feat.shape[1] - logger.info( + logger.debug( f"Required decoding window {decoding_window} frames, and the connection has {num_frames} frames" ) # the cached feat must be larger decoding_window if num_frames < decoding_window and not is_finished: - logger.info( + logger.debug( f"frame feat num is less than {decoding_window}, please input more pcm data" ) return None, None # if is_finished=True, we need at least context frames if num_frames < context: - logger.info( + logger.debug( "flast {num_frames} is less than context {context} frames, and we cannot do model forward" ) return None, None @@ -489,7 +489,7 @@ class PaddleASRConnectionHanddler: self.encoder_out = ys else: self.encoder_out = paddle.concat([self.encoder_out, ys], axis=1) - logger.info( + logger.debug( f"This connection handler encoder out shape: {self.encoder_out.shape}" ) @@ -513,7 +513,8 @@ class PaddleASRConnectionHanddler: if self.endpointer.endpoint_detected(ctc_probs.numpy(), decoding_something): self.endpoint_state = True - logger.info(f"Endpoint is detected at {self.num_frames} frame.") + logger.debug( + f"Endpoint is detected at {self.num_frames} frame.") # advance cache of feat assert self.cached_feat.shape[0] == 1 #(B=1,T,D) @@ -526,7 +527,7 @@ class PaddleASRConnectionHanddler: def update_result(self): """Conformer/Transformer hyps to result. """ - logger.info("update the final result") + logger.debug("update the final result") hyps = self.hyps # output results and tokenids @@ -560,16 +561,16 @@ class PaddleASRConnectionHanddler: only for conformer and transformer model. """ if "deepspeech2" in self.model_type: - logger.info("deepspeech2 not support rescoring decoding.") + logger.debug("deepspeech2 not support rescoring decoding.") return if "attention_rescoring" != self.ctc_decode_config.decoding_method: - logger.info( + logger.debug( f"decoding method not match: {self.ctc_decode_config.decoding_method}, need attention_rescoring" ) return - logger.info("rescoring the final result") + logger.debug("rescoring the final result") # last decoding for last audio self.searcher.finalize_search() @@ -685,7 +686,6 @@ class PaddleASRConnectionHanddler: "bg": global_offset_in_sec + start, "ed": global_offset_in_sec + end }) - # logger.info(f"{word_time_stamp[-1]}") self.word_time_stamp = word_time_stamp logger.info(f"word time stamp: {self.word_time_stamp}") @@ -707,13 +707,13 @@ class ASRServerExecutor(ASRExecutor): lm_url = self.task_resource.res_dict['lm_url'] lm_md5 = self.task_resource.res_dict['lm_md5'] - logger.info(f"Start to load language model {lm_url}") + logger.debug(f"Start to load language model {lm_url}") self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) elif "conformer" in self.model_type or "transformer" in self.model_type: with UpdateConfig(self.config): - logger.info("start to create the stream conformer asr engine") + logger.debug("start to create the stream conformer asr engine") # update the decoding method if self.decode_method: self.config.decode.decoding_method = self.decode_method @@ -726,7 +726,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.decode.decoding_method not in [ "ctc_prefix_beam_search", "attention_rescoring" ]: - logger.info( + logger.debug( "we set the decoding_method to attention_rescoring") self.config.decode.decoding_method = "attention_rescoring" @@ -739,7 +739,7 @@ class ASRServerExecutor(ASRExecutor): def init_model(self) -> None: if "deepspeech2" in self.model_type: # AM predictor - logger.info("ASR engine start to init the am predictor") + logger.debug("ASR engine start to init the am predictor") self.am_predictor = init_predictor( model_file=self.am_model, params_file=self.am_params, @@ -748,7 +748,7 @@ class ASRServerExecutor(ASRExecutor): # load model # model_type: {model_name}_{dataset} model_name = self.model_type[:self.model_type.rindex('_')] - logger.info(f"model name: {model_name}") + logger.debug(f"model name: {model_name}") model_class = self.task_resource.get_model_class(model_name) model = model_class.from_config(self.config) self.model = model @@ -782,7 +782,7 @@ class ASRServerExecutor(ASRExecutor): self.num_decoding_left_chunks = num_decoding_left_chunks # conf for paddleinference predictor or onnx self.am_predictor_conf = am_predictor_conf - logger.info(f"model_type: {self.model_type}") + logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' tag = model_type + '-' + lang + '-' + sample_rate_str @@ -804,12 +804,12 @@ class ASRServerExecutor(ASRExecutor): self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) - logger.info("Load the pretrained model:") - logger.info(f" tag = {tag}") - logger.info(f" res_path: {self.res_path}") - logger.info(f" cfg path: {self.cfg_path}") - logger.info(f" am_model path: {self.am_model}") - logger.info(f" am_params path: {self.am_params}") + logger.debug("Load the pretrained model:") + logger.debug(f" tag = {tag}") + logger.debug(f" res_path: {self.res_path}") + logger.debug(f" cfg path: {self.cfg_path}") + logger.debug(f" am_model path: {self.am_model}") + logger.debug(f" am_params path: {self.am_params}") #Init body. self.config = CfgNode(new_allowed=True) @@ -818,7 +818,7 @@ class ASRServerExecutor(ASRExecutor): if self.config.spm_model_prefix: self.config.spm_model_prefix = os.path.join( self.res_path, self.config.spm_model_prefix) - logger.info(f"spm model path: {self.config.spm_model_prefix}") + logger.debug(f"spm model path: {self.config.spm_model_prefix}") self.vocab = self.config.vocab_filepath @@ -832,7 +832,7 @@ class ASRServerExecutor(ASRExecutor): # AM predictor self.init_model() - logger.info(f"create the {model_type} model success") + logger.debug(f"create the {model_type} model success") return True @@ -883,7 +883,7 @@ class ASREngine(BaseEngine): "If all GPU or XPU is used, you can set the server to 'cpu'") sys.exit(-1) - logger.info(f"paddlespeech_server set the device: {self.device}") + logger.debug(f"paddlespeech_server set the device: {self.device}") if not self.init_model(): logger.error( @@ -891,7 +891,9 @@ class ASREngine(BaseEngine): ) return False - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) + return True def new_handler(self): diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 572004eb..6df666ce 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -65,10 +65,10 @@ class ASRServerExecutor(ASRExecutor): self.task_resource.res_dict['model']) self.am_params = os.path.join(self.res_path, self.task_resource.res_dict['params']) - logger.info(self.res_path) - logger.info(self.cfg_path) - logger.info(self.am_model) - logger.info(self.am_params) + logger.debug(self.res_path) + logger.debug(self.cfg_path) + logger.debug(self.am_model) + logger.debug(self.am_params) else: self.cfg_path = os.path.abspath(cfg_path) self.am_model = os.path.abspath(am_model) @@ -236,16 +236,16 @@ class PaddleASRConnectionHandler(ASRServerExecutor): if self._check( io.BytesIO(audio_data), self.asr_engine.config.sample_rate, self.asr_engine.config.force_yes): - logger.info("start running asr engine") + logger.debug("start running asr engine") self.preprocess(self.asr_engine.config.model_type, io.BytesIO(audio_data)) st = time.time() self.infer(self.asr_engine.config.model_type) infer_time = time.time() - st self.output = self.postprocess() # Retrieve result of asr. - logger.info("end inferring asr engine") + logger.debug("end inferring asr engine") else: - logger.info("file check failed!") + logger.error("file check failed!") self.output = None logger.info("inference time: {}".format(infer_time)) diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index f9cc3a66..02c40fd1 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -104,7 +104,7 @@ class PaddleASRConnectionHandler(ASRServerExecutor): if self._check( io.BytesIO(audio_data), self.asr_engine.config.sample_rate, self.asr_engine.config.force_yes): - logger.info("start run asr engine") + logger.debug("start run asr engine") self.preprocess(self.asr_engine.config.model, io.BytesIO(audio_data)) st = time.time() @@ -112,7 +112,7 @@ class PaddleASRConnectionHandler(ASRServerExecutor): infer_time = time.time() - st self.output = self.postprocess() # Retrieve result of asr. else: - logger.info("file check failed!") + logger.error("file check failed!") self.output = None logger.info("inference time: {}".format(infer_time)) diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py index 389d5605..fa62ba67 100644 --- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py +++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py @@ -67,22 +67,22 @@ class CLSServerExecutor(CLSExecutor): self.params_path = os.path.abspath(params_path) self.label_file = os.path.abspath(label_file) - logger.info(self.cfg_path) - logger.info(self.model_path) - logger.info(self.params_path) - logger.info(self.label_file) + logger.debug(self.cfg_path) + logger.debug(self.model_path) + logger.debug(self.params_path) + logger.debug(self.label_file) # config with open(self.cfg_path, 'r') as f: self._conf = yaml.safe_load(f) - logger.info("Read cfg file successfully.") + logger.debug("Read cfg file successfully.") # labels self._label_list = [] with open(self.label_file, 'r') as f: for line in f: self._label_list.append(line.strip()) - logger.info("Read label file successfully.") + logger.debug("Read label file successfully.") # Create predictor self.predictor_conf = predictor_conf @@ -90,7 +90,7 @@ class CLSServerExecutor(CLSExecutor): model_file=self.model_path, params_file=self.params_path, predictor_conf=self.predictor_conf) - logger.info("Create predictor successfully.") + logger.debug("Create predictor successfully.") @paddle.no_grad() def infer(self): @@ -148,7 +148,8 @@ class CLSEngine(BaseEngine): logger.error(e) return False - logger.info("Initialize CLS server engine successfully.") + logger.info("Initialize CLS server engine successfully on device: %s." % + (self.device)) return True @@ -160,7 +161,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): cls_engine (CLSEngine): The CLS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleCLSConnectionHandler to process the cls request") self._inputs = OrderedDict() @@ -183,7 +184,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): self.infer() infer_time = time.time() - st - logger.info("inference time: {}".format(infer_time)) + logger.debug("inference time: {}".format(infer_time)) logger.info("cls engine type: inference") def postprocess(self, topk: int): diff --git a/paddlespeech/server/engine/cls/python/cls_engine.py b/paddlespeech/server/engine/cls/python/cls_engine.py index f8d8f20e..210f4cbb 100644 --- a/paddlespeech/server/engine/cls/python/cls_engine.py +++ b/paddlespeech/server/engine/cls/python/cls_engine.py @@ -88,7 +88,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): cls_engine (CLSEngine): The CLS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleCLSConnectionHandler to process the cls request") self._inputs = OrderedDict() @@ -110,7 +110,7 @@ class PaddleCLSConnectionHandler(CLSServerExecutor): self.infer() infer_time = time.time() - st - logger.info("inference time: {}".format(infer_time)) + logger.debug("inference time: {}".format(infer_time)) logger.info("cls engine type: python") def postprocess(self, topk: int): diff --git a/paddlespeech/server/engine/engine_warmup.py b/paddlespeech/server/engine/engine_warmup.py index 5f548f71..12c760c6 100644 --- a/paddlespeech/server/engine/engine_warmup.py +++ b/paddlespeech/server/engine/engine_warmup.py @@ -45,7 +45,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: logger.error("Please check tte engine type.") try: - logger.info("Start to warm up tts engine.") + logger.debug("Start to warm up tts engine.") for i in range(warm_up_time): connection_handler = PaddleTTSConnectionHandler(tts_engine) if flag_online: @@ -53,7 +53,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: text=sentence, lang=tts_engine.lang, am=tts_engine.config.am): - logger.info( + logger.debug( f"The first response time of the {i} warm up: {connection_handler.first_response_time} s" ) break @@ -62,7 +62,7 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool: st = time.time() connection_handler.infer(text=sentence) et = time.time() - logger.info( + logger.debug( f"The response time of the {i} warm up: {et - st} s") except Exception as e: logger.error("Failed to warm up on tts engine.") diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py index 73cf8737..6167e778 100644 --- a/paddlespeech/server/engine/text/python/text_engine.py +++ b/paddlespeech/server/engine/text/python/text_engine.py @@ -28,7 +28,7 @@ class PaddleTextConnectionHandler: text_engine (TextEngine): The Text engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTextConnectionHandler to process the text request") self.text_engine = text_engine self.task = self.text_engine.executor.task @@ -130,7 +130,7 @@ class TextEngine(BaseEngine): """The Text Engine """ super(TextEngine, self).__init__() - logger.info("Create the TextEngine Instance") + logger.debug("Create the TextEngine Instance") def init(self, config: dict): """Init the Text Engine @@ -141,7 +141,7 @@ class TextEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the text engine") + logger.debug("Init the text engine") try: self.config = config if self.config.device: @@ -150,7 +150,7 @@ class TextEngine(BaseEngine): self.device = paddle.get_device() paddle.set_device(self.device) - logger.info(f"Text Engine set the device: {self.device}") + logger.debug(f"Text Engine set the device: {self.device}") except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" @@ -168,5 +168,6 @@ class TextEngine(BaseEngine): ckpt_path=config.ckpt_path, vocab_file=config.vocab_file) - logger.info("Init the text engine successfully") + logger.info("Initialize Text server engine successfully on device: %s." + % (self.device)) return True diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py index f64287af..7b8e04e8 100644 --- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py @@ -62,7 +62,7 @@ class TTSServerExecutor(TTSExecutor): (hasattr(self, 'am_encoder_infer_sess') and hasattr(self, 'am_decoder_sess') and hasattr( self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am am_tag = am + '-' + lang @@ -85,8 +85,7 @@ class TTSServerExecutor(TTSExecutor): else: self.am_ckpt = os.path.abspath(am_ckpt[0]) self.phones_dict = os.path.abspath(phones_dict) - self.am_res_path = os.path.dirname( - os.path.abspath(am_ckpt)) + self.am_res_path = os.path.dirname(os.path.abspath(am_ckpt)) # create am sess self.am_sess = get_sess(self.am_ckpt, am_sess_conf) @@ -119,8 +118,7 @@ class TTSServerExecutor(TTSExecutor): self.am_postnet = os.path.abspath(am_ckpt[2]) self.phones_dict = os.path.abspath(phones_dict) self.am_stat = os.path.abspath(am_stat) - self.am_res_path = os.path.dirname( - os.path.abspath(am_ckpt[0])) + self.am_res_path = os.path.dirname(os.path.abspath(am_ckpt[0])) # create am sess self.am_encoder_infer_sess = get_sess(self.am_encoder_infer, @@ -130,13 +128,13 @@ class TTSServerExecutor(TTSExecutor): self.am_mu, self.am_std = np.load(self.am_stat) - logger.info(f"self.phones_dict: {self.phones_dict}") - logger.info(f"am model dir: {self.am_res_path}") - logger.info("Create am sess successfully.") + logger.debug(f"self.phones_dict: {self.phones_dict}") + logger.debug(f"am model dir: {self.am_res_path}") + logger.debug("Create am sess successfully.") # voc model info voc_tag = voc + '-' + lang - + if voc_ckpt is None: self.task_resource.set_task_model( model_tag=voc_tag, @@ -149,16 +147,16 @@ class TTSServerExecutor(TTSExecutor): else: self.voc_ckpt = os.path.abspath(voc_ckpt) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt)) - logger.info(self.voc_res_path) + logger.debug(self.voc_res_path) # create voc sess self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf) - logger.info("Create voc sess successfully.") + logger.debug("Create voc sess successfully.") with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] self.vocab_size = len(phn_id) - logger.info(f"vocab_size: {self.vocab_size}") + logger.debug(f"vocab_size: {self.vocab_size}") # frontend self.tones_dict = None @@ -169,7 +167,7 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - logger.info("frontend done!") + logger.debug("frontend done!") class TTSEngine(BaseEngine): @@ -267,7 +265,7 @@ class PaddleTTSConnectionHandler: tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py index 74364382..9bd95849 100644 --- a/paddlespeech/server/engine/tts/online/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py @@ -102,7 +102,7 @@ class TTSServerExecutor(TTSExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am model info if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: @@ -128,17 +128,15 @@ class TTSServerExecutor(TTSExecutor): # must have phones_dict in acoustic self.phones_dict = os.path.join( self.am_res_path, self.task_resource.res_dict['phones_dict']) - print("self.phones_dict:", self.phones_dict) - logger.info(self.am_res_path) - logger.info(self.am_config) - logger.info(self.am_ckpt) + logger.debug(self.am_res_path) + logger.debug(self.am_config) + logger.debug(self.am_ckpt) else: self.am_config = os.path.abspath(am_config) self.am_ckpt = os.path.abspath(am_ckpt) self.am_stat = os.path.abspath(am_stat) self.phones_dict = os.path.abspath(phones_dict) self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) - print("self.phones_dict:", self.phones_dict) self.tones_dict = None self.speaker_dict = None @@ -165,9 +163,9 @@ class TTSServerExecutor(TTSExecutor): self.voc_stat = os.path.join( self.voc_res_path, self.task_resource.voc_res_dict['speech_stats']) - logger.info(self.voc_res_path) - logger.info(self.voc_config) - logger.info(self.voc_ckpt) + logger.debug(self.voc_res_path) + logger.debug(self.voc_config) + logger.debug(self.voc_ckpt) else: self.voc_config = os.path.abspath(voc_config) self.voc_ckpt = os.path.abspath(voc_ckpt) @@ -184,7 +182,6 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] self.vocab_size = len(phn_id) - print("vocab_size:", self.vocab_size) # frontend if lang == 'zh': @@ -194,7 +191,6 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - print("frontend done!") # am infer info self.am_name = am[:am.rindex('_')] @@ -209,7 +205,6 @@ class TTSServerExecutor(TTSExecutor): self.am_name + '_inference') self.am_inference = am_inference_class(am_normalizer, am) self.am_inference.eval() - print("acoustic model done!") # voc infer info self.voc_name = voc[:voc.rindex('_')] @@ -220,7 +215,6 @@ class TTSServerExecutor(TTSExecutor): '_inference') self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference.eval() - print("voc done!") class TTSEngine(BaseEngine): @@ -309,7 +303,7 @@ class PaddleTTSConnectionHandler: tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -369,7 +363,7 @@ class PaddleTTSConnectionHandler: text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") frontend_et = time.time() self.frontend_time = frontend_et - frontend_st diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index a5632487..43b0df40 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -65,7 +65,7 @@ class TTSServerExecutor(TTSExecutor): Init model and other resources from a specific path. """ if hasattr(self, 'am_predictor') and hasattr(self, 'voc_predictor'): - logger.info('Models had been initialized.') + logger.debug('Models had been initialized.') return # am if am_model is None or am_params is None or phones_dict is None: @@ -91,16 +91,16 @@ class TTSServerExecutor(TTSExecutor): self.am_res_path, self.task_resource.res_dict['phones_dict']) self.am_sample_rate = self.task_resource.res_dict['sample_rate'] - logger.info(self.am_res_path) - logger.info(self.am_model) - logger.info(self.am_params) + logger.debug(self.am_res_path) + logger.debug(self.am_model) + logger.debug(self.am_params) else: self.am_model = os.path.abspath(am_model) self.am_params = os.path.abspath(am_params) self.phones_dict = os.path.abspath(phones_dict) self.am_sample_rate = am_sample_rate self.am_res_path = os.path.dirname(os.path.abspath(self.am_model)) - logger.info("self.phones_dict: {}".format(self.phones_dict)) + logger.debug("self.phones_dict: {}".format(self.phones_dict)) # for speedyspeech self.tones_dict = None @@ -139,9 +139,9 @@ class TTSServerExecutor(TTSExecutor): self.voc_res_path, self.task_resource.voc_res_dict['params']) self.voc_sample_rate = self.task_resource.voc_res_dict[ 'sample_rate'] - logger.info(self.voc_res_path) - logger.info(self.voc_model) - logger.info(self.voc_params) + logger.debug(self.voc_res_path) + logger.debug(self.voc_model) + logger.debug(self.voc_params) else: self.voc_model = os.path.abspath(voc_model) self.voc_params = os.path.abspath(voc_params) @@ -156,21 +156,21 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) - logger.info("vocab_size: {}".format(vocab_size)) + logger.debug("vocab_size: {}".format(vocab_size)) tone_size = None if self.tones_dict: with open(self.tones_dict, "r") as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) - logger.info("tone_size: {}".format(tone_size)) + logger.debug("tone_size: {}".format(tone_size)) spk_num = None if self.speaker_dict: with open(self.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) - logger.info("spk_num: {}".format(spk_num)) + logger.debug("spk_num: {}".format(spk_num)) # frontend if lang == 'zh': @@ -180,7 +180,7 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - logger.info("frontend done!") + logger.debug("frontend done!") # Create am predictor self.am_predictor_conf = am_predictor_conf @@ -188,7 +188,7 @@ class TTSServerExecutor(TTSExecutor): model_file=self.am_model, params_file=self.am_params, predictor_conf=self.am_predictor_conf) - logger.info("Create AM predictor successfully.") + logger.debug("Create AM predictor successfully.") # Create voc predictor self.voc_predictor_conf = voc_predictor_conf @@ -196,7 +196,7 @@ class TTSServerExecutor(TTSExecutor): model_file=self.voc_model, params_file=self.voc_params, predictor_conf=self.voc_predictor_conf) - logger.info("Create Vocoder predictor successfully.") + logger.debug("Create Vocoder predictor successfully.") @paddle.no_grad() def infer(self, @@ -328,7 +328,8 @@ class TTSEngine(BaseEngine): logger.error(e) return False - logger.info("Initialize TTS server engine successfully.") + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.device)) return True @@ -340,7 +341,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -378,23 +379,23 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav - logger.info( + logger.debug( "The sample rate of synthesized audio is the same as model, which is {}Hz". format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - logger.info( + logger.debug( "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume - logger.info("Transform the volume of the audio successfully.") + logger.debug("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - logger.info("Transform the speed of the audio successfully.") + logger.debug("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, @@ -411,7 +412,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') - logger.info("Audio to string successfully.") + logger.debug("Audio to string successfully.") # save audio if audio_path is not None: @@ -499,15 +500,15 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.error(e) sys.exit(-1) - logger.info("AM model: {}".format(self.config.am)) - logger.info("Vocoder model: {}".format(self.config.voc)) - logger.info("Language: {}".format(lang)) + logger.debug("AM model: {}".format(self.config.am)) + logger.debug("Vocoder model: {}".format(self.config.voc)) + logger.debug("Language: {}".format(lang)) logger.info("tts engine type: python") logger.info("audio duration: {}".format(duration)) - logger.info("frontend inference time: {}".format(self.frontend_time)) - logger.info("AM inference time: {}".format(self.am_time)) - logger.info("Vocoder inference time: {}".format(self.voc_time)) + logger.debug("frontend inference time: {}".format(self.frontend_time)) + logger.debug("AM inference time: {}".format(self.am_time)) + logger.debug("Vocoder inference time: {}".format(self.voc_time)) logger.info("total inference time: {}".format(infer_time)) logger.info( "postprocess (change speed, volume, target sample rate) time: {}". @@ -515,6 +516,6 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.info("total generate audio time: {}".format(infer_time + postprocess_time)) logger.info("RTF: {}".format(rtf)) - logger.info("device: {}".format(self.tts_engine.device)) + logger.debug("device: {}".format(self.tts_engine.device)) return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index b048b01a..4d180100 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -105,7 +105,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): tts_engine (TTSEngine): The TTS engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleTTSConnectionHandler to process the tts request") self.tts_engine = tts_engine @@ -143,23 +143,23 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav - logger.info( + logger.debug( "The sample rate of synthesized audio is the same as model, which is {}Hz". format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - logger.info( + logger.debug( "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume - logger.info("Transform the volume of the audio successfully.") + logger.debug("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) - logger.info("Transform the speed of the audio successfully.") + logger.debug("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, @@ -176,7 +176,7 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') - logger.info("Audio to string successfully.") + logger.debug("Audio to string successfully.") # save audio if audio_path is not None: @@ -264,15 +264,15 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.error(e) sys.exit(-1) - logger.info("AM model: {}".format(self.config.am)) - logger.info("Vocoder model: {}".format(self.config.voc)) - logger.info("Language: {}".format(lang)) + logger.debug("AM model: {}".format(self.config.am)) + logger.debug("Vocoder model: {}".format(self.config.voc)) + logger.debug("Language: {}".format(lang)) logger.info("tts engine type: python") logger.info("audio duration: {}".format(duration)) - logger.info("frontend inference time: {}".format(self.frontend_time)) - logger.info("AM inference time: {}".format(self.am_time)) - logger.info("Vocoder inference time: {}".format(self.voc_time)) + logger.debug("frontend inference time: {}".format(self.frontend_time)) + logger.debug("AM inference time: {}".format(self.am_time)) + logger.debug("Vocoder inference time: {}".format(self.voc_time)) logger.info("total inference time: {}".format(infer_time)) logger.info( "postprocess (change speed, volume, target sample rate) time: {}". @@ -280,6 +280,6 @@ class PaddleTTSConnectionHandler(TTSServerExecutor): logger.info("total generate audio time: {}".format(infer_time + postprocess_time)) logger.info("RTF: {}".format(rtf)) - logger.info("device: {}".format(self.tts_engine.device)) + logger.debug("device: {}".format(self.tts_engine.device)) return lang, target_sample_rate, duration, wav_base64 diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 3c72f55d..f7d60648 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -33,7 +33,7 @@ class PaddleVectorConnectionHandler: vector_engine (VectorEngine): The Vector engine """ super().__init__() - logger.info( + logger.debug( "Create PaddleVectorConnectionHandler to process the vector request") self.vector_engine = vector_engine self.executor = self.vector_engine.executor @@ -54,7 +54,7 @@ class PaddleVectorConnectionHandler: Returns: str: the punctuation text """ - logger.info( + logger.debug( f"start to extract the do vector {self.task} from the http request") if self.task == "spk" and task == "spk": embedding = self.extract_audio_embedding(audio_data) @@ -81,17 +81,17 @@ class PaddleVectorConnectionHandler: Returns: float: the score between enroll and test audio """ - logger.info("start to extract the enroll audio embedding") + logger.debug("start to extract the enroll audio embedding") enroll_emb = self.extract_audio_embedding(enroll_audio) - logger.info("start to extract the test audio embedding") + logger.debug("start to extract the test audio embedding") test_emb = self.extract_audio_embedding(test_audio) - logger.info( + logger.debug( "start to get the score between the enroll and test embedding") score = self.executor.get_embeddings_score(enroll_emb, test_emb) - logger.info(f"get the enroll vs test score: {score}") + logger.debug(f"get the enroll vs test score: {score}") return score @paddle.no_grad() @@ -106,11 +106,12 @@ class PaddleVectorConnectionHandler: # because the soundfile will change the io.BytesIO(audio) to the end # thus we should convert the base64 string to io.BytesIO when we need the audio data if not self.executor._check(io.BytesIO(audio), sample_rate): - logger.info("check the audio sample rate occurs error") + logger.debug("check the audio sample rate occurs error") return np.array([0.0]) waveform, sr = load_audio(io.BytesIO(audio)) - logger.info(f"load the audio sample points, shape is: {waveform.shape}") + logger.debug( + f"load the audio sample points, shape is: {waveform.shape}") # stage 2: get the audio feat # Note: Now we only support fbank feature @@ -121,9 +122,9 @@ class PaddleVectorConnectionHandler: n_mels=self.config.n_mels, window_size=self.config.window_size, hop_length=self.config.hop_size) - logger.info(f"extract the audio feats, shape is: {feats.shape}") + logger.debug(f"extract the audio feats, shape is: {feats.shape}") except Exception as e: - logger.info(f"feats occurs exception {e}") + logger.error(f"feats occurs exception {e}") sys.exit(-1) feats = paddle.to_tensor(feats).unsqueeze(0) @@ -159,7 +160,7 @@ class VectorEngine(BaseEngine): """The Vector Engine """ super(VectorEngine, self).__init__() - logger.info("Create the VectorEngine Instance") + logger.debug("Create the VectorEngine Instance") def init(self, config: dict): """Init the Vector Engine @@ -170,7 +171,7 @@ class VectorEngine(BaseEngine): Returns: bool: The engine instance flag """ - logger.info("Init the vector engine") + logger.debug("Init the vector engine") try: self.config = config if self.config.device: @@ -179,7 +180,7 @@ class VectorEngine(BaseEngine): self.device = paddle.get_device() paddle.set_device(self.device) - logger.info(f"Vector Engine set the device: {self.device}") + logger.debug(f"Vector Engine set the device: {self.device}") except BaseException as e: logger.error( "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" @@ -196,5 +197,7 @@ class VectorEngine(BaseEngine): ckpt_path=config.ckpt_path, task=config.task) - logger.info("Init the Vector engine successfully") + logger.info( + "Initialize Vector server engine successfully on device: %s." % + (self.device)) return True diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index e3d90d46..d4540781 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -138,7 +138,7 @@ class ASRWsAudioHandler: Returns: str: the final asr result """ - logging.info("send a message to the server") + logging.debug("send a message to the server") if self.url is None: logger.error("No asr server, please input valid ip and port") @@ -160,7 +160,7 @@ class ASRWsAudioHandler: separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() - logger.info("client receive msg={}".format(msg)) + logger.debug("client receive msg={}".format(msg)) # 3. send chunk audio data to engine for chunk_data in self.read_wave(wavfile_path): @@ -170,7 +170,7 @@ class ASRWsAudioHandler: if self.punc_server and len(msg["result"]) > 0: msg["result"] = self.punc_server.run(msg["result"]) - logger.info("client receive msg={}".format(msg)) + logger.debug("client receive msg={}".format(msg)) # 4. we must send finished signal to the server audio_info = json.dumps( @@ -310,7 +310,7 @@ class TTSWsHandler: start_request = json.dumps({"task": "tts", "signal": "start"}) await ws.send(start_request) msg = await ws.recv() - logger.info(f"client receive msg={msg}") + logger.debug(f"client receive msg={msg}") msg = json.loads(msg) session = msg["session"] @@ -319,7 +319,7 @@ class TTSWsHandler: request = json.dumps({"text": text_base64}) st = time.time() await ws.send(request) - logging.info("send a message to the server") + logging.debug("send a message to the server") # 4. Process the received response message = await ws.recv() @@ -543,7 +543,6 @@ class VectorHttpHandler: "sample_rate": sample_rate, } - logger.info(self.url) res = requests.post(url=self.url, data=json.dumps(data)) return res.json() diff --git a/paddlespeech/server/utils/audio_process.py b/paddlespeech/server/utils/audio_process.py index 416d77ac..ae538397 100644 --- a/paddlespeech/server/utils/audio_process.py +++ b/paddlespeech/server/utils/audio_process.py @@ -169,7 +169,7 @@ def save_audio(bytes_data, audio_path, sample_rate: int=24000) -> bool: sample_rate=sample_rate) os.remove("./tmp.pcm") else: - print("Only supports saved audio format is pcm or wav") + logger.error("Only supports saved audio format is pcm or wav") return False return True diff --git a/paddlespeech/server/utils/onnx_infer.py b/paddlespeech/server/utils/onnx_infer.py index 087eb9c0..23d83c73 100644 --- a/paddlespeech/server/utils/onnx_infer.py +++ b/paddlespeech/server/utils/onnx_infer.py @@ -20,7 +20,7 @@ from paddlespeech.cli.log import logger def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None): - logger.info(f"ort sessconf: {sess_conf}") + logger.debug(f"ort sessconf: {sess_conf}") sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL if sess_conf.get('graph_optimization_level', 99) == 0: @@ -34,7 +34,7 @@ def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None): # fastspeech2/mb_melgan can't use trt now! if sess_conf.get("use_trt", 0): providers = ['TensorrtExecutionProvider'] - logger.info(f"ort providers: {providers}") + logger.debug(f"ort providers: {providers}") if 'cpu_threads' in sess_conf: sess_options.intra_op_num_threads = sess_conf.get("cpu_threads", 0) diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 061b213c..826d923e 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -13,6 +13,8 @@ import base64 import math +from paddlespeech.cli.log import logger + def wav2base64(wav_file: str): """ @@ -61,7 +63,7 @@ def get_chunks(data, block_size, pad_size, step): elif step == "voc": data_len = data.shape[0] else: - print("Please set correct type to get chunks, am or voc") + logger.error("Please set correct type to get chunks, am or voc") chunks = [] n = math.ceil(data_len / block_size) @@ -73,7 +75,7 @@ def get_chunks(data, block_size, pad_size, step): elif step == "voc": chunks.append(data[start:end, :]) else: - print("Please set correct type to get chunks, am or voc") + logger.error("Please set correct type to get chunks, am or voc") return chunks