# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import os import numpy as np from paddle import inference from scipy.special import softmax from paddlespeech.audio.backends import load as load_audio from paddlespeech.audio.datasets import ESC50 from paddlespeech.audio.features import melspectrogram # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.") parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.") parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.') parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.') parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.') parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.') parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.") args = parser.parse_args() # yapf: enable def extract_features(files: str, **kwargs): waveforms = [] srs = [] max_length = float('-inf') for file in files: waveform, sr = load_audio(file, sr=None) max_length = max(max_length, len(waveform)) waveforms.append(waveform) srs.append(sr) feats = [] for i in range(len(waveforms)): # padding if len(waveforms[i]) < max_length: pad_width = max_length - len(waveforms[i]) waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width)) feat = melspectrogram(waveforms[i], sr, **kwargs).transpose() feats.append(feat) return np.stack(feats, axis=0) class Predictor(object): def __init__(self, model_dir, device="gpu", batch_size=1, use_tensorrt=False, precision="fp32", cpu_threads=10, enable_mkldnn=False): self.batch_size = batch_size model_file = os.path.join(model_dir, "inference.pdmodel") params_file = os.path.join(model_dir, "inference.pdiparams") assert os.path.isfile(model_file) and os.path.isfile( params_file), 'Please check model and parameter files.' config = inference.Config(model_file, params_file) if device == "gpu": # set GPU configs accordingly # such as intialize the gpu memory, enable tensorrt config.enable_use_gpu(100, 0) precision_map = { "fp16": inference.PrecisionType.Half, "fp32": inference.PrecisionType.Float32, } precision_mode = precision_map[precision] if use_tensorrt: config.enable_tensorrt_engine( max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode) elif device == "cpu": # set CPU configs accordingly, # such as enable_mkldnn, set_cpu_math_library_num_threads config.disable_gpu() if enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() config.set_cpu_math_library_num_threads(cpu_threads) elif device == "xpu": # set XPU configs accordingly config.enable_xpu(100) config.switch_use_feed_fetch_ops(False) self.predictor = inference.create_predictor(config) self.input_handles = [ self.predictor.get_input_handle(name) for name in self.predictor.get_input_names() ] self.output_handle = self.predictor.get_output_handle( self.predictor.get_output_names()[0]) def predict(self, wavs): feats = extract_features(wavs) self.input_handles[0].copy_from_cpu(feats) self.predictor.run() logits = self.output_handle.copy_to_cpu() probs = softmax(logits, axis=1) indices = np.argmax(probs, axis=1) return indices if __name__ == "__main__": # Define predictor to do prediction. predictor = Predictor(args.model_dir, args.device, args.batch_size, args.use_tensorrt, args.precision, args.cpu_threads, args.enable_mkldnn) wavs = [args.wav] for i in range(len(wavs)): wavs[i] = os.path.abspath(os.path.expanduser(wavs[i])) assert os.path.isfile( wavs[i]), f'Please check input wave file: {wavs[i]}' results = predictor.predict(wavs) for idx, wav in enumerate(wavs): print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')