You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/cls/exps/panns/deploy/predict.py

155 lines
6.0 KiB

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import numpy as np
import paddle
from paddle import inference
from paddle.audio.datasets import ESC50
from paddle.audio.features import LogMelSpectrogram
from scipy.special import softmax
import paddlespeech.utils
from paddlespeech.audio.backends import soundfile_load as load_audio
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu', 'gcu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
args = parser.parse_args()
# yapf: enable
def extract_features(files: str, **kwargs):
waveforms = []
srs = []
max_length = float('-inf')
for file in files:
waveform, sr = load_audio(file)
max_length = max(max_length, len(waveform))
waveforms.append(waveform)
srs.append(sr)
feats = []
for i in range(len(waveforms)):
# padding
if len(waveforms[i]) < max_length:
pad_width = max_length - len(waveforms[i])
waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
feature_extractor = LogMelSpectrogram(sr, **kwargs)
feat = feature_extractor(paddle.to_tensor(waveforms[i]))
feat = paddle.transpose(feat, perm=[1, 0]).unsqueeze(0)
feats.append(feat)
return np.stack(feats, axis=0)
class Predictor(object):
def __init__(self,
model_dir,
device="gpu",
batch_size=1,
use_tensorrt=False,
precision="fp32",
cpu_threads=10,
enable_mkldnn=False):
self.batch_size = batch_size
if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
config = inference.Config(model_dir, 'inference')
config.disable_mkldnn()
else:
model_file = os.path.join(model_dir, 'inference.pdmodel')
params_file = os.path.join(model_dir, "inference.pdiparams")
assert os.path.isfile(model_file) and os.path.isfile(
params_file), 'Please check model and parameter files.'
config = inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as intialize the gpu memory, enable tensorrt
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
}
precision_mode = precision_map[precision]
if use_tensorrt:
config.enable_tensorrt_engine(
max_batch_size=batch_size,
min_subgraph_size=30,
precision_mode=precision_mode)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
if enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(cpu_threads)
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = inference.create_predictor(config)
self.input_handles = [
self.predictor.get_input_handle(name)
for name in self.predictor.get_input_names()
]
self.output_handle = self.predictor.get_output_handle(
self.predictor.get_output_names()[0])
def predict(self, wavs):
feats = extract_features(wavs)
self.input_handles[0].copy_from_cpu(feats)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
probs = softmax(logits, axis=1)
indices = np.argmax(probs, axis=1)
return indices
if __name__ == "__main__":
# Define predictor to do prediction.
predictor = Predictor(args.model_dir, args.device, args.batch_size,
args.use_tensorrt, args.precision, args.cpu_threads,
args.enable_mkldnn)
wavs = [args.wav]
for i in range(len(wavs)):
wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
assert os.path.isfile(
wavs[i]), f'Please check input wave file: {wavs[i]}'
results = predictor.predict(wavs)
for idx, wav in enumerate(wavs):
print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')