diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index e0ae20bb1..56dd7838a 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -88,20 +88,27 @@ def parse_args(): parser.add_argument("--output_dir", type=str, help="output dir") # inference parser.add_argument( - "--int8", + "--use_trt", type=str2bool, default=False, - help="Whether to use int8 inference.", ) + help="whether to use TensorRT or not in GPU", ) parser.add_argument( - "--fp16", + "--use_mkldnn", type=str2bool, default=False, - help="Whether to use float16 inference.", ) + help="whether to use MKLDNN or not in CPU.", ) + parser.add_argument( + "--precision", + type=str, + default='fp32', + choices=['fp32', 'fp16', 'bf16', 'int8'], + help="mode of running") parser.add_argument( "--device", default="gpu", choices=["gpu", "cpu"], help="Device selected for inference.", ) + parser.add_argument('--cpu_threads', type=int, default=1) args, _ = parser.parse_known_args() return args @@ -124,7 +131,11 @@ def main(): model_dir=args.inference_dir, model_file=args.am + ".pdmodel", params_file=args.am + ".pdiparams", - device=args.device) + device=args.device, + use_trt=args.use_trt, + use_mkldnn=args.use_mkldnn, + cpu_threads=args.cpu_threads, + precision=args.precision) # model: {model_name}_{dataset} am_dataset = args.am[args.am.rindex('_') + 1:] @@ -133,7 +144,11 @@ def main(): model_dir=args.inference_dir, model_file=args.voc + ".pdmodel", params_file=args.voc + ".pdiparams", - device=args.device) + device=args.device, + use_trt=args.use_trt, + use_mkldnn=args.use_mkldnn, + cpu_threads=args.cpu_threads, + precision=args.precision) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 0ac799813..6b693440c 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -464,18 +464,97 @@ def voc_to_static(voc_inference, # inference -def get_predictor(model_dir: Optional[os.PathLike]=None, - model_file: Optional[os.PathLike]=None, - params_file: Optional[os.PathLike]=None, - device: str='cpu'): +def get_predictor( + model_dir: Optional[os.PathLike]=None, + model_file: Optional[os.PathLike]=None, + params_file: Optional[os.PathLike]=None, + device: str='cpu', + # for gpu + use_trt: bool=False, + # for trt + use_dynamic_shape: bool=True, + min_subgraph_size: int=5, + # for cpu + cpu_threads: int=1, + use_mkldnn: bool=False, + # for trt or mkldnn + precision: int="fp32"): + """ + Args: + model_dir (os.PathLike): root path of model.pdmodel and model.pdiparams. + model_file (os.PathLike): name of model_file. + params_file (os.PathLike): name of params_file. + device (str): Choose the device you want to run, it can be: cpu/gpu, default is cpu. + use_trt (bool): whether to use TensorRT or not in GPU. + use_dynamic_shape (bool): use dynamic shape or not in TensorRT. + use_mkldnn (bool): whether to use MKLDNN or not in CPU. + cpu_threads (int): num of thread when use CPU. + precision (str): mode of running (fp32/fp16/bf16/int8). + """ + rerun_flag = False + if device != "gpu" and use_trt: + raise ValueError( + "Predict by TensorRT mode: {}, expect device=='gpu', but device == {}". + format(precision, device)) config = inference.Config( str(Path(model_dir) / model_file), str(Path(model_dir) / params_file)) + config.enable_memory_optim() + config.switch_ir_optim(True) if device == "gpu": config.enable_use_gpu(100, 0) - elif device == "cpu": + else: config.disable_gpu() - config.enable_memory_optim() + config.set_cpu_math_library_num_threads(cpu_threads) + if use_mkldnn: + # fp32 + config.enable_mkldnn() + if precision == "int8": + config.enable_mkldnn_int8({ + "conv2d_transpose", "conv2d", "depthwise_conv2d", "pool2d", + "transpose2", "elementwise_mul" + }) + # config.enable_mkldnn_int8() + elif precision in {"fp16", "bf16"}: + config.enable_mkldnn_bfloat16() + print("MKLDNN with {}".format(precision)) + if use_trt: + if precision == "bf16": + print("paddle trt does not support bf16, switching to fp16.") + precision = "fp16" + precision_map = { + "int8": inference.Config.Precision.Int8, + "fp32": inference.Config.Precision.Float32, + "fp16": inference.Config.Precision.Half, + } + assert precision in precision_map.keys() + pdtxt_name = model_file.split(".")[0] + "_" + precision + ".txt" + if use_dynamic_shape: + dynamic_shape_file = os.path.join(model_dir, pdtxt_name) + if os.path.exists(dynamic_shape_file): + config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file, + True) + # for fastspeech2 + config.exp_disable_tensorrt_ops(["reshape2"]) + print("trt set dynamic shape done!") + else: + # In order to avoid memory overflow when collecting dynamic shapes, it is changed to use CPU. + config.disable_gpu() + config.set_cpu_math_library_num_threads(10) + config.collect_shape_range_info(dynamic_shape_file) + print("Start collect dynamic shape...") + rerun_flag = True + + if not rerun_flag: + print("Tensor RT with {}".format(precision)) + config.enable_tensorrt_engine( + workspace_size=1 << 30, + max_batch_size=1, + min_subgraph_size=min_subgraph_size, + precision_mode=precision_map[precision], + use_static=True, + use_calib_mode=False, ) + predictor = inference.create_predictor(config) return predictor