diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 85bb877b1..d7bee6d7f 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -175,7 +175,7 @@ class U2Trainer(Trainer): observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] observation['samples'] = observation['batch_size'] - observation['ips,sent./sec'] = observation[ + observation['ips,samples/s'] = observation[ 'batch_size'] / observation['batch_cost'] for k, v in observation.items(): msg += f" {k.split(',')[0]}: " diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index cac5e5704..de90c9ef8 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -252,8 +252,7 @@ class Trainer(): if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step: logger.info( f"Reach benchmark-max-step: {self.args.benchmark_max_step}") - sys.exit( - f"Reach benchmark-max-step: {self.args.benchmark_max_step}") + sys.exit(0) def do_train(self): """The training process control by epoch.""" @@ -282,7 +281,7 @@ class Trainer(): observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] observation['samples'] = observation['batch_size'] - observation['ips[sent./sec]'] = observation[ + observation['ips samples/s'] = observation[ 'batch_size'] / observation['batch_cost'] for k, v in observation.items(): msg += f" {k}: " diff --git a/tests/test_tipc/benchmark_train.sh b/tests/test_tipc/benchmark_train.sh new file mode 100644 index 000000000..cc61567e3 --- /dev/null +++ b/tests/test_tipc/benchmark_train.sh @@ -0,0 +1,258 @@ +#!/bin/bash +source test_tipc/common_func.sh + +# set env +python=python +export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` +export model_commit=$(git log|head -n1|awk '{print $2}') +export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) + +# run benchmark sh +# Usage: +# bash run_benchmark_train.sh config.txt params +# or +# bash run_benchmark_train.sh config.txt + +function func_parser_params(){ + strs=$1 + IFS="=" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_sed_params(){ + filename=$1 + line=$2 + param_value=$3 + params=`sed -n "${line}p" $filename` + IFS=":" + array=(${params}) + key=${array[0]} + value=${array[1]} + if [[ $value =~ 'benchmark_train' ]];then + IFS='=' + _val=(${value}) + param_value="${_val[0]}=${param_value}" + fi + new_params="${key}:${param_value}" + IFS=";" + cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'" + eval $cmd +} + +function set_gpu_id(){ + string=$1 + _str=${string:1:6} + IFS="C" + arr=(${_str}) + M=${arr[0]} + P=${arr[1]} + gn=`expr $P - 1` + gpu_num=`expr $gn / $M` + seq=`seq -s "," 0 $gpu_num` + echo $seq +} + +function get_repo_name(){ + IFS=";" + cur_dir=$(pwd) + IFS="/" + arr=(${cur_dir}) + echo ${arr[-1]} +} + +FILENAME=$1 +# copy FILENAME as new +new_filename="./test_tipc/benchmark_train.txt" +cmd=`yes|cp $FILENAME $new_filename` +FILENAME=$new_filename +# MODE must be one of ['benchmark_train'] +MODE=$2 +PARAMS=$3 +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1 +IFS=$'\n' +# parser params from train_benchmark.txt +dataline=`cat $FILENAME` +# parser params +IFS=$'\n' +lines=(${dataline}) +model_name=$(func_parser_value "${lines[1]}") + +# 获取benchmark_params所在的行数 +line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` +# for train log parser +batch_size=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +fp_items=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +epoch=$(func_parser_value "${lines[line_num]}") + +line_num=`expr $line_num + 1` +profile_option_key=$(func_parser_key "${lines[line_num]}") +profile_option_params=$(func_parser_value "${lines[line_num]}") +profile_option="${profile_option_key}:${profile_option_params}" + +line_num=`expr $line_num + 1` +flags_value=$(func_parser_value "${lines[line_num]}") +# set flags +IFS=";" +flags_list=(${flags_value}) +for _flag in ${flags_list[*]}; do + cmd="export ${_flag}" + eval $cmd +done + +# set log_name +repo_name=$(get_repo_name ) +SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log +mkdir -p "${SAVE_LOG}/benchmark_log/" +status_log="${SAVE_LOG}/benchmark_log/results.log" + +# The number of lines in which train params can be replaced. +line_python=3 +line_gpuid=4 +line_precision=6 +line_epoch=7 +line_batchsize=9 +line_profile=13 +line_eval_py=24 +line_export_py=30 + +func_sed_params "$FILENAME" "${line_eval_py}" "null" +func_sed_params "$FILENAME" "${line_export_py}" "null" +func_sed_params "$FILENAME" "${line_python}" "$python" + +# if params +if [ ! -n "$PARAMS" ] ;then + # PARAMS input is not a word. + IFS="|" + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" +else + # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} + IFS="_" + params_list=(${PARAMS}) + model_type=${params_list[0]} + batch_size=${params_list[1]} + batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` + precision=${params_list[2]} + # run_process_type=${params_list[3]} + run_mode=${params_list[3]} + device_num=${params_list[4]} + IFS=";" + + if [ ${precision} = "null" ];then + precision="fp32" + fi + + fp_items_list=($precision) + batch_size_list=($batch_size) + device_num_list=($device_num) +fi + +IFS="|" +for batch_size in ${batch_size_list[*]}; do + for precision in ${fp_items_list[*]}; do + for device_num in ${device_num_list[*]}; do + # sed batchsize and precision + func_sed_params "$FILENAME" "${line_precision}" "$precision" + func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size" + func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch" + gpu_id=$(set_gpu_id $device_num) + + if [ ${#gpu_id} -le 1 ];then + run_process_type="SingleP" + log_path="$SAVE_LOG/profiling_log" + mkdir -p $log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id + # set profile_option params + tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` + + # run test_train_inference_python.sh + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + eval $cmd + eval "cat ${log_path}/${log_name}" + + # without profile + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit samples/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + else + IFS=";" + unset_env=`unset CUDA_VISIBLE_DEVICES` + run_process_type="MultiP" + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id + func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit images/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + fi + done + done +done \ No newline at end of file diff --git a/tests/test_tipc/configs/conformer/train_benchmark.txt b/tests/test_tipc/configs/conformer/train_benchmark.txt new file mode 100644 index 000000000..68f9e1ef6 --- /dev/null +++ b/tests/test_tipc/configs/conformer/train_benchmark.txt @@ -0,0 +1,57 @@ +===========================train_params=========================== +model_name:conformer +python:python3.7 +gpu_list:0|0,1 +null:null +null:null +--benchmark-max-step:50 +null:null +--benchmark-batch-size:16 +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train: ../paddlespeech/s2t/exps/u2/bin/train.py --config test_tipc/conformer/benchmark_train/conf/conformer.yaml --output test_tipc/conformer/benchmark_train/outputs --seed 1024 +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +null:null +null:null +norm_export: null +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +null:null +infer_model:null +infer_export:null +infer_quant:null +inference:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +===========================train_benchmark_params========================== +batch_size:16|30 +fp_items:fp32 +iteration:50 +--profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096" diff --git a/tests/test_tipc/conformer/scripts/aishell_tiny.py b/tests/test_tipc/conformer/scripts/aishell_tiny.py new file mode 100644 index 000000000..14f09f17b --- /dev/null +++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from pathlib import Path + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT_TAG +DATA_URL = URL_ROOT + '/data_aishell_tiny.tgz' +MD5_DATA = '337b1b1ea016761d4fd3225c5b8799b4' +RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz' +MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'wav', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + + utt2spk = Path(audio_path).parent.name + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'utt2spk': str(utt2spk), + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': text + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + manifest_dir = os.path.dirname(manifest_path_prefix) + meta_path = os.path.join(manifest_dir, dtype) + '.meta' + with open(meta_path, 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def prepare_dataset(url, md5sum, target_dir, manifest_path=None): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell_tiny') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + if manifest_path: + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + prepare_dataset( + url=RESOURCE_URL, + md5sum=MD5_RESOURCE, + target_dir=args.target_dir, + manifest_path=None) + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/tests/test_tipc/docs/benchmark_train.md b/tests/test_tipc/docs/benchmark_train.md new file mode 100644 index 000000000..af61f5971 --- /dev/null +++ b/tests/test_tipc/docs/benchmark_train.md @@ -0,0 +1,53 @@ +# TIPC Linux端Benchmark测试文档 + + 该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。 + + + # 1. 测试流程 + ## 1.1 准备数据和环境安装 +请在 repo根目录/tests 下运行 +运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程。 + + ```shell + # 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode + bash test_tipc/prepare.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train + ``` + + ## 1.2 功能测试 + 执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析 + + ```shell + # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode + bash test_tipc/benchmark_train.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train + ``` + + `test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: + ```shell + # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode + bash test_tipc/benchmark_train.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train dynamic_bs16_fp32_DP_N1C1 + ``` + dynamic_bs16_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: + `${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` + 包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。 + + + ## 2. 日志输出 + + 运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/conformer/train_benchmark.txt` 参数文件的训练日志解析结果是: + + ``` + {"model_branch": "dygaph", "model_commit": "", "model_name": "conformer_bs16_fp32_SingleP_DP", "batch_size": 16, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "", "convergence_key": "loss:", "ips": , "speed_unit": "samples/s", "device_num": "N1C1", "model_run_time": "0", "frame_commit": "", "frame_version": "0.0.0"} + ``` + + 训练日志和日志解析结果保存在test目录下,文件组织格式如下: + ``` + test/ + ├── index + │   ├── tests_conformer_bs16_fp32_SingleP_DP_N1C1_speed + │   └── tests_conformer_bs16_fp32_SingleP_DP_N1C8_speed + ├── profiling_log + │   └── tests_conformer_bs16_fp32_SingleP_DP_N1C1_profiling + └── train_log + ├── tests_conformer_bs16_fp32_SingleP_DP_N1C1_log + └── tests_conformer_bs16_fp32_SingleP_DP_N1C8_log + ``` diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh new file mode 100644 index 000000000..ea34acb85 --- /dev/null +++ b/tests/test_tipc/prepare.sh @@ -0,0 +1,61 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 + +# MODE be one of ['benchmark_train_lite_infer' 'benchmark_train_whole_infer' 'whole_train_whole_infer', +# 'whole_infer', 'klquant_whole_infer', +# 'cpp_infer', 'serving_infer', 'benchmark_train'] + + +MODE=$2 + +dataline=$(cat ${FILENAME}) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") + +echo "model_name:"${model_name} +trainer_list=$(func_parser_value "${lines[14]}") + +if [ ${MODE} = "benchmark_train" ];then + curPath=$(readlink -f "$(dirname "$0")") + echo "curPath:"${curPath} + cd ${curPath}/../.. + pip install . + cd - + if [ ${model_name} == "conformer" ]; then + # set the URL for aishell_tiny dataset + URL='None' + echo "URL:"${URL} + if [ ${URL} == 'None' ];then + echo "please contact author to get the URL.\n" + exit + fi + sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py + cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/ + cd ${curPath}/../../examples/aishell/asr1 + source path.sh + # download audio data + sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh + bash ./local/data.sh || exit -1 + if [ $? -ne 0 ]; then + exit 1 + fi + mkdir -p ${curPath}/conformer/benchmark_train/ + cp -rf conf ${curPath}/conformer/benchmark_train/ + cp -rf data ${curPath}/conformer/benchmark_train/ + cd ${curPath} + + sed -i "s#accum_grad: 2#accum_grad: 1#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml + + fi +fi diff --git a/tests/test_tipc/test_train_inference_python.sh b/tests/test_tipc/test_train_inference_python.sh new file mode 100644 index 000000000..2043e258d --- /dev/null +++ b/tests/test_tipc/test_train_inference_python.sh @@ -0,0 +1,385 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 +# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer'] +MODE=$2 + +dataline=$(awk 'NR==1, NR==51{print}' $FILENAME) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") +python=$(func_parser_value "${lines[2]}") +gpu_list=$(func_parser_value "${lines[3]}") +train_use_gpu_key=$(func_parser_key "${lines[4]}") +train_use_gpu_value=$(func_parser_value "${lines[4]}") +autocast_list=$(func_parser_value "${lines[5]}") +autocast_key=$(func_parser_key "${lines[5]}") +epoch_key=$(func_parser_key "${lines[6]}") +epoch_num=$(func_parser_params "${lines[6]}") +save_model_key=$(func_parser_key "${lines[7]}") +train_batch_key=$(func_parser_key "${lines[8]}") +train_batch_value=$(func_parser_params "${lines[8]}") +pretrain_model_key=$(func_parser_key "${lines[9]}") +pretrain_model_value=$(func_parser_value "${lines[9]}") +train_model_name=$(func_parser_value "${lines[10]}") +train_infer_img_dir=$(func_parser_value "${lines[11]}") +train_param_key1=$(func_parser_key "${lines[12]}") +train_param_value1=$(func_parser_value "${lines[12]}") + +trainer_list=$(func_parser_value "${lines[14]}") +trainer_norm=$(func_parser_key "${lines[15]}") +norm_trainer=$(func_parser_value "${lines[15]}") +pact_key=$(func_parser_key "${lines[16]}") +pact_trainer=$(func_parser_value "${lines[16]}") +fpgm_key=$(func_parser_key "${lines[17]}") +fpgm_trainer=$(func_parser_value "${lines[17]}") +distill_key=$(func_parser_key "${lines[18]}") +distill_trainer=$(func_parser_value "${lines[18]}") +trainer_key1=$(func_parser_key "${lines[19]}") +trainer_value1=$(func_parser_value "${lines[19]}") +trainer_key2=$(func_parser_key "${lines[20]}") +trainer_value2=$(func_parser_value "${lines[20]}") + +eval_py=$(func_parser_value "${lines[23]}") +eval_key1=$(func_parser_key "${lines[24]}") +eval_value1=$(func_parser_value "${lines[24]}") + +save_infer_key=$(func_parser_key "${lines[27]}") +save_infer_value=$(func_parser_value "${lines[27]}") +export_weight=$(func_parser_key "${lines[28]}") +norm_export=$(func_parser_value "${lines[29]}") +pact_export=$(func_parser_value "${lines[30]}") +fpgm_export=$(func_parser_value "${lines[31]}") +distill_export=$(func_parser_value "${lines[32]}") +export_key1=$(func_parser_key "${lines[33]}") +export_value1=$(func_parser_value "${lines[33]}") +export_key2=$(func_parser_key "${lines[34]}") +export_value2=$(func_parser_value "${lines[34]}") +inference_dir=$(func_parser_value "${lines[35]}") + +# parser inference model +infer_model_dir_list=$(func_parser_value "${lines[36]}") +infer_export_list=$(func_parser_value "${lines[37]}") +infer_is_quant=$(func_parser_value "${lines[38]}") +# parser inference +inference_py=$(func_parser_value "${lines[39]}") +use_gpu_key=$(func_parser_key "${lines[40]}") +use_gpu_list=$(func_parser_value "${lines[40]}") +use_mkldnn_key=$(func_parser_key "${lines[41]}") +use_mkldnn_list=$(func_parser_value "${lines[41]}") +cpu_threads_key=$(func_parser_key "${lines[42]}") +cpu_threads_list=$(func_parser_value "${lines[42]}") +batch_size_key=$(func_parser_key "${lines[43]}") +batch_size_list=$(func_parser_value "${lines[43]}") +use_trt_key=$(func_parser_key "${lines[44]}") +use_trt_list=$(func_parser_value "${lines[44]}") +precision_key=$(func_parser_key "${lines[45]}") +precision_list=$(func_parser_value "${lines[45]}") +infer_model_key=$(func_parser_key "${lines[46]}") +image_dir_key=$(func_parser_key "${lines[47]}") +infer_img_dir=$(func_parser_value "${lines[47]}") +save_log_key=$(func_parser_key "${lines[48]}") +benchmark_key=$(func_parser_key "${lines[49]}") +benchmark_value=$(func_parser_value "${lines[49]}") +infer_key1=$(func_parser_key "${lines[50]}") +infer_value1=$(func_parser_value "${lines[50]}") + +# parser klquant_infer +if [ ${MODE} = "klquant_whole_infer" ]; then + dataline=$(awk 'NR==1 NR==17{print}' $FILENAME) + lines=(${dataline}) + model_name=$(func_parser_value "${lines[1]}") + python=$(func_parser_value "${lines[2]}") + # parser inference model + infer_model_dir_list=$(func_parser_value "${lines[3]}") + infer_export_list=$(func_parser_value "${lines[4]}") + infer_is_quant=$(func_parser_value "${lines[5]}") + # parser inference + inference_py=$(func_parser_value "${lines[6]}") + use_gpu_key=$(func_parser_key "${lines[7]}") + use_gpu_list=$(func_parser_value "${lines[7]}") + use_mkldnn_key=$(func_parser_key "${lines[8]}") + use_mkldnn_list=$(func_parser_value "${lines[8]}") + cpu_threads_key=$(func_parser_key "${lines[9]}") + cpu_threads_list=$(func_parser_value "${lines[9]}") + batch_size_key=$(func_parser_key "${lines[10]}") + batch_size_list=$(func_parser_value "${lines[10]}") + use_trt_key=$(func_parser_key "${lines[11]}") + use_trt_list=$(func_parser_value "${lines[11]}") + precision_key=$(func_parser_key "${lines[12]}") + precision_list=$(func_parser_value "${lines[12]}") + infer_model_key=$(func_parser_key "${lines[13]}") + image_dir_key=$(func_parser_key "${lines[14]}") + infer_img_dir=$(func_parser_value "${lines[14]}") + save_log_key=$(func_parser_key "${lines[15]}") + benchmark_key=$(func_parser_key "${lines[16]}") + benchmark_value=$(func_parser_value "${lines[16]}") + infer_key1=$(func_parser_key "${lines[17]}") + infer_value1=$(func_parser_value "${lines[17]}") +fi + +save_model_value=$(func_parser_value "${lines[7]}") +if [[ ${save_model_value} = " " ]] || [[ ${save_model_value} = "null" ]] || [[ ${save_model_value} = "" ]];then + LOG_PATH="./test_tipc/output" +else + LOG_PATH=${save_model_value} +fi +mkdir -p ${LOG_PATH} +status_log="${LOG_PATH}/results_python.log" + + +function func_inference(){ + IFS='|' + _python=$1 + _script=$2 + _model_dir=$3 + _log_path=$4 + _img_dir=$5 + _flag_quant=$6 + # inference + for use_gpu in ${use_gpu_list[*]}; do + if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then + for use_mkldnn in ${use_mkldnn_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then + continue + fi + for threads in ${cpu_threads_list[*]}; do + for batch_size in ${batch_size_list[*]}; do + for precision in ${precision_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then + continue + fi # skip when enable fp16 but disable mkldnn + if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then + continue + fi # skip when quant model inference but precision is not int8 + set_precision=$(func_set_params "${precision_key}" "${precision}") + + _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + done + done + done + done + elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then + for use_trt in ${use_trt_list[*]}; do + for precision in ${precision_list[*]}; do + if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then + continue + fi + if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then + continue + fi + if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then + continue + fi + for batch_size in ${batch_size_list[*]}; do + _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}") + set_precision=$(func_set_params "${precision_key}" "${precision}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + + done + done + done + else + echo "Does not support hardware other than CPU and GPU Currently!" + fi + done +} + +if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then + GPUID=$3 + if [ ${#GPUID} -le 0 ];then + env=" " + else + env="export CUDA_VISIBLE_DEVICES=${GPUID}" + fi + # set CUDA_VISIBLE_DEVICES + eval $env + export Count=0 + IFS="|" + infer_run_exports=(${infer_export_list}) + infer_quant_flag=(${infer_is_quant}) + for infer_model in ${infer_model_dir_list[*]}; do + # run export + if [ ${infer_run_exports[Count]} != "null" ];then + save_infer_dir=$(dirname $infer_model) + set_export_weight=$(func_set_params "${export_weight}" "${infer_model}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}") + # For lac which needs the `data_dir` args + set_export1_key=$(func_set_params "${export_key1}" "${export_value1}") + export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} ${set_export1_key}" + echo ${infer_run_exports[Count]} + echo $export_cmd + eval $export_cmd + status_export=$? + status_check $status_export "${export_cmd}" "${status_log}" + else + save_infer_dir=${infer_model} + fi + #run inference + is_quant=${infer_quant_flag[Count]} + if [ ${MODE} = "klquant_infer" ]; then + is_quant="True" + fi + func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} + Count=$(($Count + 1)) + done +else + IFS="|" + export Count=0 + USE_GPU_KEY=(${train_use_gpu_value}) + for gpu in ${gpu_list[*]}; do + train_use_gpu=${USE_GPU_KEY[Count]} + Count=$(($Count + 1)) + ips="" + if [ ${gpu} = "-1" ];then + env="" + elif [ ${#gpu} -le 1 ];then + env="export CUDA_VISIBLE_DEVICES=${gpu}" + eval ${env} + elif [ ${#gpu} -le 15 ];then + IFS="," + array=(${gpu}) + env="export CUDA_VISIBLE_DEVICES=${array[0]}" + IFS="|" + else + IFS=";" + array=(${gpu}) + ips=${array[0]} + gpu=${array[1]} + IFS="|" + env=" " + fi + for autocast in ${autocast_list[*]}; do + if [ ${autocast} = "amp" ]; then + set_amp_config="Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True" + else + set_amp_config=" " + fi + for trainer in ${trainer_list[*]}; do + flag_quant=False + if [ ${trainer} = ${pact_key} ]; then + run_train=${pact_trainer} + run_export=${pact_export} + flag_quant=True + elif [ ${trainer} = "${fpgm_key}" ]; then + run_train=${fpgm_trainer} + run_export=${fpgm_export} + elif [ ${trainer} = "${distill_key}" ]; then + run_train=${distill_trainer} + run_export=${distill_export} + elif [ ${trainer} = ${trainer_key1} ]; then + run_train=${trainer_value1} + run_export=${export_value1} + elif [[ ${trainer} = ${trainer_key2} ]]; then + run_train=${trainer_value2} + run_export=${export_value2} + else + run_train=${norm_trainer} + run_export=${norm_export} + fi + + if [ ${run_train} = "null" ]; then + continue + fi + set_autocast=$(func_set_params "${autocast_key}" "${autocast}") + set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}") + set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}") + set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}") + set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}") + set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}") + if [ ${#ips} -le 26 ];then + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" + nodes=1 + else + IFS="," + ips_array=(${ips}) + IFS="|" + nodes=${#ips_array[@]} + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}" + fi + + # load pretrain from norm training if current trainer is pact or fpgm trainer + if ([ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]) && [ ${nodes} -le 1 ]; then + set_pretrain="${load_norm_train_model}" + fi + + set_save_model=$(func_set_params "${save_model_key}" "${save_log}") + if [ ${#gpu} -le 2 ];then # train with cpu or single gpu + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " + elif [ ${#ips} -le 26 ];then # train with multi-gpu + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + else # train with multi-machine + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + fi + # run train + eval "unset CUDA_VISIBLE_DEVICES" + eval $cmd + status_check $? "${cmd}" "${status_log}" + + set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}") + # save norm trained models to set pretrain for pact training and fpgm training + if [ ${trainer} = ${trainer_norm} ] && [ ${nodes} -le 1 ]; then + load_norm_train_model=${set_eval_pretrain} + fi + # run eval + if [ ${eval_py} != "null" ]; then + set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}") + eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" + eval $eval_cmd + status_check $? "${eval_cmd}" "${status_log}" + fi + # run export model + if [ ${run_export} != "null" ]; then + # run export model + save_infer_path="${save_infer_value}" + set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}") + # For lac which needs the `data_dir` args + set_export1_key=$(func_set_params "${export_key1}" "${export_value1}") + export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key} ${set_export1_key}" + eval $export_cmd + status_check $? "${export_cmd}" "${status_log}" + + #run inference + eval $env + save_infer_path="${save_infer_value}" + + if [ ${inference_dir} != "null" ] && [ ${inference_dir} != '##' ]; then + infer_model_dir="${save_infer_path}/${inference_dir}" + else + infer_model_dir=${save_infer_path} + fi + func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" + + eval "unset CUDA_VISIBLE_DEVICES" + fi + done # done with: for trainer in ${trainer_list[*]}; do + done # done with: for autocast in ${autocast_list[*]}; do + done # done with: for gpu in ${gpu_list[*]}; do +fi # end if [ ${MODE} = "infer" ]; then