import os import warnings import numpy as np import pandas as pd import datetime from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \ get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \ get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \ get_w2v_feats from model import run_cbt,run_lgb from utils import RESULT_DIR, TRAIN_DIR, \ TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR warnings.filterwarnings('ignore') def get_label(PSEUDO_FALG): preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) if PSEUDO_FALG: print('获取伪标签LABEL') pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) label = pd.concat([preliminary_train_label_dataset, pseudo_labels, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) else: print('不使用伪标签数据') label = pd.concat([preliminary_train_label_dataset, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) return label def get_log_dateset(PSEUDO_FALG): preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path) preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path) if PSEUDO_FALG: print('获取伪标签日志数据') pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv')) log_dataset = pd.concat([preliminary_sel_log_dataset, pseudo_sel_log_dataset, preliminary_sel_log_dataset_a], ignore_index=True, axis=0).sort_values( ['sn', 'time', 'server_model']).reset_index(drop=True) else: print('不使用伪标签数据') log_dataset = pd.concat([preliminary_sel_log_dataset, preliminary_sel_log_dataset_a], ignore_index=True, axis=0).sort_values( ['sn', 'time', 'server_model']).reset_index(drop=True) log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) return log_dataset def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30): print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致') fea_distribute_list = [] for i in feature_importances[:top]['fea'].to_list(): fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename( columns={'index': 'value'}) fea_distribute_list.append(fea_distribute_tmp) fea_distribute = fea_distribute_list[-1] for i in fea_distribute_list[:-1]: fea_distribute = fea_distribute.merge(i, on='value', how='left') fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}') return fea_distribute def get_train_test(label, preliminary_submit_dataset_a, log_dataset): print('获取训练集数据与测试集数据') train = label.merge(log_dataset, on='sn', how='left') test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left') # train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds()) # test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds()) # train = train.query('time_interval > 0') # test = test.query('time_interval > 0') print(f'训练集维度:{train.shape},测试集维度:{test.shape}') train = train.drop_duplicates().reset_index(drop=True) test = test.drop_duplicates().reset_index(drop=True) train['time'] = pd.to_datetime(train['time']) test['time'] = pd.to_datetime(test['time']) return train, test start_time = datetime.datetime.now() additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv') preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv') preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv') preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv') preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv') preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv') print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path) preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path) preliminary_submit_dataset_a.head() log_dataset = get_log_dateset(PSEUDO_FALG) label = get_label(PSEUDO_FALG) next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000] label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL, next_time_list) train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset) train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True) train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply( lambda x: x.total_seconds()) test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply( lambda x: x.total_seconds()) all_data = pd.concat([train, test], axis=0, ignore_index=True) all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time']) w2v_feats = get_w2v_feats(all_data, f1_list = ['sn'], f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2']) # 获取 server_model_time_interval_stat_fea server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data) msg_text_fea = get_msg_text_fea_all(all_data) # 获取时间差特征 duration_minutes_fea = get_duration_minutes_fea(train, test) # 获取时间server_model特征 server_model_fea = get_server_model_fea(train, test) counter = get_word_counter(train) # 获取时间 nearest_msg 特征 nearest_msg_fea = get_nearest_msg_fea(train, test) # 获取时间 server_model beta_target 特征 beta_target_fea = get_beta_target(train, test) key = ['sn', 'fault_time', 'label', 'server_model'] fea_num = len(KEY_WORDS) time_list = [i * TIME_INTERVAL for i in next_time_list] train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model']) test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model']) print('添加 时间差 特征') train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) print('添加 server_model特征') train = train.merge(server_model_fea, on=['sn', 'server_model']) test = test.merge(server_model_fea, on=['sn', 'server_model']) print('添加 w2v_feats') train = train.merge(w2v_feats, on=['sn' ]) test = test.merge(w2v_feats, on=['sn', ]) print('添加 nearest_msg 特征') train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) print('添加 beta_target 特征') train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test) print('添加 server_model_sn_fea_2 特征') train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model']) test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model']) # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') ) # print('添加 crashdump_venus_fea 特征') # print(train.shape,test.shape,crashdump_venus_fea.shape) # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') # print(train.shape,test.shape ) crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') ) print('添加 crashdump_venus_fea 特征') print(train.shape,test.shape,crashdump_venus_fea.shape) train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') print(train.shape,test.shape ) test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False) train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False) # print('添加 msg_text_fea 特征') # train = train.merge(msg_text_fea, on=['sn', 'fault_time' ], how='left') # test = test.merge(msg_text_fea, on=['sn', 'fault_time'], how='left') # print('添加 关键词交叉特征 ') # train,test = get_key_word_cross_fea(train,test) # print('添加 server_model_time_interval_stat_fea 特征') # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') # test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min', 'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc', 'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc', 'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc', 'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc', 'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc', 'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc', 'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc', 'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc'] use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1 print(f'use_less_col:{len(use_less_col)}') use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col] cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',] use_cols = sorted(use_cols) print('使用的特征维度:',len(use_cols)) # cat_cols = [] # for i in use_cols: # if '_LabelEnc' in i: # cat_cols.append(i) oof_prob = np.zeros((train.shape[0], 4)) test_prob = np.zeros((test.shape[0], 4)) # seeds = [42,4242,40424,1024,2048] seeds = [42 ] for seed in seeds: oof_prob, test_prob, fea_imp_df, model_list = run_lgb(train[use_cols], train[['label']], test[use_cols], k=5, seed=seed, cat_cols=cat_cols) oof_prob +=oof_prob/len(seeds) test_prob +=test_prob/len(seeds) weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001) oof_prob = oof_prob * np.array(weight) test_prob = test_prob * np.array(weight) target_df = train[['sn', 'fault_time', 'label']] submit_df = train[['sn', 'fault_time']] submit_df['label'] = oof_prob.argmax(axis=1) score = macro_f1(target_df=target_df, submit_df=submit_df) print(f'********************** BEST MACRO_F1 : {score} **********************') score = round(score, 5) y_pred = test_prob.argmax(axis=1) result = test[['sn', 'fault_time']] result['label'] = y_pred result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']] result['label'] = result['label'].fillna(0).astype(int) result.to_csv(os.path.join(RESULT_DIR, f'lgb_result.csv'), index=False) fea_imp_df = fea_imp_df.reset_index(drop=True) fea_imp_df.to_csv(os.path.join(RESULT_DIR, f'./lgb_fea_imp_{int(score * 100000)}.csv'), index=False) train_result_prob = pd.DataFrame(oof_prob).add_prefix('lgb_class_') test_result_prob = pd.DataFrame(test_prob).add_prefix('lgb_class_') train_result_prob['label'] = train['label'] train_result_prob['sn'] = train['sn'] train_result_prob['fault_time'] = train['fault_time'] test_result_prob['sn'] = test['sn'] test_result_prob['fault_time'] = test['fault_time'] result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True) result_prob.to_csv(os.path.join(RESULT_DIR,f'lgb_prob_result.csv'),index = False) end_time = datetime.datetime.now() cost_time = end_time - start_time print('****************** LIGHTGBM COST TIME : ',str(cost_time),' ******************') ''' v7 最优版本 线下 7356 v8: v7 添加 关键词交叉特征 线下 0.7357 线上 7338 v8.1 v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73361 v8.2 v7 添加 关键词交叉特征 并作为类别变量输入模型 删除 TOP_KEY_WORDS 7117 v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 使用 TOP_KEY_WORDS_2 7260 v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 添加 TOP_KEY_WORDS_2 7260 '''