import datetime import os import warnings import numpy as np import pandas as pd from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \ get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \ get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \ get_w2v_feats, get_key_for_top_fea,get_time_diff_feats_v2 from model import run_cbt from utils import RESULT_DIR, TRAIN_DIR, \ TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR warnings.filterwarnings('ignore') def get_label(PSEUDO_FALG): preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) if PSEUDO_FALG: print('获取伪标签LABEL') pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) label = pd.concat([preliminary_train_label_dataset, pseudo_labels, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) else: print('不使用伪标签数据') label = pd.concat([preliminary_train_label_dataset, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) return label def get_log_dateset(PSEUDO_FALG): preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path) preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path) if PSEUDO_FALG: print('获取伪标签日志数据') pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv')) log_dataset = pd.concat([preliminary_sel_log_dataset, pseudo_sel_log_dataset, preliminary_sel_log_dataset_a], ignore_index=True, axis=0).sort_values( ['sn', 'time', 'server_model']).reset_index(drop=True) else: print('不使用伪标签数据') log_dataset = pd.concat([preliminary_sel_log_dataset, preliminary_sel_log_dataset_a], ignore_index=True, axis=0).sort_values( ['sn', 'time', 'server_model']).reset_index(drop=True) log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) return log_dataset def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30): print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致') fea_distribute_list = [] for i in feature_importances[:top]['fea'].to_list(): fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename( columns={'index': 'value'}) fea_distribute_list.append(fea_distribute_tmp) fea_distribute = fea_distribute_list[-1] for i in fea_distribute_list[:-1]: fea_distribute = fea_distribute.merge(i, on='value', how='left') fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}') return fea_distribute def get_train_test(label, preliminary_submit_dataset_a, log_dataset): print('获取训练集数据与测试集数据') train = label.merge(log_dataset, on='sn', how='left') test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left') # train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds()) # test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds()) # train = train.query('time_interval > 0') # test = test.query('time_interval > 0') print(f'训练集维度:{train.shape},测试集维度:{test.shape}') train = train.drop_duplicates().reset_index(drop=True) test = test.drop_duplicates().reset_index(drop=True) train['time'] = pd.to_datetime(train['time']) test['time'] = pd.to_datetime(test['time']) return train, test start_time = datetime.datetime.now() additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv') preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv') preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv') preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv') preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv') preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv') print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path) preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path) preliminary_submit_dataset_a.head() log_dataset = get_log_dateset(PSEUDO_FALG) label = get_label(PSEUDO_FALG) next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000] label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL, next_time_list) train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset) train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True) train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply( lambda x: x.total_seconds()) test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply( lambda x: x.total_seconds()) all_data = pd.concat([train, test], axis=0, ignore_index=True) all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time']) w2v_feats = get_w2v_feats(all_data, f1_list = ['sn'], f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2']) # 获取 time_diff_feats_v2 time_diff_feats_v2 = get_time_diff_feats_v2(all_data) # 获取 server_model_time_interval_stat_fea server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data) msg_text_fea = get_msg_text_fea_all(all_data) # 获取时间差特征 duration_minutes_fea = get_duration_minutes_fea(train, test) # 获取时间server_model特征 server_model_fea = get_server_model_fea(train, test) counter = get_word_counter(train) # 获取时间 nearest_msg 特征 nearest_msg_fea = get_nearest_msg_fea(train, test) # 获取时间 server_model beta_target 特征 beta_target_fea = get_beta_target(train, test) key = ['sn', 'fault_time', 'label', 'server_model'] fea_num = len(KEY_WORDS) time_list = [i * TIME_INTERVAL for i in next_time_list] train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model']) test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model']) print('添加 时间差 特征') train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) print('添加 server_model特征') train = train.merge(server_model_fea, on=['sn', 'server_model']) test = test.merge(server_model_fea, on=['sn', 'server_model']) print('添加 w2v_feats') train = train.merge(w2v_feats, on=['sn' ]) test = test.merge(w2v_feats, on=['sn', ]) print('添加 nearest_msg 特征') train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) print('添加 beta_target 特征') train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test) print('添加 server_model_sn_fea_2 特征') train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model']) test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model']) print('添加 time_diff_feats_v2 特征') train = train.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time']) test = test.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time']) # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False) # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False) # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') ) # print('添加 crashdump_venus_fea 特征') # print(train.shape,test.shape,crashdump_venus_fea.shape) # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') # print(train.shape,test.shape ) # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') ) # print('添加 crashdump_venus_fea 特征') # print(train.shape,test.shape,crashdump_venus_fea.shape) # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') # print(train.shape,test.shape ) # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False) # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False) # print('添加 key_for_top_fea 特征') # train,test = get_key_for_top_fea(train,test) # print('添加 w2v_tfidf_doc2v_fea 特征') # w2v_tfidf_fea = pd.read_csv(os.path.join(GENERATION_DIR,'w2v_tfidf_fea.csv')) # drop_cols = [i for i in w2v_tfidf_fea if 'doc2vec' in i ]+[i for i in w2v_tfidf_fea if 'tfidf' in i ] # for col in drop_cols: # del w2v_tfidf_fea[col] # # train = train.merge(w2v_tfidf_fea, on=['sn' ], how='left') # test = test.merge(w2v_tfidf_fea, on=['sn' ], how='left') # print('添加 关键词交叉特征 ') # train,test = get_key_word_cross_fea(train,test) # print('添加 server_model_time_interval_stat_fea 特征') # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') # test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min', 'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc', 'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc', 'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc', 'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc', 'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc', 'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc', 'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc', 'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc'] use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1 print(f'use_less_col:{len(use_less_col)}') use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col] cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',] use_cols = sorted(use_cols) cat_cols = [] for i in use_cols: if '_LabelEnc' in i: cat_cols.append(i) print('使用的特征维度:',len(use_cols),'类别特征维度:',len(cat_cols)) # fs = FeatureSelector(data=train[use_cols], labels=train['label']) # # # 选择出missing value 百分比大于60%的特征 # fs.identify_missing(missing_threshold=0.9) # # # # 查看选择出的特征 # # fs.ops['missing'] # # 不对feature进行one-hot encoding(默认为False), 然后选择出相关性大于98%的feature, # fs.identify_collinear(correlation_threshold=0.99, one_hot=False) # # # # 查看选择的feature # # fs.ops['collinear'] # # # 选择出只有单个值的feature # fs.identify_single_unique() # # # # 查看选择出的feature # # fs.ops['single_unique'] # # train_removed = fs.remove(methods = ['missing', 'single_unique', 'collinear',], keep_one_hot=False) # use_cols = train_removed.columns # print('特征选择之后,使用的特征维度:',len(use_cols)) oof_prob = np.zeros((train.shape[0], 4)) test_prob = np.zeros((test.shape[0], 4)) # seeds = [42,4242,40424,1024,2048] seeds = [42 ] for seed in seeds: oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train[use_cols] , train[['label']] , test[use_cols], k=5, seed=seed, cat_cols=cat_cols) oof_prob +=oof_prob/len(seeds) test_prob +=test_prob/len(seeds) weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001) oof_prob = oof_prob * np.array(weight) test_prob = test_prob * np.array(weight) target_df = train[['sn', 'fault_time', 'label']] submit_df = train[['sn', 'fault_time']] submit_df['label'] = oof_prob.argmax(axis=1) score = macro_f1(target_df=target_df, submit_df=submit_df) print(f'********************** BEST MACRO_F1 : {score} **********************') score = round(score, 5) y_pred = test_prob.argmax(axis=1) result = test[['sn', 'fault_time']] result['label'] = y_pred result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']] result['label'] = result['label'].fillna(0).astype(int) result.to_csv(os.path.join(RESULT_DIR,f'catboost_result.csv'), index=False) print(result['label'].value_counts()) fea_imp_df = fea_imp_df.reset_index(drop = True) fea_imp_df.to_csv(os.path.join(RESULT_DIR,f'./cat_fea_imp_{int(score*100000)}.csv'),index = False) train_result_prob = pd.DataFrame(oof_prob).add_prefix('cat_class_') test_result_prob = pd.DataFrame(test_prob).add_prefix('cat_class_') train_result_prob['label'] = train['label'] train_result_prob['sn'] = train['sn'] train_result_prob['fault_time'] = train['fault_time'] test_result_prob['sn'] = test['sn'] test_result_prob['fault_time'] = test['fault_time'] result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True) result_prob.to_csv(os.path.join(RESULT_DIR,f'cat_prob_result.csv'),index = False) end_time = datetime.datetime.now() cost_time = end_time - start_time print('****************** CATBOOST COST TIME : ',str(cost_time),' ******************') ''' v7: 最优 线下 0.7303 v8: v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73114 '''