import datetime import os import gc import warnings import pandas as pd import pickle from gensim.models.word2vec import Word2Vec from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.utils.class_weight import compute_class_weight from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD import numpy as np import pandas as pd from generate_feature import add_w2v_feats, cat2num from generate_feature import get_key from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \ get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \ get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \ get_w2v_feats, get_key, get_class_key_words_nunique from model import run_cbt, run_lgb from utils import RESULT_DIR, TRAIN_DIR, \ TEST_A_DIR, KEY_WORDS, TOP_KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL, PSEUDO_FALG, \ GENERATION_DIR warnings.filterwarnings('ignore') def get_fault_code_list(x): try: x = x.replace('.', ',').split(',') except: x = [] return x def get_module_cause_list(x): try: x = x.replace(',', '_').replace(',', '_') x = list(set(x.split('_'))) except: x = [] return x def get_label(PSEUDO_FALG): preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) if PSEUDO_FALG: print('获取伪标签LABEL') pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) label = pd.concat([preliminary_train_label_dataset, pseudo_labels, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) else: print('不使用伪标签数据') label = pd.concat([preliminary_train_label_dataset, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) return label def get_module_cause_code(x, code_name): code_list = [] for i in x: if code_name in i: code_list.append(i) return code_list def get_alertname_code(x, alertname): x = x.split(',') try: alertname_code = x[x.index(alertname) + 1] except: alertname_code = np.nan return alertname_code def get_alertname_code_2(x, alertname): # x =x.split(',') try: alertname_code = x[x.index(alertname) + 1] except: alertname_code = ' ' return alertname_code def get_last_msg_cnt(x): last_msg = x[-1] cnt = x.count(last_msg) return cnt def get_first_msg_cnt(x): first_msg = x[0] cnt = x.count(first_msg) return cnt def get_crashdump_venus_data(): final_venus_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_venus_dataset_b.csv')) final_crashdump_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_crashdump_dataset_b.csv')) final_crashdump_venus = final_crashdump_dataset.merge(final_venus_dataset, on=['sn', 'fault_time'], how='outer') preliminary_venus_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_venus_dataset.csv')) preliminary_crashdump_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_crashdump_dataset.csv')) preliminary_crashdump_venus = preliminary_crashdump_dataset.merge(preliminary_venus_dataset, on=['sn', 'fault_time'], how='outer') crashdump_venus = pd.concat([final_crashdump_venus, preliminary_crashdump_venus], ignore_index=True).drop_duplicates() crashdump_venus = crashdump_venus.sort_values(['sn', 'fault_time']).reset_index(drop=True) return crashdump_venus def get_crashdump_venus_fea(crashdump_venus): print('生成 crashdump_venus 特征') crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].apply(lambda x: get_module_cause_list(x)) crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].apply(lambda x: get_fault_code_list(x)) code_name_list = ['module', 'cod1', 'cod2', 'addr', 'port'] for code_name in code_name_list: crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus['module_cause_list'].apply( lambda x: get_module_cause_code(x, code_name)) crashdump_venus[f'module_cause_{code_name}_len'] = crashdump_venus[f'module_cause_{code_name}'].apply( lambda x: len(x)) crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus[f'module_cause_{code_name}'].apply( lambda x: '_'.join(set(x))) code_name_list = ['cha', '0x', 'cod', 'core', 'cpu', 'm2m', 'pcu'] for code_name in code_name_list: crashdump_venus[f'fault_{code_name}'] = crashdump_venus['fault_code_list'].apply( lambda x: get_module_cause_code(x, code_name)) crashdump_venus[f'fault_{code_name}_len'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: len(x)) crashdump_venus[f'fault_{code_name}'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: '_'.join(set(x))) cols_tmp = ['module_cause', 'fault_code', 'module_cause_module', 'module_cause_cod1', 'module_cause_cod2', 'module_cause_addr', 'module_cause_port', 'fault_cha', 'fault_0x', 'fault_cod', 'fault_core', 'fault_cpu', 'fault_m2m', 'fault_pcu', ] new_cat_cols = [] crashdump_venus = cat2num(crashdump_venus, cols_tmp) for name in cols_tmp: # le = LabelEncoder() # crashdump_venus[f'{name}_LabelEnc'] = le.fit_transform(crashdump_venus[name]) new_cat_cols.append(f'{name}_LabelEnc') num_cols = ['fault_pcu_len', 'fault_m2m_len', 'fault_cpu_len', 'fault_0x_len', 'fault_cod_len', 'module_cause_module_len', 'module_cause_cod1_len', 'module_cause_cod2_len', 'module_cause_addr_len', 'module_cause_port_len', 'fault_cha_len', 'fault_core_len', ] crashdump_venus = crashdump_venus[['sn', 'fault_time'] + new_cat_cols + num_cols] crashdump_venus = crashdump_venus.rename(columns={'fault_time': 'crashdump_fault_time'}) crashdump_venus['crashdump_fault_time'] = pd.to_datetime(crashdump_venus['crashdump_fault_time']) del crashdump_venus['crashdump_fault_time'] print(f'生成 crashdump_venus 特征完毕,特征维度 {crashdump_venus.shape}') return crashdump_venus def get_location_word(x, num): try: return x[num] except: return def get_label(PSEUDO_FALG): preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) if PSEUDO_FALG: print('获取伪标签LABEL') pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) label = pd.concat([preliminary_train_label_dataset, pseudo_labels, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) else: print('不使用伪标签数据') label = pd.concat([preliminary_train_label_dataset, preliminary_train_label_dataset_s], ignore_index=True, axis=0).sort_values( ['sn', 'fault_time']).reset_index(drop=True) label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) return label module_list = ['module0','module1','module2','module3','module4','module5','module7','module8','module9', 'module10','module11','module12','module13','module14','module17','module18','module19', 'in traffic control', 'irpp0','irpp1', 'pcie rootport 0:0.0','pcie rootport a2:0.0','pcie rootport 2b:3.0', 'port a','port c'] module_list2 = ['module0','module1','module2','module3','module4','module5','module7','module8','module9', 'module10','module11','module12','module13','module14','module17','module18','module19'] other_module_list = ['in traffic control', 'irpp0', 'irpp1', 'pcie rootport 0:0.0', 'pcie rootport a2:0.0', 'pcie rootport 2b:3.0', 'port a', 'port c'] module_content_list = ['module0_cod1', 'module0_cod2', 'module0_addr', 'module1_cod1', 'module1_cod2', 'module1_addr', 'module2_cod1', 'module2_cod2', 'module2_addr', 'module3_cod1', 'module3_cod2', 'module3_addr', 'module4_cod1', 'module4_cod2', 'module4_addr', 'module5_cod1', 'module5_cod2', 'module5_addr', 'module7_cod1', 'module7_cod2', 'module7_addr', 'module8_cod1', 'module8_cod2', 'module8_addr', 'module9_cod1', 'module9_cod2', 'module9_addr', 'module10_cod1', 'module10_cod2', 'module10_addr', 'module11_cod1', 'module11_cod2', 'module11_addr', 'module12_cod1', 'module12_cod2', 'module12_addr', 'module13_cod1', 'module13_cod2', 'module13_addr', 'module14_cod1', 'module14_cod2', 'module14_addr', 'module17_cod1', 'module17_cod2', 'module17_addr', 'module18_cod1', 'module18_cod2', 'module18_addr', 'module19_cod1', 'module19_cod2', 'module19_addr'] fault_code_content_list = ['fault_code_cod1', 'fault_code_cod2', 'fault_code_cpu0', 'fault_code_cpu1'] crashdump_venus = get_crashdump_venus_data() crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.split(',')) crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.replace(':','_').replace(',','_')) for module in module_list: crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply( lambda x:x.replace(f'{module}_',f'{module}:').replace(f'_{module}',f',{module}')) crashdump_venus['module_cause'] = crashdump_venus['module_cause'].apply(lambda x:x.replace(':',',')) for module in module_list: crashdump_venus[module] = crashdump_venus['module_cause'].apply(lambda x:get_alertname_code(x,module)) crashdump_venus[module] = crashdump_venus.loc[:,module].fillna(' ').apply(lambda x:x.replace('_',' ')) crashdump_venus[module] = crashdump_venus[module].apply(lambda x:x.split(' ')) crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,module_list].sum(1) for module in module_list2: crashdump_venus[f'{module}_cod1'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod1')]) crashdump_venus[f'{module}_cod2'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod2')]) crashdump_venus[f'{module}_addr'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'addr')]) del crashdump_venus[module] gc.collect() crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].fillna(' ').apply(lambda x:x.split('.')) for i in ['cod1','cod2','cpu0','cpu1']: crashdump_venus[f'fault_code_{i}'] = crashdump_venus['fault_code_list'].apply(lambda x:[get_alertname_code_2(x,i)]) crashdump_venus['other_module_list'] = crashdump_venus.loc[:,other_module_list].sum(1) crashdump_venus['module_content_list'] = crashdump_venus.loc[:,module_content_list].sum(1) crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,other_module_list+module_content_list].sum(1) crashdump_venus['fault_code_content_list'] = crashdump_venus.loc[:,fault_code_content_list].sum(1) crashdump_venus['all_crashdump_venus'] = crashdump_venus.loc[:,other_module_list+module_content_list+fault_code_content_list].sum(1) f1_list = ['sn'] f2_list = ['other_module_list','module_content_list','module_cause_new','fault_code_content_list','all_crashdump_venus'] w2v_feats_df = crashdump_venus[f1_list].drop_duplicates() w2v_feats_df_list = [] for f1 in f1_list: for f2 in f2_list: w2v_fea_tmp = add_w2v_feats(crashdump_venus,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count =5,) w2v_feats_df_list.append(w2v_fea_tmp) w2v_feats_df = w2v_feats_df_list[0] for i in w2v_feats_df_list[1:]: w2v_feats_df = w2v_feats_df.merge(i,on = 'sn',how = 'left') for i in other_module_list+module_content_list+fault_code_content_list: crashdump_venus[i] = crashdump_venus[i].astype(str) crashdump_venus = cat2num(crashdump_venus,other_module_list+module_content_list+fault_code_content_list) for i in other_module_list+module_content_list+fault_code_content_list: del crashdump_venus[i] gc.collect() crashdump_venus = crashdump_venus.merge(w2v_feats_df,on ='sn',how ='left').rename(columns ={'fault_time':'crashdump_venus_fault_time'} ) preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv') preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv') test = pd.read_csv(os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv'))[['sn', 'fault_time' ]] train = get_label(False)[['sn', 'fault_time', 'label',]] test_tmp = test[['sn', 'fault_time']] test_tmp = test_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True) train_tmp = train[['sn', 'fault_time', 'label', ]] train_tmp = train_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True) train_tmp['duration_fault_time'] = pd.to_datetime(train_tmp['fault_time']) - pd.to_datetime(train_tmp['crashdump_venus_fault_time']) test_tmp['duration_fault_time'] = pd.to_datetime(test_tmp['fault_time']) - pd.to_datetime(test_tmp['crashdump_venus_fault_time']) train_tmp['duration_fault_time'] = train_tmp['duration_fault_time'].apply(lambda x:x.total_seconds()) test_tmp['duration_fault_time'] = test_tmp['duration_fault_time'].apply(lambda x:x.total_seconds()) drop_cols = ['sn', 'fault_time', 'fault_code', 'module_cause', 'module','crashdump_venus_fault_time', 'module_cause_list', 'module_cause_new', 'fault_code_list','label','duration_fault_time', 'other_module_list', 'module_content_list', 'fault_code_content_list', 'all_crashdump_venus',] use_cols = [i for i in train_tmp.columns if i not in drop_cols] cat_cols = [f'{i}_LabelEnc' for i in other_module_list+module_content_list+fault_code_content_list] oof_prob = np.zeros((train.shape[0], 4)) test_prob = np.zeros((test.shape[0], 4)) # seeds = [42,4242,40424,1024,2048] seeds = [42 ] for seed in seeds: oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train_tmp[use_cols], train_tmp[['label']], test_tmp[use_cols], k=5, seed=seed, cat_cols=cat_cols) oof_prob +=oof_prob/len(seeds) test_prob +=test_prob/len(seeds) weight = search_weight(train_tmp, train_tmp[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001) oof_prob = oof_prob * np.array(weight) test_prob = test_prob * np.array(weight) target_df = train_tmp[['sn', 'fault_time', 'label']].drop_duplicates(['sn', 'fault_time']) submit_df = train_tmp[['sn', 'fault_time']] submit_df['label'] = oof_prob.argmax(axis=1) submit_df = submit_df.drop_duplicates(['sn', 'fault_time']) # submit_df = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea1.csv')).rename(columns = {'crashdump_venus_label':'label'}) score = macro_f1(target_df=target_df, submit_df=submit_df) print(f'********************** BEST MACRO_F1 : {score} **********************') score = round(score, 5) print(fea_imp_df[:20]) y_pred = test_prob.argmax(axis=1) result = test_tmp[['sn', 'fault_time']] result['label'] = y_pred result = result.drop_duplicates(['sn', 'fault_time']) crashdump_venus_fea = pd.concat([submit_df,result],ignore_index = False,axis = 0) crashdump_venus_fea = crashdump_venus_fea.rename(columns = {'label':'crashdump_venus_label_v1'}) crashdump_venus_fea.to_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv'),index= False) print(crashdump_venus_fea['crashdump_venus_label_v1'].value_counts())