You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

354 lines
16 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import datetime
import os
import gc
import warnings
import pandas as pd
import pickle
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd
from generate_feature import add_w2v_feats, cat2num
from generate_feature import get_key
from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
get_w2v_feats, get_key, get_class_key_words_nunique
from model import run_cbt, run_lgb
from utils import RESULT_DIR, TRAIN_DIR, \
TEST_A_DIR, KEY_WORDS, TOP_KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL, PSEUDO_FALG, \
GENERATION_DIR
warnings.filterwarnings('ignore')
def get_fault_code_list(x):
try:
x = x.replace('.', ',').split(',')
except:
x = []
return x
def get_module_cause_list(x):
try:
x = x.replace(',', '_').replace('', '_')
x = list(set(x.split('_')))
except:
x = []
return x
def get_label(PSEUDO_FALG):
preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
if PSEUDO_FALG:
print('获取伪标签LABEL')
pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
label = pd.concat([preliminary_train_label_dataset,
pseudo_labels,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
else:
print('不使用伪标签数据')
label = pd.concat([preliminary_train_label_dataset,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
return label
def get_module_cause_code(x, code_name):
code_list = []
for i in x:
if code_name in i:
code_list.append(i)
return code_list
def get_alertname_code(x, alertname):
x = x.split(',')
try:
alertname_code = x[x.index(alertname) + 1]
except:
alertname_code = np.nan
return alertname_code
def get_alertname_code_2(x, alertname):
# x =x.split(',')
try:
alertname_code = x[x.index(alertname) + 1]
except:
alertname_code = ' '
return alertname_code
def get_last_msg_cnt(x):
last_msg = x[-1]
cnt = x.count(last_msg)
return cnt
def get_first_msg_cnt(x):
first_msg = x[0]
cnt = x.count(first_msg)
return cnt
def get_crashdump_venus_data():
final_venus_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_venus_dataset_b.csv'))
final_crashdump_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_crashdump_dataset_b.csv'))
final_crashdump_venus = final_crashdump_dataset.merge(final_venus_dataset, on=['sn', 'fault_time'],
how='outer')
preliminary_venus_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_venus_dataset.csv'))
preliminary_crashdump_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_crashdump_dataset.csv'))
preliminary_crashdump_venus = preliminary_crashdump_dataset.merge(preliminary_venus_dataset,
on=['sn', 'fault_time'],
how='outer')
crashdump_venus = pd.concat([final_crashdump_venus, preliminary_crashdump_venus],
ignore_index=True).drop_duplicates()
crashdump_venus = crashdump_venus.sort_values(['sn', 'fault_time']).reset_index(drop=True)
return crashdump_venus
def get_crashdump_venus_fea(crashdump_venus):
print('生成 crashdump_venus 特征')
crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].apply(lambda x: get_module_cause_list(x))
crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].apply(lambda x: get_fault_code_list(x))
code_name_list = ['module', 'cod1', 'cod2', 'addr', 'port']
for code_name in code_name_list:
crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus['module_cause_list'].apply(
lambda x: get_module_cause_code(x, code_name))
crashdump_venus[f'module_cause_{code_name}_len'] = crashdump_venus[f'module_cause_{code_name}'].apply(
lambda x: len(x))
crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus[f'module_cause_{code_name}'].apply(
lambda x: '_'.join(set(x)))
code_name_list = ['cha', '0x', 'cod', 'core', 'cpu', 'm2m', 'pcu']
for code_name in code_name_list:
crashdump_venus[f'fault_{code_name}'] = crashdump_venus['fault_code_list'].apply(
lambda x: get_module_cause_code(x, code_name))
crashdump_venus[f'fault_{code_name}_len'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: len(x))
crashdump_venus[f'fault_{code_name}'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: '_'.join(set(x)))
cols_tmp = ['module_cause', 'fault_code', 'module_cause_module',
'module_cause_cod1', 'module_cause_cod2', 'module_cause_addr',
'module_cause_port', 'fault_cha', 'fault_0x', 'fault_cod', 'fault_core',
'fault_cpu', 'fault_m2m', 'fault_pcu', ]
new_cat_cols = []
crashdump_venus = cat2num(crashdump_venus, cols_tmp)
for name in cols_tmp:
# le = LabelEncoder()
# crashdump_venus[f'{name}_LabelEnc'] = le.fit_transform(crashdump_venus[name])
new_cat_cols.append(f'{name}_LabelEnc')
num_cols = ['fault_pcu_len', 'fault_m2m_len',
'fault_cpu_len', 'fault_0x_len', 'fault_cod_len',
'module_cause_module_len', 'module_cause_cod1_len',
'module_cause_cod2_len', 'module_cause_addr_len',
'module_cause_port_len', 'fault_cha_len', 'fault_core_len', ]
crashdump_venus = crashdump_venus[['sn', 'fault_time'] + new_cat_cols + num_cols]
crashdump_venus = crashdump_venus.rename(columns={'fault_time': 'crashdump_fault_time'})
crashdump_venus['crashdump_fault_time'] = pd.to_datetime(crashdump_venus['crashdump_fault_time'])
del crashdump_venus['crashdump_fault_time']
print(f'生成 crashdump_venus 特征完毕,特征维度 {crashdump_venus.shape}')
return crashdump_venus
def get_location_word(x, num):
try:
return x[num]
except:
return
def get_label(PSEUDO_FALG):
preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
if PSEUDO_FALG:
print('获取伪标签LABEL')
pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
label = pd.concat([preliminary_train_label_dataset,
pseudo_labels,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
else:
print('不使用伪标签数据')
label = pd.concat([preliminary_train_label_dataset,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
return label
module_list = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
'module10','module11','module12','module13','module14','module17','module18','module19',
'in traffic control',
'irpp0','irpp1',
'pcie rootport 0:0.0','pcie rootport a2:0.0','pcie rootport 2b:3.0',
'port a','port c']
module_list2 = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
'module10','module11','module12','module13','module14','module17','module18','module19']
other_module_list = ['in traffic control', 'irpp0', 'irpp1', 'pcie rootport 0:0.0',
'pcie rootport a2:0.0', 'pcie rootport 2b:3.0', 'port a', 'port c']
module_content_list = ['module0_cod1', 'module0_cod2', 'module0_addr',
'module1_cod1', 'module1_cod2', 'module1_addr', 'module2_cod1',
'module2_cod2', 'module2_addr', 'module3_cod1', 'module3_cod2',
'module3_addr', 'module4_cod1', 'module4_cod2', 'module4_addr',
'module5_cod1', 'module5_cod2', 'module5_addr', 'module7_cod1',
'module7_cod2', 'module7_addr', 'module8_cod1', 'module8_cod2',
'module8_addr', 'module9_cod1', 'module9_cod2', 'module9_addr',
'module10_cod1', 'module10_cod2', 'module10_addr', 'module11_cod1',
'module11_cod2', 'module11_addr', 'module12_cod1', 'module12_cod2',
'module12_addr', 'module13_cod1', 'module13_cod2', 'module13_addr',
'module14_cod1', 'module14_cod2', 'module14_addr', 'module17_cod1',
'module17_cod2', 'module17_addr', 'module18_cod1', 'module18_cod2',
'module18_addr', 'module19_cod1', 'module19_cod2', 'module19_addr']
fault_code_content_list = ['fault_code_cod1', 'fault_code_cod2',
'fault_code_cpu0', 'fault_code_cpu1']
crashdump_venus = get_crashdump_venus_data()
crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.split(','))
crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.replace(':','_').replace(',','_'))
for module in module_list:
crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(
lambda x:x.replace(f'{module}_',f'{module}:').replace(f'_{module}',f',{module}'))
crashdump_venus['module_cause'] = crashdump_venus['module_cause'].apply(lambda x:x.replace(':',','))
for module in module_list:
crashdump_venus[module] = crashdump_venus['module_cause'].apply(lambda x:get_alertname_code(x,module))
crashdump_venus[module] = crashdump_venus.loc[:,module].fillna(' ').apply(lambda x:x.replace('_',' '))
crashdump_venus[module] = crashdump_venus[module].apply(lambda x:x.split(' '))
crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,module_list].sum(1)
for module in module_list2:
crashdump_venus[f'{module}_cod1'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod1')])
crashdump_venus[f'{module}_cod2'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod2')])
crashdump_venus[f'{module}_addr'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'addr')])
del crashdump_venus[module]
gc.collect()
crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].fillna(' ').apply(lambda x:x.split('.'))
for i in ['cod1','cod2','cpu0','cpu1']:
crashdump_venus[f'fault_code_{i}'] = crashdump_venus['fault_code_list'].apply(lambda x:[get_alertname_code_2(x,i)])
crashdump_venus['other_module_list'] = crashdump_venus.loc[:,other_module_list].sum(1)
crashdump_venus['module_content_list'] = crashdump_venus.loc[:,module_content_list].sum(1)
crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,other_module_list+module_content_list].sum(1)
crashdump_venus['fault_code_content_list'] = crashdump_venus.loc[:,fault_code_content_list].sum(1)
crashdump_venus['all_crashdump_venus'] = crashdump_venus.loc[:,other_module_list+module_content_list+fault_code_content_list].sum(1)
f1_list = ['sn']
f2_list = ['other_module_list','module_content_list','module_cause_new','fault_code_content_list','all_crashdump_venus']
w2v_feats_df = crashdump_venus[f1_list].drop_duplicates()
w2v_feats_df_list = []
for f1 in f1_list:
for f2 in f2_list:
w2v_fea_tmp = add_w2v_feats(crashdump_venus,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count =5,)
w2v_feats_df_list.append(w2v_fea_tmp)
w2v_feats_df = w2v_feats_df_list[0]
for i in w2v_feats_df_list[1:]:
w2v_feats_df = w2v_feats_df.merge(i,on = 'sn',how = 'left')
for i in other_module_list+module_content_list+fault_code_content_list:
crashdump_venus[i] = crashdump_venus[i].astype(str)
crashdump_venus = cat2num(crashdump_venus,other_module_list+module_content_list+fault_code_content_list)
for i in other_module_list+module_content_list+fault_code_content_list:
del crashdump_venus[i]
gc.collect()
crashdump_venus = crashdump_venus.merge(w2v_feats_df,on ='sn',how ='left').rename(columns ={'fault_time':'crashdump_venus_fault_time'} )
preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
test = pd.read_csv(os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv'))[['sn', 'fault_time' ]]
train = get_label(False)[['sn', 'fault_time', 'label',]]
test_tmp = test[['sn', 'fault_time']]
test_tmp = test_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)
train_tmp = train[['sn', 'fault_time', 'label', ]]
train_tmp = train_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)
train_tmp['duration_fault_time'] = pd.to_datetime(train_tmp['fault_time']) - pd.to_datetime(train_tmp['crashdump_venus_fault_time'])
test_tmp['duration_fault_time'] = pd.to_datetime(test_tmp['fault_time']) - pd.to_datetime(test_tmp['crashdump_venus_fault_time'])
train_tmp['duration_fault_time'] = train_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())
test_tmp['duration_fault_time'] = test_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())
drop_cols = ['sn', 'fault_time', 'fault_code', 'module_cause', 'module','crashdump_venus_fault_time',
'module_cause_list', 'module_cause_new', 'fault_code_list','label','duration_fault_time',
'other_module_list', 'module_content_list', 'fault_code_content_list',
'all_crashdump_venus',]
use_cols = [i for i in train_tmp.columns if i not in drop_cols]
cat_cols = [f'{i}_LabelEnc' for i in other_module_list+module_content_list+fault_code_content_list]
oof_prob = np.zeros((train.shape[0], 4))
test_prob = np.zeros((test.shape[0], 4))
# seeds = [42,4242,40424,1024,2048]
seeds = [42 ]
for seed in seeds:
oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train_tmp[use_cols], train_tmp[['label']], test_tmp[use_cols], k=5,
seed=seed, cat_cols=cat_cols)
oof_prob +=oof_prob/len(seeds)
test_prob +=test_prob/len(seeds)
weight = search_weight(train_tmp, train_tmp[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
oof_prob = oof_prob * np.array(weight)
test_prob = test_prob * np.array(weight)
target_df = train_tmp[['sn', 'fault_time', 'label']].drop_duplicates(['sn', 'fault_time'])
submit_df = train_tmp[['sn', 'fault_time']]
submit_df['label'] = oof_prob.argmax(axis=1)
submit_df = submit_df.drop_duplicates(['sn', 'fault_time'])
# submit_df = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea1.csv')).rename(columns = {'crashdump_venus_label':'label'})
score = macro_f1(target_df=target_df, submit_df=submit_df)
print(f'********************** BEST MACRO_F1 : {score} **********************')
score = round(score, 5)
print(fea_imp_df[:20])
y_pred = test_prob.argmax(axis=1)
result = test_tmp[['sn', 'fault_time']]
result['label'] = y_pred
result = result.drop_duplicates(['sn', 'fault_time'])
crashdump_venus_fea = pd.concat([submit_df,result],ignore_index = False,axis = 0)
crashdump_venus_fea = crashdump_venus_fea.rename(columns = {'label':'crashdump_venus_label_v1'})
crashdump_venus_fea.to_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv'),index= False)
print(crashdump_venus_fea['crashdump_venus_label_v1'].value_counts())