You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

292 lines
13 KiB

import os
import warnings
import numpy as np
import pandas as pd
import datetime
from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
get_w2v_feats
from model import run_cbt,run_lgb
from utils import RESULT_DIR, TRAIN_DIR, \
TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR
warnings.filterwarnings('ignore')
def get_label(PSEUDO_FALG):
preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
if PSEUDO_FALG:
print('获取伪标签LABEL')
pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
label = pd.concat([preliminary_train_label_dataset,
pseudo_labels,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
else:
print('不使用伪标签数据')
label = pd.concat([preliminary_train_label_dataset,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
return label
def get_log_dateset(PSEUDO_FALG):
preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path)
preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path)
if PSEUDO_FALG:
print('获取伪标签日志数据')
pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv'))
log_dataset = pd.concat([preliminary_sel_log_dataset,
pseudo_sel_log_dataset,
preliminary_sel_log_dataset_a],
ignore_index=True,
axis=0).sort_values(
['sn', 'time', 'server_model']).reset_index(drop=True)
else:
print('不使用伪标签数据')
log_dataset = pd.concat([preliminary_sel_log_dataset,
preliminary_sel_log_dataset_a],
ignore_index=True,
axis=0).sort_values(
['sn', 'time', 'server_model']).reset_index(drop=True)
log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
return log_dataset
def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30):
print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致')
fea_distribute_list = []
for i in feature_importances[:top]['fea'].to_list():
fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename(
columns={'index': 'value'})
fea_distribute_list.append(fea_distribute_tmp)
fea_distribute = fea_distribute_list[-1]
for i in fea_distribute_list[:-1]:
fea_distribute = fea_distribute.merge(i, on='value', how='left')
fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}')
return fea_distribute
def get_train_test(label, preliminary_submit_dataset_a, log_dataset):
print('获取训练集数据与测试集数据')
train = label.merge(log_dataset, on='sn', how='left')
test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left')
# train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds())
# test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds())
# train = train.query('time_interval > 0')
# test = test.query('time_interval > 0')
print(f'训练集维度:{train.shape},测试集维度:{test.shape}')
train = train.drop_duplicates().reset_index(drop=True)
test = test.drop_duplicates().reset_index(drop=True)
train['time'] = pd.to_datetime(train['time'])
test['time'] = pd.to_datetime(test['time'])
return train, test
start_time = datetime.datetime.now()
additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv')
preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv')
preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv')
preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv')
print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path)
preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path)
preliminary_submit_dataset_a.head()
log_dataset = get_log_dateset(PSEUDO_FALG)
label = get_label(PSEUDO_FALG)
next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000]
label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL,
next_time_list)
train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset)
train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True)
train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply(
lambda x: x.total_seconds())
test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply(
lambda x: x.total_seconds())
all_data = pd.concat([train, test], axis=0, ignore_index=True)
all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time'])
w2v_feats = get_w2v_feats(all_data,
f1_list = ['sn'],
f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2'])
# 获取 server_model_time_interval_stat_fea
server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data)
msg_text_fea = get_msg_text_fea_all(all_data)
# 获取时间差特征
duration_minutes_fea = get_duration_minutes_fea(train, test)
# 获取时间server_model特征
server_model_fea = get_server_model_fea(train, test)
counter = get_word_counter(train)
# 获取时间 nearest_msg 特征
nearest_msg_fea = get_nearest_msg_fea(train, test)
# 获取时间 server_model beta_target 特征
beta_target_fea = get_beta_target(train, test)
key = ['sn', 'fault_time', 'label', 'server_model']
fea_num = len(KEY_WORDS)
time_list = [i * TIME_INTERVAL for i in next_time_list]
train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model'])
test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model'])
print('添加 时间差 特征')
train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
print('添加 server_model特征')
train = train.merge(server_model_fea, on=['sn', 'server_model'])
test = test.merge(server_model_fea, on=['sn', 'server_model'])
print('添加 w2v_feats')
train = train.merge(w2v_feats, on=['sn' ])
test = test.merge(w2v_feats, on=['sn', ])
print('添加 nearest_msg 特征')
train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
print('添加 beta_target 特征')
train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test)
print('添加 server_model_sn_fea_2 特征')
train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
# crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') )
# print('添加 crashdump_venus_fea 特征')
# print(train.shape,test.shape,crashdump_venus_fea.shape)
# train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
# test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
# print(train.shape,test.shape )
crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') )
print('添加 crashdump_venus_fea 特征')
print(train.shape,test.shape,crashdump_venus_fea.shape)
train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
print(train.shape,test.shape )
test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
# print('添加 msg_text_fea 特征')
# train = train.merge(msg_text_fea, on=['sn', 'fault_time' ], how='left')
# test = test.merge(msg_text_fea, on=['sn', 'fault_time'], how='left')
# print('添加 关键词交叉特征 ')
# train,test = get_key_word_cross_fea(train,test)
# print('添加 server_model_time_interval_stat_fea 特征')
# train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
# test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min',
'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc',
'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc',
'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc',
'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc',
'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc',
'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc',
'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc',
'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc']
use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1
print(f'use_less_col:{len(use_less_col)}')
use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col]
cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',]
use_cols = sorted(use_cols)
print('使用的特征维度:',len(use_cols))
# cat_cols = []
# for i in use_cols:
# if '_LabelEnc' in i:
# cat_cols.append(i)
oof_prob = np.zeros((train.shape[0], 4))
test_prob = np.zeros((test.shape[0], 4))
# seeds = [42,4242,40424,1024,2048]
seeds = [42 ]
for seed in seeds:
oof_prob, test_prob, fea_imp_df, model_list = run_lgb(train[use_cols], train[['label']], test[use_cols], k=5,
seed=seed, cat_cols=cat_cols)
oof_prob +=oof_prob/len(seeds)
test_prob +=test_prob/len(seeds)
weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
oof_prob = oof_prob * np.array(weight)
test_prob = test_prob * np.array(weight)
target_df = train[['sn', 'fault_time', 'label']]
submit_df = train[['sn', 'fault_time']]
submit_df['label'] = oof_prob.argmax(axis=1)
score = macro_f1(target_df=target_df, submit_df=submit_df)
print(f'********************** BEST MACRO_F1 : {score} **********************')
score = round(score, 5)
y_pred = test_prob.argmax(axis=1)
result = test[['sn', 'fault_time']]
result['label'] = y_pred
result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']]
result['label'] = result['label'].fillna(0).astype(int)
result.to_csv(os.path.join(RESULT_DIR, f'lgb_result.csv'), index=False)
fea_imp_df = fea_imp_df.reset_index(drop=True)
fea_imp_df.to_csv(os.path.join(RESULT_DIR, f'./lgb_fea_imp_{int(score * 100000)}.csv'), index=False)
train_result_prob = pd.DataFrame(oof_prob).add_prefix('lgb_class_')
test_result_prob = pd.DataFrame(test_prob).add_prefix('lgb_class_')
train_result_prob['label'] = train['label']
train_result_prob['sn'] = train['sn']
train_result_prob['fault_time'] = train['fault_time']
test_result_prob['sn'] = test['sn']
test_result_prob['fault_time'] = test['fault_time']
result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True)
result_prob.to_csv(os.path.join(RESULT_DIR,f'lgb_prob_result.csv'),index = False)
end_time = datetime.datetime.now()
cost_time = end_time - start_time
print('****************** LIGHTGBM COST TIME : ',str(cost_time),' ******************')
'''
v7 最优版本 线下 7356
v8: v7 添加 关键词交叉特征 线下 0.7357 线上 7338
v8.1 v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73361
v8.2 v7 添加 关键词交叉特征 并作为类别变量输入模型 删除 TOP_KEY_WORDS 7117
v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 使用 TOP_KEY_WORDS_2 7260
v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 添加 TOP_KEY_WORDS_2 7260
'''