You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

328 lines
15 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import datetime
import os
import warnings
import numpy as np
import pandas as pd
from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
get_w2v_feats, get_key_for_top_fea,get_time_diff_feats_v2
from model import run_cbt
from utils import RESULT_DIR, TRAIN_DIR, \
TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR
warnings.filterwarnings('ignore')
def get_label(PSEUDO_FALG):
preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
if PSEUDO_FALG:
print('获取伪标签LABEL')
pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
label = pd.concat([preliminary_train_label_dataset,
pseudo_labels,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
else:
print('不使用伪标签数据')
label = pd.concat([preliminary_train_label_dataset,
preliminary_train_label_dataset_s],
ignore_index=True,
axis=0).sort_values(
['sn', 'fault_time']).reset_index(drop=True)
label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
return label
def get_log_dateset(PSEUDO_FALG):
preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path)
preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path)
if PSEUDO_FALG:
print('获取伪标签日志数据')
pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv'))
log_dataset = pd.concat([preliminary_sel_log_dataset,
pseudo_sel_log_dataset,
preliminary_sel_log_dataset_a],
ignore_index=True,
axis=0).sort_values(
['sn', 'time', 'server_model']).reset_index(drop=True)
else:
print('不使用伪标签数据')
log_dataset = pd.concat([preliminary_sel_log_dataset,
preliminary_sel_log_dataset_a],
ignore_index=True,
axis=0).sort_values(
['sn', 'time', 'server_model']).reset_index(drop=True)
log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
return log_dataset
def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30):
print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致')
fea_distribute_list = []
for i in feature_importances[:top]['fea'].to_list():
fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename(
columns={'index': 'value'})
fea_distribute_list.append(fea_distribute_tmp)
fea_distribute = fea_distribute_list[-1]
for i in fea_distribute_list[:-1]:
fea_distribute = fea_distribute.merge(i, on='value', how='left')
fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}')
return fea_distribute
def get_train_test(label, preliminary_submit_dataset_a, log_dataset):
print('获取训练集数据与测试集数据')
train = label.merge(log_dataset, on='sn', how='left')
test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left')
# train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds())
# test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds())
# train = train.query('time_interval > 0')
# test = test.query('time_interval > 0')
print(f'训练集维度:{train.shape},测试集维度:{test.shape}')
train = train.drop_duplicates().reset_index(drop=True)
test = test.drop_duplicates().reset_index(drop=True)
train['time'] = pd.to_datetime(train['time'])
test['time'] = pd.to_datetime(test['time'])
return train, test
start_time = datetime.datetime.now()
additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv')
preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv')
preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv')
preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv')
print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path)
preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path)
preliminary_submit_dataset_a.head()
log_dataset = get_log_dateset(PSEUDO_FALG)
label = get_label(PSEUDO_FALG)
next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000]
label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL,
next_time_list)
train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset)
train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True)
train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply(
lambda x: x.total_seconds())
test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply(
lambda x: x.total_seconds())
all_data = pd.concat([train, test], axis=0, ignore_index=True)
all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time'])
w2v_feats = get_w2v_feats(all_data,
f1_list = ['sn'],
f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2'])
# 获取 time_diff_feats_v2
time_diff_feats_v2 = get_time_diff_feats_v2(all_data)
# 获取 server_model_time_interval_stat_fea
server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data)
msg_text_fea = get_msg_text_fea_all(all_data)
# 获取时间差特征
duration_minutes_fea = get_duration_minutes_fea(train, test)
# 获取时间server_model特征
server_model_fea = get_server_model_fea(train, test)
counter = get_word_counter(train)
# 获取时间 nearest_msg 特征
nearest_msg_fea = get_nearest_msg_fea(train, test)
# 获取时间 server_model beta_target 特征
beta_target_fea = get_beta_target(train, test)
key = ['sn', 'fault_time', 'label', 'server_model']
fea_num = len(KEY_WORDS)
time_list = [i * TIME_INTERVAL for i in next_time_list]
train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model'])
test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model'])
print('添加 时间差 特征')
train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
print('添加 server_model特征')
train = train.merge(server_model_fea, on=['sn', 'server_model'])
test = test.merge(server_model_fea, on=['sn', 'server_model'])
print('添加 w2v_feats')
train = train.merge(w2v_feats, on=['sn' ])
test = test.merge(w2v_feats, on=['sn', ])
print('添加 nearest_msg 特征')
train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
print('添加 beta_target 特征')
train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test)
print('添加 server_model_sn_fea_2 特征')
train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
print('添加 time_diff_feats_v2 特征')
train = train.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time'])
test = test.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time'])
# test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
# train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
# crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') )
# print('添加 crashdump_venus_fea 特征')
# print(train.shape,test.shape,crashdump_venus_fea.shape)
# train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
# test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
# print(train.shape,test.shape )
# crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') )
# print('添加 crashdump_venus_fea 特征')
# print(train.shape,test.shape,crashdump_venus_fea.shape)
# train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
# test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
# print(train.shape,test.shape )
# test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
# train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
# print('添加 key_for_top_fea 特征')
# train,test = get_key_for_top_fea(train,test)
# print('添加 w2v_tfidf_doc2v_fea 特征')
# w2v_tfidf_fea = pd.read_csv(os.path.join(GENERATION_DIR,'w2v_tfidf_fea.csv'))
# drop_cols = [i for i in w2v_tfidf_fea if 'doc2vec' in i ]+[i for i in w2v_tfidf_fea if 'tfidf' in i ]
# for col in drop_cols:
# del w2v_tfidf_fea[col]
#
# train = train.merge(w2v_tfidf_fea, on=['sn' ], how='left')
# test = test.merge(w2v_tfidf_fea, on=['sn' ], how='left')
# print('添加 关键词交叉特征 ')
# train,test = get_key_word_cross_fea(train,test)
# print('添加 server_model_time_interval_stat_fea 特征')
# train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
# test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min',
'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc',
'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc',
'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc',
'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc',
'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc',
'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc',
'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc',
'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc']
use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1
print(f'use_less_col:{len(use_less_col)}')
use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col]
cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',]
use_cols = sorted(use_cols)
cat_cols = []
for i in use_cols:
if '_LabelEnc' in i:
cat_cols.append(i)
print('使用的特征维度:',len(use_cols),'类别特征维度:',len(cat_cols))
# fs = FeatureSelector(data=train[use_cols], labels=train['label'])
#
# # 选择出missing value 百分比大于60%的特征
# fs.identify_missing(missing_threshold=0.9)
#
# # # 查看选择出的特征
# # fs.ops['missing']
# # 不对feature进行one-hot encoding默认为False, 然后选择出相关性大于98%的feature,
# fs.identify_collinear(correlation_threshold=0.99, one_hot=False)
#
# # # 查看选择的feature
# # fs.ops['collinear']
#
# # 选择出只有单个值的feature
# fs.identify_single_unique()
#
# # # 查看选择出的feature
# # fs.ops['single_unique']
#
# train_removed = fs.remove(methods = ['missing', 'single_unique', 'collinear',], keep_one_hot=False)
# use_cols = train_removed.columns
# print('特征选择之后,使用的特征维度:',len(use_cols))
oof_prob = np.zeros((train.shape[0], 4))
test_prob = np.zeros((test.shape[0], 4))
# seeds = [42,4242,40424,1024,2048]
seeds = [42 ]
for seed in seeds:
oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train[use_cols] , train[['label']] , test[use_cols], k=5,
seed=seed, cat_cols=cat_cols)
oof_prob +=oof_prob/len(seeds)
test_prob +=test_prob/len(seeds)
weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
oof_prob = oof_prob * np.array(weight)
test_prob = test_prob * np.array(weight)
target_df = train[['sn', 'fault_time', 'label']]
submit_df = train[['sn', 'fault_time']]
submit_df['label'] = oof_prob.argmax(axis=1)
score = macro_f1(target_df=target_df, submit_df=submit_df)
print(f'********************** BEST MACRO_F1 : {score} **********************')
score = round(score, 5)
y_pred = test_prob.argmax(axis=1)
result = test[['sn', 'fault_time']]
result['label'] = y_pred
result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']]
result['label'] = result['label'].fillna(0).astype(int)
result.to_csv(os.path.join(RESULT_DIR,f'catboost_result.csv'), index=False)
print(result['label'].value_counts())
fea_imp_df = fea_imp_df.reset_index(drop = True)
fea_imp_df.to_csv(os.path.join(RESULT_DIR,f'./cat_fea_imp_{int(score*100000)}.csv'),index = False)
train_result_prob = pd.DataFrame(oof_prob).add_prefix('cat_class_')
test_result_prob = pd.DataFrame(test_prob).add_prefix('cat_class_')
train_result_prob['label'] = train['label']
train_result_prob['sn'] = train['sn']
train_result_prob['fault_time'] = train['fault_time']
test_result_prob['sn'] = test['sn']
test_result_prob['fault_time'] = test['fault_time']
result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True)
result_prob.to_csv(os.path.join(RESULT_DIR,f'cat_prob_result.csv'),index = False)
end_time = datetime.datetime.now()
cost_time = end_time - start_time
print('****************** CATBOOST COST TIME : ',str(cost_time),' ******************')
'''
v7: 最优 线下 0.7303
v8: v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73114
'''