You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
837 lines
38 KiB
837 lines
38 KiB
import datetime
|
|
import os
|
|
import pickle
|
|
from collections import Counter
|
|
from utils import get_new_cols
|
|
import numpy as np
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
from gensim.models import Word2Vec
|
|
from utils import GENERATION_DIR
|
|
from utils import KEY_1, KEY_2, KEY_3, KEY_4
|
|
from tqdm import tqdm
|
|
from scipy import stats
|
|
|
|
def cat2num(df, cat_cols, Transfer2num=True):
|
|
'''
|
|
|
|
:param df:
|
|
:param cat_cols: 类别特征列表
|
|
:param Transfer2num: 类别特征转换为数值特征
|
|
:return:
|
|
'''
|
|
if Transfer2num:
|
|
|
|
print('Transfer category feature to num feature ')
|
|
for col in cat_cols:
|
|
|
|
if not os.path.exists(os.path.join(GENERATION_DIR, f'{col}_map.pkl')):
|
|
print(f'Transfer : {col}')
|
|
tmp_map = dict(zip(df[col].unique(), range(df[col].nunique())))
|
|
with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'wb') as f:
|
|
pickle.dump(tmp_map, f)
|
|
else:
|
|
with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'rb') as f:
|
|
tmp_map = pickle.load(f)
|
|
df[f'{col}_LabelEnc'] = df[col].map(tmp_map).fillna(-1).astype(int)
|
|
else:
|
|
print('Transfer category feature to category feature ')
|
|
for col in cat_cols:
|
|
df[col] = df[col].astype('category')
|
|
print('Transfer category feature to num feature Down...')
|
|
return df
|
|
|
|
def add_minutes(x, minutes=5):
|
|
dt = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
|
|
out_date = (dt + datetime.timedelta(minutes=minutes)
|
|
).strftime('%Y-%m-%d %H:%M:%S')
|
|
return out_date
|
|
|
|
|
|
def time_process(df, time_cols, minutes_):
|
|
df[f'time_{minutes_}'] = df[time_cols].apply(
|
|
lambda x: add_minutes(str(x), minutes_))
|
|
return df
|
|
|
|
|
|
def get_fea(x, fea):
|
|
if fea in x:
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
|
|
def get_last_msg_cnt(x):
|
|
last_msg = x[-1]
|
|
cnt = x.count(last_msg)
|
|
return cnt
|
|
|
|
|
|
def get_first_msg_cnt(x):
|
|
first_msg = x[0]
|
|
cnt = x.count(first_msg)
|
|
return cnt
|
|
|
|
|
|
def add_last_next_time4fault(label, preliminary_submit_dataset_a,
|
|
time_interval, next_time_list):
|
|
print(f'添加自定义异常出现的时间间隔{time_interval}的前后的时间点')
|
|
for i in tqdm([-i for i in next_time_list] + next_time_list):
|
|
label = time_process(label, 'fault_time', i * time_interval)
|
|
preliminary_submit_dataset_a = time_process(
|
|
preliminary_submit_dataset_a, 'fault_time', i * time_interval)
|
|
|
|
return label, preliminary_submit_dataset_a
|
|
|
|
|
|
def get_msg_text_fea(df, msg_type='last'):
|
|
print(f'获取 msg text {msg_type}特征')
|
|
|
|
df_fea = df.groupby(['sn', 'fault_time']).agg(
|
|
{'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index()
|
|
df_fea['msg_list_unique'] = df_fea['msg_list'].apply(lambda x: str(set(x)))
|
|
df_fea['msg_0_unique'] = df_fea['msg_0'].apply(lambda x: str(set(x)))
|
|
df_fea['msg_1_unique'] = df_fea['msg_1'].apply(lambda x: str(set(x)))
|
|
df_fea['msg_2_unique'] = df_fea['msg_2'].apply(lambda x: str(set(x)))
|
|
|
|
df_fea['msg_list_list'] = df_fea['msg_list'].apply(lambda x: str(x))
|
|
df_fea['msg_0_list'] = df_fea['msg_0'].apply(lambda x: str(x))
|
|
df_fea['msg_1_list'] = df_fea['msg_1'].apply(lambda x: str(x))
|
|
df_fea['msg_2_list'] = df_fea['msg_2'].apply(lambda x: str(x))
|
|
|
|
df_fea['msg_0_first'] = df_fea['msg_0'].apply(lambda x: x[0])
|
|
df_fea['msg_1_first'] = df_fea['msg_1'].apply(lambda x: x[0])
|
|
df_fea['msg_2_first'] = df_fea['msg_2'].apply(lambda x: x[0])
|
|
|
|
df_fea['msg_0_last'] = df_fea['msg_0'].apply(lambda x: x[-1])
|
|
df_fea['msg_1_last'] = df_fea['msg_1'].apply(lambda x: x[-1])
|
|
df_fea['msg_2_last'] = df_fea['msg_2'].apply(lambda x: x[-1])
|
|
|
|
df_fea['msg_last'] = df.groupby(['sn', 'fault_time']).apply(
|
|
lambda x: x['msg'].to_list()[-1]).values
|
|
df_fea['msg_first'] = df.groupby(['sn', 'fault_time']).apply(
|
|
lambda x: x['msg'].to_list()[0]).values
|
|
|
|
df_fea['last_msg_cnt'] = df_fea['msg_list'].apply(
|
|
lambda x: get_last_msg_cnt(x))
|
|
df_fea['first_msg_cnt'] = df_fea['msg_list'].apply(
|
|
lambda x: get_first_msg_cnt(x))
|
|
cat_cols = ['msg_list', 'msg_0', 'msg_1', 'msg_2',
|
|
'msg_list_unique', 'msg_0_unique', 'msg_1_unique', 'msg_2_unique',
|
|
'msg_list_list', 'msg_0_list', 'msg_1_list', 'msg_2_list',
|
|
'msg_0_first', 'msg_1_first', 'msg_2_first', 'msg_0_last', 'msg_1_last',
|
|
'msg_2_last', 'msg_last', 'msg_first']
|
|
num_cols = ['last_msg_cnt', 'first_msg_cnt']
|
|
id_cols = ['sn', 'fault_time']
|
|
|
|
df_fea = df_fea.rename(
|
|
columns={
|
|
i: f'{msg_type}_{i}' for i in (cat_cols + num_cols)})
|
|
cat_cols = [f'{msg_type}_{i}' for i in cat_cols]
|
|
for cat_col in cat_cols:
|
|
df_fea[cat_col] = df_fea[cat_col].astype(str)
|
|
df_fea = cat2num(df_fea, cat_cols, Transfer2num=True)
|
|
for i in cat_cols:
|
|
del df_fea[i]
|
|
return df_fea
|
|
|
|
def add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 32,window = 5,min_count =5,):
|
|
print(f'生成 {f1}_{f2}_w2v 特征')
|
|
|
|
df_fea = all_data.groupby(f1).agg({f2:'sum'}).reset_index()
|
|
df_emb = df_fea[[f1 ]]
|
|
sencences = df_fea[f2].to_list()
|
|
if not os.path.exists(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl')):
|
|
print(f'{f1}_{f2}_w2v_model 不存在,开始训练......')
|
|
model = Word2Vec(sencences, vector_size=emb_size, window=window,
|
|
min_count=min_count, sg=0, hs=1, seed=42)
|
|
with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'wb') as f:
|
|
pickle.dump(model, f)
|
|
else:
|
|
print(f'{f1}_{f2}_w2v_model 已存在,开始读取......')
|
|
with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'rb') as f:
|
|
model = pickle.load(f)
|
|
|
|
emb_matrix_mean = []
|
|
for sent in sencences:
|
|
vec = []
|
|
for w in sent:
|
|
if w in model.wv:
|
|
vec.append(model.wv[w])
|
|
if len(vec) >0:
|
|
emb_matrix_mean.append(np.mean(vec,axis = 0))
|
|
else:
|
|
emb_matrix_mean.append([0]*emb_size)
|
|
df_emb_mean = pd.DataFrame(emb_matrix_mean).add_prefix(f'{f1}_{f2}_w2v_')
|
|
|
|
df_emb = pd.concat([df_emb,df_emb_mean],axis = 1)
|
|
w2v_feats_df = w2v_feats_df.merge(df_emb,on = f1,how ='left')
|
|
return w2v_feats_df
|
|
def get_w2v_feats(all_data,f1_list,f2_list):
|
|
all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
|
|
all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)])
|
|
all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)])
|
|
all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)])
|
|
w2v_feats_df = all_data[f1_list].drop_duplicates()
|
|
for f1 in f1_list:
|
|
for f2 in f2_list:
|
|
w2v_feats_df = add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count =5,)
|
|
print(f'w2v_feats 的特征维度: {w2v_feats_df.shape}')
|
|
return w2v_feats_df
|
|
|
|
|
|
|
|
def get_time_diff_feats_v2(all_data):
|
|
print('生成时间差特征 time_diff_feats_v2')
|
|
all_data['duration_seconds'] = all_data['time_interval']
|
|
all_data['duration_minutes'] = all_data['time_interval'] / 60
|
|
df_merge_log = all_data[['sn', 'fault_time', 'label', 'time', 'msg',
|
|
'server_model', 'time_interval', 'duration_seconds',
|
|
'duration_minutes']]
|
|
df_merge_log['fault_id'] = df_merge_log['sn'] + '_' + df_merge_log['fault_time'] + '_' + df_merge_log[
|
|
'server_model']
|
|
f1_list = ['fault_id', 'sn', 'server_model']
|
|
f2_list = ['duration_minutes', 'duration_seconds']
|
|
time_diff_feats_v2 = df_merge_log[['sn', 'fault_time', 'fault_id', 'server_model']].drop_duplicates().reset_index(
|
|
drop=True)
|
|
|
|
for f1 in f1_list:
|
|
for f2 in f2_list:
|
|
func_opt = ['count', 'nunique', 'min', 'max', 'median', 'sum']
|
|
for opt in func_opt:
|
|
tmp = df_merge_log.groupby([f1])[f2].agg([(f'{f2}_in_{f1}_' + opt, opt)]).reset_index()
|
|
# print(f'{f1}_in_{f2}_{opt}:{tmp.shape}' )
|
|
time_diff_feats_v2 = time_diff_feats_v2.merge(tmp, on=f1, how='left')
|
|
|
|
temp = df_merge_log.groupby([f1])[f2].apply(lambda x: stats.mode(x)[0][0])
|
|
time_diff_feats_v2[f'{f2}_in_{f1}_mode'] = time_diff_feats_v2[f1].map(temp).fillna(np.nan)
|
|
secs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
|
|
for sec in secs:
|
|
temp = df_merge_log.groupby([f1])[f2].quantile(sec).reset_index(
|
|
name=f'log_{f2}_in_{f1}_quantile_' + str(sec * 100))
|
|
# print(f'log_{f1}_in_{f2}_quantile_{str(sec * 100)}:{tmp.shape}' )
|
|
time_diff_feats_v2 = pd.merge(time_diff_feats_v2, temp, on=f1, how='left')
|
|
del time_diff_feats_v2['fault_id']
|
|
return time_diff_feats_v2
|
|
|
|
def get_feature(data, time_list, log_fea, fea_num, key):
|
|
print(f'当前特征维度{data.shape}')
|
|
fea_df_list = []
|
|
fea_cnt_list = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event','OEM CPU0 MCERR',
|
|
'OEM CPU0 CATERR', 'Reading 0 < Threshold 2 degrees C', '0203c0a80101',
|
|
'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR','Memory', 'Correctable ECC logging limit reached',
|
|
'Memory MEM_CHE0_Status', 'Memory Memory_Status', 'Memory #0x87', 'Memory CPU0F0_DIMM_Stat',
|
|
'Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', ]
|
|
for time_tmp in tqdm(time_list):
|
|
print(f'获取异常前后 {time_tmp} min的数据进行聚合')
|
|
tmp1 = data[(pd.to_datetime(data['time']) < pd.to_datetime(data[f'time_{time_tmp}'])) & (pd.to_datetime(data['time']) > pd.to_datetime(data[f'time_-{time_tmp}']))].sort_values(
|
|
['sn', 'fault_time'])
|
|
tmp1 = tmp1.groupby(key).apply(
|
|
lambda x: ' | '.join(x['msg'].to_list())).reset_index().rename(columns={0: 'msg'})
|
|
tmp1[f'msg_len'] = tmp1['msg'].apply(lambda x: len(x.split(' | ')))
|
|
# tmp1[f'msg_len_two'] = tmp1['msg'].apply(lambda x: len(x))
|
|
# 添加数字个数
|
|
# tmp1[f'msg_num_two'] = tmp1['msg'].apply(
|
|
# lambda x: len([int(s) for s in re.findall(r'\b\d+\b', x)]))
|
|
print(f'根据异常前后 {time_tmp} min的数据的日志数据提取 {fea_num} 个稀疏特征')
|
|
feature = log_fea + ['msg_len']
|
|
for fea in feature:
|
|
tmp1[fea] = tmp1['msg'].apply(lambda x: get_fea(x, fea))
|
|
# 添加计数特征
|
|
if fea in fea_cnt_list:
|
|
tmp1[f'{fea}_cnt'] = tmp1['msg'].apply(lambda x:x.replace('|',' ').replace('_',' ').split(' ').count(fea))
|
|
feature.append(f'{fea}_cnt')
|
|
tmp1_new_col_map = {i: i + '_' + str(int(time_tmp)) for i in feature}
|
|
tmp1 = tmp1.rename(columns=tmp1_new_col_map)
|
|
del tmp1['msg']
|
|
fea_df_list.append(tmp1)
|
|
fea_df = fea_df_list[-1]
|
|
print(fea_df.shape)
|
|
for i in fea_df_list[:-1]:
|
|
fea_df = fea_df.merge(i, on=key, how='left')
|
|
print(fea_df.shape)
|
|
return fea_df
|
|
|
|
|
|
def get_msg_location(x, num):
|
|
try:
|
|
return x[num]
|
|
except BaseException:
|
|
return '其它'
|
|
|
|
|
|
def get_nearest_msg_fea(train, test):
|
|
print('生成 nearest_msg 特征')
|
|
df = pd.concat([train, test], axis=0, ignore_index=True)
|
|
df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply(
|
|
lambda x: x.total_seconds())
|
|
df = df.sort_values(
|
|
['sn', 'server_model', 'fault_time', 'time']).reset_index(drop=True)
|
|
df['duration_minutes_abs'] = np.abs(df['duration_minutes'])
|
|
|
|
df['duration_minutes_abs_rank'] = df.groupby(['sn', 'server_model', 'fault_time'])['duration_minutes_abs'].rank(
|
|
method='first', ascending=False)
|
|
|
|
key = ['sn', 'server_model', 'fault_time', 'duration_minutes_abs']
|
|
df = df.sort_values(key, ascending=False)
|
|
df = df.drop_duplicates(
|
|
['sn', 'server_model', 'fault_time', ], keep='first')
|
|
|
|
df.loc[df['duration_minutes'] ==
|
|
df['duration_minutes_abs'], 'last_or_next'] = 1
|
|
df.loc[df['duration_minutes'] !=
|
|
df['duration_minutes_abs'], 'last_or_next'] = 0
|
|
df['msg_cnt'] = df['msg'].map(df['msg'].value_counts())
|
|
df['msg_0'] = df['msg'].apply(
|
|
lambda x: get_msg_location(
|
|
x.split(' | '), 0))
|
|
df['msg_0_cnt'] = df['msg_0'].map(df['msg_0'].value_counts())
|
|
df['msg_1'] = df['msg'].apply(
|
|
lambda x: get_msg_location(
|
|
x.split(' | '), 1))
|
|
df['msg_1_cnt'] = df['msg_1'].map(df['msg_1'].value_counts())
|
|
df['msg_2'] = df['msg'].apply(
|
|
lambda x: get_msg_location(
|
|
x.split(' | '), 2))
|
|
df['msg_2_cnt'] = df['msg_2'].map(df['msg_2'].value_counts())
|
|
cat_feats = ['msg', 'msg_0', 'msg_1',
|
|
'msg_2'] # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour']
|
|
# for name in cat_feats:
|
|
# le = LabelEncoder()
|
|
# df[f'{name}_LabelEnc'] = le.fit_transform(df[name])
|
|
df = cat2num(df,cat_feats)
|
|
df = df.drop_duplicates().reset_index(drop=True)
|
|
df = df[['sn', 'server_model', 'fault_time', 'msg_cnt',
|
|
'msg_0_cnt', 'msg_1_cnt', 'msg_2_cnt',
|
|
# 'duration_minutes_abs','duration_minutes', 'duration_minutes_abs_rank',
|
|
'last_or_next', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc']]
|
|
print(f'生成 nearest_msg 特征完毕,特征维度{df.shape}')
|
|
return df
|
|
|
|
def get_server_model_time_interval_stat_fea(all_data):
|
|
server_model_time_interval_stat_fea = all_data.groupby('server_model').agg({'time_interval':['min','max','mean','median']}).reset_index()
|
|
server_model_time_interval_stat_fea = get_new_cols(server_model_time_interval_stat_fea,key = ['server_model' ])
|
|
|
|
server_model_time_interval_stat_fea.columns = ['server_model', 'sm_time_interval_min', 'sm_ttime_interval_max',
|
|
'sm_ttime_interval_mean', 'sm_ttime_interval_median']
|
|
return server_model_time_interval_stat_fea
|
|
|
|
def get_server_model_sn_fea_2(train, test):
|
|
df = pd.concat([train[['sn', 'server_model']],
|
|
test[['sn', 'server_model']]], ignore_index=True)
|
|
df['server_model_count_sn_2'] = df.groupby(
|
|
['server_model'])['sn'].transform('count')
|
|
df['server_model_nunique_sn_2'] = df.groupby(
|
|
['server_model'])['sn'].transform('nunique')
|
|
df['sn_cnt_2'] = df['sn'].map(df['sn'].value_counts())
|
|
return df.drop_duplicates().reset_index(drop=True)
|
|
|
|
|
|
def get_4_time_stat_fea(df):
|
|
print(' 生成时间统计特征')
|
|
time_stat_fea_df = df.groupby(['sn', 'fault_time', 'server_model']).agg(
|
|
{'duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std', 'count'],
|
|
'log_duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'],
|
|
'time_diff_1': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'],
|
|
'log_time_diff_1': ['min', 'max', 'median'],
|
|
}).reset_index()
|
|
new_time_stat_cols = []
|
|
for i in time_stat_fea_df.columns:
|
|
if i[0] in ['sn', 'fault_time', 'server_model']:
|
|
new_time_stat_cols.append(i[0])
|
|
else:
|
|
new_time_stat_cols.append(f'{i[0]}_{i[1]}')
|
|
# print(f'{i[0]}_{i[1]}')
|
|
time_stat_fea_df.loc[time_stat_fea_df[i[0]]
|
|
[i[1]] == -np.inf, (i[0], i[1])] = -20
|
|
time_stat_fea_df.loc[time_stat_fea_df[i[0]]
|
|
[i[1]] == np.inf, (i[0], i[1])] = 30
|
|
time_stat_fea_df.columns = new_time_stat_cols
|
|
time_stat_fea_df['duration_minutes_range'] = time_stat_fea_df['duration_minutes_max'] - time_stat_fea_df[
|
|
'duration_minutes_min']
|
|
time_stat_fea_df['log_duration_minutes_range'] = time_stat_fea_df['log_duration_minutes_max'] - time_stat_fea_df[
|
|
'log_duration_minutes_min']
|
|
time_stat_fea_df['time_diff_1_range'] = time_stat_fea_df['time_diff_1_max'] - \
|
|
time_stat_fea_df['time_diff_1_min']
|
|
time_stat_fea_df['log_time_diff_1_range'] = time_stat_fea_df['log_time_diff_1_max'] - time_stat_fea_df[
|
|
'log_time_diff_1_min']
|
|
time_stat_fea_df['duration_minutes_freq'] = time_stat_fea_df['duration_minutes_range'] / time_stat_fea_df[
|
|
'duration_minutes_count']
|
|
print(f' 生成时间统计特征完毕,特征维度:{time_stat_fea_df.shape}')
|
|
return time_stat_fea_df
|
|
|
|
|
|
def get_time_std_fea(train, test):
|
|
print('生成 server_model 特征')
|
|
df = pd.concat([train, test], axis=0, ignore_index=True)
|
|
# df['year'] = df['time'].dt.year
|
|
# df['month'] = df['time'].dt.month
|
|
df['hour'] = df['time'].dt.hour
|
|
# df['week'] = df['time'].dt.week
|
|
df['minute'] = df['time'].dt.minute
|
|
time_std = df.groupby(['sn', 'server_model']).agg(
|
|
{'hour': 'std', 'minute': 'std'}).reset_index()
|
|
time_std = time_std.rename(
|
|
columns={
|
|
'hour': 'hour_std',
|
|
'minute': 'minute_std'})
|
|
return time_std
|
|
|
|
|
|
def get_key(all_data):
|
|
all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
|
|
class_fea_cnt_list = []
|
|
for label in [0,1,2,3]:
|
|
class_df = all_data.query(f'label =={label}')
|
|
counter = Counter()
|
|
for i in class_df['msg_list']:
|
|
counter.update(i)
|
|
class_fea_cnt = pd.DataFrame({i[0]:i[1] for i in counter.most_common()},index = [f'fea_cnt_{label}']).T.reset_index().rename(columns = {'index':'fea'})
|
|
class_fea_cnt_list.append(class_fea_cnt)
|
|
|
|
fea_cnt_df = class_fea_cnt_list[0]
|
|
for tmp in class_fea_cnt_list[1:]:
|
|
fea_cnt_df = fea_cnt_df.merge(tmp,on = 'fea')
|
|
|
|
fea_cnt_df['fea_cnt_sum'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']].sum(1)
|
|
|
|
all_fea_cnt = fea_cnt_df['fea_cnt_sum'].sum()
|
|
|
|
for i in ['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']:
|
|
fea_cnt_df[f'{i}_ratio'] = fea_cnt_df[i]/fea_cnt_df['fea_cnt_sum']
|
|
fea_cnt_df[f'{i}_all_ratio'] = fea_cnt_df[i]/all_fea_cnt
|
|
|
|
fea_cnt_df['fea_cnt_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_ratio','fea_cnt_1_ratio','fea_cnt_2_ratio','fea_cnt_3_ratio', ]].std(1)
|
|
fea_cnt_df['fea_cnt_std'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1','fea_cnt_2','fea_cnt_3',]].std(1)
|
|
|
|
fea_cnt_df['fea_cnt_all_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_all_ratio','fea_cnt_1_all_ratio',
|
|
'fea_cnt_2_all_ratio','fea_cnt_3_all_ratio',]].std(1)
|
|
|
|
fea_cnt_df = fea_cnt_df[~fea_cnt_df['fea_cnt_ratio_std'].isnull()].sort_values('fea_cnt_ratio_std',ascending = False)
|
|
|
|
fea_cnt_df['fea_max'] = np.argmax(fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3',]].values,axis = 1)
|
|
key_0 = fea_cnt_df.query('fea_max ==0 ')['fea'].to_list()
|
|
key_1 = fea_cnt_df.query('fea_max ==1 ')['fea'].to_list()
|
|
key_2 = fea_cnt_df.query('fea_max ==2 ')['fea'].to_list()
|
|
key_3 = fea_cnt_df.query('fea_max ==3 ')['fea'].to_list()
|
|
# key_1 = ['OEM record c2','Processor CPU_Core_Error','001c4c','System Event Sys_Event','Power Supply PS0_Status','Temperature CPU0_Margin_Temp','Reading 51 > Threshold 85 degrees C','Lower Non-critical going low','Temperature CPU1_Margin_Temp','System ACPI Power State #0x7d','Lower Critical going low']
|
|
# key_2 = ['OEM CPU0 MCERR','OEM CPU0 CATERR','Reading 0 < Threshold 2 degrees C','0203c0a80101','Unknown CPU0 MCERR','Unknown CPU0 CATERR','Microcontroller #0x3b','System Boot Initiated','Processor #0xfa','Power Unit Pwr Unit Status','Hard reset','Power off/down','System Event #0xff','Memory CPU1A1_DIMM_Stat','000000','Power cycle','OEM record c3','Memory CPU1C0_DIMM_Stat','Reading 0 < Threshold 1 degrees C','IERR']
|
|
# key_3 = ['Memory','Correctable ECC logging limit reached','Memory MEM_CHE0_Status','Memory Memory_Status','Memory #0x87','Memory CPU0F0_DIMM_Stat','Memory Device Disabled','Memory #0xe2','OS Stop/Shutdown OS Status','System Boot Initiated System Restart','OS Boot BIOS_Boot_Up','System Boot Initiated BIOS_Boot_UP','Memory DIMM101','OS graceful shutdown','OS Critical Stop OS Status','Memory #0xf9','Memory CPU0C0_DIMM_Stat','Memory DIMM111','Memory DIMM021',]
|
|
# key_4 = ['Drive Fault','NMI/Diag Interrupt','Failure detected','Power Supply AC lost','Power Supply PSU0_Supply','AC out-of-range, but present','Predictive failure','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS1_Status','Identify Status','Power Supply PS2_Status','Temperature DIMMG1_Temp','Upper Non-critical going high','Temperature DIMMG0_Temp','Upper Critical going high','Power Button pressed','System Boot Initiated #0xb8','Deasserted']
|
|
return key_0,key_1,key_2,key_3
|
|
|
|
def get_class_key_words_nunique(all_data):
|
|
print('获取 class_key_words_nunique 特征')
|
|
|
|
key_0,key_1,key_2,key_3 = get_key(all_data)
|
|
|
|
df = all_data[['sn', 'fault_time', 'msg_list']]
|
|
df_tmp = df.groupby(['sn' ]).agg({'msg_list':'sum'}).reset_index()
|
|
df_tmp['class_0_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_0)))
|
|
df_tmp['class_1_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_1)))
|
|
df_tmp['class_2_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_2)))
|
|
df_tmp['class_3_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_3)))
|
|
del df_tmp['msg_list']
|
|
return df_tmp
|
|
def get_key_for_top_fea(train,test):
|
|
KEY_FOR_TOP_COLS = []
|
|
print('添加 key_for_top_fea 特征')
|
|
for TIME in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]:
|
|
for i in range(10):
|
|
train[f'KEY_FOR_TOP_{i}_{TIME}'] = train[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_4[i]}_{TIME}'].astype(str)
|
|
test[f'KEY_FOR_TOP_{i}_{TIME}'] = test[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_4[i]}_{TIME}'].astype(str)
|
|
KEY_FOR_TOP_COLS.append(f'KEY_FOR_TOP_{i}_{TIME}')
|
|
train = cat2num(train,KEY_FOR_TOP_COLS)
|
|
test = cat2num(test,KEY_FOR_TOP_COLS)
|
|
for KEY_FOR_TOP_COL in KEY_FOR_TOP_COLS:
|
|
del train[KEY_FOR_TOP_COL]
|
|
del test[KEY_FOR_TOP_COL]
|
|
return train,test
|
|
|
|
def get_key_word_cross_fea(train,test):
|
|
print('获取关键词交叉特征......')
|
|
KEY_WORDS_MAP = {'CPU0':KEY_1,'CPU1':KEY_2,'CPU2':KEY_3,'CPU3':KEY_4}
|
|
KEY_WORDS_CROSS_COLS =[]
|
|
for KEY_WORDS in KEY_WORDS_MAP:
|
|
for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]:
|
|
KEY_WORDS_COLS = [f'{col}_{i}' for col in KEY_WORDS_MAP[KEY_WORDS]]
|
|
train[f'{KEY_WORDS}_WORDS_{i}'] = train[KEY_WORDS_COLS].astype(str).sum(1)
|
|
test[f'{KEY_WORDS}_WORDS_{i}'] = test[KEY_WORDS_COLS].astype(str).sum(1)
|
|
KEY_WORDS_CROSS_COLS.append(f'{KEY_WORDS}_WORDS_{i}')
|
|
train = cat2num(train,KEY_WORDS_CROSS_COLS)
|
|
test = cat2num(test,KEY_WORDS_CROSS_COLS)
|
|
|
|
for COLS in KEY_WORDS_CROSS_COLS:
|
|
del train[COLS]
|
|
del test[COLS]
|
|
print('获取关键词交叉特征完毕......')
|
|
return train,test
|
|
def get_time_quantile_fea(df):
|
|
print(' 生成时间分位数特征')
|
|
secs = [0.2, 0.4, 0.6, 0.8]
|
|
time_fea_list = []
|
|
for sec in tqdm(secs):
|
|
for time_fea_type in [
|
|
'duration_minutes', 'log_duration_minutes', 'time_diff_1', 'log_time_diff_1']:
|
|
temp = df.groupby(['sn', 'server_model', 'fault_time'])[time_fea_type].quantile(sec).reset_index(
|
|
name=f'{time_fea_type}_' + str(sec * 100))
|
|
|
|
time_fea_list.append(temp)
|
|
time_fea_df = time_fea_list[0]
|
|
for time_fea in time_fea_list[1:]:
|
|
time_fea_df = time_fea_df.merge(
|
|
time_fea, how='left', on=[
|
|
'sn', 'server_model', 'fault_time'])
|
|
print(f' 生成时间分位数特征完毕,特征维度:{time_fea_df.shape}')
|
|
return time_fea_df
|
|
|
|
|
|
def get_server_model_fea(train, test):
|
|
print('生成 server_model 特征')
|
|
df = pd.concat([train, test], axis=0, ignore_index=True)
|
|
df['server_model_count_sn'] = df.groupby(
|
|
['server_model'])['sn'].transform('count')
|
|
df['server_model_nunique_sn'] = df.groupby(
|
|
['server_model'])['sn'].transform('nunique')
|
|
# df['server_model_count'] = df.groupby('server_model')['server_model'].transform('count')
|
|
# df['server_model_cnt_quantile'] = df['server_model'].map(
|
|
# df['server_model'].value_counts().rank() / len(df['server_model'].unique()))
|
|
# df['server_model_cnt_rank'] = df[f'server_model_cnt_quantile'].rank(method='min')
|
|
|
|
df['sn_cnt'] = df['sn'].map(df['sn'].value_counts())
|
|
df['sn_freq'] = df['sn'].map(df['sn'].value_counts() / len(df))
|
|
df['server_model_cnt'] = df['server_model'].map(
|
|
df['server_model'].value_counts())
|
|
df['server_model_freq'] = df['server_model'].map(
|
|
df['server_model'].value_counts() / len(df))
|
|
select_cols = ['sn', 'server_model',
|
|
'server_model_count_sn', 'server_model_nunique_sn',
|
|
'sn_cnt', 'sn_freq', 'server_model_cnt', 'server_model_freq'
|
|
# 'server_model_count','server_model_cnt_quantile', 'server_model_cnt_rank'
|
|
]
|
|
server_model_fea = df[select_cols]
|
|
|
|
cat_feats = [
|
|
'server_model'] # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour']
|
|
# for name in cat_feats:
|
|
# le = LabelEncoder()
|
|
# server_model_fea[f'{name}_LabelEnc'] = le.fit_transform(
|
|
# server_model_fea[name])
|
|
server_model_fea = cat2num(server_model_fea, cat_feats, Transfer2num=True)
|
|
server_model_fea = server_model_fea.drop_duplicates().reset_index(drop=True)
|
|
print(f'生成 server_model 特征完毕,特征维度:{server_model_fea.shape}')
|
|
|
|
return server_model_fea
|
|
|
|
|
|
def get_time_type_msg_unique_fea(df):
|
|
df['msg_list'] = df['msg'].apply(
|
|
lambda x: [i.strip() for i in x.split(' | ')])
|
|
|
|
df['msg_0'] = df['msg'].apply(
|
|
lambda x: [
|
|
get_msg_location(
|
|
x.split(' | '),
|
|
0)])
|
|
df['msg_1'] = df['msg'].apply(
|
|
lambda x: [
|
|
get_msg_location(
|
|
x.split(' | '),
|
|
1)])
|
|
df['msg_2'] = df['msg'].apply(
|
|
lambda x: [
|
|
get_msg_location(
|
|
x.split(' | '),
|
|
2)])
|
|
|
|
df = df.groupby(['sn', 'fault_time']).agg(
|
|
{'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index()
|
|
|
|
df['msg_set'] = df['msg_list'].apply(lambda x: '|'.join(list(set(x))))
|
|
|
|
df['msg_0_set'] = df['msg_0'].apply(lambda x: '|'.join(list(set(x))))
|
|
df['msg_1_set'] = df['msg_1'].apply(lambda x: '|'.join(list(set(x))))
|
|
df['msg_2_set'] = df['msg_2'].apply(lambda x: '|'.join(list(set(x))))
|
|
df = df[['sn', 'fault_time', 'msg_set',
|
|
'msg_0_set', 'msg_1_set', 'msg_2_set']]
|
|
return df
|
|
|
|
|
|
def get_msg_unique_fea(train, test, time_type='last'):
|
|
print('生成msg_unique_ fea')
|
|
common_cols = ['msg_set', 'msg_0_set', 'msg_1_set', 'msg_2_set']
|
|
df = pd.concat([train, test], axis=0, ignore_index=True)
|
|
df['time_interval'] = (
|
|
pd.to_datetime(
|
|
df['fault_time']) -
|
|
df['time']).apply(
|
|
lambda x: x.total_seconds())
|
|
|
|
last_fea = get_time_type_msg_unique_fea(df.query('time_interval >0'))
|
|
last_fea = last_fea.rename(columns={i: f'last_{i}' for i in common_cols})
|
|
next_fea = get_time_type_msg_unique_fea(df.query('time_interval <0'))
|
|
next_fea = next_fea.rename(columns={i: f'next_{i}' for i in common_cols})
|
|
all_fea = get_time_type_msg_unique_fea(df)
|
|
all_fea = all_fea.rename(columns={i: f'all_{i}' for i in common_cols})
|
|
msg_unique_fea = all_fea.merge(
|
|
last_fea, on=['sn', 'fault_time'], how='outer')
|
|
msg_unique_fea = msg_unique_fea.merge(
|
|
next_fea, on=['sn', 'fault_time'], how='outer')
|
|
return msg_unique_fea
|
|
|
|
|
|
def get_duration_minutes_fea(train, test):
|
|
print('生成 duration_minutes 特征')
|
|
df = pd.concat([train, test], axis=0, ignore_index=True)
|
|
df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply(
|
|
lambda x: x.total_seconds())
|
|
df['log_duration_minutes'] = np.log(df['duration_minutes'])
|
|
|
|
df = df.sort_values(['sn', 'label', 'server_model',
|
|
'fault_time', 'time']).reset_index(drop=True)
|
|
df['time_diff_1'] = (df.groupby(['sn', 'server_model', 'fault_time'])['time'].diff(1)).apply(
|
|
lambda x: x.total_seconds())
|
|
df['time_diff_1'] = df['time_diff_1'].fillna(0)
|
|
df['log_time_diff_1'] = np.log(df['time_diff_1'])
|
|
|
|
# time_quantile_fea_df = get_time_quantile_fea(df)
|
|
# time_stat_fea_df = get_4_time_stat_fea(df)
|
|
# df_tmp = time_quantile_fea_df.merge(time_stat_fea_df, on= ['sn', 'server_model','fault_time'],how = 'left')
|
|
time_stat_fea_df = get_4_time_stat_fea(df)
|
|
df_tmp = time_stat_fea_df
|
|
print(f'生成 duration_minutes 特征完毕,特征维度{df_tmp.shape}')
|
|
return df_tmp
|
|
|
|
|
|
def get_msg_text_fea_all(all_data):
|
|
all_data['label'] = all_data['label'].fillna(-1)
|
|
all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
|
|
all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)])
|
|
all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)])
|
|
all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)])
|
|
|
|
all_data = all_data.sort_values(['sn', 'fault_time', 'time']).reset_index(drop=True)
|
|
del all_data['label']
|
|
last_data = all_data.query('time_interval >0')
|
|
next_data = all_data.query('time_interval <=0')
|
|
|
|
# id_cols = ['sn', 'fault_time', 'label']
|
|
|
|
# all_msg_text_fea = get_msg_text_fea(all_data, msg_type='all')
|
|
last_msg_text_fea = get_msg_text_fea(last_data, msg_type='last')
|
|
# next_msg_text_fea = get_msg_text_fea(next_data, msg_type='next')
|
|
msg_text_fea = last_msg_text_fea
|
|
return msg_text_fea
|
|
|
|
def get_test_key_words(train,test):
|
|
|
|
df = pd.concat([train[['sn', 'fault_time', 'label','msg']],test[['sn', 'fault_time', 'msg']]],ignore_index = True).drop_duplicates(['sn', 'fault_time', 'msg'])
|
|
df['label'] = df['label'].fillna(5)
|
|
df['msg_list'] = df['msg'].apply(lambda x:[i.strip() for i in x.split(' | ')])
|
|
words_cnt_df_list = []
|
|
for label in df['label'].unique():
|
|
label = int(label)
|
|
df_tmp = df.query(f'label == {label}')
|
|
counter = Counter()
|
|
for words in df_tmp['msg_list']:
|
|
words = [i.replace('_',' ') for i in words]
|
|
# word_list = []
|
|
# for i in words:
|
|
# word_list+=i.split(' ')
|
|
# words = word_list
|
|
counter.update(words)
|
|
words_cnt_df = pd.DataFrame(counter,index = [0]).T.reset_index().rename(columns = {'index':'word',0:f'cnt_{label}'})
|
|
words_cnt_df_list.append(words_cnt_df)
|
|
words_cnt_df = words_cnt_df_list[0]
|
|
for i in words_cnt_df_list[1:]:
|
|
words_cnt_df = words_cnt_df.merge(i,on = 'word',how = 'outer' )
|
|
|
|
words_cnt_df = words_cnt_df.fillna(-1)
|
|
words_cnt_df1 = words_cnt_df.query('cnt_0 >10 and cnt_2 >10 and cnt_1 >10 and cnt_3>10 and cnt_5>10 ')
|
|
cnt_class = ['cnt_0','cnt_1','cnt_2','cnt_3','cnt_5']
|
|
words_cnt_df1['word_cnt_sum'] = words_cnt_df1.loc[:,cnt_class].sum(1)
|
|
for i in cnt_class:
|
|
words_cnt_df1[f'{i}_ratio'] = words_cnt_df1[i]/words_cnt_df1['word_cnt_sum']
|
|
words_cnt_df1['word_cnt_ratio_std'] = words_cnt_df1.loc[:,['cnt_0_ratio','cnt_1_ratio', 'cnt_2_ratio', 'cnt_3_ratio']].std(1)
|
|
words_cnt_df1['cnt_1_0_diff'] = (words_cnt_df1['cnt_1_ratio'] - words_cnt_df1['cnt_0_ratio'])
|
|
test_key_words = words_cnt_df1.sort_values('cnt_5',ascending = False)['word'].to_list()[5:40]
|
|
return test_key_words
|
|
|
|
def get_w2v_mean(w2v_model,sentences):
|
|
emb_matrix = list()
|
|
vec = list()
|
|
for w in sentences.split():
|
|
if w in w2v_model.wv:
|
|
vec.append(w2v_model.wv[w])
|
|
if len(vec) > 0:
|
|
emb_matrix.append(np.mean(vec, axis=0))
|
|
else:
|
|
emb_matrix.append([0] * w2v_model.vector_size)
|
|
return emb_matrix
|
|
def get_tfidf_svd(tfv,svd,sentences, n_components=16):
|
|
X_tfidf = tfv.transform(sentences)
|
|
X_svd = svd.transform(X_tfidf)
|
|
return np.mean(X_svd, axis=0)
|
|
def get_w2v_tfidf_fea(all_data):
|
|
print('w2v编码')
|
|
df = all_data
|
|
df['msg_list'] = df['msg'].apply(lambda x: [i.strip().lower().replace(' ','_') for i in x.split(" | ")])
|
|
df = df.groupby(['sn']).agg({'msg_list': 'sum'}).reset_index()
|
|
df['text'] = df['msg_list'].apply(lambda x: ' '.join(x))
|
|
|
|
sentences_list = df['text'].values.tolist()
|
|
sentences = []
|
|
for s in sentences_list:
|
|
sentences.append([w for w in s.split()])
|
|
w2v_model = Word2Vec(sentences, vector_size=10, window=3, min_count=5, sg=0, hs=1, seed=2022)
|
|
df['text_w2v'] = df['text'].apply(lambda x: get_w2v_mean(w2v_model, x)[0])
|
|
|
|
print('tfidf编码')
|
|
X = df['text'].to_list()
|
|
tfv = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=50000)
|
|
tfv.fit(X)
|
|
X_tfidf = tfv.transform(X)
|
|
svd = TruncatedSVD(n_components=16) # 降维
|
|
svd.fit(X_tfidf)
|
|
df['text_tfidf'] = df['text'].apply(lambda x: get_tfidf_svd(tfv, svd, x.split()))
|
|
|
|
print("doc2vec编码")
|
|
texts = df['text'].tolist()
|
|
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
|
|
model = Doc2Vec(documents, window=5, min_count=3, workers=4)
|
|
docvecs = model.docvecs
|
|
df['doc2vec'] = [docvecs[i] for i in range(len(docvecs))]
|
|
|
|
for i in range(32):
|
|
df[f'msg_w2v_{i}'] = df['text_w2v'].apply(lambda x: x[i])
|
|
for i in range(16):
|
|
df[f'msg_tfv_{i}'] = df['text_tfidf'].apply(lambda x: x[i])
|
|
for i in range(100):
|
|
df[f'msg_doc2vec_{i}'] = df['doc2vec'].apply(lambda x: x[i])
|
|
|
|
save_cols = [i for i in df.columns if i not in ['msg_list', 'text', 'text_w2v', 'text_tfidf', 'doc2vec']]
|
|
return df[save_cols]
|
|
|
|
# w2v_tfidf_fea = get_w2v_tfidf_fea(all_data)
|
|
class BetaEncoder(object):
|
|
|
|
def __init__(self, group):
|
|
|
|
self.group = group
|
|
self.stats = None
|
|
|
|
# get counts from df
|
|
def fit(self, df, target_col):
|
|
# 先验均值
|
|
self.prior_mean = np.mean(df[target_col])
|
|
stats = df[[target_col, self.group]].groupby(self.group)
|
|
# count和sum
|
|
stats = stats.agg(['sum', 'count'])[target_col]
|
|
stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True)
|
|
stats.reset_index(level=0, inplace=True)
|
|
self.stats = stats
|
|
|
|
# extract posterior statistics
|
|
def transform(self, df, stat_type, N_min=1):
|
|
|
|
df_stats = pd.merge(df[[self.group]], self.stats, how='left')
|
|
n = df_stats['n'].copy()
|
|
N = df_stats['N'].copy()
|
|
|
|
# fill in missing
|
|
nan_indexs = np.isnan(n)
|
|
n[nan_indexs] = self.prior_mean
|
|
N[nan_indexs] = 1.0
|
|
|
|
# prior parameters
|
|
N_prior = np.maximum(N_min - N, 0)
|
|
alpha_prior = self.prior_mean * N_prior
|
|
beta_prior = (1 - self.prior_mean) * N_prior
|
|
|
|
# posterior parameters
|
|
alpha = alpha_prior + n
|
|
beta = beta_prior + N - n
|
|
|
|
# calculate statistics
|
|
if stat_type == 'mean':
|
|
num = alpha
|
|
dem = alpha + beta
|
|
|
|
elif stat_type == 'mode':
|
|
num = alpha - 1
|
|
dem = alpha + beta - 2
|
|
|
|
elif stat_type == 'median':
|
|
num = alpha - 1 / 3
|
|
dem = alpha + beta - 2 / 3
|
|
|
|
elif stat_type == 'var':
|
|
num = alpha * beta
|
|
dem = (alpha + beta) ** 2 * (alpha + beta + 1)
|
|
|
|
elif stat_type == 'skewness':
|
|
num = 2 * (beta - alpha) * np.sqrt(alpha + beta + 1)
|
|
dem = (alpha + beta + 2) * np.sqrt(alpha * beta)
|
|
|
|
elif stat_type == 'kurtosis':
|
|
num = 6 * (alpha - beta) ** 2 * (alpha + beta + 1) - \
|
|
alpha * beta * (alpha + beta + 2)
|
|
dem = alpha * beta * (alpha + beta + 2) * (alpha + beta + 3)
|
|
|
|
# replace missing
|
|
value = num / dem
|
|
value[np.isnan(value)] = np.nanmedian(value)
|
|
return value
|
|
|
|
|
|
def get_beta_target(train, test):
|
|
N_min = 1000
|
|
feature_cols = []
|
|
|
|
# encode variables
|
|
for c in ['server_model']:
|
|
# fit encoder
|
|
be = BetaEncoder(c)
|
|
be.fit(train, 'label')
|
|
|
|
# mean
|
|
feature_name = f'{c}_mean'
|
|
train[feature_name] = be.transform(train, 'mean', N_min)
|
|
test[feature_name] = be.transform(test, 'mean', N_min)
|
|
feature_cols.append(feature_name)
|
|
|
|
# mode
|
|
feature_name = f'{c}_mode'
|
|
train[feature_name] = be.transform(train, 'mode', N_min)
|
|
test[feature_name] = be.transform(test, 'mode', N_min)
|
|
feature_cols.append(feature_name)
|
|
|
|
# median
|
|
feature_name = f'{c}_median'
|
|
train[feature_name] = be.transform(train, 'median', N_min)
|
|
test[feature_name] = be.transform(test, 'median', N_min)
|
|
feature_cols.append(feature_name)
|
|
|
|
# var
|
|
feature_name = f'{c}_var'
|
|
train[feature_name] = be.transform(train, 'var', N_min)
|
|
test[feature_name] = be.transform(test, 'var', N_min)
|
|
feature_cols.append(feature_name)
|
|
|
|
# # skewness
|
|
# feature_name = f'{c}_skewness'
|
|
# train[feature_name] = be.transform(train, 'skewness', N_min)
|
|
# test[feature_name] = be.transform(test, 'skewness', N_min)
|
|
# feature_cols.append(feature_name)
|
|
|
|
# kurtosis
|
|
feature_name = f'{c}_kurtosis'
|
|
train[feature_name] = be.transform(train, 'kurtosis', N_min)
|
|
test[feature_name] = be.transform(test, 'kurtosis', N_min)
|
|
feature_cols.append(feature_name)
|
|
df = train.append(test).reset_index(drop=True)
|
|
df = df[['sn', 'fault_time', 'server_model', 'server_model_mean',
|
|
'server_model_mode', 'server_model_median', 'server_model_var',
|
|
'server_model_kurtosis']].drop_duplicates().reset_index(drop=True)
|
|
return df
|