|
|
import os
|
|
|
import sys
|
|
|
from log import Logger
|
|
|
from collections import Counter
|
|
|
from tqdm import tqdm
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
|
|
|
ROOT_DIR = os.path.join(sys.path[0], '../')
|
|
|
LOG_DIR = os.path.join(ROOT_DIR, 'log')
|
|
|
|
|
|
DATA_DIR = os.path.join(ROOT_DIR, 'data')
|
|
|
TRAIN_DIR = os.path.join(DATA_DIR, 'preliminary_train')
|
|
|
# 提交docker时 需要打开更换
|
|
|
MODEL_PATH = os.path.join(ROOT_DIR, './model/deberta-base')
|
|
|
MODEL_1_PATH = os.path.join(ROOT_DIR, './model')
|
|
|
TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata')
|
|
|
# TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata_test')
|
|
|
PSEUDO_FALG = True
|
|
|
TEST_B_DIR = os.path.join(ROOT_DIR, 'tcdata')
|
|
|
|
|
|
|
|
|
|
|
|
RESULT_DIR = os.path.join(ROOT_DIR, 'prediction_result')
|
|
|
|
|
|
FEATURE_DIR = os.path.join(ROOT_DIR, 'feature')
|
|
|
GENERATION_DIR = os.path.join(FEATURE_DIR, 'generation')
|
|
|
CORRELATION_DIR = os.path.join(FEATURE_DIR, 'correlation')
|
|
|
|
|
|
|
|
|
USER_DATA_DIR = os.path.join(ROOT_DIR, 'user_data')
|
|
|
USER_MODEL_DIR = os.path.join(USER_DATA_DIR, 'model_data')
|
|
|
TMP_DIR = os.path.join(USER_DATA_DIR, 'tmp_data')
|
|
|
N_ROUNDS = 10000
|
|
|
TIME_INTERVAL = 60
|
|
|
|
|
|
KEY_1 = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event', 'Power Supply PS0_Status',
|
|
|
'Temperature CPU0_Margin_Temp', 'Reading 51 > Threshold 85 degrees C', 'Lower Non-critical going low',
|
|
|
'Temperature CPU1_Margin_Temp', 'System ACPI Power State #0x7d', 'Lower Critical going low']
|
|
|
KEY_2 = ['OEM CPU0 MCERR', 'OEM CPU0 CATERR', 'Reading 0 < Threshold 2 degrees C', '0203c0a80101',
|
|
|
'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR', 'Microcontroller #0x3b', 'System Boot Initiated',
|
|
|
'Processor #0xfa', 'Power Unit Pwr Unit Status', 'Hard reset', 'Power off/down', 'System Event #0xff',
|
|
|
'Memory CPU1A1_DIMM_Stat', '000000', 'Power cycle', 'OEM record c3', 'Memory CPU1C0_DIMM_Stat',
|
|
|
'Reading 0 < Threshold 1 degrees C', 'IERR']
|
|
|
KEY_3 = ['Memory', 'Correctable ECC logging limit reached', 'Memory MEM_CHE0_Status', 'Memory Memory_Status',
|
|
|
'Memory #0x87', 'Memory CPU0F0_DIMM_Stat', 'Memory Device Disabled', 'Memory #0xe2',
|
|
|
'OS Stop/Shutdown OS Status', 'System Boot Initiated System Restart', 'OS Boot BIOS_Boot_Up',
|
|
|
'System Boot Initiated BIOS_Boot_UP', 'Memory DIMM101', 'OS graceful shutdown', 'OS Critical Stop OS Status',
|
|
|
'Memory #0xf9', 'Memory CPU0C0_DIMM_Stat', 'Memory DIMM111', 'Memory DIMM021', ]
|
|
|
KEY_4 = ['Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', 'Power Supply PSU0_Supply',
|
|
|
'AC out-of-range, but present', 'Predictive failure', 'Drive Present', 'Temperature Temp_DIMM_KLM',
|
|
|
'Temperature Temp_DIMM_DEF', 'Power Supply PS1_Status', 'Identify Status', 'Power Supply PS2_Status',
|
|
|
'Temperature DIMMG1_Temp', 'Upper Non-critical going high', 'Temperature DIMMG0_Temp',
|
|
|
'Upper Critical going high', 'Power Button pressed', 'System Boot Initiated #0xb8', 'Deasserted']
|
|
|
TOP_KEY_WORDS = ['0203c0a80101', 'Configuration Error', 'Correctable ECC', 'Deasserted', 'Device Enabled', 'Drive Present',
|
|
|
'Event Logging Disabled SEL', 'Failure detected', 'IERR', 'Initiated by hard reset', 'Initiated by power up',
|
|
|
'Initiated by warm reset', 'Log area reset/cleared', 'Memory', 'Memory #0xe2', 'Memory CPU0C0',
|
|
|
'Microcontroller/Coprocessor BMC', 'OEM CPU0 CATERR', 'OEM CPU0 MCERR', 'OS Boot BIOS',
|
|
|
'OS Critical Stop OS Status', 'Power Supply PS1', 'Power Supply PS2', 'Presence detected', 'Processor', 'Processor CPU', 'Processor CPU0',
|
|
|
'Processor CPU1', 'S0/G0: working', 'S4/S5: soft-off', 'Slot / Connector PCIE', 'State Asserted', 'State Deasserted',
|
|
|
'System ACPI Power State ACPI', 'System Boot Initiated', 'System Boot Initiated #0xe0', 'System Boot Initiated BIOS',
|
|
|
'System Event', 'System Event #0x10', 'System Event #0xff', 'Timestamp Clock Sync', 'Transition to Running', 'Uncorrectable ECC',
|
|
|
'Uncorrectable machine check exception', 'Unknown CPU0 CATERR', 'Unknown CPU0 MCERR', 'Unknown Chassis', 'Watchdog2 IPMI',
|
|
|
]
|
|
|
TOP_KEY_WORDS_2 = ['Processor CPU0 Status', 'System Boot Initiated BIOS Boot Up', 'Uncorrectable ECC', 'Initiated by power up',
|
|
|
'Configuration Error', 'Processor CPU CATERR', 'Processor CPU1 Status', 'Memory #0xe2', 'IERR', 'Initiated by warm reset',
|
|
|
'State Asserted', 'S4/S5: soft-off', 'Memory #0xf9', 'S0/G0: working', 'boot completed - device not specified', 'Timestamp Clock Sync',
|
|
|
'Presence detected', 'System Boot Initiated #0xe0', 'Drive Fault', 'Power Supply PS1 Status', 'Power off/down', 'OS Boot #0xe9',
|
|
|
'Failure detected', 'Uncorrectable machine check exception', 'Transition to Running', 'Power Supply PS2 Status',
|
|
|
'Memory Device Disabled', 'System Restart', 'System Event #0x10', 'Sensor access degraded or unavailable', 'Unknown #0x17',
|
|
|
'Drive Present', 'Management Subsys Health System Health', 'Power Supply AC lost', 'Microcontroller #0x16']
|
|
|
CHARATERS = ['#', '&', ]
|
|
|
# KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS
|
|
|
KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS + TOP_KEY_WORDS
|
|
|
KEY_WORDS = list(set(KEY_WORDS))
|
|
|
# cnt_1_0_diff_key_words = ['State Asserted','Processor CPU_CATERR','Unknown #0x17','Microcontroller #0x16','Transition to Running','State Deasserted','Processor #0xfa','Temperature CPU1_Margin_Temp','Temperature CPU0_Margin_Temp','Power cycle','Management Subsys Health System_Health','Sensor access degraded or unavailable','Power off/down','System ACPI Power State #0x7d']
|
|
|
# key_words_0 = ['Temperature CPU0_Margin_Temp','Lower Critical going low','System ACPI Power State #0x7d','Temperature CPU1_Margin_Temp','Lower Non-critical going low','Uncorrectable machine check exception','Reading 0 < Threshold 1 degrees C','000000','Unknown #0x19','Temperature DIMMG1_Temp','Reading 0 < Threshold 0 degrees C','001c4c','IERR','Upper Critical going high','Unknown Chassis_control','Temperature DIMMG0_Temp','Upper Non-critical going high','Temperature Temp_DIMM_DEF','Power cycle','Processor CPU0_Status','Temperature Temp_DIMM_KLM','Processor CPU1_Status','Management Subsys Health System_Health']
|
|
|
# key_words_1 = ['Processor #0xfa','State Deasserted','Power off/down','Power cycle','IERR','Unknown #0x17','Management Subsys Health System_Health','Processor CPU_CATERR','Reading 0 < Threshold 1 degrees C','','Sensor access degraded or unavailable','Transition to Running','State Asserted','Microcontroller #0x16','Processor CPU0_Status','Processor CPU1_Status','Slot / Connector PCIE_Status','Fault Status','System ACPI Power State ACPI_PWR_Status','Management Subsystem Health System_Health','Configuration Error','Uncorrectable machine check exception','Timestamp Clock Sync']
|
|
|
# key_words_2 = ['Memory #0xe2','Memory Device Disabled','Memory #0x87','Memory #0xf9','Correctable ECC','Memory CPU0D0_DIMM_Stat','Uncorrectable ECC','Memory CPU1B0_DIMM_Stat','System Boot Initiated BIOS_Boot_UP','System Restart','Presence Detected','Temperature CPU0_Temp','boot completed - device not specified','Log almost full','Device Present','Legacy OFF state','System Boot Initiated #0xe0','System Event #0x10','Legacy ON state','OS Boot #0xe0','Unknown #0xc5','System Boot Initiated #0xb8','Event Logging Disabled SEL_Status']
|
|
|
# key_words_3 = ['Drive Fault','Failure detected','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS4_Status','Upper Non-critical going high','Temperature DIMMG0_Temp','Temperature DIMMG1_Temp','Power Supply PS3_Status','Upper Critical going high','Predictive failure','Power Supply AC lost','Unknown #0x19','Power Unit Power Unit','AC out-of-range, but present','Power Supply PS1_Status','Power Supply PS2_Status','Log area reset/cleared','Microcontroller/Coprocessor BMC_Boot_Up','System Boot Initiated #0xb8','Power Button pressed','Device Present']
|
|
|
# top_key_words = [ 'Configuration Error','Uncorrectable ECC','Processor CPU0_Status','Initiated by power up','','Presence Detected','Processor CPU1_Status','S0/G0: working','Processor CPU_CATERR','Presence detected','S4/S5: soft-off','Upper Critical going high','Memory #0xe2','IERR','Initiated by warm reset','State Asserted','Upper Non-critical going high','boot completed - device not specified','Memory Device Disabled','Timestamp Clock Sync','Lower Critical going low','Transition to Running','Memory #0xf9','Power Supply PS1_Status']
|
|
|
# key_words_1_desc = ['#0xfa', '#0x','#0xff','CATERR','cycle','Unit','IERR','IPMI','#0x17', 'Running','#0x7c','Unknown','CPU', 'Sensor','CPU0','CPU1','Subsys']
|
|
|
#
|
|
|
# key_words = cnt_1_0_diff_key_words +key_words_0+key_words_1+key_words_2+key_words_3+top_key_words+key_words_1_desc
|
|
|
# key_words = list(set(key_words))
|
|
|
# KEY_WORDS = key_words+CHARATERS
|
|
|
|
|
|
|
|
|
def create_dir(dir):
|
|
|
"""
|
|
|
创建目录
|
|
|
:param dir: 目录名
|
|
|
:return:
|
|
|
"""
|
|
|
if not os.path.exists(dir):
|
|
|
os.mkdir(dir)
|
|
|
print(f'{dir}目录不存在,创建{dir}目录成功.')
|
|
|
else:
|
|
|
print(f'{dir}目录已存在.')
|
|
|
|
|
|
|
|
|
def create_all_dir():
|
|
|
"""
|
|
|
创建所有需要的目录
|
|
|
:return:
|
|
|
"""
|
|
|
create_dir(ROOT_DIR)
|
|
|
create_dir(LOG_DIR)
|
|
|
|
|
|
# create_dir(MODEL_DIR)
|
|
|
create_dir(RESULT_DIR)
|
|
|
|
|
|
create_dir(FEATURE_DIR)
|
|
|
create_dir(GENERATION_DIR)
|
|
|
create_dir(CORRELATION_DIR)
|
|
|
|
|
|
create_dir(DATA_DIR)
|
|
|
create_dir(TRAIN_DIR)
|
|
|
create_dir(TEST_A_DIR)
|
|
|
# create_dir(TEST_B_DIR)
|
|
|
|
|
|
create_dir(USER_DATA_DIR)
|
|
|
create_dir(USER_MODEL_DIR)
|
|
|
create_dir(TMP_DIR)
|
|
|
|
|
|
|
|
|
def clean_str(string):
|
|
|
return string
|
|
|
|
|
|
|
|
|
def my_tokenizer(s):
|
|
|
return s.split(' | ')
|
|
|
|
|
|
|
|
|
def get_word_counter(data):
|
|
|
print('获取异常日志计数字典')
|
|
|
|
|
|
counter = Counter()
|
|
|
for string_ in tqdm(data['msg']):
|
|
|
string_ = string_.strip()
|
|
|
counter.update(my_tokenizer(clean_str(string_)))
|
|
|
return counter
|
|
|
|
|
|
|
|
|
def macro_f1(target_df: pd.DataFrame, submit_df: pd.DataFrame):
|
|
|
"""
|
|
|
计算得分
|
|
|
:param target_df: [sn,fault_time,label]
|
|
|
:param submit_df: [sn,fault_time,label]
|
|
|
:return:
|
|
|
"""
|
|
|
|
|
|
weights = [5 / 11, 4 / 11, 1 / 11, 1 / 11]
|
|
|
|
|
|
# weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]
|
|
|
overall_df = target_df.merge(
|
|
|
submit_df, how='left', on=[
|
|
|
'sn', 'fault_time'], suffixes=[
|
|
|
'_gt', '_pr'])
|
|
|
overall_df.fillna(-1)
|
|
|
macro_F1 = 0.
|
|
|
for i in range(len(weights)):
|
|
|
TP = len(overall_df[(overall_df['label_gt'] == i)
|
|
|
& (overall_df['label_pr'] == i)])
|
|
|
FP = len(overall_df[(overall_df['label_gt'] != i)
|
|
|
& (overall_df['label_pr'] == i)])
|
|
|
FN = len(overall_df[(overall_df['label_gt'] == i)
|
|
|
& (overall_df['label_pr'] != i)])
|
|
|
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
|
|
|
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
|
|
|
F1 = 2 * precision * recall / \
|
|
|
(precision + recall) if (precision + recall) > 0 else 0
|
|
|
macro_F1 += weights[i] * F1
|
|
|
return macro_F1
|
|
|
|
|
|
|
|
|
def search_weight(train, valid_y, raw_prob, init_weight=[
|
|
|
1.0], class_num=4, step=0.001):
|
|
|
weight = init_weight.copy() * class_num
|
|
|
oof = train[['sn', 'fault_time']]
|
|
|
oof['label'] = raw_prob.argmax(axis=1)
|
|
|
f_best = macro_f1(train[['sn', 'fault_time', 'label']], oof)
|
|
|
print("Inint Score:", f_best)
|
|
|
|
|
|
# f_best = f1_score(y_true=valid_y, y_pred=raw_prob.argmax(axis=1),average='macro')
|
|
|
flag_score = 0
|
|
|
round_num = 1
|
|
|
while (flag_score != f_best):
|
|
|
print("round: ", round_num)
|
|
|
round_num += 1
|
|
|
flag_score = f_best
|
|
|
for c in range(class_num):
|
|
|
for n_w in range(0, 2000, 10):
|
|
|
num = n_w * step
|
|
|
new_weight = weight.copy()
|
|
|
new_weight[c] = num
|
|
|
prob_df = raw_prob.copy()
|
|
|
prob_df = prob_df * np.array(new_weight)
|
|
|
|
|
|
oof['label'] = prob_df.argmax(axis=1)
|
|
|
f = macro_f1(train[['sn', 'fault_time', 'label']], oof)
|
|
|
# f = f1_score(y_true=valid_y, y_pred=prob_df.argmax(axis=1),average='macro')
|
|
|
if f > f_best:
|
|
|
weight = new_weight.copy()
|
|
|
f_best = f
|
|
|
print(f"class:{c}, new_weight:{num}, f1 score: {f}")
|
|
|
print(
|
|
|
f'********************** SEARCH BEST WEIGHT : {weight} **********************')
|
|
|
return weight
|
|
|
|
|
|
|
|
|
def get_new_cols(df, key=['sn', 'fault_time']):
|
|
|
if isinstance(df.columns[0], tuple):
|
|
|
|
|
|
new_cols = []
|
|
|
for i in df.columns:
|
|
|
if i[0] in key:
|
|
|
new_cols.append(i[0])
|
|
|
else:
|
|
|
new_cols.append(f'{i[0]}_{i[1]}')
|
|
|
df.columns = new_cols
|
|
|
return df
|
|
|
else:
|
|
|
print('当前的DataFrame没有二级列名,请检查。')
|
|
|
return df
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
# create_all_dir()
|
|
|
logger = Logger(name=os.path.basename(__file__).split(
|
|
|
'.py')[0], log_path=LOG_DIR, mode="w").get_log
|
|
|
print(len(KEY_WORDS))
|