You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

233 lines
13 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import sys
from log import Logger
from collections import Counter
from tqdm import tqdm
import numpy as np
import pandas as pd
ROOT_DIR = os.path.join(sys.path[0], '../')
LOG_DIR = os.path.join(ROOT_DIR, 'log')
DATA_DIR = os.path.join(ROOT_DIR, 'data')
TRAIN_DIR = os.path.join(DATA_DIR, 'preliminary_train')
# 提交docker时 需要打开更换
MODEL_PATH = os.path.join(ROOT_DIR, './model/deberta-base')
MODEL_1_PATH = os.path.join(ROOT_DIR, './model')
TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata')
# TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata_test')
PSEUDO_FALG = True
TEST_B_DIR = os.path.join(ROOT_DIR, 'tcdata')
RESULT_DIR = os.path.join(ROOT_DIR, 'prediction_result')
FEATURE_DIR = os.path.join(ROOT_DIR, 'feature')
GENERATION_DIR = os.path.join(FEATURE_DIR, 'generation')
CORRELATION_DIR = os.path.join(FEATURE_DIR, 'correlation')
USER_DATA_DIR = os.path.join(ROOT_DIR, 'user_data')
USER_MODEL_DIR = os.path.join(USER_DATA_DIR, 'model_data')
TMP_DIR = os.path.join(USER_DATA_DIR, 'tmp_data')
N_ROUNDS = 10000
TIME_INTERVAL = 60
KEY_1 = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event', 'Power Supply PS0_Status',
'Temperature CPU0_Margin_Temp', 'Reading 51 > Threshold 85 degrees C', 'Lower Non-critical going low',
'Temperature CPU1_Margin_Temp', 'System ACPI Power State #0x7d', 'Lower Critical going low']
KEY_2 = ['OEM CPU0 MCERR', 'OEM CPU0 CATERR', 'Reading 0 < Threshold 2 degrees C', '0203c0a80101',
'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR', 'Microcontroller #0x3b', 'System Boot Initiated',
'Processor #0xfa', 'Power Unit Pwr Unit Status', 'Hard reset', 'Power off/down', 'System Event #0xff',
'Memory CPU1A1_DIMM_Stat', '000000', 'Power cycle', 'OEM record c3', 'Memory CPU1C0_DIMM_Stat',
'Reading 0 < Threshold 1 degrees C', 'IERR']
KEY_3 = ['Memory', 'Correctable ECC logging limit reached', 'Memory MEM_CHE0_Status', 'Memory Memory_Status',
'Memory #0x87', 'Memory CPU0F0_DIMM_Stat', 'Memory Device Disabled', 'Memory #0xe2',
'OS Stop/Shutdown OS Status', 'System Boot Initiated System Restart', 'OS Boot BIOS_Boot_Up',
'System Boot Initiated BIOS_Boot_UP', 'Memory DIMM101', 'OS graceful shutdown', 'OS Critical Stop OS Status',
'Memory #0xf9', 'Memory CPU0C0_DIMM_Stat', 'Memory DIMM111', 'Memory DIMM021', ]
KEY_4 = ['Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', 'Power Supply PSU0_Supply',
'AC out-of-range, but present', 'Predictive failure', 'Drive Present', 'Temperature Temp_DIMM_KLM',
'Temperature Temp_DIMM_DEF', 'Power Supply PS1_Status', 'Identify Status', 'Power Supply PS2_Status',
'Temperature DIMMG1_Temp', 'Upper Non-critical going high', 'Temperature DIMMG0_Temp',
'Upper Critical going high', 'Power Button pressed', 'System Boot Initiated #0xb8', 'Deasserted']
TOP_KEY_WORDS = ['0203c0a80101', 'Configuration Error', 'Correctable ECC', 'Deasserted', 'Device Enabled', 'Drive Present',
'Event Logging Disabled SEL', 'Failure detected', 'IERR', 'Initiated by hard reset', 'Initiated by power up',
'Initiated by warm reset', 'Log area reset/cleared', 'Memory', 'Memory #0xe2', 'Memory CPU0C0',
'Microcontroller/Coprocessor BMC', 'OEM CPU0 CATERR', 'OEM CPU0 MCERR', 'OS Boot BIOS',
'OS Critical Stop OS Status', 'Power Supply PS1', 'Power Supply PS2', 'Presence detected', 'Processor', 'Processor CPU', 'Processor CPU0',
'Processor CPU1', 'S0/G0: working', 'S4/S5: soft-off', 'Slot / Connector PCIE', 'State Asserted', 'State Deasserted',
'System ACPI Power State ACPI', 'System Boot Initiated', 'System Boot Initiated #0xe0', 'System Boot Initiated BIOS',
'System Event', 'System Event #0x10', 'System Event #0xff', 'Timestamp Clock Sync', 'Transition to Running', 'Uncorrectable ECC',
'Uncorrectable machine check exception', 'Unknown CPU0 CATERR', 'Unknown CPU0 MCERR', 'Unknown Chassis', 'Watchdog2 IPMI',
]
TOP_KEY_WORDS_2 = ['Processor CPU0 Status', 'System Boot Initiated BIOS Boot Up', 'Uncorrectable ECC', 'Initiated by power up',
'Configuration Error', 'Processor CPU CATERR', 'Processor CPU1 Status', 'Memory #0xe2', 'IERR', 'Initiated by warm reset',
'State Asserted', 'S4/S5: soft-off', 'Memory #0xf9', 'S0/G0: working', 'boot completed - device not specified', 'Timestamp Clock Sync',
'Presence detected', 'System Boot Initiated #0xe0', 'Drive Fault', 'Power Supply PS1 Status', 'Power off/down', 'OS Boot #0xe9',
'Failure detected', 'Uncorrectable machine check exception', 'Transition to Running', 'Power Supply PS2 Status',
'Memory Device Disabled', 'System Restart', 'System Event #0x10', 'Sensor access degraded or unavailable', 'Unknown #0x17',
'Drive Present', 'Management Subsys Health System Health', 'Power Supply AC lost', 'Microcontroller #0x16']
CHARATERS = ['#', '&', ]
# KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS
KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS + TOP_KEY_WORDS
KEY_WORDS = list(set(KEY_WORDS))
# cnt_1_0_diff_key_words = ['State Asserted','Processor CPU_CATERR','Unknown #0x17','Microcontroller #0x16','Transition to Running','State Deasserted','Processor #0xfa','Temperature CPU1_Margin_Temp','Temperature CPU0_Margin_Temp','Power cycle','Management Subsys Health System_Health','Sensor access degraded or unavailable','Power off/down','System ACPI Power State #0x7d']
# key_words_0 = ['Temperature CPU0_Margin_Temp','Lower Critical going low','System ACPI Power State #0x7d','Temperature CPU1_Margin_Temp','Lower Non-critical going low','Uncorrectable machine check exception','Reading 0 < Threshold 1 degrees C','000000','Unknown #0x19','Temperature DIMMG1_Temp','Reading 0 < Threshold 0 degrees C','001c4c','IERR','Upper Critical going high','Unknown Chassis_control','Temperature DIMMG0_Temp','Upper Non-critical going high','Temperature Temp_DIMM_DEF','Power cycle','Processor CPU0_Status','Temperature Temp_DIMM_KLM','Processor CPU1_Status','Management Subsys Health System_Health']
# key_words_1 = ['Processor #0xfa','State Deasserted','Power off/down','Power cycle','IERR','Unknown #0x17','Management Subsys Health System_Health','Processor CPU_CATERR','Reading 0 < Threshold 1 degrees C','','Sensor access degraded or unavailable','Transition to Running','State Asserted','Microcontroller #0x16','Processor CPU0_Status','Processor CPU1_Status','Slot / Connector PCIE_Status','Fault Status','System ACPI Power State ACPI_PWR_Status','Management Subsystem Health System_Health','Configuration Error','Uncorrectable machine check exception','Timestamp Clock Sync']
# key_words_2 = ['Memory #0xe2','Memory Device Disabled','Memory #0x87','Memory #0xf9','Correctable ECC','Memory CPU0D0_DIMM_Stat','Uncorrectable ECC','Memory CPU1B0_DIMM_Stat','System Boot Initiated BIOS_Boot_UP','System Restart','Presence Detected','Temperature CPU0_Temp','boot completed - device not specified','Log almost full','Device Present','Legacy OFF state','System Boot Initiated #0xe0','System Event #0x10','Legacy ON state','OS Boot #0xe0','Unknown #0xc5','System Boot Initiated #0xb8','Event Logging Disabled SEL_Status']
# key_words_3 = ['Drive Fault','Failure detected','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS4_Status','Upper Non-critical going high','Temperature DIMMG0_Temp','Temperature DIMMG1_Temp','Power Supply PS3_Status','Upper Critical going high','Predictive failure','Power Supply AC lost','Unknown #0x19','Power Unit Power Unit','AC out-of-range, but present','Power Supply PS1_Status','Power Supply PS2_Status','Log area reset/cleared','Microcontroller/Coprocessor BMC_Boot_Up','System Boot Initiated #0xb8','Power Button pressed','Device Present']
# top_key_words = [ 'Configuration Error','Uncorrectable ECC','Processor CPU0_Status','Initiated by power up','','Presence Detected','Processor CPU1_Status','S0/G0: working','Processor CPU_CATERR','Presence detected','S4/S5: soft-off','Upper Critical going high','Memory #0xe2','IERR','Initiated by warm reset','State Asserted','Upper Non-critical going high','boot completed - device not specified','Memory Device Disabled','Timestamp Clock Sync','Lower Critical going low','Transition to Running','Memory #0xf9','Power Supply PS1_Status']
# key_words_1_desc = ['#0xfa', '#0x','#0xff','CATERR','cycle','Unit','IERR','IPMI','#0x17', 'Running','#0x7c','Unknown','CPU', 'Sensor','CPU0','CPU1','Subsys']
#
# key_words = cnt_1_0_diff_key_words +key_words_0+key_words_1+key_words_2+key_words_3+top_key_words+key_words_1_desc
# key_words = list(set(key_words))
# KEY_WORDS = key_words+CHARATERS
def create_dir(dir):
"""
创建目录
:param dir: 目录名
:return:
"""
if not os.path.exists(dir):
os.mkdir(dir)
print(f'{dir}目录不存在,创建{dir}目录成功.')
else:
print(f'{dir}目录已存在.')
def create_all_dir():
"""
创建所有需要的目录
:return:
"""
create_dir(ROOT_DIR)
create_dir(LOG_DIR)
# create_dir(MODEL_DIR)
create_dir(RESULT_DIR)
create_dir(FEATURE_DIR)
create_dir(GENERATION_DIR)
create_dir(CORRELATION_DIR)
create_dir(DATA_DIR)
create_dir(TRAIN_DIR)
create_dir(TEST_A_DIR)
# create_dir(TEST_B_DIR)
create_dir(USER_DATA_DIR)
create_dir(USER_MODEL_DIR)
create_dir(TMP_DIR)
def clean_str(string):
return string
def my_tokenizer(s):
return s.split(' | ')
def get_word_counter(data):
print('获取异常日志计数字典')
counter = Counter()
for string_ in tqdm(data['msg']):
string_ = string_.strip()
counter.update(my_tokenizer(clean_str(string_)))
return counter
def macro_f1(target_df: pd.DataFrame, submit_df: pd.DataFrame):
"""
计算得分
:param target_df: [sn,fault_time,label]
:param submit_df: [sn,fault_time,label]
:return:
"""
weights = [5 / 11, 4 / 11, 1 / 11, 1 / 11]
# weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]
overall_df = target_df.merge(
submit_df, how='left', on=[
'sn', 'fault_time'], suffixes=[
'_gt', '_pr'])
overall_df.fillna(-1)
macro_F1 = 0.
for i in range(len(weights)):
TP = len(overall_df[(overall_df['label_gt'] == i)
& (overall_df['label_pr'] == i)])
FP = len(overall_df[(overall_df['label_gt'] != i)
& (overall_df['label_pr'] == i)])
FN = len(overall_df[(overall_df['label_gt'] == i)
& (overall_df['label_pr'] != i)])
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
F1 = 2 * precision * recall / \
(precision + recall) if (precision + recall) > 0 else 0
macro_F1 += weights[i] * F1
return macro_F1
def search_weight(train, valid_y, raw_prob, init_weight=[
1.0], class_num=4, step=0.001):
weight = init_weight.copy() * class_num
oof = train[['sn', 'fault_time']]
oof['label'] = raw_prob.argmax(axis=1)
f_best = macro_f1(train[['sn', 'fault_time', 'label']], oof)
print("Inint Score:", f_best)
# f_best = f1_score(y_true=valid_y, y_pred=raw_prob.argmax(axis=1),average='macro')
flag_score = 0
round_num = 1
while (flag_score != f_best):
print("round: ", round_num)
round_num += 1
flag_score = f_best
for c in range(class_num):
for n_w in range(0, 2000, 10):
num = n_w * step
new_weight = weight.copy()
new_weight[c] = num
prob_df = raw_prob.copy()
prob_df = prob_df * np.array(new_weight)
oof['label'] = prob_df.argmax(axis=1)
f = macro_f1(train[['sn', 'fault_time', 'label']], oof)
# f = f1_score(y_true=valid_y, y_pred=prob_df.argmax(axis=1),average='macro')
if f > f_best:
weight = new_weight.copy()
f_best = f
print(f"class:{c}, new_weight:{num}, f1 score: {f}")
print(
f'********************** SEARCH BEST WEIGHT : {weight} **********************')
return weight
def get_new_cols(df, key=['sn', 'fault_time']):
if isinstance(df.columns[0], tuple):
new_cols = []
for i in df.columns:
if i[0] in key:
new_cols.append(i[0])
else:
new_cols.append(f'{i[0]}_{i[1]}')
df.columns = new_cols
return df
else:
print('当前的DataFrame没有二级列名请检查。')
return df
if __name__ == '__main__':
# create_all_dir()
logger = Logger(name=os.path.basename(__file__).split(
'.py')[0], log_path=LOG_DIR, mode="w").get_log
print(len(KEY_WORDS))