AiLearning-Theory-Applying/机器学习竞赛实战_优胜解决方案/第三届阿里云磐久智维算法大赛/code/get_crashdump_venus_fea.py

import datetime
import os
import gc
import warnings
import pandas as pd
import pickle
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd
from generate_feature import add_w2v_feats, cat2num
from generate_feature import get_key

from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
    get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
    get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
    get_w2v_feats, get_key, get_class_key_words_nunique
from model import run_cbt, run_lgb
from utils import RESULT_DIR, TRAIN_DIR, \
    TEST_A_DIR, KEY_WORDS, TOP_KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL, PSEUDO_FALG, \
    GENERATION_DIR

warnings.filterwarnings('ignore')


def get_fault_code_list(x):
    try:
        x = x.replace('.', ',').split(',')
    except:
        x = []
    return x


def get_module_cause_list(x):
    try:
        x = x.replace(',', '_').replace('，', '_')
        x = list(set(x.split('_')))
    except:
        x = []
    return x


def get_label(PSEUDO_FALG):
    preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
    preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)

    if PSEUDO_FALG:
        print('获取伪标签LABEL')
        pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
        label = pd.concat([preliminary_train_label_dataset,
                           pseudo_labels,
                           preliminary_train_label_dataset_s],
                          ignore_index=True,
                          axis=0).sort_values(
            ['sn', 'fault_time']).reset_index(drop=True)
    else:
        print('不使用伪标签数据')
        label = pd.concat([preliminary_train_label_dataset,
                           preliminary_train_label_dataset_s],
                          ignore_index=True,
                          axis=0).sort_values(
            ['sn', 'fault_time']).reset_index(drop=True)
    label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
    return label


def get_module_cause_code(x, code_name):
    code_list = []
    for i in x:
        if code_name in i:
            code_list.append(i)
    return code_list


def get_alertname_code(x, alertname):
    x = x.split(',')

    try:
        alertname_code = x[x.index(alertname) + 1]
    except:
        alertname_code = np.nan
    return alertname_code


def get_alertname_code_2(x, alertname):
    # x =x.split(',')

    try:
        alertname_code = x[x.index(alertname) + 1]
    except:
        alertname_code = ' '
    return alertname_code


def get_last_msg_cnt(x):
    last_msg = x[-1]
    cnt = x.count(last_msg)
    return cnt


def get_first_msg_cnt(x):
    first_msg = x[0]
    cnt = x.count(first_msg)
    return cnt


def get_crashdump_venus_data():
    final_venus_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_venus_dataset_b.csv'))
    final_crashdump_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_crashdump_dataset_b.csv'))
    final_crashdump_venus = final_crashdump_dataset.merge(final_venus_dataset, on=['sn', 'fault_time'],
                                                          how='outer')

    preliminary_venus_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_venus_dataset.csv'))
    preliminary_crashdump_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_crashdump_dataset.csv'))
    preliminary_crashdump_venus = preliminary_crashdump_dataset.merge(preliminary_venus_dataset,
                                                                      on=['sn', 'fault_time'],
                                                                      how='outer')

    crashdump_venus = pd.concat([final_crashdump_venus, preliminary_crashdump_venus],
                                ignore_index=True).drop_duplicates()
    crashdump_venus = crashdump_venus.sort_values(['sn', 'fault_time']).reset_index(drop=True)
    return crashdump_venus


def get_crashdump_venus_fea(crashdump_venus):
    print('生成 crashdump_venus 特征')
    crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].apply(lambda x: get_module_cause_list(x))
    crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].apply(lambda x: get_fault_code_list(x))

    code_name_list = ['module', 'cod1', 'cod2', 'addr', 'port']
    for code_name in code_name_list:
        crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus['module_cause_list'].apply(
            lambda x: get_module_cause_code(x, code_name))
        crashdump_venus[f'module_cause_{code_name}_len'] = crashdump_venus[f'module_cause_{code_name}'].apply(
            lambda x: len(x))
        crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus[f'module_cause_{code_name}'].apply(
            lambda x: '_'.join(set(x)))
    code_name_list = ['cha', '0x', 'cod', 'core', 'cpu', 'm2m', 'pcu']
    for code_name in code_name_list:
        crashdump_venus[f'fault_{code_name}'] = crashdump_venus['fault_code_list'].apply(
            lambda x: get_module_cause_code(x, code_name))
        crashdump_venus[f'fault_{code_name}_len'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: len(x))
        crashdump_venus[f'fault_{code_name}'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: '_'.join(set(x)))

    cols_tmp = ['module_cause', 'fault_code', 'module_cause_module',
                'module_cause_cod1', 'module_cause_cod2', 'module_cause_addr',
                'module_cause_port', 'fault_cha', 'fault_0x', 'fault_cod', 'fault_core',
                'fault_cpu', 'fault_m2m', 'fault_pcu', ]
    new_cat_cols = []
    crashdump_venus = cat2num(crashdump_venus, cols_tmp)
    for name in cols_tmp:
        # le = LabelEncoder()
        # crashdump_venus[f'{name}_LabelEnc'] = le.fit_transform(crashdump_venus[name])
        new_cat_cols.append(f'{name}_LabelEnc')

    num_cols = ['fault_pcu_len', 'fault_m2m_len',
                'fault_cpu_len', 'fault_0x_len', 'fault_cod_len',
                'module_cause_module_len', 'module_cause_cod1_len',
                'module_cause_cod2_len', 'module_cause_addr_len',
                'module_cause_port_len', 'fault_cha_len', 'fault_core_len', ]

    crashdump_venus = crashdump_venus[['sn', 'fault_time'] + new_cat_cols + num_cols]
    crashdump_venus = crashdump_venus.rename(columns={'fault_time': 'crashdump_fault_time'})

    crashdump_venus['crashdump_fault_time'] = pd.to_datetime(crashdump_venus['crashdump_fault_time'])
    del crashdump_venus['crashdump_fault_time']
    print(f'生成 crashdump_venus 特征完毕,特征维度 {crashdump_venus.shape}')
    return crashdump_venus


def get_location_word(x, num):
    try:
        return x[num]
    except:
        return


def get_label(PSEUDO_FALG):
    preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
    preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)

    if PSEUDO_FALG:
        print('获取伪标签LABEL')
        pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
        label = pd.concat([preliminary_train_label_dataset,
                           pseudo_labels,
                           preliminary_train_label_dataset_s],
                          ignore_index=True,
                          axis=0).sort_values(
            ['sn', 'fault_time']).reset_index(drop=True)
    else:
        print('不使用伪标签数据')
        label = pd.concat([preliminary_train_label_dataset,
                           preliminary_train_label_dataset_s],
                          ignore_index=True,
                          axis=0).sort_values(
            ['sn', 'fault_time']).reset_index(drop=True)
    label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
    return label


module_list = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
             'module10','module11','module12','module13','module14','module17','module18','module19',
             'in traffic control',
             'irpp0','irpp1',
             'pcie rootport 0:0.0','pcie rootport a2:0.0','pcie rootport 2b:3.0',
             'port a','port c']
module_list2 = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
'module10','module11','module12','module13','module14','module17','module18','module19']
other_module_list = ['in traffic control', 'irpp0', 'irpp1', 'pcie rootport 0:0.0',
       'pcie rootport a2:0.0', 'pcie rootport 2b:3.0', 'port a', 'port c']
module_content_list = ['module0_cod1', 'module0_cod2', 'module0_addr',
       'module1_cod1', 'module1_cod2', 'module1_addr', 'module2_cod1',
       'module2_cod2', 'module2_addr', 'module3_cod1', 'module3_cod2',
       'module3_addr', 'module4_cod1', 'module4_cod2', 'module4_addr',
       'module5_cod1', 'module5_cod2', 'module5_addr', 'module7_cod1',
       'module7_cod2', 'module7_addr', 'module8_cod1', 'module8_cod2',
       'module8_addr', 'module9_cod1', 'module9_cod2', 'module9_addr',
       'module10_cod1', 'module10_cod2', 'module10_addr', 'module11_cod1',
       'module11_cod2', 'module11_addr', 'module12_cod1', 'module12_cod2',
       'module12_addr', 'module13_cod1', 'module13_cod2', 'module13_addr',
       'module14_cod1', 'module14_cod2', 'module14_addr', 'module17_cod1',
       'module17_cod2', 'module17_addr', 'module18_cod1', 'module18_cod2',
       'module18_addr', 'module19_cod1', 'module19_cod2', 'module19_addr']
fault_code_content_list = ['fault_code_cod1', 'fault_code_cod2',
       'fault_code_cpu0', 'fault_code_cpu1']


crashdump_venus = get_crashdump_venus_data()
crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.split(','))
crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.replace(':','_').replace(',','_'))
for module in module_list:
    crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(
    lambda x:x.replace(f'{module}_',f'{module}:').replace(f'_{module}',f',{module}'))
crashdump_venus['module_cause'] = crashdump_venus['module_cause'].apply(lambda x:x.replace(':',','))

for module in module_list:
    crashdump_venus[module] = crashdump_venus['module_cause'].apply(lambda x:get_alertname_code(x,module))
    crashdump_venus[module] = crashdump_venus.loc[:,module].fillna(' ').apply(lambda x:x.replace('_',' '))
    crashdump_venus[module] = crashdump_venus[module].apply(lambda x:x.split(' '))
crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,module_list].sum(1)


for module in module_list2:
    crashdump_venus[f'{module}_cod1'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod1')])
    crashdump_venus[f'{module}_cod2'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod2')])
    crashdump_venus[f'{module}_addr'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'addr')])
    del crashdump_venus[module]
    gc.collect()

crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].fillna(' ').apply(lambda x:x.split('.'))
for i in ['cod1','cod2','cpu0','cpu1']:
    crashdump_venus[f'fault_code_{i}'] = crashdump_venus['fault_code_list'].apply(lambda x:[get_alertname_code_2(x,i)])


crashdump_venus['other_module_list'] = crashdump_venus.loc[:,other_module_list].sum(1)
crashdump_venus['module_content_list'] = crashdump_venus.loc[:,module_content_list].sum(1)
crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,other_module_list+module_content_list].sum(1)
crashdump_venus['fault_code_content_list'] = crashdump_venus.loc[:,fault_code_content_list].sum(1)
crashdump_venus['all_crashdump_venus'] = crashdump_venus.loc[:,other_module_list+module_content_list+fault_code_content_list].sum(1)

f1_list = ['sn']
f2_list = ['other_module_list','module_content_list','module_cause_new','fault_code_content_list','all_crashdump_venus']
w2v_feats_df = crashdump_venus[f1_list].drop_duplicates()
w2v_feats_df_list = []
for f1 in f1_list:
    for f2 in f2_list:
        w2v_fea_tmp = add_w2v_feats(crashdump_venus,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count  =5,)
        w2v_feats_df_list.append(w2v_fea_tmp)
w2v_feats_df = w2v_feats_df_list[0]
for i in w2v_feats_df_list[1:]:
    w2v_feats_df = w2v_feats_df.merge(i,on = 'sn',how = 'left')

for i in other_module_list+module_content_list+fault_code_content_list:
    crashdump_venus[i] = crashdump_venus[i].astype(str)

crashdump_venus = cat2num(crashdump_venus,other_module_list+module_content_list+fault_code_content_list)
for i in other_module_list+module_content_list+fault_code_content_list:
    del crashdump_venus[i]
gc.collect()
crashdump_venus = crashdump_venus.merge(w2v_feats_df,on ='sn',how ='left').rename(columns ={'fault_time':'crashdump_venus_fault_time'} )

preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
test = pd.read_csv(os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv'))[['sn', 'fault_time' ]]
train = get_label(False)[['sn', 'fault_time', 'label',]]

test_tmp = test[['sn', 'fault_time']]
test_tmp = test_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)
train_tmp = train[['sn', 'fault_time', 'label', ]]
train_tmp = train_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)


train_tmp['duration_fault_time'] = pd.to_datetime(train_tmp['fault_time']) - pd.to_datetime(train_tmp['crashdump_venus_fault_time'])
test_tmp['duration_fault_time'] = pd.to_datetime(test_tmp['fault_time']) - pd.to_datetime(test_tmp['crashdump_venus_fault_time'])

train_tmp['duration_fault_time'] = train_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())
test_tmp['duration_fault_time']  = test_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())


drop_cols = ['sn', 'fault_time', 'fault_code', 'module_cause', 'module','crashdump_venus_fault_time',
       'module_cause_list', 'module_cause_new', 'fault_code_list','label','duration_fault_time',
       'other_module_list', 'module_content_list', 'fault_code_content_list',
       'all_crashdump_venus',]
use_cols = [i for i in train_tmp.columns if i not in drop_cols]

cat_cols = [f'{i}_LabelEnc' for i in other_module_list+module_content_list+fault_code_content_list]

oof_prob = np.zeros((train.shape[0], 4))

test_prob = np.zeros((test.shape[0], 4))
# seeds = [42,4242,40424,1024,2048]
seeds = [42 ]
for seed in seeds:
    oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train_tmp[use_cols], train_tmp[['label']], test_tmp[use_cols], k=5,
                                              seed=seed, cat_cols=cat_cols)
    oof_prob +=oof_prob/len(seeds)
    test_prob +=test_prob/len(seeds)


weight = search_weight(train_tmp, train_tmp[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
oof_prob = oof_prob * np.array(weight)
test_prob = test_prob * np.array(weight)


target_df = train_tmp[['sn', 'fault_time', 'label']].drop_duplicates(['sn', 'fault_time'])
submit_df = train_tmp[['sn', 'fault_time']]
submit_df['label'] = oof_prob.argmax(axis=1)
submit_df = submit_df.drop_duplicates(['sn', 'fault_time'])
# submit_df = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea1.csv')).rename(columns = {'crashdump_venus_label':'label'})


score = macro_f1(target_df=target_df, submit_df=submit_df)
print(f'********************** BEST MACRO_F1 : {score} **********************')
score = round(score, 5)

print(fea_imp_df[:20])
y_pred = test_prob.argmax(axis=1)
result = test_tmp[['sn', 'fault_time']]
result['label'] = y_pred
result = result.drop_duplicates(['sn', 'fault_time'])

crashdump_venus_fea = pd.concat([submit_df,result],ignore_index = False,axis = 0)
crashdump_venus_fea = crashdump_venus_fea.rename(columns = {'label':'crashdump_venus_label_v1'})
crashdump_venus_fea.to_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv'),index= False)
print(crashdump_venus_fea['crashdump_venus_label_v1'].value_counts())