You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

151 lines
6.3 KiB

import warnings
import datetime
import lightgbm as lgb
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from utils import N_ROUNDS
import pickle
import os
warnings.filterwarnings('ignore')
def get_model_feature_importances(model):
feature_importances = pd.DataFrame()
feature_importances['fea'] = model.feature_names_
feature_importances['importances'] = model.feature_importances_
feature_importances = feature_importances.sort_values('importances', ascending=False).reset_index(drop=True)
return feature_importances
def run_cbt(train, target, test, k, seed, NUM_CLASS=4, cat_cols=[]):
print('********************** RUN CATBOOST MODEL **********************')
print(f'****************** 当前的 SEED {seed} ********************** ')
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
oof_prob = np.zeros((train.shape[0], NUM_CLASS))
test_prob = np.zeros((test.shape[0], NUM_CLASS))
feature_importance_df = []
offline_score = []
model_list = []
## K-Fold
for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
print("FOLD {} IS RUNNING...".format(fold + 1))
trn_x, trn_y = train.loc[trn_idx], target.loc[trn_idx]
val_x, val_y = train.loc[val_idx], target.loc[val_idx]
catboost_model = CatBoostClassifier(
iterations=N_ROUNDS,
od_type='Iter',
od_wait=120,
max_depth=8,
learning_rate=0.05,
l2_leaf_reg=9,
random_seed=seed,
fold_len_multiplier=1.1,
loss_function='MultiClass',
logging_level='Verbose',
# task_type="GPU"
)
start_time = datetime.datetime.now()
catboost_model.fit(trn_x,
trn_y,
eval_set=(val_x, val_y),
use_best_model=True,
verbose=800,
early_stopping_rounds=100,
cat_features=cat_cols,
)
end_time = datetime.datetime.now()
model_train_cost_time = end_time - start_time
print('****************** 模型训练 COST TIME : ',str(model_train_cost_time),' ******************')
start_time = datetime.datetime.now()
oof_prob[val_idx] = catboost_model.predict_proba(train.loc[val_idx])
end_time = datetime.datetime.now()
model_pred_cost_time = end_time - start_time
print('****************** 模型预测 COST TIME : ', str(model_pred_cost_time), ' ******************')
# catboost_model = catboost_model.get_best_iteration()
test_prob += catboost_model.predict_proba(test) / folds.n_splits
print(catboost_model.get_best_score())
offline_score.append(catboost_model.get_best_score()['validation']['MultiClass'])
feature_importance_df.append(get_model_feature_importances(catboost_model))
model_list.append(catboost_model)
with open(os.path.join('../model', f'cat_model_flod_{fold}.pkl'), 'wb') as f:
pickle.dump(catboost_model, f)
print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
fea_imp_df = pd.concat(feature_importance_df, ignore_index=True).groupby('fea').agg(
{'importances': 'mean'}).reset_index().sort_values('importances', ascending=False).reset_index(drop=True)
return oof_prob, test_prob, fea_imp_df, model_list
def run_lgb(train, target, test, k, seed=42, NUM_CLASS=4, cat_cols=[]):
# feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']]
# print('Current num of features:', len(feats))
print(f'********************** RUN LGBM MODEL **********************')
print(f'****************** 当前的 SEED {seed} ********************** ')
cols_map = {j: i for i, j in enumerate(train.columns)}
cat_cols = [cols_map[i] for i in cat_cols]
train = train.rename(columns=cols_map)
test = test.rename(columns=cols_map)
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
oof_prob = np.zeros((train.shape[0], NUM_CLASS))
test_prob = np.zeros((test.shape[0], NUM_CLASS))
fea_imp_df_list = []
offline_score = []
model_list = []
## K-Fold
for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
params = {
"objective": "multiclass",
"num_class": NUM_CLASS,
"learning_rate": 0.01,
"max_depth": -1,
"num_leaves": 32,
"verbose": -1,
"bagging_fraction": 0.8,
"feature_fraction": 0.8,
"seed": seed,
'metric': 'multi_error'
}
print("FOLD {} IS RUNNING...".format(fold + 1))
trn_data = lgb.Dataset(train.loc[trn_idx], label=target.loc[trn_idx])
val_data = lgb.Dataset(train.loc[val_idx], label=target.loc[val_idx])
# train
params['seed'] = seed
lgb_model = lgb.train(
params,
trn_data,
num_boost_round=N_ROUNDS,
valid_sets=[trn_data, val_data],
early_stopping_rounds=100,
verbose_eval=200,
categorical_feature=cat_cols,
)
# predict
oof_prob[val_idx] = lgb_model.predict(train.loc[val_idx], num_iteration=lgb_model.best_iteration)
test_prob += lgb_model.predict(test, num_iteration=lgb_model.best_iteration) / folds.n_splits
offline_score.append(lgb_model.best_score['valid_1']['multi_error'])
fea_imp = pd.DataFrame()
fea_imp['feature_name'] = lgb_model.feature_name()
fea_imp['importance'] = lgb_model.feature_importance()
fea_imp['feature_name'] = fea_imp['feature_name'].map({str(cols_map[i]): i for i in cols_map})
fea_imp = fea_imp.sort_values('importance', ascending=False)
fea_imp_df_list.append(fea_imp)
model_list.append(lgb_model)
print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
fea_imp_df = pd.concat(fea_imp_df_list, ignore_index=True).groupby('feature_name').agg(
{'importance': 'mean'}).reset_index().sort_values('importance', ascending=False)
return oof_prob, test_prob, fea_imp_df, model_list