You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
151 lines
6.3 KiB
151 lines
6.3 KiB
import warnings
|
|
import datetime
|
|
import lightgbm as lgb
|
|
import numpy as np
|
|
import pandas as pd
|
|
from catboost import CatBoostClassifier
|
|
from sklearn.model_selection import StratifiedKFold
|
|
|
|
from utils import N_ROUNDS
|
|
import pickle
|
|
import os
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
def get_model_feature_importances(model):
|
|
feature_importances = pd.DataFrame()
|
|
feature_importances['fea'] = model.feature_names_
|
|
feature_importances['importances'] = model.feature_importances_
|
|
feature_importances = feature_importances.sort_values('importances', ascending=False).reset_index(drop=True)
|
|
|
|
return feature_importances
|
|
|
|
|
|
def run_cbt(train, target, test, k, seed, NUM_CLASS=4, cat_cols=[]):
|
|
print('********************** RUN CATBOOST MODEL **********************')
|
|
print(f'****************** 当前的 SEED {seed} ********************** ')
|
|
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
|
|
oof_prob = np.zeros((train.shape[0], NUM_CLASS))
|
|
test_prob = np.zeros((test.shape[0], NUM_CLASS))
|
|
feature_importance_df = []
|
|
offline_score = []
|
|
model_list = []
|
|
|
|
## K-Fold
|
|
for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
|
|
print("FOLD {} IS RUNNING...".format(fold + 1))
|
|
trn_x, trn_y = train.loc[trn_idx], target.loc[trn_idx]
|
|
val_x, val_y = train.loc[val_idx], target.loc[val_idx]
|
|
catboost_model = CatBoostClassifier(
|
|
iterations=N_ROUNDS,
|
|
od_type='Iter',
|
|
od_wait=120,
|
|
max_depth=8,
|
|
learning_rate=0.05,
|
|
l2_leaf_reg=9,
|
|
random_seed=seed,
|
|
fold_len_multiplier=1.1,
|
|
loss_function='MultiClass',
|
|
logging_level='Verbose',
|
|
# task_type="GPU"
|
|
|
|
)
|
|
|
|
start_time = datetime.datetime.now()
|
|
|
|
catboost_model.fit(trn_x,
|
|
trn_y,
|
|
eval_set=(val_x, val_y),
|
|
use_best_model=True,
|
|
verbose=800,
|
|
early_stopping_rounds=100,
|
|
cat_features=cat_cols,
|
|
)
|
|
end_time = datetime.datetime.now()
|
|
model_train_cost_time = end_time - start_time
|
|
print('****************** 模型训练 COST TIME : ',str(model_train_cost_time),' ******************')
|
|
|
|
start_time = datetime.datetime.now()
|
|
oof_prob[val_idx] = catboost_model.predict_proba(train.loc[val_idx])
|
|
end_time = datetime.datetime.now()
|
|
model_pred_cost_time = end_time - start_time
|
|
print('****************** 模型预测 COST TIME : ', str(model_pred_cost_time), ' ******************')
|
|
# catboost_model = catboost_model.get_best_iteration()
|
|
test_prob += catboost_model.predict_proba(test) / folds.n_splits
|
|
print(catboost_model.get_best_score())
|
|
offline_score.append(catboost_model.get_best_score()['validation']['MultiClass'])
|
|
|
|
feature_importance_df.append(get_model_feature_importances(catboost_model))
|
|
model_list.append(catboost_model)
|
|
with open(os.path.join('../model', f'cat_model_flod_{fold}.pkl'), 'wb') as f:
|
|
pickle.dump(catboost_model, f)
|
|
print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
|
|
fea_imp_df = pd.concat(feature_importance_df, ignore_index=True).groupby('fea').agg(
|
|
{'importances': 'mean'}).reset_index().sort_values('importances', ascending=False).reset_index(drop=True)
|
|
|
|
return oof_prob, test_prob, fea_imp_df, model_list
|
|
|
|
|
|
def run_lgb(train, target, test, k, seed=42, NUM_CLASS=4, cat_cols=[]):
|
|
# feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']]
|
|
# print('Current num of features:', len(feats))
|
|
print(f'********************** RUN LGBM MODEL **********************')
|
|
print(f'****************** 当前的 SEED {seed} ********************** ')
|
|
cols_map = {j: i for i, j in enumerate(train.columns)}
|
|
cat_cols = [cols_map[i] for i in cat_cols]
|
|
train = train.rename(columns=cols_map)
|
|
test = test.rename(columns=cols_map)
|
|
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
|
|
oof_prob = np.zeros((train.shape[0], NUM_CLASS))
|
|
test_prob = np.zeros((test.shape[0], NUM_CLASS))
|
|
fea_imp_df_list = []
|
|
offline_score = []
|
|
model_list = []
|
|
## K-Fold
|
|
for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
|
|
params = {
|
|
"objective": "multiclass",
|
|
"num_class": NUM_CLASS,
|
|
"learning_rate": 0.01,
|
|
"max_depth": -1,
|
|
"num_leaves": 32,
|
|
"verbose": -1,
|
|
"bagging_fraction": 0.8,
|
|
"feature_fraction": 0.8,
|
|
"seed": seed,
|
|
'metric': 'multi_error'
|
|
|
|
}
|
|
print("FOLD {} IS RUNNING...".format(fold + 1))
|
|
trn_data = lgb.Dataset(train.loc[trn_idx], label=target.loc[trn_idx])
|
|
val_data = lgb.Dataset(train.loc[val_idx], label=target.loc[val_idx])
|
|
|
|
# train
|
|
params['seed'] = seed
|
|
lgb_model = lgb.train(
|
|
params,
|
|
trn_data,
|
|
num_boost_round=N_ROUNDS,
|
|
valid_sets=[trn_data, val_data],
|
|
early_stopping_rounds=100,
|
|
verbose_eval=200,
|
|
categorical_feature=cat_cols,
|
|
|
|
)
|
|
# predict
|
|
oof_prob[val_idx] = lgb_model.predict(train.loc[val_idx], num_iteration=lgb_model.best_iteration)
|
|
test_prob += lgb_model.predict(test, num_iteration=lgb_model.best_iteration) / folds.n_splits
|
|
offline_score.append(lgb_model.best_score['valid_1']['multi_error'])
|
|
fea_imp = pd.DataFrame()
|
|
fea_imp['feature_name'] = lgb_model.feature_name()
|
|
fea_imp['importance'] = lgb_model.feature_importance()
|
|
fea_imp['feature_name'] = fea_imp['feature_name'].map({str(cols_map[i]): i for i in cols_map})
|
|
fea_imp = fea_imp.sort_values('importance', ascending=False)
|
|
fea_imp_df_list.append(fea_imp)
|
|
|
|
model_list.append(lgb_model)
|
|
print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
|
|
fea_imp_df = pd.concat(fea_imp_df_list, ignore_index=True).groupby('feature_name').agg(
|
|
{'importance': 'mean'}).reset_index().sort_values('importance', ascending=False)
|
|
return oof_prob, test_prob, fea_imp_df, model_list
|