In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from featexp import univariate_plotter # pip install featexp
from featexp import get_univariate_plots
from featexp import get_trend_stats

In [2]:
application_raw = pd.read_csv('data/application_train.csv') # 银行贷款数据,预测违约可能性0/1
application_raw.head()

KeyboardInterrupt: 

In [None]:
# 数据预处理
def get_nonull_dummy_data(application_train_raw, dummy_drop=['ORGANIZATION_TYPE']):
 # 缺失值填充
 nulls = pd.isnull(application_train_raw).sum()
 less_nulls = nulls[(nulls<3075)&(nulls!=0)].index
 less_nulls_float = []
 for i in range(len(less_nulls)):
 if application_train_raw[less_nulls[i]].dtype != 'O':
 less_nulls_float.append(less_nulls[i])
 
 application_train_raw[less_nulls_float] = application_train_raw[less_nulls_float].fillna(application_train_raw[less_nulls_float].mean())
 
 # 缺失值填充
 more_nulls = nulls[(nulls >= 3075)].index
 more_nulls_float = []
 for i in range(len(more_nulls)):
 if application_train_raw[more_nulls[i]].dtype != 'O':
 more_nulls_float.append(more_nulls[i])
 
 application_train_raw[more_nulls_float] = application_train_raw[more_nulls_float].fillna(application_train_raw[more_nulls_float].mean())
 
 # 特征编码
 application_train_raw.drop(columns=dummy_drop, axis=1, inplace=True)
 
 all_cols = application_train_raw.columns
 cat_cols = []
 for col in all_cols:
 if application_train_raw[col].dtype == 'O':
 cat_cols.append(col)
 
 application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, dummy_na=True)
 
 return application_train_raw


def import_and_create_train_test_data(test_size=0.33, random_state=42):
 # 训练和验证集制作
 application_raw = pd.read_csv('data/application_train.csv') 
 application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])
 
 X = application.drop(['TARGET'],axis=1)
 y = application['TARGET']
 X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=test_size,random_state=random_state)
 
 train_users = X_train[['SK_ID_CURR']]
 train_users['TARGET'] = y_train
 test_users = X_test[['SK_ID_CURR']]
 test_users['TARGET'] = y_test
 train_users.reset_index(drop=True, inplace=True)
 test_users.reset_index(drop=True, inplace=True)
 
 return(X_train, X_test,y_train,y_test,train_users,test_users)


def import_and_create_TEST_data():
 # 线上测试集
 application_raw = pd.read_csv('data/application_test.csv') 
 application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])
 
 X = application
 users = X[['SK_ID_CURR']]
 users.reset_index(drop=True, inplace=True)
 
 return(X, users)


def get_imp_df(xgb_model):
# 获取特征重要性
 imp = pd.DataFrame(np.asarray(list(xgb_model.get_fscore().keys())))
 imp.columns = ['Feature']
 imp['importance'] = np.asarray(list(xgb_model.get_fscore().keys()))
 imp = imp.sort_values(by=['importance'], ascending=False)
 imp = imp.reset_index(drop=True)
 
 return (imp)

In [None]:
X_train.columns.values

### 训练集和验证集

In [None]:
X_train, X_test, y_train,y_test,train_users,test_users = import_and_create_train_test_data()
X_TEST,TEST_users = import_and_create_TEST_data()

drop=['CODE_GENDER_XNA''NAME_INCOME_TYPE_Maternity leave',
 'NAME_FAMILY_STATUS_Unknown','SK_ID_CURR']
X_train = X_train.drop(drop, axis=1)
X_test = X_test.drop(drop, axis=1)

In [None]:
data_train = X_train.reset_index(drop=True)
data_train['target'] = y_train.reset_index(drop=True)
data_test = X_test.reset_index(drop=True)
data_test['target'] = y_test.reset_index(drop=True)

In [None]:
# 训练集与验证集特征趋势分析


get_univariate_plots(data=data_train,target_col='target',
 features_list=data_train.columns[0:10],data_test=data_test)


In [None]:
grouped_train,grouped_test = univariate_plotter(data=data_train,
 target_col='target',
 feature='AMT_INCOME_TOTAL',
 data_test=data_test)

In [None]:
grouped_train # 展示bin中数据信息

### 训练模型,使用全部特征

In [None]:
dtrain = xgb.DMatrix(X_test,label=y_test,missing=np.nan)
dtest = xgb.DMatrix(X_train,label=y_train,missing=np.nan)

params = {'max_depth':8,'learning_rate':0.1,'silent':0,
 'objective':'binary:logistic','min_child_weight':500,
 'eval_metric':'auc','nthread':8}
xgb_model = xgb.train(params, dtrain, 400, evals=[(dtrain,'train'),
 (dtest,'test')],
 early_stopping_rounds=25)

In [None]:
# 提交结果
dTEST = xgb.DMatrix(X_TEST[X_test.columns], missing=np.nan)
y_TEST_pred = xgb_model.predict(dTEST)
submission_all_feats = pd.DataFrame({'SK_ID_CURR':TEST_users['SK_ID_CURR'],
 'TARGET':y_TEST_pred})
submission_all_feats.to_csv('data/submission_all_feats_1.csv',index=False)

### 计算训练集和验证集中特征的趋势

In [None]:
stats = get_trend_stats(data=data_train,target_col='target',data_test=data_test)

In [None]:
submission_all_feats.shape