You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

307 lines
16 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import roc_auc_score\n",
"from featexp import univariate_plotter # pip install featexp\n",
"from featexp import get_univariate_plots\n",
"from featexp import get_trend_stats"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-6bab406005e5>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mapplication_raw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'data/application_train.csv'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 银行贷款数据预测违约可能性0/1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mapplication_raw\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 700\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 701\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 702\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 703\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 704\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 433\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 434\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 435\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 436\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 437\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1137\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1138\u001b[0m \u001b[0mnrows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'nrows'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1139\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1140\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1141\u001b[0m \u001b[1;31m# May alter columns / col_dict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1993\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1994\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1995\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1996\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1997\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\dtypes\\common.py\u001b[0m in \u001b[0;36mis_categorical_dtype\u001b[1;34m(arr_or_dtype)\u001b[0m\n\u001b[0;32m 570\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 572\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mis_categorical_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr_or_dtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 573\u001b[0m \"\"\"\n\u001b[0;32m 574\u001b[0m \u001b[0mCheck\u001b[0m \u001b[0mwhether\u001b[0m \u001b[0man\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mCategorical\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"application_raw = pd.read_csv('data/application_train.csv') # 银行贷款数据预测违约可能性0/1\n",
"application_raw.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 数据预处理\n",
"def get_nonull_dummy_data(application_train_raw, dummy_drop=['ORGANIZATION_TYPE']):\n",
" # 缺失值填充\n",
" nulls = pd.isnull(application_train_raw).sum()\n",
" less_nulls = nulls[(nulls<3075)&(nulls!=0)].index\n",
" less_nulls_float = []\n",
" for i in range(len(less_nulls)):\n",
" if application_train_raw[less_nulls[i]].dtype != 'O':\n",
" less_nulls_float.append(less_nulls[i])\n",
" \n",
" application_train_raw[less_nulls_float] = application_train_raw[less_nulls_float].fillna(application_train_raw[less_nulls_float].mean())\n",
" \n",
" # 缺失值填充\n",
" more_nulls = nulls[(nulls >= 3075)].index\n",
" more_nulls_float = []\n",
" for i in range(len(more_nulls)):\n",
" if application_train_raw[more_nulls[i]].dtype != 'O':\n",
" more_nulls_float.append(more_nulls[i])\n",
" \n",
" application_train_raw[more_nulls_float] = application_train_raw[more_nulls_float].fillna(application_train_raw[more_nulls_float].mean())\n",
" \n",
" # 特征编码\n",
" application_train_raw.drop(columns=dummy_drop, axis=1, inplace=True)\n",
" \n",
" all_cols = application_train_raw.columns\n",
" cat_cols = []\n",
" for col in all_cols:\n",
" if application_train_raw[col].dtype == 'O':\n",
" cat_cols.append(col)\n",
" \n",
" application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, dummy_na=True)\n",
" \n",
" return application_train_raw\n",
"\n",
"\n",
"def import_and_create_train_test_data(test_size=0.33, random_state=42):\n",
" # 训练和验证集制作\n",
" application_raw = pd.read_csv('data/application_train.csv') \n",
" application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n",
" \n",
" X = application.drop(['TARGET'],axis=1)\n",
" y = application['TARGET']\n",
" X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=test_size,random_state=random_state)\n",
" \n",
" train_users = X_train[['SK_ID_CURR']]\n",
" train_users['TARGET'] = y_train\n",
" test_users = X_test[['SK_ID_CURR']]\n",
" test_users['TARGET'] = y_test\n",
" train_users.reset_index(drop=True, inplace=True)\n",
" test_users.reset_index(drop=True, inplace=True)\n",
" \n",
" return(X_train, X_test,y_train,y_test,train_users,test_users)\n",
"\n",
"\n",
"def import_and_create_TEST_data():\n",
" # 线上测试集\n",
" application_raw = pd.read_csv('data/application_test.csv') \n",
" application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n",
" \n",
" X = application\n",
" users = X[['SK_ID_CURR']]\n",
" users.reset_index(drop=True, inplace=True)\n",
" \n",
" return(X, users)\n",
"\n",
"\n",
"def get_imp_df(xgb_model):\n",
"# 获取特征重要性\n",
" imp = pd.DataFrame(np.asarray(list(xgb_model.get_fscore().keys())))\n",
" imp.columns = ['Feature']\n",
" imp['importance'] = np.asarray(list(xgb_model.get_fscore().keys()))\n",
" imp = imp.sort_values(by=['importance'], ascending=False)\n",
" imp = imp.reset_index(drop=True)\n",
" \n",
" return (imp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train.columns.values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 训练集和验证集"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train,y_test,train_users,test_users = import_and_create_train_test_data()\n",
"X_TEST,TEST_users = import_and_create_TEST_data()\n",
"\n",
"drop=['CODE_GENDER_XNA''NAME_INCOME_TYPE_Maternity leave',\n",
" 'NAME_FAMILY_STATUS_Unknown','SK_ID_CURR']\n",
"X_train = X_train.drop(drop, axis=1)\n",
"X_test = X_test.drop(drop, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_train = X_train.reset_index(drop=True)\n",
"data_train['target'] = y_train.reset_index(drop=True)\n",
"data_test = X_test.reset_index(drop=True)\n",
"data_test['target'] = y_test.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 训练集与验证集特征趋势分析\n",
"\n",
"\n",
"get_univariate_plots(data=data_train,target_col='target',\n",
" features_list=data_train.columns[0:10],data_test=data_test)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"grouped_train,grouped_test = univariate_plotter(data=data_train,\n",
" target_col='target',\n",
" feature='AMT_INCOME_TOTAL',\n",
" data_test=data_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"grouped_train # 展示bin中数据信息"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 训练模型,使用全部特征"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dtrain = xgb.DMatrix(X_test,label=y_test,missing=np.nan)\n",
"dtest = xgb.DMatrix(X_train,label=y_train,missing=np.nan)\n",
"\n",
"params = {'max_depth':8,'learning_rate':0.1,'silent':0,\n",
" 'objective':'binary:logistic','min_child_weight':500,\n",
" 'eval_metric':'auc','nthread':8}\n",
"xgb_model = xgb.train(params, dtrain, 400, evals=[(dtrain,'train'),\n",
" (dtest,'test')],\n",
" early_stopping_rounds=25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 提交结果\n",
"dTEST = xgb.DMatrix(X_TEST[X_test.columns], missing=np.nan)\n",
"y_TEST_pred = xgb_model.predict(dTEST)\n",
"submission_all_feats = pd.DataFrame({'SK_ID_CURR':TEST_users['SK_ID_CURR'],\n",
" 'TARGET':y_TEST_pred})\n",
"submission_all_feats.to_csv('data/submission_all_feats_1.csv',index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 计算训练集和验证集中特征的趋势"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"stats = get_trend_stats(data=data_train,target_col='target',data_test=data_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"submission_all_feats.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}