{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import xgboost as xgb\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import roc_auc_score\n", "from featexp import univariate_plotter # pip install featexp\n", "from featexp import get_univariate_plots\n", "from featexp import get_trend_stats" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mapplication_raw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'data/application_train.csv'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 银行贷款数据,预测违约可能性0/1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mapplication_raw\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 700\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 701\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 702\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 703\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 704\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 433\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 434\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 435\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 436\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 437\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1137\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1138\u001b[0m \u001b[0mnrows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'nrows'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1139\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1140\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1141\u001b[0m \u001b[1;31m# May alter columns / col_dict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1993\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1994\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1995\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1996\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1997\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[1;34m()\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[1;34m()\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[1;34m()\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[1;34m()\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[1;34m()\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[1;34m()\u001b[0m\n", "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\dtypes\\common.py\u001b[0m in \u001b[0;36mis_categorical_dtype\u001b[1;34m(arr_or_dtype)\u001b[0m\n\u001b[0;32m 570\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 572\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mis_categorical_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr_or_dtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 573\u001b[0m \"\"\"\n\u001b[0;32m 574\u001b[0m \u001b[0mCheck\u001b[0m \u001b[0mwhether\u001b[0m \u001b[0man\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mCategorical\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "application_raw = pd.read_csv('data/application_train.csv') # 银行贷款数据,预测违约可能性0/1\n", "application_raw.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 数据预处理\n", "def get_nonull_dummy_data(application_train_raw, dummy_drop=['ORGANIZATION_TYPE']):\n", " # 缺失值填充\n", " nulls = pd.isnull(application_train_raw).sum()\n", " less_nulls = nulls[(nulls<3075)&(nulls!=0)].index\n", " less_nulls_float = []\n", " for i in range(len(less_nulls)):\n", " if application_train_raw[less_nulls[i]].dtype != 'O':\n", " less_nulls_float.append(less_nulls[i])\n", " \n", " application_train_raw[less_nulls_float] = application_train_raw[less_nulls_float].fillna(application_train_raw[less_nulls_float].mean())\n", " \n", " # 缺失值填充\n", " more_nulls = nulls[(nulls >= 3075)].index\n", " more_nulls_float = []\n", " for i in range(len(more_nulls)):\n", " if application_train_raw[more_nulls[i]].dtype != 'O':\n", " more_nulls_float.append(more_nulls[i])\n", " \n", " application_train_raw[more_nulls_float] = application_train_raw[more_nulls_float].fillna(application_train_raw[more_nulls_float].mean())\n", " \n", " # 特征编码\n", " application_train_raw.drop(columns=dummy_drop, axis=1, inplace=True)\n", " \n", " all_cols = application_train_raw.columns\n", " cat_cols = []\n", " for col in all_cols:\n", " if application_train_raw[col].dtype == 'O':\n", " cat_cols.append(col)\n", " \n", " application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, dummy_na=True)\n", " \n", " return application_train_raw\n", "\n", "\n", "def import_and_create_train_test_data(test_size=0.33, random_state=42):\n", " # 训练和验证集制作\n", " application_raw = pd.read_csv('data/application_train.csv') \n", " application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n", " \n", " X = application.drop(['TARGET'],axis=1)\n", " y = application['TARGET']\n", " X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=test_size,random_state=random_state)\n", " \n", " train_users = X_train[['SK_ID_CURR']]\n", " train_users['TARGET'] = y_train\n", " test_users = X_test[['SK_ID_CURR']]\n", " test_users['TARGET'] = y_test\n", " train_users.reset_index(drop=True, inplace=True)\n", " test_users.reset_index(drop=True, inplace=True)\n", " \n", " return(X_train, X_test,y_train,y_test,train_users,test_users)\n", "\n", "\n", "def import_and_create_TEST_data():\n", " # 线上测试集\n", " application_raw = pd.read_csv('data/application_test.csv') \n", " application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n", " \n", " X = application\n", " users = X[['SK_ID_CURR']]\n", " users.reset_index(drop=True, inplace=True)\n", " \n", " return(X, users)\n", "\n", "\n", "def get_imp_df(xgb_model):\n", "# 获取特征重要性\n", " imp = pd.DataFrame(np.asarray(list(xgb_model.get_fscore().keys())))\n", " imp.columns = ['Feature']\n", " imp['importance'] = np.asarray(list(xgb_model.get_fscore().keys()))\n", " imp = imp.sort_values(by=['importance'], ascending=False)\n", " imp = imp.reset_index(drop=True)\n", " \n", " return (imp)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train.columns.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 训练集和验证集" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train,y_test,train_users,test_users = import_and_create_train_test_data()\n", "X_TEST,TEST_users = import_and_create_TEST_data()\n", "\n", "drop=['CODE_GENDER_XNA''NAME_INCOME_TYPE_Maternity leave',\n", " 'NAME_FAMILY_STATUS_Unknown','SK_ID_CURR']\n", "X_train = X_train.drop(drop, axis=1)\n", "X_test = X_test.drop(drop, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_train = X_train.reset_index(drop=True)\n", "data_train['target'] = y_train.reset_index(drop=True)\n", "data_test = X_test.reset_index(drop=True)\n", "data_test['target'] = y_test.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 训练集与验证集特征趋势分析\n", "\n", "\n", "get_univariate_plots(data=data_train,target_col='target',\n", " features_list=data_train.columns[0:10],data_test=data_test)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grouped_train,grouped_test = univariate_plotter(data=data_train,\n", " target_col='target',\n", " feature='AMT_INCOME_TOTAL',\n", " data_test=data_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grouped_train # 展示bin中数据信息" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 训练模型,使用全部特征" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dtrain = xgb.DMatrix(X_test,label=y_test,missing=np.nan)\n", "dtest = xgb.DMatrix(X_train,label=y_train,missing=np.nan)\n", "\n", "params = {'max_depth':8,'learning_rate':0.1,'silent':0,\n", " 'objective':'binary:logistic','min_child_weight':500,\n", " 'eval_metric':'auc','nthread':8}\n", "xgb_model = xgb.train(params, dtrain, 400, evals=[(dtrain,'train'),\n", " (dtest,'test')],\n", " early_stopping_rounds=25)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 提交结果\n", "dTEST = xgb.DMatrix(X_TEST[X_test.columns], missing=np.nan)\n", "y_TEST_pred = xgb_model.predict(dTEST)\n", "submission_all_feats = pd.DataFrame({'SK_ID_CURR':TEST_users['SK_ID_CURR'],\n", " 'TARGET':y_TEST_pred})\n", "submission_all_feats.to_csv('data/submission_all_feats_1.csv',index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 计算训练集和验证集中特征的趋势" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "stats = get_trend_stats(data=data_train,target_col='target',data_test=data_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "submission_all_feats.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }