diff --git a/机器学习竞赛实战_优胜解决方案/特征工程建模/特征工程建模.ipynb b/机器学习竞赛实战_优胜解决方案/特征工程建模/特征工程建模.ipynb new file mode 100644 index 0000000..7d7b3c0 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/特征工程建模/特征工程建模.ipynb @@ -0,0 +1,908 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import xgboost as xgb\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import roc_auc_score\n", + "from featexp import univariate_plotter # pip install featexp\n", + "from featexp import get_univariate_plots\n", + "from featexp import get_trend_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SK_ID_CURRTARGETNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITYAMT_GOODS_PRICENAME_TYPE_SUITENAME_INCOME_TYPENAME_EDUCATION_TYPENAME_FAMILY_STATUSNAME_HOUSING_TYPEREGION_POPULATION_RELATIVEDAYS_BIRTHDAYS_EMPLOYEDDAYS_REGISTRATIONDAYS_ID_PUBLISHOWN_CAR_AGEFLAG_MOBILFLAG_EMP_PHONEFLAG_WORK_PHONEFLAG_CONT_MOBILEFLAG_PHONEFLAG_EMAILOCCUPATION_TYPECNT_FAM_MEMBERSREGION_RATING_CLIENTREGION_RATING_CLIENT_W_CITYWEEKDAY_APPR_PROCESS_STARTHOUR_APPR_PROCESS_STARTREG_REGION_NOT_LIVE_REGIONREG_REGION_NOT_WORK_REGIONLIVE_REGION_NOT_WORK_REGIONREG_CITY_NOT_LIVE_CITYREG_CITY_NOT_WORK_CITYLIVE_CITY_NOT_WORK_CITYORGANIZATION_TYPEEXT_SOURCE_1EXT_SOURCE_2EXT_SOURCE_3APARTMENTS_AVGBASEMENTAREA_AVGYEARS_BEGINEXPLUATATION_AVGYEARS_BUILD_AVGCOMMONAREA_AVGELEVATORS_AVGENTRANCES_AVGFLOORSMAX_AVGFLOORSMIN_AVGLANDAREA_AVGLIVINGAPARTMENTS_AVGLIVINGAREA_AVGNONLIVINGAPARTMENTS_AVGNONLIVINGAREA_AVGAPARTMENTS_MODEBASEMENTAREA_MODE...ELEVATORS_MODEENTRANCES_MODEFLOORSMAX_MODEFLOORSMIN_MODELANDAREA_MODELIVINGAPARTMENTS_MODELIVINGAREA_MODENONLIVINGAPARTMENTS_MODENONLIVINGAREA_MODEAPARTMENTS_MEDIBASEMENTAREA_MEDIYEARS_BEGINEXPLUATATION_MEDIYEARS_BUILD_MEDICOMMONAREA_MEDIELEVATORS_MEDIENTRANCES_MEDIFLOORSMAX_MEDIFLOORSMIN_MEDILANDAREA_MEDILIVINGAPARTMENTS_MEDILIVINGAREA_MEDINONLIVINGAPARTMENTS_MEDINONLIVINGAREA_MEDIFONDKAPREMONT_MODEHOUSETYPE_MODETOTALAREA_MODEWALLSMATERIAL_MODEEMERGENCYSTATE_MODEOBS_30_CNT_SOCIAL_CIRCLEDEF_30_CNT_SOCIAL_CIRCLEOBS_60_CNT_SOCIAL_CIRCLEDEF_60_CNT_SOCIAL_CIRCLEDAYS_LAST_PHONE_CHANGEFLAG_DOCUMENT_2FLAG_DOCUMENT_3FLAG_DOCUMENT_4FLAG_DOCUMENT_5FLAG_DOCUMENT_6FLAG_DOCUMENT_7FLAG_DOCUMENT_8FLAG_DOCUMENT_9FLAG_DOCUMENT_10FLAG_DOCUMENT_11FLAG_DOCUMENT_12FLAG_DOCUMENT_13FLAG_DOCUMENT_14FLAG_DOCUMENT_15FLAG_DOCUMENT_16FLAG_DOCUMENT_17FLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEARDATE
01454570Cash loansMYN0288000.0242595.010813.5202500.0UnaccompaniedPensionerSecondary / secondary specialMarriedMunicipal apartment0.046220-22230365243-7689.0-40965.0100100NaN2.011FRIDAY13000000XNANaN0.7355940.4135970.61130.32950.98710.82320.47610.960.41380.45830.50.1420.4530.63850.20850.44230.62290.342...0.96670.41380.45830.50.14520.49490.66520.21010.46820.61730.32950.98710.82560.47910.960.41380.45830.50.14440.46090.650.20960.4516org spec accountblock of flats0.8750Stone, brickNo0.00.00.00.0-1347.0000010000000000000000.00.00.00.00.02.02018-05
11289790Cash loansFNN094500.0646920.020866.5540000.0UnaccompaniedPensionerSecondary / secondary specialMarriedHouse / apartment0.018850-20599365243-784.0-2393NaN100100NaN2.022SUNDAY15000000XNANaN0.4619440.326475NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.00.00.00.0-1273.0010000000000000000000.00.00.00.00.03.02018-05
21454480Cash loansMYY1225000.0183294.014823.0153000.0UnaccompaniedWorkingSecondary / secondary specialMarriedHouse / apartment0.020713-11070-1345-3463.0-361819.0110100Drivers3.033SATURDAY9000111Self-employedNaN0.374592NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN12.00.011.00.0-1127.0000000100000000000000.00.00.00.00.05.02018-03
32944750Cash loansMYN0180000.0260640.020169.0225000.0FamilyWorkingSecondary / secondary specialMarriedHouse / apartment0.026392-15901-130-7799.0-444912.0111100Security staff2.022THURSDAY18011011Business Entity Type 3NaN0.712657NaN0.1031NaN0.9856NaNNaN0.000.20690.1667NaNNaNNaNNaNNaNNaN0.1050NaN...0.00000.20690.1667NaNNaNNaNNaNNaNNaN0.1041NaN0.9856NaNNaN0.000.20690.1667NaNNaNNaNNaNNaNNaNNaNblock of flats0.0696Stone, brickNo0.00.00.00.0-1768.0010000000000000000000.00.00.01.00.01.02018-04
42166090Revolving loansMYY1112500.0180000.09000.0180000.0UnaccompaniedState servantHigher educationMarriedHouse / apartment0.007020-10234-1993-4040.0-29136.0110100NaN3.022WEDNESDAY11000000Emergency0.4050510.5288790.604113NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1.00.01.00.0-429.0000000000000000000000.00.01.00.00.00.02018-03
\n", + "

5 rows × 123 columns

\n", + "
" + ], + "text/plain": [ + " SK_ID_CURR TARGET NAME_CONTRACT_TYPE ... AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR DATE\n", + "0 145457 0 Cash loans ... 0.0 2.0 2018-05\n", + "1 128979 0 Cash loans ... 0.0 3.0 2018-05\n", + "2 145448 0 Cash loans ... 0.0 5.0 2018-03\n", + "3 294475 0 Cash loans ... 0.0 1.0 2018-04\n", + "4 216609 0 Revolving loans ... 0.0 0.0 2018-03\n", + "\n", + "[5 rows x 123 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "application_raw = pd.read_csv('application_train.csv') # 银行贷款数据,预测违约可能性0/1\n", + "application_raw.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 数据预处理\n", + "def get_nonull_dummy_data(application_train_raw, dummy_drop=['ORGANIZATION_TYPE']):\n", + " # 缺失值填充\n", + " nulls = pd.isnull(application_train_raw),sum()\n", + " less_nulls = nulls[(nulls<3075)&(nulls!=0)].index\n", + " less_nulls_float = []\n", + " for i in range(len(less_nulls)):\n", + " if application_train_raw[less_nulls[i]].dtype!='0':\n", + " less_nulls_float.append(less_nulls[i])\n", + " \n", + " application_train_raw[less_nulls_float] = application_train_raw[less_nulls_float].fillna(application_train_raw[less_nulls_float].mean())\n", + " \n", + " # 缺失值填充\n", + " more_nulls = nulls[(nulls >= 3075)].index\n", + " more_nulls_float = []\n", + " for i in range(len(more_nulls)):\n", + " if application_train_raw[more_nulls[i]].dtype!='0':\n", + " more_nulls_float.append(more_nulls[i])\n", + " \n", + " application_train_raw[more_nulls_float] = application_train_raw[more_nulls_float].fillna(application_train_raw[more_nulls_float].mean())\n", + " \n", + " # 特征编码\n", + " application_train_raw.drop(columns=dummy_drop, axis=1, inplace=True)\n", + " \n", + " all_cols = application_train_raw.columns\n", + " cat_cols = []\n", + " for col in all_cols:\n", + " if application_train_raw[col].dtype!='0':\n", + " more_nulls_float.append(col)\n", + " \n", + " application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, axis=1, dummy_na=True)\n", + " \n", + " return application_train_raw\n", + "\n", + "\n", + "def import_and_create_train_test_data(test_size=0.33, random_state=42):\n", + " application_raw = pd.read_csv('application_train.csv') \n", + " application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n", + " \n", + " X = application.drop(['TARGET'],axis=1)\n", + " y = application['TARGET']\n", + " X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=test_size,random_state=random_state)\n", + " \n", + " train_users = X_train[['SK_ID_CURR']]\n", + " train_users['TARGET'] = y_train\n", + " test_users = X_test[['SK_ID_CURR']]\n", + " test_users['TARGET'] = y_test\n", + " train_users.reset_index(drop=True, inplace=True)\n", + " test_users.reset_index(drop=True, inplace=True)\n", + " \n", + " return(X_train, X_test,y_train,y_test,train_users,test_users)\n", + "\n", + "\n", + "def import_and_create_TEST_data():\n", + " application_raw = pd.read_csv('test_data.csv') \n", + " application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n", + " \n", + " X = application\n", + " users = X[['SK_ID_CURR']]\n", + " \n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}