diff --git a/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb
index 7d7b3c0..6452c75 100644
--- a/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -20,797 +20,34 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " SK_ID_CURR | \n",
- " TARGET | \n",
- " NAME_CONTRACT_TYPE | \n",
- " CODE_GENDER | \n",
- " FLAG_OWN_CAR | \n",
- " FLAG_OWN_REALTY | \n",
- " CNT_CHILDREN | \n",
- " AMT_INCOME_TOTAL | \n",
- " AMT_CREDIT | \n",
- " AMT_ANNUITY | \n",
- " AMT_GOODS_PRICE | \n",
- " NAME_TYPE_SUITE | \n",
- " NAME_INCOME_TYPE | \n",
- " NAME_EDUCATION_TYPE | \n",
- " NAME_FAMILY_STATUS | \n",
- " NAME_HOUSING_TYPE | \n",
- " REGION_POPULATION_RELATIVE | \n",
- " DAYS_BIRTH | \n",
- " DAYS_EMPLOYED | \n",
- " DAYS_REGISTRATION | \n",
- " DAYS_ID_PUBLISH | \n",
- " OWN_CAR_AGE | \n",
- " FLAG_MOBIL | \n",
- " FLAG_EMP_PHONE | \n",
- " FLAG_WORK_PHONE | \n",
- " FLAG_CONT_MOBILE | \n",
- " FLAG_PHONE | \n",
- " FLAG_EMAIL | \n",
- " OCCUPATION_TYPE | \n",
- " CNT_FAM_MEMBERS | \n",
- " REGION_RATING_CLIENT | \n",
- " REGION_RATING_CLIENT_W_CITY | \n",
- " WEEKDAY_APPR_PROCESS_START | \n",
- " HOUR_APPR_PROCESS_START | \n",
- " REG_REGION_NOT_LIVE_REGION | \n",
- " REG_REGION_NOT_WORK_REGION | \n",
- " LIVE_REGION_NOT_WORK_REGION | \n",
- " REG_CITY_NOT_LIVE_CITY | \n",
- " REG_CITY_NOT_WORK_CITY | \n",
- " LIVE_CITY_NOT_WORK_CITY | \n",
- " ORGANIZATION_TYPE | \n",
- " EXT_SOURCE_1 | \n",
- " EXT_SOURCE_2 | \n",
- " EXT_SOURCE_3 | \n",
- " APARTMENTS_AVG | \n",
- " BASEMENTAREA_AVG | \n",
- " YEARS_BEGINEXPLUATATION_AVG | \n",
- " YEARS_BUILD_AVG | \n",
- " COMMONAREA_AVG | \n",
- " ELEVATORS_AVG | \n",
- " ENTRANCES_AVG | \n",
- " FLOORSMAX_AVG | \n",
- " FLOORSMIN_AVG | \n",
- " LANDAREA_AVG | \n",
- " LIVINGAPARTMENTS_AVG | \n",
- " LIVINGAREA_AVG | \n",
- " NONLIVINGAPARTMENTS_AVG | \n",
- " NONLIVINGAREA_AVG | \n",
- " APARTMENTS_MODE | \n",
- " BASEMENTAREA_MODE | \n",
- " ... | \n",
- " ELEVATORS_MODE | \n",
- " ENTRANCES_MODE | \n",
- " FLOORSMAX_MODE | \n",
- " FLOORSMIN_MODE | \n",
- " LANDAREA_MODE | \n",
- " LIVINGAPARTMENTS_MODE | \n",
- " LIVINGAREA_MODE | \n",
- " NONLIVINGAPARTMENTS_MODE | \n",
- " NONLIVINGAREA_MODE | \n",
- " APARTMENTS_MEDI | \n",
- " BASEMENTAREA_MEDI | \n",
- " YEARS_BEGINEXPLUATATION_MEDI | \n",
- " YEARS_BUILD_MEDI | \n",
- " COMMONAREA_MEDI | \n",
- " ELEVATORS_MEDI | \n",
- " ENTRANCES_MEDI | \n",
- " FLOORSMAX_MEDI | \n",
- " FLOORSMIN_MEDI | \n",
- " LANDAREA_MEDI | \n",
- " LIVINGAPARTMENTS_MEDI | \n",
- " LIVINGAREA_MEDI | \n",
- " NONLIVINGAPARTMENTS_MEDI | \n",
- " NONLIVINGAREA_MEDI | \n",
- " FONDKAPREMONT_MODE | \n",
- " HOUSETYPE_MODE | \n",
- " TOTALAREA_MODE | \n",
- " WALLSMATERIAL_MODE | \n",
- " EMERGENCYSTATE_MODE | \n",
- " OBS_30_CNT_SOCIAL_CIRCLE | \n",
- " DEF_30_CNT_SOCIAL_CIRCLE | \n",
- " OBS_60_CNT_SOCIAL_CIRCLE | \n",
- " DEF_60_CNT_SOCIAL_CIRCLE | \n",
- " DAYS_LAST_PHONE_CHANGE | \n",
- " FLAG_DOCUMENT_2 | \n",
- " FLAG_DOCUMENT_3 | \n",
- " FLAG_DOCUMENT_4 | \n",
- " FLAG_DOCUMENT_5 | \n",
- " FLAG_DOCUMENT_6 | \n",
- " FLAG_DOCUMENT_7 | \n",
- " FLAG_DOCUMENT_8 | \n",
- " FLAG_DOCUMENT_9 | \n",
- " FLAG_DOCUMENT_10 | \n",
- " FLAG_DOCUMENT_11 | \n",
- " FLAG_DOCUMENT_12 | \n",
- " FLAG_DOCUMENT_13 | \n",
- " FLAG_DOCUMENT_14 | \n",
- " FLAG_DOCUMENT_15 | \n",
- " FLAG_DOCUMENT_16 | \n",
- " FLAG_DOCUMENT_17 | \n",
- " FLAG_DOCUMENT_18 | \n",
- " FLAG_DOCUMENT_19 | \n",
- " FLAG_DOCUMENT_20 | \n",
- " FLAG_DOCUMENT_21 | \n",
- " AMT_REQ_CREDIT_BUREAU_HOUR | \n",
- " AMT_REQ_CREDIT_BUREAU_DAY | \n",
- " AMT_REQ_CREDIT_BUREAU_WEEK | \n",
- " AMT_REQ_CREDIT_BUREAU_MON | \n",
- " AMT_REQ_CREDIT_BUREAU_QRT | \n",
- " AMT_REQ_CREDIT_BUREAU_YEAR | \n",
- " DATE | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 145457 | \n",
- " 0 | \n",
- " Cash loans | \n",
- " M | \n",
- " Y | \n",
- " N | \n",
- " 0 | \n",
- " 288000.0 | \n",
- " 242595.0 | \n",
- " 10813.5 | \n",
- " 202500.0 | \n",
- " Unaccompanied | \n",
- " Pensioner | \n",
- " Secondary / secondary special | \n",
- " Married | \n",
- " Municipal apartment | \n",
- " 0.046220 | \n",
- " -22230 | \n",
- " 365243 | \n",
- " -7689.0 | \n",
- " -4096 | \n",
- " 5.0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " NaN | \n",
- " 2.0 | \n",
- " 1 | \n",
- " 1 | \n",
- " FRIDAY | \n",
- " 13 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " XNA | \n",
- " NaN | \n",
- " 0.735594 | \n",
- " 0.413597 | \n",
- " 0.6113 | \n",
- " 0.3295 | \n",
- " 0.9871 | \n",
- " 0.8232 | \n",
- " 0.4761 | \n",
- " 0.96 | \n",
- " 0.4138 | \n",
- " 0.4583 | \n",
- " 0.5 | \n",
- " 0.142 | \n",
- " 0.453 | \n",
- " 0.6385 | \n",
- " 0.2085 | \n",
- " 0.4423 | \n",
- " 0.6229 | \n",
- " 0.342 | \n",
- " ... | \n",
- " 0.9667 | \n",
- " 0.4138 | \n",
- " 0.4583 | \n",
- " 0.5 | \n",
- " 0.1452 | \n",
- " 0.4949 | \n",
- " 0.6652 | \n",
- " 0.2101 | \n",
- " 0.4682 | \n",
- " 0.6173 | \n",
- " 0.3295 | \n",
- " 0.9871 | \n",
- " 0.8256 | \n",
- " 0.4791 | \n",
- " 0.96 | \n",
- " 0.4138 | \n",
- " 0.4583 | \n",
- " 0.5 | \n",
- " 0.1444 | \n",
- " 0.4609 | \n",
- " 0.65 | \n",
- " 0.2096 | \n",
- " 0.4516 | \n",
- " org spec account | \n",
- " block of flats | \n",
- " 0.8750 | \n",
- " Stone, brick | \n",
- " No | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " -1347.0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 2.0 | \n",
- " 2018-05 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 128979 | \n",
- " 0 | \n",
- " Cash loans | \n",
- " F | \n",
- " N | \n",
- " N | \n",
- " 0 | \n",
- " 94500.0 | \n",
- " 646920.0 | \n",
- " 20866.5 | \n",
- " 540000.0 | \n",
- " Unaccompanied | \n",
- " Pensioner | \n",
- " Secondary / secondary special | \n",
- " Married | \n",
- " House / apartment | \n",
- " 0.018850 | \n",
- " -20599 | \n",
- " 365243 | \n",
- " -784.0 | \n",
- " -2393 | \n",
- " NaN | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " NaN | \n",
- " 2.0 | \n",
- " 2 | \n",
- " 2 | \n",
- " SUNDAY | \n",
- " 15 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " XNA | \n",
- " NaN | \n",
- " 0.461944 | \n",
- " 0.326475 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " -1273.0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " 2018-05 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 145448 | \n",
- " 0 | \n",
- " Cash loans | \n",
- " M | \n",
- " Y | \n",
- " Y | \n",
- " 1 | \n",
- " 225000.0 | \n",
- " 183294.0 | \n",
- " 14823.0 | \n",
- " 153000.0 | \n",
- " Unaccompanied | \n",
- " Working | \n",
- " Secondary / secondary special | \n",
- " Married | \n",
- " House / apartment | \n",
- " 0.020713 | \n",
- " -11070 | \n",
- " -1345 | \n",
- " -3463.0 | \n",
- " -3618 | \n",
- " 19.0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " Drivers | \n",
- " 3.0 | \n",
- " 3 | \n",
- " 3 | \n",
- " SATURDAY | \n",
- " 9 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " Self-employed | \n",
- " NaN | \n",
- " 0.374592 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 12.0 | \n",
- " 0.0 | \n",
- " 11.0 | \n",
- " 0.0 | \n",
- " -1127.0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 5.0 | \n",
- " 2018-03 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 294475 | \n",
- " 0 | \n",
- " Cash loans | \n",
- " M | \n",
- " Y | \n",
- " N | \n",
- " 0 | \n",
- " 180000.0 | \n",
- " 260640.0 | \n",
- " 20169.0 | \n",
- " 225000.0 | \n",
- " Family | \n",
- " Working | \n",
- " Secondary / secondary special | \n",
- " Married | \n",
- " House / apartment | \n",
- " 0.026392 | \n",
- " -15901 | \n",
- " -130 | \n",
- " -7799.0 | \n",
- " -4449 | \n",
- " 12.0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " Security staff | \n",
- " 2.0 | \n",
- " 2 | \n",
- " 2 | \n",
- " THURSDAY | \n",
- " 18 | \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " Business Entity Type 3 | \n",
- " NaN | \n",
- " 0.712657 | \n",
- " NaN | \n",
- " 0.1031 | \n",
- " NaN | \n",
- " 0.9856 | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.00 | \n",
- " 0.2069 | \n",
- " 0.1667 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.1050 | \n",
- " NaN | \n",
- " ... | \n",
- " 0.0000 | \n",
- " 0.2069 | \n",
- " 0.1667 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.1041 | \n",
- " NaN | \n",
- " 0.9856 | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.00 | \n",
- " 0.2069 | \n",
- " 0.1667 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " block of flats | \n",
- " 0.0696 | \n",
- " Stone, brick | \n",
- " No | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " -1768.0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 2018-04 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 216609 | \n",
- " 0 | \n",
- " Revolving loans | \n",
- " M | \n",
- " Y | \n",
- " Y | \n",
- " 1 | \n",
- " 112500.0 | \n",
- " 180000.0 | \n",
- " 9000.0 | \n",
- " 180000.0 | \n",
- " Unaccompanied | \n",
- " State servant | \n",
- " Higher education | \n",
- " Married | \n",
- " House / apartment | \n",
- " 0.007020 | \n",
- " -10234 | \n",
- " -1993 | \n",
- " -4040.0 | \n",
- " -2913 | \n",
- " 6.0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " NaN | \n",
- " 3.0 | \n",
- " 2 | \n",
- " 2 | \n",
- " WEDNESDAY | \n",
- " 11 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " Emergency | \n",
- " 0.405051 | \n",
- " 0.528879 | \n",
- " 0.604113 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " -429.0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 2018-03 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 123 columns
\n",
- "
"
- ],
- "text/plain": [
- " SK_ID_CURR TARGET NAME_CONTRACT_TYPE ... AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR DATE\n",
- "0 145457 0 Cash loans ... 0.0 2.0 2018-05\n",
- "1 128979 0 Cash loans ... 0.0 3.0 2018-05\n",
- "2 145448 0 Cash loans ... 0.0 5.0 2018-03\n",
- "3 294475 0 Cash loans ... 0.0 1.0 2018-04\n",
- "4 216609 0 Revolving loans ... 0.0 0.0 2018-03\n",
- "\n",
- "[5 rows x 123 columns]"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mapplication_raw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'data/application_train.csv'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 银行贷款数据,预测违约可能性0/1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mapplication_raw\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 700\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 701\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 702\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 703\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 704\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 433\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 434\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 435\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 436\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 437\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1137\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1138\u001b[0m \u001b[0mnrows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'nrows'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1139\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1140\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1141\u001b[0m \u001b[1;31m# May alter columns / col_dict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1993\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1994\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1995\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1996\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1997\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[1;34m()\u001b[0m\n",
+ "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[1;34m()\u001b[0m\n",
+ "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[1;34m()\u001b[0m\n",
+ "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[1;34m()\u001b[0m\n",
+ "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[1;34m()\u001b[0m\n",
+ "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[1;34m()\u001b[0m\n",
+ "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\dtypes\\common.py\u001b[0m in \u001b[0;36mis_categorical_dtype\u001b[1;34m(arr_or_dtype)\u001b[0m\n\u001b[0;32m 570\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 572\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mis_categorical_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr_or_dtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 573\u001b[0m \"\"\"\n\u001b[0;32m 574\u001b[0m \u001b[0mCheck\u001b[0m \u001b[0mwhether\u001b[0m \u001b[0man\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mCategorical\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+ ]
}
],
"source": [
- "application_raw = pd.read_csv('application_train.csv') # 银行贷款数据,预测违约可能性0/1\n",
+ "application_raw = pd.read_csv('data/application_train.csv') # 银行贷款数据,预测违约可能性0/1\n",
"application_raw.head()"
]
},
@@ -823,11 +60,11 @@
"# 数据预处理\n",
"def get_nonull_dummy_data(application_train_raw, dummy_drop=['ORGANIZATION_TYPE']):\n",
" # 缺失值填充\n",
- " nulls = pd.isnull(application_train_raw),sum()\n",
+ " nulls = pd.isnull(application_train_raw).sum()\n",
" less_nulls = nulls[(nulls<3075)&(nulls!=0)].index\n",
" less_nulls_float = []\n",
" for i in range(len(less_nulls)):\n",
- " if application_train_raw[less_nulls[i]].dtype!='0':\n",
+ " if application_train_raw[less_nulls[i]].dtype != 'O':\n",
" less_nulls_float.append(less_nulls[i])\n",
" \n",
" application_train_raw[less_nulls_float] = application_train_raw[less_nulls_float].fillna(application_train_raw[less_nulls_float].mean())\n",
@@ -836,7 +73,7 @@
" more_nulls = nulls[(nulls >= 3075)].index\n",
" more_nulls_float = []\n",
" for i in range(len(more_nulls)):\n",
- " if application_train_raw[more_nulls[i]].dtype!='0':\n",
+ " if application_train_raw[more_nulls[i]].dtype != 'O':\n",
" more_nulls_float.append(more_nulls[i])\n",
" \n",
" application_train_raw[more_nulls_float] = application_train_raw[more_nulls_float].fillna(application_train_raw[more_nulls_float].mean())\n",
@@ -847,16 +84,17 @@
" all_cols = application_train_raw.columns\n",
" cat_cols = []\n",
" for col in all_cols:\n",
- " if application_train_raw[col].dtype!='0':\n",
- " more_nulls_float.append(col)\n",
+ " if application_train_raw[col].dtype == 'O':\n",
+ " cat_cols.append(col)\n",
" \n",
- " application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, axis=1, dummy_na=True)\n",
+ " application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, dummy_na=True)\n",
" \n",
" return application_train_raw\n",
"\n",
"\n",
"def import_and_create_train_test_data(test_size=0.33, random_state=42):\n",
- " application_raw = pd.read_csv('application_train.csv') \n",
+ " # 训练和验证集制作\n",
+ " application_raw = pd.read_csv('data/application_train.csv') \n",
" application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n",
" \n",
" X = application.drop(['TARGET'],axis=1)\n",
@@ -874,14 +112,174 @@
"\n",
"\n",
"def import_and_create_TEST_data():\n",
- " application_raw = pd.read_csv('test_data.csv') \n",
+ " # 线上测试集\n",
+ " application_raw = pd.read_csv('data/application_test.csv') \n",
" application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n",
" \n",
" X = application\n",
" users = X[['SK_ID_CURR']]\n",
+ " users.reset_index(drop=True, inplace=True)\n",
" \n",
- " "
+ " return(X, users)\n",
+ "\n",
+ "\n",
+ "def get_imp_df(xgb_model):\n",
+ "# 获取特征重要性\n",
+ " imp = pd.DataFrame(np.asarray(list(xgb_model.get_fscore().keys())))\n",
+ " imp.columns = ['Feature']\n",
+ " imp['importance'] = np.asarray(list(xgb_model.get_fscore().keys()))\n",
+ " imp = imp.sort_values(by=['importance'], ascending=False)\n",
+ " imp = imp.reset_index(drop=True)\n",
+ " \n",
+ " return (imp)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train.columns.values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 训练集和验证集"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train,y_test,train_users,test_users = import_and_create_train_test_data()\n",
+ "X_TEST,TEST_users = import_and_create_TEST_data()\n",
+ "\n",
+ "drop=['CODE_GENDER_XNA''NAME_INCOME_TYPE_Maternity leave',\n",
+ " 'NAME_FAMILY_STATUS_Unknown','SK_ID_CURR']\n",
+ "X_train = X_train.drop(drop, axis=1)\n",
+ "X_test = X_test.drop(drop, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_train = X_train.reset_index(drop=True)\n",
+ "data_train['target'] = y_train.reset_index(drop=True)\n",
+ "data_test = X_test.reset_index(drop=True)\n",
+ "data_test['target'] = y_test.reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 训练集与验证集特征趋势分析\n",
+ "\n",
+ "\n",
+ "get_univariate_plots(data=data_train,target_col='target',\n",
+ " features_list=data_train.columns[0:10],data_test=data_test)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "grouped_train,grouped_test = univariate_plotter(data=data_train,\n",
+ " target_col='target',\n",
+ " feature='AMT_INCOME_TOTAL',\n",
+ " data_test=data_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "grouped_train # 展示bin中数据信息"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 训练模型,使用全部特征"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dtrain = xgb.DMatrix(X_test,label=y_test,missing=np.nan)\n",
+ "dtest = xgb.DMatrix(X_train,label=y_train,missing=np.nan)\n",
+ "\n",
+ "params = {'max_depth':8,'learning_rate':0.1,'silent':0,\n",
+ " 'objective':'binary:logistic','min_child_weight':500,\n",
+ " 'eval_metric':'auc','nthread':8}\n",
+ "xgb_model = xgb.train(params, dtrain, 400, evals=[(dtrain,'train'),\n",
+ " (dtest,'test')],\n",
+ " early_stopping_rounds=25)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 提交结果\n",
+ "dTEST = xgb.DMatrix(X_TEST[X_test.columns], missing=np.nan)\n",
+ "y_TEST_pred = xgb_model.predict(dTEST)\n",
+ "submission_all_feats = pd.DataFrame({'SK_ID_CURR':TEST_users['SK_ID_CURR'],\n",
+ " 'TARGET':y_TEST_pred})\n",
+ "submission_all_feats.to_csv('data/submission_all_feats_1.csv',index=False)"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 计算训练集和验证集中特征的趋势"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stats = get_trend_stats(data=data_train,target_col='target',data_test=data_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission_all_feats.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {