diff --git a/机器学习竞赛实战_优胜解决方案/特征工程建模/特征工程建模.ipynb b/机器学习竞赛实战_优胜解决方案/特征工程建模/特征工程建模.ipynb new file mode 100644 index 0000000..7d7b3c0 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/特征工程建模/特征工程建模.ipynb @@ -0,0 +1,908 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import xgboost as xgb\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import roc_auc_score\n", + "from featexp import univariate_plotter # pip install featexp\n", + "from featexp import get_univariate_plots\n", + "from featexp import get_trend_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | SK_ID_CURR | \n", + "TARGET | \n", + "NAME_CONTRACT_TYPE | \n", + "CODE_GENDER | \n", + "FLAG_OWN_CAR | \n", + "FLAG_OWN_REALTY | \n", + "CNT_CHILDREN | \n", + "AMT_INCOME_TOTAL | \n", + "AMT_CREDIT | \n", + "AMT_ANNUITY | \n", + "AMT_GOODS_PRICE | \n", + "NAME_TYPE_SUITE | \n", + "NAME_INCOME_TYPE | \n", + "NAME_EDUCATION_TYPE | \n", + "NAME_FAMILY_STATUS | \n", + "NAME_HOUSING_TYPE | \n", + "REGION_POPULATION_RELATIVE | \n", + "DAYS_BIRTH | \n", + "DAYS_EMPLOYED | \n", + "DAYS_REGISTRATION | \n", + "DAYS_ID_PUBLISH | \n", + "OWN_CAR_AGE | \n", + "FLAG_MOBIL | \n", + "FLAG_EMP_PHONE | \n", + "FLAG_WORK_PHONE | \n", + "FLAG_CONT_MOBILE | \n", + "FLAG_PHONE | \n", + "FLAG_EMAIL | \n", + "OCCUPATION_TYPE | \n", + "CNT_FAM_MEMBERS | \n", + "REGION_RATING_CLIENT | \n", + "REGION_RATING_CLIENT_W_CITY | \n", + "WEEKDAY_APPR_PROCESS_START | \n", + "HOUR_APPR_PROCESS_START | \n", + "REG_REGION_NOT_LIVE_REGION | \n", + "REG_REGION_NOT_WORK_REGION | \n", + "LIVE_REGION_NOT_WORK_REGION | \n", + "REG_CITY_NOT_LIVE_CITY | \n", + "REG_CITY_NOT_WORK_CITY | \n", + "LIVE_CITY_NOT_WORK_CITY | \n", + "ORGANIZATION_TYPE | \n", + "EXT_SOURCE_1 | \n", + "EXT_SOURCE_2 | \n", + "EXT_SOURCE_3 | \n", + "APARTMENTS_AVG | \n", + "BASEMENTAREA_AVG | \n", + "YEARS_BEGINEXPLUATATION_AVG | \n", + "YEARS_BUILD_AVG | \n", + "COMMONAREA_AVG | \n", + "ELEVATORS_AVG | \n", + "ENTRANCES_AVG | \n", + "FLOORSMAX_AVG | \n", + "FLOORSMIN_AVG | \n", + "LANDAREA_AVG | \n", + "LIVINGAPARTMENTS_AVG | \n", + "LIVINGAREA_AVG | \n", + "NONLIVINGAPARTMENTS_AVG | \n", + "NONLIVINGAREA_AVG | \n", + "APARTMENTS_MODE | \n", + "BASEMENTAREA_MODE | \n", + "... | \n", + "ELEVATORS_MODE | \n", + "ENTRANCES_MODE | \n", + "FLOORSMAX_MODE | \n", + "FLOORSMIN_MODE | \n", + "LANDAREA_MODE | \n", + "LIVINGAPARTMENTS_MODE | \n", + "LIVINGAREA_MODE | \n", + "NONLIVINGAPARTMENTS_MODE | \n", + "NONLIVINGAREA_MODE | \n", + "APARTMENTS_MEDI | \n", + "BASEMENTAREA_MEDI | \n", + "YEARS_BEGINEXPLUATATION_MEDI | \n", + "YEARS_BUILD_MEDI | \n", + "COMMONAREA_MEDI | \n", + "ELEVATORS_MEDI | \n", + "ENTRANCES_MEDI | \n", + "FLOORSMAX_MEDI | \n", + "FLOORSMIN_MEDI | \n", + "LANDAREA_MEDI | \n", + "LIVINGAPARTMENTS_MEDI | \n", + "LIVINGAREA_MEDI | \n", + "NONLIVINGAPARTMENTS_MEDI | \n", + "NONLIVINGAREA_MEDI | \n", + "FONDKAPREMONT_MODE | \n", + "HOUSETYPE_MODE | \n", + "TOTALAREA_MODE | \n", + "WALLSMATERIAL_MODE | \n", + "EMERGENCYSTATE_MODE | \n", + "OBS_30_CNT_SOCIAL_CIRCLE | \n", + "DEF_30_CNT_SOCIAL_CIRCLE | \n", + "OBS_60_CNT_SOCIAL_CIRCLE | \n", + "DEF_60_CNT_SOCIAL_CIRCLE | \n", + "DAYS_LAST_PHONE_CHANGE | \n", + "FLAG_DOCUMENT_2 | \n", + "FLAG_DOCUMENT_3 | \n", + "FLAG_DOCUMENT_4 | \n", + "FLAG_DOCUMENT_5 | \n", + "FLAG_DOCUMENT_6 | \n", + "FLAG_DOCUMENT_7 | \n", + "FLAG_DOCUMENT_8 | \n", + "FLAG_DOCUMENT_9 | \n", + "FLAG_DOCUMENT_10 | \n", + "FLAG_DOCUMENT_11 | \n", + "FLAG_DOCUMENT_12 | \n", + "FLAG_DOCUMENT_13 | \n", + "FLAG_DOCUMENT_14 | \n", + "FLAG_DOCUMENT_15 | \n", + "FLAG_DOCUMENT_16 | \n", + "FLAG_DOCUMENT_17 | \n", + "FLAG_DOCUMENT_18 | \n", + "FLAG_DOCUMENT_19 | \n", + "FLAG_DOCUMENT_20 | \n", + "FLAG_DOCUMENT_21 | \n", + "AMT_REQ_CREDIT_BUREAU_HOUR | \n", + "AMT_REQ_CREDIT_BUREAU_DAY | \n", + "AMT_REQ_CREDIT_BUREAU_WEEK | \n", + "AMT_REQ_CREDIT_BUREAU_MON | \n", + "AMT_REQ_CREDIT_BUREAU_QRT | \n", + "AMT_REQ_CREDIT_BUREAU_YEAR | \n", + "DATE | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "145457 | \n", + "0 | \n", + "Cash loans | \n", + "M | \n", + "Y | \n", + "N | \n", + "0 | \n", + "288000.0 | \n", + "242595.0 | \n", + "10813.5 | \n", + "202500.0 | \n", + "Unaccompanied | \n", + "Pensioner | \n", + "Secondary / secondary special | \n", + "Married | \n", + "Municipal apartment | \n", + "0.046220 | \n", + "-22230 | \n", + "365243 | \n", + "-7689.0 | \n", + "-4096 | \n", + "5.0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "NaN | \n", + "2.0 | \n", + "1 | \n", + "1 | \n", + "FRIDAY | \n", + "13 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "XNA | \n", + "NaN | \n", + "0.735594 | \n", + "0.413597 | \n", + "0.6113 | \n", + "0.3295 | \n", + "0.9871 | \n", + "0.8232 | \n", + "0.4761 | \n", + "0.96 | \n", + "0.4138 | \n", + "0.4583 | \n", + "0.5 | \n", + "0.142 | \n", + "0.453 | \n", + "0.6385 | \n", + "0.2085 | \n", + "0.4423 | \n", + "0.6229 | \n", + "0.342 | \n", + "... | \n", + "0.9667 | \n", + "0.4138 | \n", + "0.4583 | \n", + "0.5 | \n", + "0.1452 | \n", + "0.4949 | \n", + "0.6652 | \n", + "0.2101 | \n", + "0.4682 | \n", + "0.6173 | \n", + "0.3295 | \n", + "0.9871 | \n", + "0.8256 | \n", + "0.4791 | \n", + "0.96 | \n", + "0.4138 | \n", + "0.4583 | \n", + "0.5 | \n", + "0.1444 | \n", + "0.4609 | \n", + "0.65 | \n", + "0.2096 | \n", + "0.4516 | \n", + "org spec account | \n", + "block of flats | \n", + "0.8750 | \n", + "Stone, brick | \n", + "No | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "-1347.0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2.0 | \n", + "2018-05 | \n", + "
1 | \n", + "128979 | \n", + "0 | \n", + "Cash loans | \n", + "F | \n", + "N | \n", + "N | \n", + "0 | \n", + "94500.0 | \n", + "646920.0 | \n", + "20866.5 | \n", + "540000.0 | \n", + "Unaccompanied | \n", + "Pensioner | \n", + "Secondary / secondary special | \n", + "Married | \n", + "House / apartment | \n", + "0.018850 | \n", + "-20599 | \n", + "365243 | \n", + "-784.0 | \n", + "-2393 | \n", + "NaN | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "NaN | \n", + "2.0 | \n", + "2 | \n", + "2 | \n", + "SUNDAY | \n", + "15 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "XNA | \n", + "NaN | \n", + "0.461944 | \n", + "0.326475 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "-1273.0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "3.0 | \n", + "2018-05 | \n", + "
2 | \n", + "145448 | \n", + "0 | \n", + "Cash loans | \n", + "M | \n", + "Y | \n", + "Y | \n", + "1 | \n", + "225000.0 | \n", + "183294.0 | \n", + "14823.0 | \n", + "153000.0 | \n", + "Unaccompanied | \n", + "Working | \n", + "Secondary / secondary special | \n", + "Married | \n", + "House / apartment | \n", + "0.020713 | \n", + "-11070 | \n", + "-1345 | \n", + "-3463.0 | \n", + "-3618 | \n", + "19.0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "Drivers | \n", + "3.0 | \n", + "3 | \n", + "3 | \n", + "SATURDAY | \n", + "9 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "Self-employed | \n", + "NaN | \n", + "0.374592 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "12.0 | \n", + "0.0 | \n", + "11.0 | \n", + "0.0 | \n", + "-1127.0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "5.0 | \n", + "2018-03 | \n", + "
3 | \n", + "294475 | \n", + "0 | \n", + "Cash loans | \n", + "M | \n", + "Y | \n", + "N | \n", + "0 | \n", + "180000.0 | \n", + "260640.0 | \n", + "20169.0 | \n", + "225000.0 | \n", + "Family | \n", + "Working | \n", + "Secondary / secondary special | \n", + "Married | \n", + "House / apartment | \n", + "0.026392 | \n", + "-15901 | \n", + "-130 | \n", + "-7799.0 | \n", + "-4449 | \n", + "12.0 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "Security staff | \n", + "2.0 | \n", + "2 | \n", + "2 | \n", + "THURSDAY | \n", + "18 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "Business Entity Type 3 | \n", + "NaN | \n", + "0.712657 | \n", + "NaN | \n", + "0.1031 | \n", + "NaN | \n", + "0.9856 | \n", + "NaN | \n", + "NaN | \n", + "0.00 | \n", + "0.2069 | \n", + "0.1667 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.1050 | \n", + "NaN | \n", + "... | \n", + "0.0000 | \n", + "0.2069 | \n", + "0.1667 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.1041 | \n", + "NaN | \n", + "0.9856 | \n", + "NaN | \n", + "NaN | \n", + "0.00 | \n", + "0.2069 | \n", + "0.1667 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "block of flats | \n", + "0.0696 | \n", + "Stone, brick | \n", + "No | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "-1768.0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "1.0 | \n", + "2018-04 | \n", + "
4 | \n", + "216609 | \n", + "0 | \n", + "Revolving loans | \n", + "M | \n", + "Y | \n", + "Y | \n", + "1 | \n", + "112500.0 | \n", + "180000.0 | \n", + "9000.0 | \n", + "180000.0 | \n", + "Unaccompanied | \n", + "State servant | \n", + "Higher education | \n", + "Married | \n", + "House / apartment | \n", + "0.007020 | \n", + "-10234 | \n", + "-1993 | \n", + "-4040.0 | \n", + "-2913 | \n", + "6.0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "NaN | \n", + "3.0 | \n", + "2 | \n", + "2 | \n", + "WEDNESDAY | \n", + "11 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "Emergency | \n", + "0.405051 | \n", + "0.528879 | \n", + "0.604113 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "-429.0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2018-03 | \n", + "
5 rows × 123 columns
\n", + "