diff --git a/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb index 7d7b3c0..6452c75 100644 --- a/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/特征工程建模/.ipynb_checkpoints/特征工程建模-checkpoint.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -20,797 +20,34 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SK_ID_CURRTARGETNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITYAMT_GOODS_PRICENAME_TYPE_SUITENAME_INCOME_TYPENAME_EDUCATION_TYPENAME_FAMILY_STATUSNAME_HOUSING_TYPEREGION_POPULATION_RELATIVEDAYS_BIRTHDAYS_EMPLOYEDDAYS_REGISTRATIONDAYS_ID_PUBLISHOWN_CAR_AGEFLAG_MOBILFLAG_EMP_PHONEFLAG_WORK_PHONEFLAG_CONT_MOBILEFLAG_PHONEFLAG_EMAILOCCUPATION_TYPECNT_FAM_MEMBERSREGION_RATING_CLIENTREGION_RATING_CLIENT_W_CITYWEEKDAY_APPR_PROCESS_STARTHOUR_APPR_PROCESS_STARTREG_REGION_NOT_LIVE_REGIONREG_REGION_NOT_WORK_REGIONLIVE_REGION_NOT_WORK_REGIONREG_CITY_NOT_LIVE_CITYREG_CITY_NOT_WORK_CITYLIVE_CITY_NOT_WORK_CITYORGANIZATION_TYPEEXT_SOURCE_1EXT_SOURCE_2EXT_SOURCE_3APARTMENTS_AVGBASEMENTAREA_AVGYEARS_BEGINEXPLUATATION_AVGYEARS_BUILD_AVGCOMMONAREA_AVGELEVATORS_AVGENTRANCES_AVGFLOORSMAX_AVGFLOORSMIN_AVGLANDAREA_AVGLIVINGAPARTMENTS_AVGLIVINGAREA_AVGNONLIVINGAPARTMENTS_AVGNONLIVINGAREA_AVGAPARTMENTS_MODEBASEMENTAREA_MODE...ELEVATORS_MODEENTRANCES_MODEFLOORSMAX_MODEFLOORSMIN_MODELANDAREA_MODELIVINGAPARTMENTS_MODELIVINGAREA_MODENONLIVINGAPARTMENTS_MODENONLIVINGAREA_MODEAPARTMENTS_MEDIBASEMENTAREA_MEDIYEARS_BEGINEXPLUATATION_MEDIYEARS_BUILD_MEDICOMMONAREA_MEDIELEVATORS_MEDIENTRANCES_MEDIFLOORSMAX_MEDIFLOORSMIN_MEDILANDAREA_MEDILIVINGAPARTMENTS_MEDILIVINGAREA_MEDINONLIVINGAPARTMENTS_MEDINONLIVINGAREA_MEDIFONDKAPREMONT_MODEHOUSETYPE_MODETOTALAREA_MODEWALLSMATERIAL_MODEEMERGENCYSTATE_MODEOBS_30_CNT_SOCIAL_CIRCLEDEF_30_CNT_SOCIAL_CIRCLEOBS_60_CNT_SOCIAL_CIRCLEDEF_60_CNT_SOCIAL_CIRCLEDAYS_LAST_PHONE_CHANGEFLAG_DOCUMENT_2FLAG_DOCUMENT_3FLAG_DOCUMENT_4FLAG_DOCUMENT_5FLAG_DOCUMENT_6FLAG_DOCUMENT_7FLAG_DOCUMENT_8FLAG_DOCUMENT_9FLAG_DOCUMENT_10FLAG_DOCUMENT_11FLAG_DOCUMENT_12FLAG_DOCUMENT_13FLAG_DOCUMENT_14FLAG_DOCUMENT_15FLAG_DOCUMENT_16FLAG_DOCUMENT_17FLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEARDATE
01454570Cash loansMYN0288000.0242595.010813.5202500.0UnaccompaniedPensionerSecondary / secondary specialMarriedMunicipal apartment0.046220-22230365243-7689.0-40965.0100100NaN2.011FRIDAY13000000XNANaN0.7355940.4135970.61130.32950.98710.82320.47610.960.41380.45830.50.1420.4530.63850.20850.44230.62290.342...0.96670.41380.45830.50.14520.49490.66520.21010.46820.61730.32950.98710.82560.47910.960.41380.45830.50.14440.46090.650.20960.4516org spec accountblock of flats0.8750Stone, brickNo0.00.00.00.0-1347.0000010000000000000000.00.00.00.00.02.02018-05
11289790Cash loansFNN094500.0646920.020866.5540000.0UnaccompaniedPensionerSecondary / secondary specialMarriedHouse / apartment0.018850-20599365243-784.0-2393NaN100100NaN2.022SUNDAY15000000XNANaN0.4619440.326475NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.00.00.00.0-1273.0010000000000000000000.00.00.00.00.03.02018-05
21454480Cash loansMYY1225000.0183294.014823.0153000.0UnaccompaniedWorkingSecondary / secondary specialMarriedHouse / apartment0.020713-11070-1345-3463.0-361819.0110100Drivers3.033SATURDAY9000111Self-employedNaN0.374592NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN12.00.011.00.0-1127.0000000100000000000000.00.00.00.00.05.02018-03
32944750Cash loansMYN0180000.0260640.020169.0225000.0FamilyWorkingSecondary / secondary specialMarriedHouse / apartment0.026392-15901-130-7799.0-444912.0111100Security staff2.022THURSDAY18011011Business Entity Type 3NaN0.712657NaN0.1031NaN0.9856NaNNaN0.000.20690.1667NaNNaNNaNNaNNaNNaN0.1050NaN...0.00000.20690.1667NaNNaNNaNNaNNaNNaN0.1041NaN0.9856NaNNaN0.000.20690.1667NaNNaNNaNNaNNaNNaNNaNblock of flats0.0696Stone, brickNo0.00.00.00.0-1768.0010000000000000000000.00.00.01.00.01.02018-04
42166090Revolving loansMYY1112500.0180000.09000.0180000.0UnaccompaniedState servantHigher educationMarriedHouse / apartment0.007020-10234-1993-4040.0-29136.0110100NaN3.022WEDNESDAY11000000Emergency0.4050510.5288790.604113NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1.00.01.00.0-429.0000000000000000000000.00.01.00.00.00.02018-03
\n", - "

5 rows × 123 columns

\n", - "
" - ], - "text/plain": [ - " SK_ID_CURR TARGET NAME_CONTRACT_TYPE ... AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR DATE\n", - "0 145457 0 Cash loans ... 0.0 2.0 2018-05\n", - "1 128979 0 Cash loans ... 0.0 3.0 2018-05\n", - "2 145448 0 Cash loans ... 0.0 5.0 2018-03\n", - "3 294475 0 Cash loans ... 0.0 1.0 2018-04\n", - "4 216609 0 Revolving loans ... 0.0 0.0 2018-03\n", - "\n", - "[5 rows x 123 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mapplication_raw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'data/application_train.csv'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 银行贷款数据,预测违约可能性0/1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mapplication_raw\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 700\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 701\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 702\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 703\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 704\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 433\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 434\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 435\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 436\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 437\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1137\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1138\u001b[0m \u001b[0mnrows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'nrows'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1139\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1140\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1141\u001b[0m \u001b[1;31m# May alter columns / col_dict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1993\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1994\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1995\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1996\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1997\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[1;34m()\u001b[0m\n", + "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[1;34m()\u001b[0m\n", + "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[1;34m()\u001b[0m\n", + "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[1;34m()\u001b[0m\n", + "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[1;34m()\u001b[0m\n", + "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[1;34m()\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\dtypes\\common.py\u001b[0m in \u001b[0;36mis_categorical_dtype\u001b[1;34m(arr_or_dtype)\u001b[0m\n\u001b[0;32m 570\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 572\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mis_categorical_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr_or_dtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 573\u001b[0m \"\"\"\n\u001b[0;32m 574\u001b[0m \u001b[0mCheck\u001b[0m \u001b[0mwhether\u001b[0m \u001b[0man\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mCategorical\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] } ], "source": [ - "application_raw = pd.read_csv('application_train.csv') # 银行贷款数据,预测违约可能性0/1\n", + "application_raw = pd.read_csv('data/application_train.csv') # 银行贷款数据,预测违约可能性0/1\n", "application_raw.head()" ] }, @@ -823,11 +60,11 @@ "# 数据预处理\n", "def get_nonull_dummy_data(application_train_raw, dummy_drop=['ORGANIZATION_TYPE']):\n", " # 缺失值填充\n", - " nulls = pd.isnull(application_train_raw),sum()\n", + " nulls = pd.isnull(application_train_raw).sum()\n", " less_nulls = nulls[(nulls<3075)&(nulls!=0)].index\n", " less_nulls_float = []\n", " for i in range(len(less_nulls)):\n", - " if application_train_raw[less_nulls[i]].dtype!='0':\n", + " if application_train_raw[less_nulls[i]].dtype != 'O':\n", " less_nulls_float.append(less_nulls[i])\n", " \n", " application_train_raw[less_nulls_float] = application_train_raw[less_nulls_float].fillna(application_train_raw[less_nulls_float].mean())\n", @@ -836,7 +73,7 @@ " more_nulls = nulls[(nulls >= 3075)].index\n", " more_nulls_float = []\n", " for i in range(len(more_nulls)):\n", - " if application_train_raw[more_nulls[i]].dtype!='0':\n", + " if application_train_raw[more_nulls[i]].dtype != 'O':\n", " more_nulls_float.append(more_nulls[i])\n", " \n", " application_train_raw[more_nulls_float] = application_train_raw[more_nulls_float].fillna(application_train_raw[more_nulls_float].mean())\n", @@ -847,16 +84,17 @@ " all_cols = application_train_raw.columns\n", " cat_cols = []\n", " for col in all_cols:\n", - " if application_train_raw[col].dtype!='0':\n", - " more_nulls_float.append(col)\n", + " if application_train_raw[col].dtype == 'O':\n", + " cat_cols.append(col)\n", " \n", - " application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, axis=1, dummy_na=True)\n", + " application_train_raw = pd.get_dummies(application_train_raw,columns=cat_cols, dummy_na=True)\n", " \n", " return application_train_raw\n", "\n", "\n", "def import_and_create_train_test_data(test_size=0.33, random_state=42):\n", - " application_raw = pd.read_csv('application_train.csv') \n", + " # 训练和验证集制作\n", + " application_raw = pd.read_csv('data/application_train.csv') \n", " application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n", " \n", " X = application.drop(['TARGET'],axis=1)\n", @@ -874,14 +112,174 @@ "\n", "\n", "def import_and_create_TEST_data():\n", - " application_raw = pd.read_csv('test_data.csv') \n", + " # 线上测试集\n", + " application_raw = pd.read_csv('data/application_test.csv') \n", " application = get_nonull_dummy_data(application_raw, dummy_drop=['ORGANIZATION_TYPE'])\n", " \n", " X = application\n", " users = X[['SK_ID_CURR']]\n", + " users.reset_index(drop=True, inplace=True)\n", " \n", - " " + " return(X, users)\n", + "\n", + "\n", + "def get_imp_df(xgb_model):\n", + "# 获取特征重要性\n", + " imp = pd.DataFrame(np.asarray(list(xgb_model.get_fscore().keys())))\n", + " imp.columns = ['Feature']\n", + " imp['importance'] = np.asarray(list(xgb_model.get_fscore().keys()))\n", + " imp = imp.sort_values(by=['importance'], ascending=False)\n", + " imp = imp.reset_index(drop=True)\n", + " \n", + " return (imp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.columns.values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练集和验证集" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train,y_test,train_users,test_users = import_and_create_train_test_data()\n", + "X_TEST,TEST_users = import_and_create_TEST_data()\n", + "\n", + "drop=['CODE_GENDER_XNA''NAME_INCOME_TYPE_Maternity leave',\n", + " 'NAME_FAMILY_STATUS_Unknown','SK_ID_CURR']\n", + "X_train = X_train.drop(drop, axis=1)\n", + "X_test = X_test.drop(drop, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_train = X_train.reset_index(drop=True)\n", + "data_train['target'] = y_train.reset_index(drop=True)\n", + "data_test = X_test.reset_index(drop=True)\n", + "data_test['target'] = y_test.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 训练集与验证集特征趋势分析\n", + "\n", + "\n", + "get_univariate_plots(data=data_train,target_col='target',\n", + " features_list=data_train.columns[0:10],data_test=data_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grouped_train,grouped_test = univariate_plotter(data=data_train,\n", + " target_col='target',\n", + " feature='AMT_INCOME_TOTAL',\n", + " data_test=data_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grouped_train # 展示bin中数据信息" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练模型,使用全部特征" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dtrain = xgb.DMatrix(X_test,label=y_test,missing=np.nan)\n", + "dtest = xgb.DMatrix(X_train,label=y_train,missing=np.nan)\n", + "\n", + "params = {'max_depth':8,'learning_rate':0.1,'silent':0,\n", + " 'objective':'binary:logistic','min_child_weight':500,\n", + " 'eval_metric':'auc','nthread':8}\n", + "xgb_model = xgb.train(params, dtrain, 400, evals=[(dtrain,'train'),\n", + " (dtest,'test')],\n", + " early_stopping_rounds=25)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 提交结果\n", + "dTEST = xgb.DMatrix(X_TEST[X_test.columns], missing=np.nan)\n", + "y_TEST_pred = xgb_model.predict(dTEST)\n", + "submission_all_feats = pd.DataFrame({'SK_ID_CURR':TEST_users['SK_ID_CURR'],\n", + " 'TARGET':y_TEST_pred})\n", + "submission_all_feats.to_csv('data/submission_all_feats_1.csv',index=False)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 计算训练集和验证集中特征的趋势" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stats = get_trend_stats(data=data_train,target_col='target',data_test=data_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "submission_all_feats.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {