From fba1e1357cf0c7172b9e996fc5f35522ab6e1762 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Wed, 1 Sep 2021 15:39:43 +0800
Subject: [PATCH] Add. Training and testing the simple model
---
...mated feature engineering-checkpoint.ipynb | 1169 +++++++++++------
.../Automated feature engineering.ipynb | 1095 ++++++++++-----
2 files changed, 1590 insertions(+), 674 deletions(-)
diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
index 8428481..488f3e0 100644
--- a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
+++ b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
@@ -13,6 +13,7 @@
"id": "66dfb30d",
"metadata": {},
"source": [
+ "### 结论:效果一般\n",
"搬运参考:https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
]
},
@@ -99,13 +100,14 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 19,
"id": "43cc9a46",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import time\n",
+ "import gc\n",
"\n",
"import featuretools as ft\n",
"from featuretools.primitives import *\n",
@@ -115,9 +117,6 @@
"# 导入相关模型,没有的pip install xxx 即可\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
- "from sklearn.linear_model import LogisticRegression\n",
- "import xgboost as xgb \n",
- "import lightgbm as lgb \n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
@@ -127,7 +126,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"id": "4c17c0bc",
"metadata": {},
"outputs": [],
@@ -138,7 +137,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"id": "bcce5a3d",
"metadata": {},
"outputs": [
@@ -168,7 +167,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 4,
"id": "4afeeca5",
"metadata": {},
"outputs": [
@@ -292,7 +291,7 @@
"1 0.0 0.0 0.0 "
]
},
- "execution_count": 17,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -306,7 +305,70 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 5,
+ "id": "af6722f2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Cover_Type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Cover_Type\n",
+ "0 0 4.0\n",
+ "1 1 4.0"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y = pd.DataFrame(y, columns=data.target_names)\n",
+ "y = y.reset_index()\n",
+ "y.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
"id": "2d34ab5c",
"metadata": {},
"outputs": [
@@ -350,7 +412,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 7,
"id": "1551c241",
"metadata": {},
"outputs": [
@@ -407,10 +469,18 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 8,
"id": "06f24545",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Object `es.entity_from_dataframe` not found.\n"
+ ]
+ }
+ ],
"source": [
"es.entity_from_dataframe?"
]
@@ -433,7 +503,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 9,
"id": "f2c69a94",
"metadata": {},
"outputs": [
@@ -447,7 +517,7 @@
" No relationships"
]
},
- "execution_count": 18,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -479,7 +549,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 10,
"id": "770130bc",
"metadata": {
"scrolled": false
@@ -512,7 +582,7 @@
" X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4"
]
},
- "execution_count": 19,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -532,7 +602,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 11,
"id": "352fa085",
"metadata": {
"scrolled": true
@@ -571,126 +641,116 @@
" \n",
" \n",
" 0 | \n",
- " all | \n",
+ " sum | \n",
" aggregation | \n",
" True | \n",
- " False | \n",
- " Calculates if all values are 'True' in a list. | \n",
- " Boolean | \n",
- " Boolean | \n",
+ " True | \n",
+ " Calculates the total addition, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 1 | \n",
- " skew | \n",
+ " first | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Computes the extent to which a distribution differs from a normal distribution. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " Determines the first value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
"
\n",
" \n",
" 2 | \n",
- " percent_true | \n",
+ " last | \n",
" aggregation | \n",
- " True | \n",
" False | \n",
- " Determines the percent of `True` values. | \n",
- " Boolean | \n",
- " Numeric | \n",
+ " False | \n",
+ " Determines the last value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
"
\n",
" \n",
" 3 | \n",
- " count | \n",
+ " trend | \n",
" aggregation | \n",
- " True | \n",
- " True | \n",
- " Determines the total number of values, excluding `NaN`. | \n",
- " Index | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the trend of a variable over time. | \n",
+ " DatetimeTimeIndex, Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
" 4 | \n",
- " num_unique | \n",
+ " n_most_common | \n",
" aggregation | \n",
- " True | \n",
- " True | \n",
- " Determines the number of distinct values, ignoring `NaN` values. | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the `n` most common elements. | \n",
+ " Discrete | \n",
" Discrete | \n",
- " Numeric | \n",
"
\n",
" \n",
" 5 | \n",
- " first | \n",
+ " time_since_last | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Determines the first value in a list. | \n",
- " Variable | \n",
- " None | \n",
+ " Calculates the time elapsed since the last datetime (default in seconds). | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 6 | \n",
- " mode | \n",
+ " std | \n",
" aggregation | \n",
- " False | \n",
- " False | \n",
- " Determines the most commonly repeated value. | \n",
- " Discrete | \n",
- " None | \n",
+ " True | \n",
+ " True | \n",
+ " Computes the dispersion relative to the mean value, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 7 | \n",
- " entropy | \n",
+ " median | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Calculates the entropy for a categorical variable | \n",
- " Categorical | \n",
+ " Determines the middlemost number in a list of values. | \n",
+ " Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
" 8 | \n",
- " time_since_last | \n",
+ " count | \n",
" aggregation | \n",
- " False | \n",
- " False | \n",
- " Calculates the time elapsed since the last datetime (default in seconds). | \n",
- " DatetimeTimeIndex | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the total number of values, excluding `NaN`. | \n",
+ " Index | \n",
" Numeric | \n",
"
\n",
" \n",
" 9 | \n",
- " any | \n",
+ " percent_true | \n",
" aggregation | \n",
" True | \n",
" False | \n",
- " Determines if any value is 'True' in a list. | \n",
- " Boolean | \n",
+ " Determines the percent of `True` values. | \n",
" Boolean | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 10 | \n",
- " last | \n",
- " aggregation | \n",
- " False | \n",
- " False | \n",
- " Determines the last value in a list. | \n",
- " Variable | \n",
- " None | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " avg_time_between | \n",
+ " time_since_first | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Computes the average number of seconds between consecutive events. | \n",
+ " Calculates the time elapsed since the first datetime (in seconds). | \n",
" DatetimeTimeIndex | \n",
" Numeric | \n",
"
\n",
" \n",
- " 12 | \n",
+ " 11 | \n",
" max | \n",
" aggregation | \n",
" True | \n",
@@ -700,93 +760,103 @@
" Numeric | \n",
"
\n",
" \n",
+ " 12 | \n",
+ " any | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Determines if any value is 'True' in a list. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
" 13 | \n",
- " median | \n",
+ " mode | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Determines the middlemost number in a list of values. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " Determines the most commonly repeated value. | \n",
+ " Discrete | \n",
+ " None | \n",
"
\n",
" \n",
" 14 | \n",
- " mean | \n",
+ " entropy | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the entropy for a categorical variable | \n",
+ " Categorical | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " min | \n",
" aggregation | \n",
" True | \n",
" True | \n",
- " Computes the average for a list of values. | \n",
+ " Calculates the smallest value, ignoring `NaN` values. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
- " 15 | \n",
- " num_true | \n",
+ " 16 | \n",
+ " all | \n",
" aggregation | \n",
" True | \n",
" False | \n",
- " Counts the number of `True` values. | \n",
+ " Calculates if all values are 'True' in a list. | \n",
+ " Boolean | \n",
" Boolean | \n",
- " Numeric | \n",
"
\n",
" \n",
- " 16 | \n",
- " min | \n",
+ " 17 | \n",
+ " skew | \n",
" aggregation | \n",
- " True | \n",
- " True | \n",
- " Calculates the smallest value, ignoring `NaN` values. | \n",
+ " False | \n",
+ " False | \n",
+ " Computes the extent to which a distribution differs from a normal distribution. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
- " 17 | \n",
- " sum | \n",
+ " 18 | \n",
+ " mean | \n",
" aggregation | \n",
" True | \n",
" True | \n",
- " Calculates the total addition, ignoring `NaN`. | \n",
+ " Computes the average for a list of values. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
- " 18 | \n",
- " trend | \n",
+ " 19 | \n",
+ " avg_time_between | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Calculates the trend of a variable over time. | \n",
- " Numeric, DatetimeTimeIndex | \n",
+ " Computes the average number of seconds between consecutive events. | \n",
+ " DatetimeTimeIndex | \n",
" Numeric | \n",
"
\n",
" \n",
- " 19 | \n",
- " n_most_common | \n",
- " aggregation | \n",
- " False | \n",
- " False | \n",
- " Determines the `n` most common elements. | \n",
- " Discrete | \n",
- " Discrete | \n",
- "
\n",
- " \n",
" 20 | \n",
- " time_since_first | \n",
+ " num_unique | \n",
" aggregation | \n",
- " False | \n",
- " False | \n",
- " Calculates the time elapsed since the first datetime (in seconds). | \n",
- " DatetimeTimeIndex | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the number of distinct values, ignoring `NaN` values. | \n",
+ " Discrete | \n",
" Numeric | \n",
"
\n",
" \n",
" 21 | \n",
- " std | \n",
+ " num_true | \n",
" aggregation | \n",
" True | \n",
- " True | \n",
- " Computes the dispersion relative to the mean value, ignoring `NaN`. | \n",
- " Numeric | \n",
+ " False | \n",
+ " Counts the number of `True` values. | \n",
+ " Boolean | \n",
" Numeric | \n",
"
\n",
" \n",
@@ -795,79 +865,79 @@
],
"text/plain": [
" name type dask_compatible koalas_compatible \\\n",
- "0 all aggregation True False \n",
- "1 skew aggregation False False \n",
- "2 percent_true aggregation True False \n",
- "3 count aggregation True True \n",
- "4 num_unique aggregation True True \n",
- "5 first aggregation False False \n",
- "6 mode aggregation False False \n",
- "7 entropy aggregation False False \n",
- "8 time_since_last aggregation False False \n",
- "9 any aggregation True False \n",
- "10 last aggregation False False \n",
- "11 avg_time_between aggregation False False \n",
- "12 max aggregation True True \n",
- "13 median aggregation False False \n",
- "14 mean aggregation True True \n",
- "15 num_true aggregation True False \n",
- "16 min aggregation True True \n",
- "17 sum aggregation True True \n",
- "18 trend aggregation False False \n",
- "19 n_most_common aggregation False False \n",
- "20 time_since_first aggregation False False \n",
- "21 std aggregation True True \n",
+ "0 sum aggregation True True \n",
+ "1 first aggregation False False \n",
+ "2 last aggregation False False \n",
+ "3 trend aggregation False False \n",
+ "4 n_most_common aggregation False False \n",
+ "5 time_since_last aggregation False False \n",
+ "6 std aggregation True True \n",
+ "7 median aggregation False False \n",
+ "8 count aggregation True True \n",
+ "9 percent_true aggregation True False \n",
+ "10 time_since_first aggregation False False \n",
+ "11 max aggregation True True \n",
+ "12 any aggregation True False \n",
+ "13 mode aggregation False False \n",
+ "14 entropy aggregation False False \n",
+ "15 min aggregation True True \n",
+ "16 all aggregation True False \n",
+ "17 skew aggregation False False \n",
+ "18 mean aggregation True True \n",
+ "19 avg_time_between aggregation False False \n",
+ "20 num_unique aggregation True True \n",
+ "21 num_true aggregation True False \n",
"\n",
" description \\\n",
- "0 Calculates if all values are 'True' in a list. \n",
- "1 Computes the extent to which a distribution differs from a normal distribution. \n",
- "2 Determines the percent of `True` values. \n",
- "3 Determines the total number of values, excluding `NaN`. \n",
- "4 Determines the number of distinct values, ignoring `NaN` values. \n",
- "5 Determines the first value in a list. \n",
- "6 Determines the most commonly repeated value. \n",
- "7 Calculates the entropy for a categorical variable \n",
- "8 Calculates the time elapsed since the last datetime (default in seconds). \n",
- "9 Determines if any value is 'True' in a list. \n",
- "10 Determines the last value in a list. \n",
- "11 Computes the average number of seconds between consecutive events. \n",
- "12 Calculates the highest value, ignoring `NaN` values. \n",
- "13 Determines the middlemost number in a list of values. \n",
- "14 Computes the average for a list of values. \n",
- "15 Counts the number of `True` values. \n",
- "16 Calculates the smallest value, ignoring `NaN` values. \n",
- "17 Calculates the total addition, ignoring `NaN`. \n",
- "18 Calculates the trend of a variable over time. \n",
- "19 Determines the `n` most common elements. \n",
- "20 Calculates the time elapsed since the first datetime (in seconds). \n",
- "21 Computes the dispersion relative to the mean value, ignoring `NaN`. \n",
+ "0 Calculates the total addition, ignoring `NaN`. \n",
+ "1 Determines the first value in a list. \n",
+ "2 Determines the last value in a list. \n",
+ "3 Calculates the trend of a variable over time. \n",
+ "4 Determines the `n` most common elements. \n",
+ "5 Calculates the time elapsed since the last datetime (default in seconds). \n",
+ "6 Computes the dispersion relative to the mean value, ignoring `NaN`. \n",
+ "7 Determines the middlemost number in a list of values. \n",
+ "8 Determines the total number of values, excluding `NaN`. \n",
+ "9 Determines the percent of `True` values. \n",
+ "10 Calculates the time elapsed since the first datetime (in seconds). \n",
+ "11 Calculates the highest value, ignoring `NaN` values. \n",
+ "12 Determines if any value is 'True' in a list. \n",
+ "13 Determines the most commonly repeated value. \n",
+ "14 Calculates the entropy for a categorical variable \n",
+ "15 Calculates the smallest value, ignoring `NaN` values. \n",
+ "16 Calculates if all values are 'True' in a list. \n",
+ "17 Computes the extent to which a distribution differs from a normal distribution. \n",
+ "18 Computes the average for a list of values. \n",
+ "19 Computes the average number of seconds between consecutive events. \n",
+ "20 Determines the number of distinct values, ignoring `NaN` values. \n",
+ "21 Counts the number of `True` values. \n",
"\n",
" valid_inputs return_type \n",
- "0 Boolean Boolean \n",
- "1 Numeric Numeric \n",
- "2 Boolean Numeric \n",
- "3 Index Numeric \n",
- "4 Discrete Numeric \n",
- "5 Variable None \n",
- "6 Discrete None \n",
- "7 Categorical Numeric \n",
- "8 DatetimeTimeIndex Numeric \n",
- "9 Boolean Boolean \n",
- "10 Variable None \n",
- "11 DatetimeTimeIndex Numeric \n",
- "12 Numeric Numeric \n",
- "13 Numeric Numeric \n",
- "14 Numeric Numeric \n",
- "15 Boolean Numeric \n",
- "16 Numeric Numeric \n",
+ "0 Numeric Numeric \n",
+ "1 Variable None \n",
+ "2 Variable None \n",
+ "3 DatetimeTimeIndex, Numeric Numeric \n",
+ "4 Discrete Discrete \n",
+ "5 DatetimeTimeIndex Numeric \n",
+ "6 Numeric Numeric \n",
+ "7 Numeric Numeric \n",
+ "8 Index Numeric \n",
+ "9 Boolean Numeric \n",
+ "10 DatetimeTimeIndex Numeric \n",
+ "11 Numeric Numeric \n",
+ "12 Boolean Boolean \n",
+ "13 Discrete None \n",
+ "14 Categorical Numeric \n",
+ "15 Numeric Numeric \n",
+ "16 Boolean Boolean \n",
"17 Numeric Numeric \n",
- "18 Numeric, DatetimeTimeIndex Numeric \n",
- "19 Discrete Discrete \n",
- "20 DatetimeTimeIndex Numeric \n",
- "21 Numeric Numeric "
+ "18 Numeric Numeric \n",
+ "19 DatetimeTimeIndex Numeric \n",
+ "20 Discrete Numeric \n",
+ "21 Boolean Numeric "
]
},
- "execution_count": 20,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -880,7 +950,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 12,
"id": "7762885f",
"metadata": {},
"outputs": [
@@ -917,52 +987,52 @@
" \n",
" \n",
" 22 | \n",
- " url_to_domain | \n",
+ " greater_than | \n",
" transform | \n",
+ " True | \n",
" False | \n",
- " False | \n",
- " Determines the domain of a url. | \n",
- " URL | \n",
- " Categorical | \n",
+ " Determines if values in one list are greater than another list. | \n",
+ " Ordinal, Datetime, Numeric | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 23 | \n",
- " cum_mean | \n",
+ " less_than | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Calculates the cumulative mean. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " True | \n",
+ " True | \n",
+ " Determines if values in one list are less than another list. | \n",
+ " Ordinal, Datetime, Numeric | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 24 | \n",
- " minute | \n",
+ " and | \n",
" transform | \n",
" True | \n",
" True | \n",
- " Determines the minutes value of a datetime. | \n",
- " Datetime | \n",
- " Numeric | \n",
+ " Element-wise logical AND of two lists. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 25 | \n",
- " cum_max | \n",
+ " less_than_scalar | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Calculates the cumulative maximum. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " True | \n",
+ " True | \n",
+ " Determines if values are less than a given scalar. | \n",
+ " Ordinal, Datetime, Numeric | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 26 | \n",
- " age | \n",
+ " modulo_numeric | \n",
" transform | \n",
" True | \n",
- " False | \n",
- " Calculates the age in years as a floating point number given a | \n",
- " DateOfBirth | \n",
+ " True | \n",
+ " Element-wise modulo of two lists. | \n",
+ " Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
@@ -977,52 +1047,52 @@
"
\n",
" \n",
" 79 | \n",
- " greater_than_scalar | \n",
+ " is_weekend | \n",
" transform | \n",
" True | \n",
" True | \n",
- " Determines if values are greater than a given scalar. | \n",
- " Numeric, Datetime, Ordinal | \n",
+ " Determines if a date falls on a weekend. | \n",
+ " Datetime | \n",
" Boolean | \n",
"
\n",
" \n",
" 80 | \n",
- " url_to_protocol | \n",
+ " num_characters | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Determines the protocol (http or https) of a url. | \n",
- " URL | \n",
- " Categorical | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the number of characters in a string. | \n",
+ " NaturalLanguage | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 81 | \n",
- " month | \n",
+ " latitude | \n",
" transform | \n",
- " True | \n",
- " True | \n",
- " Determines the month value of a datetime. | \n",
- " Datetime | \n",
- " Ordinal | \n",
+ " False | \n",
+ " False | \n",
+ " Returns the first tuple value in a list of LatLong tuples. | \n",
+ " LatLong | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 82 | \n",
- " divide_numeric_scalar | \n",
+ " cum_sum | \n",
" transform | \n",
- " True | \n",
- " True | \n",
- " Divide each element in the list by a scalar. | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the cumulative sum. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
" 83 | \n",
- " time_since_previous | \n",
+ " subtract_numeric_scalar | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Compute the time since the previous entry in a list. | \n",
- " DatetimeTimeIndex | \n",
+ " True | \n",
+ " True | \n",
+ " Subtract a scalar from each element in the list. | \n",
+ " Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
@@ -1031,49 +1101,49 @@
""
],
"text/plain": [
- " name type dask_compatible koalas_compatible \\\n",
- "22 url_to_domain transform False False \n",
- "23 cum_mean transform False False \n",
- "24 minute transform True True \n",
- "25 cum_max transform False False \n",
- "26 age transform True False \n",
- ".. ... ... ... ... \n",
- "79 greater_than_scalar transform True True \n",
- "80 url_to_protocol transform False False \n",
- "81 month transform True True \n",
- "82 divide_numeric_scalar transform True True \n",
- "83 time_since_previous transform False False \n",
+ " name type dask_compatible koalas_compatible \\\n",
+ "22 greater_than transform True False \n",
+ "23 less_than transform True True \n",
+ "24 and transform True True \n",
+ "25 less_than_scalar transform True True \n",
+ "26 modulo_numeric transform True True \n",
+ ".. ... ... ... ... \n",
+ "79 is_weekend transform True True \n",
+ "80 num_characters transform True True \n",
+ "81 latitude transform False False \n",
+ "82 cum_sum transform False False \n",
+ "83 subtract_numeric_scalar transform True True \n",
"\n",
- " description \\\n",
- "22 Determines the domain of a url. \n",
- "23 Calculates the cumulative mean. \n",
- "24 Determines the minutes value of a datetime. \n",
- "25 Calculates the cumulative maximum. \n",
- "26 Calculates the age in years as a floating point number given a \n",
- ".. ... \n",
- "79 Determines if values are greater than a given scalar. \n",
- "80 Determines the protocol (http or https) of a url. \n",
- "81 Determines the month value of a datetime. \n",
- "82 Divide each element in the list by a scalar. \n",
- "83 Compute the time since the previous entry in a list. \n",
+ " description \\\n",
+ "22 Determines if values in one list are greater than another list. \n",
+ "23 Determines if values in one list are less than another list. \n",
+ "24 Element-wise logical AND of two lists. \n",
+ "25 Determines if values are less than a given scalar. \n",
+ "26 Element-wise modulo of two lists. \n",
+ ".. ... \n",
+ "79 Determines if a date falls on a weekend. \n",
+ "80 Calculates the number of characters in a string. \n",
+ "81 Returns the first tuple value in a list of LatLong tuples. \n",
+ "82 Calculates the cumulative sum. \n",
+ "83 Subtract a scalar from each element in the list. \n",
"\n",
- " valid_inputs return_type \n",
- "22 URL Categorical \n",
- "23 Numeric Numeric \n",
- "24 Datetime Numeric \n",
- "25 Numeric Numeric \n",
- "26 DateOfBirth Numeric \n",
- ".. ... ... \n",
- "79 Numeric, Datetime, Ordinal Boolean \n",
- "80 URL Categorical \n",
- "81 Datetime Ordinal \n",
- "82 Numeric Numeric \n",
- "83 DatetimeTimeIndex Numeric \n",
+ " valid_inputs return_type \n",
+ "22 Ordinal, Datetime, Numeric Boolean \n",
+ "23 Ordinal, Datetime, Numeric Boolean \n",
+ "24 Boolean Boolean \n",
+ "25 Ordinal, Datetime, Numeric Boolean \n",
+ "26 Numeric Numeric \n",
+ ".. ... ... \n",
+ "79 Datetime Boolean \n",
+ "80 NaturalLanguage Numeric \n",
+ "81 LatLong Numeric \n",
+ "82 Numeric Numeric \n",
+ "83 Numeric Numeric \n",
"\n",
"[62 rows x 7 columns]"
]
},
- "execution_count": 21,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1092,11 +1162,20 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 14,
"id": "6d3df2f7",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Wall time: 1min 3s\n"
+ ]
+ }
+ ],
"source": [
+ "%%time\n",
"features, feature_names = ft.dfs(entityset = es, \n",
" target_entity = 'X', \n",
" max_depth = 2)"
@@ -1112,7 +1191,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 15,
"id": "9a44a98a",
"metadata": {},
"outputs": [
@@ -1653,7 +1732,7 @@
" ]"
]
},
- "execution_count": 27,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -1664,7 +1743,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 16,
"id": "d5036e65",
"metadata": {},
"outputs": [
@@ -1703,31 +1782,31 @@
" \n",
" \n",
" 0 | \n",
- " 3000.267286 | \n",
+ " 3000.267334 | \n",
" 2596.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 561 | \n",
- " 3000.267286 | \n",
+ " 3000.267334 | \n",
" 2596.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2062 | \n",
- " 2926.053180 | \n",
+ " 2926.053223 | \n",
" 2596.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6946 | \n",
- " 2926.053180 | \n",
+ " 2926.053223 | \n",
" 2596.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6976 | \n",
- " 2926.053180 | \n",
+ " 2926.053223 | \n",
" 2596.0 | \n",
" 0.0 | \n",
"
\n",
@@ -1738,14 +1817,14 @@
"text/plain": [
" Wilderness_Area_0.MEAN(X.Elevation) Elevation Wilderness_Area_0\n",
"index \n",
- "0 3000.267286 2596.0 1.0\n",
- "561 3000.267286 2596.0 1.0\n",
- "2062 2926.053180 2596.0 0.0\n",
- "6946 2926.053180 2596.0 0.0\n",
- "6976 2926.053180 2596.0 0.0"
+ "0 3000.267334 2596.0 1.0\n",
+ "561 3000.267334 2596.0 1.0\n",
+ "2062 2926.053223 2596.0 0.0\n",
+ "6946 2926.053223 2596.0 0.0\n",
+ "6976 2926.053223 2596.0 0.0"
]
},
- "execution_count": 35,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -1756,7 +1835,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 17,
"id": "ec8b7ccd",
"metadata": {},
"outputs": [
@@ -1766,7 +1845,7 @@
"(581012, 532)"
]
},
- "execution_count": 36,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -1798,7 +1877,7 @@
"id": "75b7cc64",
"metadata": {},
"source": [
- "为了解决“维数灾难”,有必要应用特征简化和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n",
+ "为了解决“维数灾难”,有必要应用特征约简和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n",
"\n",
"* 确定共线特征\n",
"\n",
@@ -4248,69 +4327,6 @@
"注意,正常情况下我们是不知道测试集的标签,所以这里先做分割,切分训练和预测集合"
]
},
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "1e65ffe8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " Cover_Type | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 4.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " 4.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index Cover_Type\n",
- "0 0 4.0\n",
- "1 1 4.0"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "y = pd.DataFrame(y, columns=data.target_names)\n",
- "y = y.reset_index()\n",
- "y.head(2)"
- ]
- },
{
"cell_type": "code",
"execution_count": 46,
@@ -4968,11 +4984,12 @@
}
],
"source": [
- "import gc\n",
+ "\"\"\"\n",
"del features_filtered\n",
"del features_positive\n",
"del fetch_covtype\n",
"del df, X,y, X_selected_df,train,test,train_df,test_df,train_X,train_y\n",
+ "\"\"\"\n",
"gc.collect()"
]
},
@@ -4986,7 +5003,7 @@
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": 8,
"id": "b7241552",
"metadata": {},
"outputs": [
@@ -5110,7 +5127,7 @@
"246788 0.0 0.0 0.0 0.0 "
]
},
- "execution_count": 65,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -5128,24 +5145,16 @@
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 9,
"id": "db3d3b92",
"metadata": {},
"outputs": [
{
- "ename": "MemoryError",
- "evalue": "Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)",
- "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n",
- "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 397\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 398\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moob_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 399\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_oob_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 400\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 401\u001b[0m \u001b[1;31m# Decapsulate classes_ attributes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36m_set_oob_score\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 528\u001b[0m unsampled_indices = _generate_unsampled_indices(\n\u001b[0;32m 529\u001b[0m estimator.random_state, n_samples, n_samples_bootstrap)\n\u001b[1;32m--> 530\u001b[1;33m p_estimator = estimator.predict_proba(X[unsampled_indices, :],\n\u001b[0m\u001b[0;32m 531\u001b[0m check_input=False)\n\u001b[0;32m 532\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[1;34m(self, X, check_input)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m \u001b[0mproba\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtree_\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
- "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
- "\u001b[1;31mMemoryError\u001b[0m: Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9673328605949619\n",
+ "Wall time: 14min 30s\n"
]
}
],
@@ -5157,13 +5166,429 @@
"print(accuracy_score(pred_org_test_y,org_test_y)) # RF"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "50b5f988",
+ "metadata": {},
+ "source": [
+ "### 5.2 使用未约简与选择的特征的分数"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
- "id": "52e36341",
+ "execution_count": 18,
+ "id": "0dc54e8c",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Elevation | \n",
+ " Horizontal_Distance_To_Hydrology | \n",
+ " Vertical_Distance_To_Hydrology | \n",
+ " Horizontal_Distance_To_Roadways | \n",
+ " Horizontal_Distance_To_Fire_Points | \n",
+ " Aspect | \n",
+ " Slope | \n",
+ " Hillshade_9am | \n",
+ " Hillshade_Noon | \n",
+ " ... | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Elevation) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) | \n",
+ " Cover_Type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2596.0 | \n",
+ " 258.0 | \n",
+ " 0.0 | \n",
+ " 510.0 | \n",
+ " 6279.0 | \n",
+ " 51.0 | \n",
+ " 3.0 | \n",
+ " 221.0 | \n",
+ " 232.0 | \n",
+ " ... | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2590.0 | \n",
+ " 212.0 | \n",
+ " -6.0 | \n",
+ " 390.0 | \n",
+ " 6225.0 | \n",
+ " 56.0 | \n",
+ " 2.0 | \n",
+ " 220.0 | \n",
+ " 235.0 | \n",
+ " ... | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 534 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Elevation Horizontal_Distance_To_Hydrology \\\n",
+ "0 0 2596.0 258.0 \n",
+ "1 1 2590.0 212.0 \n",
+ "\n",
+ " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n",
+ "0 0.0 510.0 \n",
+ "1 -6.0 390.0 \n",
+ "\n",
+ " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n",
+ "0 6279.0 51.0 3.0 221.0 \n",
+ "1 6225.0 56.0 2.0 220.0 \n",
+ "\n",
+ " Hillshade_Noon ... Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "0 232.0 ... 1324.050751 \n",
+ "1 235.0 ... 1324.050751 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "0 212.689925 \n",
+ "1 212.689925 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n",
+ "0 1558.361956 \n",
+ "1 1558.361956 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n",
+ "0 58.279989 \n",
+ "1 58.279989 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Elevation) \\\n",
+ "0 1.715981e+09 \n",
+ "1 1.715981e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "0 1.149499e+09 \n",
+ "1 1.149499e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "0 156171328.0 \n",
+ "1 156171328.0 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n",
+ "0 1.364632e+09 \n",
+ "1 1.364632e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) Cover_Type \n",
+ "0 26848308.0 4.0 \n",
+ "1 26848308.0 4.0 \n",
+ "\n",
+ "[2 rows x 534 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.merge(features, y, on=['index'])\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "637b3a7e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3256"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "del features, X\n",
+ "gc.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "4ac537b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Elevation | \n",
+ " Horizontal_Distance_To_Hydrology | \n",
+ " Vertical_Distance_To_Hydrology | \n",
+ " Horizontal_Distance_To_Roadways | \n",
+ " Horizontal_Distance_To_Fire_Points | \n",
+ " Aspect | \n",
+ " Slope | \n",
+ " Hillshade_9am | \n",
+ " Hillshade_Noon | \n",
+ " ... | \n",
+ " Soil_Type_4.STD(X.Elevation) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Elevation) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 250728 | \n",
+ " 250728 | \n",
+ " 3351.0 | \n",
+ " 726.0 | \n",
+ " 124.0 | \n",
+ " 3813.0 | \n",
+ " 2271.0 | \n",
+ " 206.0 | \n",
+ " 27.0 | \n",
+ " 192.0 | \n",
+ " 252.0 | \n",
+ " ... | \n",
+ " 277.045517 | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ "
\n",
+ " \n",
+ " 246788 | \n",
+ " 246788 | \n",
+ " 2732.0 | \n",
+ " 212.0 | \n",
+ " 1.0 | \n",
+ " 1082.0 | \n",
+ " 912.0 | \n",
+ " 129.0 | \n",
+ " 7.0 | \n",
+ " 231.0 | \n",
+ " 236.0 | \n",
+ " ... | \n",
+ " 277.045517 | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 533 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Elevation Horizontal_Distance_To_Hydrology \\\n",
+ "250728 250728 3351.0 726.0 \n",
+ "246788 246788 2732.0 212.0 \n",
+ "\n",
+ " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n",
+ "250728 124.0 3813.0 \n",
+ "246788 1.0 1082.0 \n",
+ "\n",
+ " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n",
+ "250728 2271.0 206.0 27.0 192.0 \n",
+ "246788 912.0 129.0 7.0 231.0 \n",
+ "\n",
+ " Hillshade_Noon ... Soil_Type_4.STD(X.Elevation) \\\n",
+ "250728 252.0 ... 277.045517 \n",
+ "246788 236.0 ... 277.045517 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "250728 1324.050751 \n",
+ "246788 1324.050751 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "250728 212.689925 \n",
+ "246788 212.689925 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n",
+ "250728 1558.361956 \n",
+ "246788 1558.361956 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n",
+ "250728 58.279989 \n",
+ "246788 58.279989 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Elevation) \\\n",
+ "250728 1.715981e+09 \n",
+ "246788 1.715981e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "250728 1.149499e+09 \n",
+ "246788 1.149499e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "250728 156171328.0 \n",
+ "246788 156171328.0 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n",
+ "250728 1.364632e+09 \n",
+ "246788 1.364632e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) \n",
+ "250728 26848308.0 \n",
+ "246788 26848308.0 \n",
+ "\n",
+ "[2 rows x 533 columns]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df, test_df = train_test_split(df,random_state=42)\n",
+ "train_X = train_df.drop('Cover_Type',1)\n",
+ "train_y = train_df['Cover_Type']\n",
+ "\n",
+ "test_X = test_df.drop('Cover_Type',1)\n",
+ "test_y = test_df['Cover_Type']\n",
+ "test_X.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "24c7b22f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "45"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "del df, train_df, test_df\n",
+ "gc.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "869777ba",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9442352309418738\n",
+ "Wall time: 30min 31s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "random_forest = RandomForestClassifier(n_estimators=500,oob_score=True)\n",
+ "random_forest.fit(train_X, train_y)\n",
+ "pred_y = random_forest.predict(test_X)\n",
+ "print(accuracy_score(pred_y,test_y)) # RF"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3739a43c",
+ "metadata": {},
+ "source": [
+ "从结果来看,在这个数据集上,不管是增加的特征,还是增加后过滤的特征,效果都比原始特征差。我也咨询了一些朋友他们试了效果都一般,但是kaggle上很多人点赞,如果你们在哪个数据集上试了效果上涨,请联系我。"
+ ]
}
],
"metadata": {
diff --git a/竞赛优胜技巧/Automated feature engineering.ipynb b/竞赛优胜技巧/Automated feature engineering.ipynb
index 839f48c..488f3e0 100644
--- a/竞赛优胜技巧/Automated feature engineering.ipynb
+++ b/竞赛优胜技巧/Automated feature engineering.ipynb
@@ -13,6 +13,7 @@
"id": "66dfb30d",
"metadata": {},
"source": [
+ "### 结论:效果一般\n",
"搬运参考:https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
]
},
@@ -99,13 +100,14 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 19,
"id": "43cc9a46",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import time\n",
+ "import gc\n",
"\n",
"import featuretools as ft\n",
"from featuretools.primitives import *\n",
@@ -304,6 +306,69 @@
{
"cell_type": "code",
"execution_count": 5,
+ "id": "af6722f2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Cover_Type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Cover_Type\n",
+ "0 0 4.0\n",
+ "1 1 4.0"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y = pd.DataFrame(y, columns=data.target_names)\n",
+ "y = y.reset_index()\n",
+ "y.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
"id": "2d34ab5c",
"metadata": {},
"outputs": [
@@ -347,7 +412,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"id": "1551c241",
"metadata": {},
"outputs": [
@@ -404,10 +469,18 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 8,
"id": "06f24545",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Object `es.entity_from_dataframe` not found.\n"
+ ]
+ }
+ ],
"source": [
"es.entity_from_dataframe?"
]
@@ -430,7 +503,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 9,
"id": "f2c69a94",
"metadata": {},
"outputs": [
@@ -444,7 +517,7 @@
" No relationships"
]
},
- "execution_count": 18,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -476,7 +549,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 10,
"id": "770130bc",
"metadata": {
"scrolled": false
@@ -509,7 +582,7 @@
" X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4"
]
},
- "execution_count": 19,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -529,7 +602,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 11,
"id": "352fa085",
"metadata": {
"scrolled": true
@@ -568,126 +641,116 @@
" \n",
" \n",
" 0 | \n",
- " all | \n",
+ " sum | \n",
" aggregation | \n",
" True | \n",
- " False | \n",
- " Calculates if all values are 'True' in a list. | \n",
- " Boolean | \n",
- " Boolean | \n",
+ " True | \n",
+ " Calculates the total addition, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 1 | \n",
- " skew | \n",
+ " first | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Computes the extent to which a distribution differs from a normal distribution. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " Determines the first value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
"
\n",
" \n",
" 2 | \n",
- " percent_true | \n",
+ " last | \n",
" aggregation | \n",
- " True | \n",
" False | \n",
- " Determines the percent of `True` values. | \n",
- " Boolean | \n",
- " Numeric | \n",
+ " False | \n",
+ " Determines the last value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
"
\n",
" \n",
" 3 | \n",
- " count | \n",
+ " trend | \n",
" aggregation | \n",
- " True | \n",
- " True | \n",
- " Determines the total number of values, excluding `NaN`. | \n",
- " Index | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the trend of a variable over time. | \n",
+ " DatetimeTimeIndex, Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
" 4 | \n",
- " num_unique | \n",
+ " n_most_common | \n",
" aggregation | \n",
- " True | \n",
- " True | \n",
- " Determines the number of distinct values, ignoring `NaN` values. | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the `n` most common elements. | \n",
+ " Discrete | \n",
" Discrete | \n",
- " Numeric | \n",
"
\n",
" \n",
" 5 | \n",
- " first | \n",
+ " time_since_last | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Determines the first value in a list. | \n",
- " Variable | \n",
- " None | \n",
+ " Calculates the time elapsed since the last datetime (default in seconds). | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 6 | \n",
- " mode | \n",
+ " std | \n",
" aggregation | \n",
- " False | \n",
- " False | \n",
- " Determines the most commonly repeated value. | \n",
- " Discrete | \n",
- " None | \n",
+ " True | \n",
+ " True | \n",
+ " Computes the dispersion relative to the mean value, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 7 | \n",
- " entropy | \n",
+ " median | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Calculates the entropy for a categorical variable | \n",
- " Categorical | \n",
+ " Determines the middlemost number in a list of values. | \n",
+ " Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
" 8 | \n",
- " time_since_last | \n",
+ " count | \n",
" aggregation | \n",
- " False | \n",
- " False | \n",
- " Calculates the time elapsed since the last datetime (default in seconds). | \n",
- " DatetimeTimeIndex | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the total number of values, excluding `NaN`. | \n",
+ " Index | \n",
" Numeric | \n",
"
\n",
" \n",
" 9 | \n",
- " any | \n",
+ " percent_true | \n",
" aggregation | \n",
" True | \n",
" False | \n",
- " Determines if any value is 'True' in a list. | \n",
- " Boolean | \n",
+ " Determines the percent of `True` values. | \n",
" Boolean | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 10 | \n",
- " last | \n",
- " aggregation | \n",
- " False | \n",
- " False | \n",
- " Determines the last value in a list. | \n",
- " Variable | \n",
- " None | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " avg_time_between | \n",
+ " time_since_first | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Computes the average number of seconds between consecutive events. | \n",
+ " Calculates the time elapsed since the first datetime (in seconds). | \n",
" DatetimeTimeIndex | \n",
" Numeric | \n",
"
\n",
" \n",
- " 12 | \n",
+ " 11 | \n",
" max | \n",
" aggregation | \n",
" True | \n",
@@ -697,93 +760,103 @@
" Numeric | \n",
"
\n",
" \n",
+ " 12 | \n",
+ " any | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Determines if any value is 'True' in a list. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
" 13 | \n",
- " median | \n",
+ " mode | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Determines the middlemost number in a list of values. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " Determines the most commonly repeated value. | \n",
+ " Discrete | \n",
+ " None | \n",
"
\n",
" \n",
" 14 | \n",
- " mean | \n",
+ " entropy | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the entropy for a categorical variable | \n",
+ " Categorical | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " min | \n",
" aggregation | \n",
" True | \n",
" True | \n",
- " Computes the average for a list of values. | \n",
+ " Calculates the smallest value, ignoring `NaN` values. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
- " 15 | \n",
- " num_true | \n",
+ " 16 | \n",
+ " all | \n",
" aggregation | \n",
" True | \n",
" False | \n",
- " Counts the number of `True` values. | \n",
+ " Calculates if all values are 'True' in a list. | \n",
+ " Boolean | \n",
" Boolean | \n",
- " Numeric | \n",
"
\n",
" \n",
- " 16 | \n",
- " min | \n",
+ " 17 | \n",
+ " skew | \n",
" aggregation | \n",
- " True | \n",
- " True | \n",
- " Calculates the smallest value, ignoring `NaN` values. | \n",
+ " False | \n",
+ " False | \n",
+ " Computes the extent to which a distribution differs from a normal distribution. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
- " 17 | \n",
- " sum | \n",
+ " 18 | \n",
+ " mean | \n",
" aggregation | \n",
" True | \n",
" True | \n",
- " Calculates the total addition, ignoring `NaN`. | \n",
+ " Computes the average for a list of values. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
- " 18 | \n",
- " trend | \n",
+ " 19 | \n",
+ " avg_time_between | \n",
" aggregation | \n",
" False | \n",
" False | \n",
- " Calculates the trend of a variable over time. | \n",
- " Numeric, DatetimeTimeIndex | \n",
+ " Computes the average number of seconds between consecutive events. | \n",
+ " DatetimeTimeIndex | \n",
" Numeric | \n",
"
\n",
" \n",
- " 19 | \n",
- " n_most_common | \n",
+ " 20 | \n",
+ " num_unique | \n",
" aggregation | \n",
- " False | \n",
- " False | \n",
- " Determines the `n` most common elements. | \n",
- " Discrete | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the number of distinct values, ignoring `NaN` values. | \n",
" Discrete | \n",
- "
\n",
- " \n",
- " 20 | \n",
- " time_since_first | \n",
- " aggregation | \n",
- " False | \n",
- " False | \n",
- " Calculates the time elapsed since the first datetime (in seconds). | \n",
- " DatetimeTimeIndex | \n",
- " Numeric | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 21 | \n",
- " std | \n",
+ " num_true | \n",
" aggregation | \n",
" True | \n",
- " True | \n",
- " Computes the dispersion relative to the mean value, ignoring `NaN`. | \n",
- " Numeric | \n",
+ " False | \n",
+ " Counts the number of `True` values. | \n",
+ " Boolean | \n",
" Numeric | \n",
"
\n",
" \n",
@@ -792,79 +865,79 @@
],
"text/plain": [
" name type dask_compatible koalas_compatible \\\n",
- "0 all aggregation True False \n",
- "1 skew aggregation False False \n",
- "2 percent_true aggregation True False \n",
- "3 count aggregation True True \n",
- "4 num_unique aggregation True True \n",
- "5 first aggregation False False \n",
- "6 mode aggregation False False \n",
- "7 entropy aggregation False False \n",
- "8 time_since_last aggregation False False \n",
- "9 any aggregation True False \n",
- "10 last aggregation False False \n",
- "11 avg_time_between aggregation False False \n",
- "12 max aggregation True True \n",
- "13 median aggregation False False \n",
- "14 mean aggregation True True \n",
- "15 num_true aggregation True False \n",
- "16 min aggregation True True \n",
- "17 sum aggregation True True \n",
- "18 trend aggregation False False \n",
- "19 n_most_common aggregation False False \n",
- "20 time_since_first aggregation False False \n",
- "21 std aggregation True True \n",
+ "0 sum aggregation True True \n",
+ "1 first aggregation False False \n",
+ "2 last aggregation False False \n",
+ "3 trend aggregation False False \n",
+ "4 n_most_common aggregation False False \n",
+ "5 time_since_last aggregation False False \n",
+ "6 std aggregation True True \n",
+ "7 median aggregation False False \n",
+ "8 count aggregation True True \n",
+ "9 percent_true aggregation True False \n",
+ "10 time_since_first aggregation False False \n",
+ "11 max aggregation True True \n",
+ "12 any aggregation True False \n",
+ "13 mode aggregation False False \n",
+ "14 entropy aggregation False False \n",
+ "15 min aggregation True True \n",
+ "16 all aggregation True False \n",
+ "17 skew aggregation False False \n",
+ "18 mean aggregation True True \n",
+ "19 avg_time_between aggregation False False \n",
+ "20 num_unique aggregation True True \n",
+ "21 num_true aggregation True False \n",
"\n",
" description \\\n",
- "0 Calculates if all values are 'True' in a list. \n",
- "1 Computes the extent to which a distribution differs from a normal distribution. \n",
- "2 Determines the percent of `True` values. \n",
- "3 Determines the total number of values, excluding `NaN`. \n",
- "4 Determines the number of distinct values, ignoring `NaN` values. \n",
- "5 Determines the first value in a list. \n",
- "6 Determines the most commonly repeated value. \n",
- "7 Calculates the entropy for a categorical variable \n",
- "8 Calculates the time elapsed since the last datetime (default in seconds). \n",
- "9 Determines if any value is 'True' in a list. \n",
- "10 Determines the last value in a list. \n",
- "11 Computes the average number of seconds between consecutive events. \n",
- "12 Calculates the highest value, ignoring `NaN` values. \n",
- "13 Determines the middlemost number in a list of values. \n",
- "14 Computes the average for a list of values. \n",
- "15 Counts the number of `True` values. \n",
- "16 Calculates the smallest value, ignoring `NaN` values. \n",
- "17 Calculates the total addition, ignoring `NaN`. \n",
- "18 Calculates the trend of a variable over time. \n",
- "19 Determines the `n` most common elements. \n",
- "20 Calculates the time elapsed since the first datetime (in seconds). \n",
- "21 Computes the dispersion relative to the mean value, ignoring `NaN`. \n",
+ "0 Calculates the total addition, ignoring `NaN`. \n",
+ "1 Determines the first value in a list. \n",
+ "2 Determines the last value in a list. \n",
+ "3 Calculates the trend of a variable over time. \n",
+ "4 Determines the `n` most common elements. \n",
+ "5 Calculates the time elapsed since the last datetime (default in seconds). \n",
+ "6 Computes the dispersion relative to the mean value, ignoring `NaN`. \n",
+ "7 Determines the middlemost number in a list of values. \n",
+ "8 Determines the total number of values, excluding `NaN`. \n",
+ "9 Determines the percent of `True` values. \n",
+ "10 Calculates the time elapsed since the first datetime (in seconds). \n",
+ "11 Calculates the highest value, ignoring `NaN` values. \n",
+ "12 Determines if any value is 'True' in a list. \n",
+ "13 Determines the most commonly repeated value. \n",
+ "14 Calculates the entropy for a categorical variable \n",
+ "15 Calculates the smallest value, ignoring `NaN` values. \n",
+ "16 Calculates if all values are 'True' in a list. \n",
+ "17 Computes the extent to which a distribution differs from a normal distribution. \n",
+ "18 Computes the average for a list of values. \n",
+ "19 Computes the average number of seconds between consecutive events. \n",
+ "20 Determines the number of distinct values, ignoring `NaN` values. \n",
+ "21 Counts the number of `True` values. \n",
"\n",
" valid_inputs return_type \n",
- "0 Boolean Boolean \n",
- "1 Numeric Numeric \n",
- "2 Boolean Numeric \n",
- "3 Index Numeric \n",
- "4 Discrete Numeric \n",
- "5 Variable None \n",
- "6 Discrete None \n",
- "7 Categorical Numeric \n",
- "8 DatetimeTimeIndex Numeric \n",
- "9 Boolean Boolean \n",
- "10 Variable None \n",
- "11 DatetimeTimeIndex Numeric \n",
- "12 Numeric Numeric \n",
- "13 Numeric Numeric \n",
- "14 Numeric Numeric \n",
- "15 Boolean Numeric \n",
- "16 Numeric Numeric \n",
+ "0 Numeric Numeric \n",
+ "1 Variable None \n",
+ "2 Variable None \n",
+ "3 DatetimeTimeIndex, Numeric Numeric \n",
+ "4 Discrete Discrete \n",
+ "5 DatetimeTimeIndex Numeric \n",
+ "6 Numeric Numeric \n",
+ "7 Numeric Numeric \n",
+ "8 Index Numeric \n",
+ "9 Boolean Numeric \n",
+ "10 DatetimeTimeIndex Numeric \n",
+ "11 Numeric Numeric \n",
+ "12 Boolean Boolean \n",
+ "13 Discrete None \n",
+ "14 Categorical Numeric \n",
+ "15 Numeric Numeric \n",
+ "16 Boolean Boolean \n",
"17 Numeric Numeric \n",
- "18 Numeric, DatetimeTimeIndex Numeric \n",
- "19 Discrete Discrete \n",
- "20 DatetimeTimeIndex Numeric \n",
- "21 Numeric Numeric "
+ "18 Numeric Numeric \n",
+ "19 DatetimeTimeIndex Numeric \n",
+ "20 Discrete Numeric \n",
+ "21 Boolean Numeric "
]
},
- "execution_count": 20,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -877,7 +950,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 12,
"id": "7762885f",
"metadata": {},
"outputs": [
@@ -914,52 +987,52 @@
" \n",
" \n",
" 22 | \n",
- " url_to_domain | \n",
+ " greater_than | \n",
" transform | \n",
+ " True | \n",
" False | \n",
- " False | \n",
- " Determines the domain of a url. | \n",
- " URL | \n",
- " Categorical | \n",
+ " Determines if values in one list are greater than another list. | \n",
+ " Ordinal, Datetime, Numeric | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 23 | \n",
- " cum_mean | \n",
+ " less_than | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Calculates the cumulative mean. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " True | \n",
+ " True | \n",
+ " Determines if values in one list are less than another list. | \n",
+ " Ordinal, Datetime, Numeric | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 24 | \n",
- " minute | \n",
+ " and | \n",
" transform | \n",
" True | \n",
" True | \n",
- " Determines the minutes value of a datetime. | \n",
- " Datetime | \n",
- " Numeric | \n",
+ " Element-wise logical AND of two lists. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 25 | \n",
- " cum_max | \n",
+ " less_than_scalar | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Calculates the cumulative maximum. | \n",
- " Numeric | \n",
- " Numeric | \n",
+ " True | \n",
+ " True | \n",
+ " Determines if values are less than a given scalar. | \n",
+ " Ordinal, Datetime, Numeric | \n",
+ " Boolean | \n",
"
\n",
" \n",
" 26 | \n",
- " age | \n",
+ " modulo_numeric | \n",
" transform | \n",
" True | \n",
- " False | \n",
- " Calculates the age in years as a floating point number given a | \n",
- " DateOfBirth | \n",
+ " True | \n",
+ " Element-wise modulo of two lists. | \n",
+ " Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
@@ -974,52 +1047,52 @@
"
\n",
" \n",
" 79 | \n",
- " greater_than_scalar | \n",
+ " is_weekend | \n",
" transform | \n",
" True | \n",
" True | \n",
- " Determines if values are greater than a given scalar. | \n",
- " Numeric, Datetime, Ordinal | \n",
+ " Determines if a date falls on a weekend. | \n",
+ " Datetime | \n",
" Boolean | \n",
"
\n",
" \n",
" 80 | \n",
- " url_to_protocol | \n",
+ " num_characters | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Determines the protocol (http or https) of a url. | \n",
- " URL | \n",
- " Categorical | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the number of characters in a string. | \n",
+ " NaturalLanguage | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 81 | \n",
- " month | \n",
+ " latitude | \n",
" transform | \n",
- " True | \n",
- " True | \n",
- " Determines the month value of a datetime. | \n",
- " Datetime | \n",
- " Ordinal | \n",
+ " False | \n",
+ " False | \n",
+ " Returns the first tuple value in a list of LatLong tuples. | \n",
+ " LatLong | \n",
+ " Numeric | \n",
"
\n",
" \n",
" 82 | \n",
- " divide_numeric_scalar | \n",
+ " cum_sum | \n",
" transform | \n",
- " True | \n",
- " True | \n",
- " Divide each element in the list by a scalar. | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the cumulative sum. | \n",
" Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
" 83 | \n",
- " time_since_previous | \n",
+ " subtract_numeric_scalar | \n",
" transform | \n",
- " False | \n",
- " False | \n",
- " Compute the time since the previous entry in a list. | \n",
- " DatetimeTimeIndex | \n",
+ " True | \n",
+ " True | \n",
+ " Subtract a scalar from each element in the list. | \n",
+ " Numeric | \n",
" Numeric | \n",
"
\n",
" \n",
@@ -1028,49 +1101,49 @@
""
],
"text/plain": [
- " name type dask_compatible koalas_compatible \\\n",
- "22 url_to_domain transform False False \n",
- "23 cum_mean transform False False \n",
- "24 minute transform True True \n",
- "25 cum_max transform False False \n",
- "26 age transform True False \n",
- ".. ... ... ... ... \n",
- "79 greater_than_scalar transform True True \n",
- "80 url_to_protocol transform False False \n",
- "81 month transform True True \n",
- "82 divide_numeric_scalar transform True True \n",
- "83 time_since_previous transform False False \n",
+ " name type dask_compatible koalas_compatible \\\n",
+ "22 greater_than transform True False \n",
+ "23 less_than transform True True \n",
+ "24 and transform True True \n",
+ "25 less_than_scalar transform True True \n",
+ "26 modulo_numeric transform True True \n",
+ ".. ... ... ... ... \n",
+ "79 is_weekend transform True True \n",
+ "80 num_characters transform True True \n",
+ "81 latitude transform False False \n",
+ "82 cum_sum transform False False \n",
+ "83 subtract_numeric_scalar transform True True \n",
"\n",
- " description \\\n",
- "22 Determines the domain of a url. \n",
- "23 Calculates the cumulative mean. \n",
- "24 Determines the minutes value of a datetime. \n",
- "25 Calculates the cumulative maximum. \n",
- "26 Calculates the age in years as a floating point number given a \n",
- ".. ... \n",
- "79 Determines if values are greater than a given scalar. \n",
- "80 Determines the protocol (http or https) of a url. \n",
- "81 Determines the month value of a datetime. \n",
- "82 Divide each element in the list by a scalar. \n",
- "83 Compute the time since the previous entry in a list. \n",
+ " description \\\n",
+ "22 Determines if values in one list are greater than another list. \n",
+ "23 Determines if values in one list are less than another list. \n",
+ "24 Element-wise logical AND of two lists. \n",
+ "25 Determines if values are less than a given scalar. \n",
+ "26 Element-wise modulo of two lists. \n",
+ ".. ... \n",
+ "79 Determines if a date falls on a weekend. \n",
+ "80 Calculates the number of characters in a string. \n",
+ "81 Returns the first tuple value in a list of LatLong tuples. \n",
+ "82 Calculates the cumulative sum. \n",
+ "83 Subtract a scalar from each element in the list. \n",
"\n",
- " valid_inputs return_type \n",
- "22 URL Categorical \n",
- "23 Numeric Numeric \n",
- "24 Datetime Numeric \n",
- "25 Numeric Numeric \n",
- "26 DateOfBirth Numeric \n",
- ".. ... ... \n",
- "79 Numeric, Datetime, Ordinal Boolean \n",
- "80 URL Categorical \n",
- "81 Datetime Ordinal \n",
- "82 Numeric Numeric \n",
- "83 DatetimeTimeIndex Numeric \n",
+ " valid_inputs return_type \n",
+ "22 Ordinal, Datetime, Numeric Boolean \n",
+ "23 Ordinal, Datetime, Numeric Boolean \n",
+ "24 Boolean Boolean \n",
+ "25 Ordinal, Datetime, Numeric Boolean \n",
+ "26 Numeric Numeric \n",
+ ".. ... ... \n",
+ "79 Datetime Boolean \n",
+ "80 NaturalLanguage Numeric \n",
+ "81 LatLong Numeric \n",
+ "82 Numeric Numeric \n",
+ "83 Numeric Numeric \n",
"\n",
"[62 rows x 7 columns]"
]
},
- "execution_count": 21,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1089,11 +1162,20 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 14,
"id": "6d3df2f7",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Wall time: 1min 3s\n"
+ ]
+ }
+ ],
"source": [
+ "%%time\n",
"features, feature_names = ft.dfs(entityset = es, \n",
" target_entity = 'X', \n",
" max_depth = 2)"
@@ -1109,7 +1191,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 15,
"id": "9a44a98a",
"metadata": {},
"outputs": [
@@ -1650,7 +1732,7 @@
" ]"
]
},
- "execution_count": 27,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -1661,7 +1743,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 16,
"id": "d5036e65",
"metadata": {},
"outputs": [
@@ -1700,31 +1782,31 @@
" \n",
" \n",
" 0 | \n",
- " 3000.267286 | \n",
+ " 3000.267334 | \n",
" 2596.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 561 | \n",
- " 3000.267286 | \n",
+ " 3000.267334 | \n",
" 2596.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2062 | \n",
- " 2926.053180 | \n",
+ " 2926.053223 | \n",
" 2596.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6946 | \n",
- " 2926.053180 | \n",
+ " 2926.053223 | \n",
" 2596.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6976 | \n",
- " 2926.053180 | \n",
+ " 2926.053223 | \n",
" 2596.0 | \n",
" 0.0 | \n",
"
\n",
@@ -1735,14 +1817,14 @@
"text/plain": [
" Wilderness_Area_0.MEAN(X.Elevation) Elevation Wilderness_Area_0\n",
"index \n",
- "0 3000.267286 2596.0 1.0\n",
- "561 3000.267286 2596.0 1.0\n",
- "2062 2926.053180 2596.0 0.0\n",
- "6946 2926.053180 2596.0 0.0\n",
- "6976 2926.053180 2596.0 0.0"
+ "0 3000.267334 2596.0 1.0\n",
+ "561 3000.267334 2596.0 1.0\n",
+ "2062 2926.053223 2596.0 0.0\n",
+ "6946 2926.053223 2596.0 0.0\n",
+ "6976 2926.053223 2596.0 0.0"
]
},
- "execution_count": 35,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -1753,7 +1835,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 17,
"id": "ec8b7ccd",
"metadata": {},
"outputs": [
@@ -1763,7 +1845,7 @@
"(581012, 532)"
]
},
- "execution_count": 36,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -1795,7 +1877,7 @@
"id": "75b7cc64",
"metadata": {},
"source": [
- "为了解决“维数灾难”,有必要应用特征简化和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n",
+ "为了解决“维数灾难”,有必要应用特征约简和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n",
"\n",
"* 确定共线特征\n",
"\n",
@@ -4902,11 +4984,12 @@
}
],
"source": [
- "import gc\n",
+ "\"\"\"\n",
"del features_filtered\n",
"del features_positive\n",
"del fetch_covtype\n",
"del df, X,y, X_selected_df,train,test,train_df,test_df,train_X,train_y\n",
+ "\"\"\"\n",
"gc.collect()"
]
},
@@ -4920,7 +5003,7 @@
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": 8,
"id": "b7241552",
"metadata": {},
"outputs": [
@@ -5044,7 +5127,7 @@
"246788 0.0 0.0 0.0 0.0 "
]
},
- "execution_count": 65,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -5062,24 +5145,16 @@
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 9,
"id": "db3d3b92",
"metadata": {},
"outputs": [
{
- "ename": "MemoryError",
- "evalue": "Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)",
- "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n",
- "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 397\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 398\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moob_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 399\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_oob_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 400\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 401\u001b[0m \u001b[1;31m# Decapsulate classes_ attributes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36m_set_oob_score\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 528\u001b[0m unsampled_indices = _generate_unsampled_indices(\n\u001b[0;32m 529\u001b[0m estimator.random_state, n_samples, n_samples_bootstrap)\n\u001b[1;32m--> 530\u001b[1;33m p_estimator = estimator.predict_proba(X[unsampled_indices, :],\n\u001b[0m\u001b[0;32m 531\u001b[0m check_input=False)\n\u001b[0;32m 532\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[1;34m(self, X, check_input)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m \u001b[0mproba\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtree_\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
- "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
- "\u001b[1;31mMemoryError\u001b[0m: Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9673328605949619\n",
+ "Wall time: 14min 30s\n"
]
}
],
@@ -5091,13 +5166,429 @@
"print(accuracy_score(pred_org_test_y,org_test_y)) # RF"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "50b5f988",
+ "metadata": {},
+ "source": [
+ "### 5.2 使用未约简与选择的特征的分数"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
- "id": "52e36341",
+ "execution_count": 18,
+ "id": "0dc54e8c",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Elevation | \n",
+ " Horizontal_Distance_To_Hydrology | \n",
+ " Vertical_Distance_To_Hydrology | \n",
+ " Horizontal_Distance_To_Roadways | \n",
+ " Horizontal_Distance_To_Fire_Points | \n",
+ " Aspect | \n",
+ " Slope | \n",
+ " Hillshade_9am | \n",
+ " Hillshade_Noon | \n",
+ " ... | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Elevation) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) | \n",
+ " Cover_Type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2596.0 | \n",
+ " 258.0 | \n",
+ " 0.0 | \n",
+ " 510.0 | \n",
+ " 6279.0 | \n",
+ " 51.0 | \n",
+ " 3.0 | \n",
+ " 221.0 | \n",
+ " 232.0 | \n",
+ " ... | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2590.0 | \n",
+ " 212.0 | \n",
+ " -6.0 | \n",
+ " 390.0 | \n",
+ " 6225.0 | \n",
+ " 56.0 | \n",
+ " 2.0 | \n",
+ " 220.0 | \n",
+ " 235.0 | \n",
+ " ... | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 534 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Elevation Horizontal_Distance_To_Hydrology \\\n",
+ "0 0 2596.0 258.0 \n",
+ "1 1 2590.0 212.0 \n",
+ "\n",
+ " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n",
+ "0 0.0 510.0 \n",
+ "1 -6.0 390.0 \n",
+ "\n",
+ " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n",
+ "0 6279.0 51.0 3.0 221.0 \n",
+ "1 6225.0 56.0 2.0 220.0 \n",
+ "\n",
+ " Hillshade_Noon ... Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "0 232.0 ... 1324.050751 \n",
+ "1 235.0 ... 1324.050751 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "0 212.689925 \n",
+ "1 212.689925 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n",
+ "0 1558.361956 \n",
+ "1 1558.361956 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n",
+ "0 58.279989 \n",
+ "1 58.279989 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Elevation) \\\n",
+ "0 1.715981e+09 \n",
+ "1 1.715981e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "0 1.149499e+09 \n",
+ "1 1.149499e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "0 156171328.0 \n",
+ "1 156171328.0 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n",
+ "0 1.364632e+09 \n",
+ "1 1.364632e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) Cover_Type \n",
+ "0 26848308.0 4.0 \n",
+ "1 26848308.0 4.0 \n",
+ "\n",
+ "[2 rows x 534 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.merge(features, y, on=['index'])\n",
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "637b3a7e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3256"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "del features, X\n",
+ "gc.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "4ac537b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Elevation | \n",
+ " Horizontal_Distance_To_Hydrology | \n",
+ " Vertical_Distance_To_Hydrology | \n",
+ " Horizontal_Distance_To_Roadways | \n",
+ " Horizontal_Distance_To_Fire_Points | \n",
+ " Aspect | \n",
+ " Slope | \n",
+ " Hillshade_9am | \n",
+ " Hillshade_Noon | \n",
+ " ... | \n",
+ " Soil_Type_4.STD(X.Elevation) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Elevation) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) | \n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) | \n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 250728 | \n",
+ " 250728 | \n",
+ " 3351.0 | \n",
+ " 726.0 | \n",
+ " 124.0 | \n",
+ " 3813.0 | \n",
+ " 2271.0 | \n",
+ " 206.0 | \n",
+ " 27.0 | \n",
+ " 192.0 | \n",
+ " 252.0 | \n",
+ " ... | \n",
+ " 277.045517 | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ "
\n",
+ " \n",
+ " 246788 | \n",
+ " 246788 | \n",
+ " 2732.0 | \n",
+ " 212.0 | \n",
+ " 1.0 | \n",
+ " 1082.0 | \n",
+ " 912.0 | \n",
+ " 129.0 | \n",
+ " 7.0 | \n",
+ " 231.0 | \n",
+ " 236.0 | \n",
+ " ... | \n",
+ " 277.045517 | \n",
+ " 1324.050751 | \n",
+ " 212.689925 | \n",
+ " 1558.361956 | \n",
+ " 58.279989 | \n",
+ " 1.715981e+09 | \n",
+ " 1.149499e+09 | \n",
+ " 156171328.0 | \n",
+ " 1.364632e+09 | \n",
+ " 26848308.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 533 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Elevation Horizontal_Distance_To_Hydrology \\\n",
+ "250728 250728 3351.0 726.0 \n",
+ "246788 246788 2732.0 212.0 \n",
+ "\n",
+ " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n",
+ "250728 124.0 3813.0 \n",
+ "246788 1.0 1082.0 \n",
+ "\n",
+ " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n",
+ "250728 2271.0 206.0 27.0 192.0 \n",
+ "246788 912.0 129.0 7.0 231.0 \n",
+ "\n",
+ " Hillshade_Noon ... Soil_Type_4.STD(X.Elevation) \\\n",
+ "250728 252.0 ... 277.045517 \n",
+ "246788 236.0 ... 277.045517 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "250728 1324.050751 \n",
+ "246788 1324.050751 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "250728 212.689925 \n",
+ "246788 212.689925 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n",
+ "250728 1558.361956 \n",
+ "246788 1558.361956 \n",
+ "\n",
+ " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n",
+ "250728 58.279989 \n",
+ "246788 58.279989 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Elevation) \\\n",
+ "250728 1.715981e+09 \n",
+ "246788 1.715981e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n",
+ "250728 1.149499e+09 \n",
+ "246788 1.149499e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n",
+ "250728 156171328.0 \n",
+ "246788 156171328.0 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n",
+ "250728 1.364632e+09 \n",
+ "246788 1.364632e+09 \n",
+ "\n",
+ " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) \n",
+ "250728 26848308.0 \n",
+ "246788 26848308.0 \n",
+ "\n",
+ "[2 rows x 533 columns]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df, test_df = train_test_split(df,random_state=42)\n",
+ "train_X = train_df.drop('Cover_Type',1)\n",
+ "train_y = train_df['Cover_Type']\n",
+ "\n",
+ "test_X = test_df.drop('Cover_Type',1)\n",
+ "test_y = test_df['Cover_Type']\n",
+ "test_X.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "24c7b22f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "45"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "del df, train_df, test_df\n",
+ "gc.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "869777ba",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9442352309418738\n",
+ "Wall time: 30min 31s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "random_forest = RandomForestClassifier(n_estimators=500,oob_score=True)\n",
+ "random_forest.fit(train_X, train_y)\n",
+ "pred_y = random_forest.predict(test_X)\n",
+ "print(accuracy_score(pred_y,test_y)) # RF"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3739a43c",
+ "metadata": {},
+ "source": [
+ "从结果来看,在这个数据集上,不管是增加的特征,还是增加后过滤的特征,效果都比原始特征差。我也咨询了一些朋友他们试了效果都一般,但是kaggle上很多人点赞,如果你们在哪个数据集上试了效果上涨,请联系我。"
+ ]
}
],
"metadata": {