From fba1e1357cf0c7172b9e996fc5f35522ab6e1762 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Wed, 1 Sep 2021 15:39:43 +0800
Subject: [PATCH] Add. Training and testing the simple model

---
 ...mated feature engineering-checkpoint.ipynb | 1169 +++++++++++------
 .../Automated feature engineering.ipynb       | 1095 ++++++++++-----
 2 files changed, 1590 insertions(+), 674 deletions(-)
diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
index 8428481..488f3e0 100644
--- a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb	
+++ b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb	
@@ -13,6 +13,7 @@
    "id": "66dfb30d",
    "metadata": {},
    "source": [
+    "### 结论：效果一般\n",
     "搬运参考：https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
    ]
   },
@@ -99,13 +100,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 19,
    "id": "43cc9a46",
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
     "import time\n",
+    "import gc\n",
     "\n",
     "import featuretools as ft\n",
     "from featuretools.primitives import *\n",
@@ -115,9 +117,6 @@
     "# 导入相关模型，没有的pip install xxx 即可\n",
     "\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
-    "from sklearn.linear_model import LogisticRegression\n",
-    "import xgboost as xgb \n",
-    "import lightgbm as lgb \n",
     "\n",
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.metrics import accuracy_score\n",
@@ -127,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "4c17c0bc",
    "metadata": {},
    "outputs": [],
@@ -138,7 +137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "bcce5a3d",
    "metadata": {},
    "outputs": [
@@ -168,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 4,
    "id": "4afeeca5",
    "metadata": {},
    "outputs": [
@@ -292,7 +291,7 @@
        "1          0.0          0.0          0.0  "
       ]
      },
-     "execution_count": 17,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -306,7 +305,70 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 5,
+   "id": "af6722f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Cover_Type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   index  Cover_Type\n",
+       "0      0         4.0\n",
+       "1      1         4.0"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y = pd.DataFrame(y, columns=data.target_names)\n",
+    "y = y.reset_index()\n",
+    "y.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "id": "2d34ab5c",
    "metadata": {},
    "outputs": [
@@ -350,7 +412,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 7,
    "id": "1551c241",
    "metadata": {},
    "outputs": [
@@ -407,10 +469,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 8,
    "id": "06f24545",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Object `es.entity_from_dataframe` not found.\n"
+     ]
+    }
+   ],
    "source": [
     "es.entity_from_dataframe?"
    ]
@@ -433,7 +503,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 9,
    "id": "f2c69a94",
    "metadata": {},
    "outputs": [
@@ -447,7 +517,7 @@
        "    No relationships"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -479,7 +549,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 10,
    "id": "770130bc",
    "metadata": {
     "scrolled": false
@@ -512,7 +582,7 @@
        "    X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -532,7 +602,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 11,
    "id": "352fa085",
    "metadata": {
     "scrolled": true
@@ -571,126 +641,116 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>all</td>\n",
+       "      <td>sum</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates if all values are 'True' in a list.</td>\n",
-       "      <td>Boolean</td>\n",
-       "      <td>Boolean</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the total addition, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>skew</td>\n",
+       "      <td>first</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Computes the extent to which a distribution differs from a normal distribution.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>Determines the first value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>percent_true</td>\n",
+       "      <td>last</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines the percent of `True` values.</td>\n",
-       "      <td>Boolean</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the last value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>count</td>\n",
+       "      <td>trend</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Determines the total number of values, excluding `NaN`.</td>\n",
-       "      <td>Index</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the trend of a variable over time.</td>\n",
+       "      <td>DatetimeTimeIndex, Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>num_unique</td>\n",
+       "      <td>n_most_common</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Determines the number of distinct values, ignoring `NaN` values.</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the `n` most common elements.</td>\n",
+       "      <td>Discrete</td>\n",
        "      <td>Discrete</td>\n",
-       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>first</td>\n",
+       "      <td>time_since_last</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines the first value in a list.</td>\n",
-       "      <td>Variable</td>\n",
-       "      <td>None</td>\n",
+       "      <td>Calculates the time elapsed since the last datetime (default in seconds).</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>mode</td>\n",
+       "      <td>std</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the most commonly repeated value.</td>\n",
-       "      <td>Discrete</td>\n",
-       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Computes the dispersion relative to the mean value, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>entropy</td>\n",
+       "      <td>median</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Calculates the entropy for a categorical variable</td>\n",
-       "      <td>Categorical</td>\n",
+       "      <td>Determines the middlemost number in a list of values.</td>\n",
+       "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>time_since_last</td>\n",
+       "      <td>count</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the time elapsed since the last datetime (default in seconds).</td>\n",
-       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the total number of values, excluding `NaN`.</td>\n",
+       "      <td>Index</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>any</td>\n",
+       "      <td>percent_true</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines if any value is 'True' in a list.</td>\n",
-       "      <td>Boolean</td>\n",
+       "      <td>Determines the percent of `True` values.</td>\n",
        "      <td>Boolean</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>last</td>\n",
-       "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the last value in a list.</td>\n",
-       "      <td>Variable</td>\n",
-       "      <td>None</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>avg_time_between</td>\n",
+       "      <td>time_since_first</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Computes the average number of seconds between consecutive events.</td>\n",
+       "      <td>Calculates the time elapsed since the first datetime (in seconds).</td>\n",
        "      <td>DatetimeTimeIndex</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>12</th>\n",
+       "      <th>11</th>\n",
        "      <td>max</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
@@ -700,93 +760,103 @@
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>any</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines if any value is 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>13</th>\n",
-       "      <td>median</td>\n",
+       "      <td>mode</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines the middlemost number in a list of values.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>Determines the most commonly repeated value.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
-       "      <td>mean</td>\n",
+       "      <td>entropy</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the entropy for a categorical variable</td>\n",
+       "      <td>Categorical</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>min</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Computes the average for a list of values.</td>\n",
+       "      <td>Calculates the smallest value, ignoring `NaN` values.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>num_true</td>\n",
+       "      <th>16</th>\n",
+       "      <td>all</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Counts the number of `True` values.</td>\n",
+       "      <td>Calculates if all values are 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
        "      <td>Boolean</td>\n",
-       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>min</td>\n",
+       "      <th>17</th>\n",
+       "      <td>skew</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Calculates the smallest value, ignoring `NaN` values.</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Computes the extent to which a distribution differs from a normal distribution.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>sum</td>\n",
+       "      <th>18</th>\n",
+       "      <td>mean</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Calculates the total addition, ignoring `NaN`.</td>\n",
+       "      <td>Computes the average for a list of values.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>trend</td>\n",
+       "      <th>19</th>\n",
+       "      <td>avg_time_between</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Calculates the trend of a variable over time.</td>\n",
-       "      <td>Numeric, DatetimeTimeIndex</td>\n",
+       "      <td>Computes the average number of seconds between consecutive events.</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>n_most_common</td>\n",
-       "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the `n` most common elements.</td>\n",
-       "      <td>Discrete</td>\n",
-       "      <td>Discrete</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th>20</th>\n",
-       "      <td>time_since_first</td>\n",
+       "      <td>num_unique</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the time elapsed since the first datetime (in seconds).</td>\n",
-       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the number of distinct values, ignoring `NaN` values.</td>\n",
+       "      <td>Discrete</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>21</th>\n",
-       "      <td>std</td>\n",
+       "      <td>num_true</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Computes the dispersion relative to the mean value, ignoring `NaN`.</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Counts the number of `True` values.</td>\n",
+       "      <td>Boolean</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -795,79 +865,79 @@
       ],
       "text/plain": [
        "                name         type  dask_compatible  koalas_compatible  \\\n",
-       "0                all  aggregation             True              False   \n",
-       "1               skew  aggregation            False              False   \n",
-       "2       percent_true  aggregation             True              False   \n",
-       "3              count  aggregation             True               True   \n",
-       "4         num_unique  aggregation             True               True   \n",
-       "5              first  aggregation            False              False   \n",
-       "6               mode  aggregation            False              False   \n",
-       "7            entropy  aggregation            False              False   \n",
-       "8    time_since_last  aggregation            False              False   \n",
-       "9                any  aggregation             True              False   \n",
-       "10              last  aggregation            False              False   \n",
-       "11  avg_time_between  aggregation            False              False   \n",
-       "12               max  aggregation             True               True   \n",
-       "13            median  aggregation            False              False   \n",
-       "14              mean  aggregation             True               True   \n",
-       "15          num_true  aggregation             True              False   \n",
-       "16               min  aggregation             True               True   \n",
-       "17               sum  aggregation             True               True   \n",
-       "18             trend  aggregation            False              False   \n",
-       "19     n_most_common  aggregation            False              False   \n",
-       "20  time_since_first  aggregation            False              False   \n",
-       "21               std  aggregation             True               True   \n",
+       "0                sum  aggregation             True               True   \n",
+       "1              first  aggregation            False              False   \n",
+       "2               last  aggregation            False              False   \n",
+       "3              trend  aggregation            False              False   \n",
+       "4      n_most_common  aggregation            False              False   \n",
+       "5    time_since_last  aggregation            False              False   \n",
+       "6                std  aggregation             True               True   \n",
+       "7             median  aggregation            False              False   \n",
+       "8              count  aggregation             True               True   \n",
+       "9       percent_true  aggregation             True              False   \n",
+       "10  time_since_first  aggregation            False              False   \n",
+       "11               max  aggregation             True               True   \n",
+       "12               any  aggregation             True              False   \n",
+       "13              mode  aggregation            False              False   \n",
+       "14           entropy  aggregation            False              False   \n",
+       "15               min  aggregation             True               True   \n",
+       "16               all  aggregation             True              False   \n",
+       "17              skew  aggregation            False              False   \n",
+       "18              mean  aggregation             True               True   \n",
+       "19  avg_time_between  aggregation            False              False   \n",
+       "20        num_unique  aggregation             True               True   \n",
+       "21          num_true  aggregation             True              False   \n",
        "\n",
        "                                                                        description  \\\n",
-       "0                                    Calculates if all values are 'True' in a list.   \n",
-       "1   Computes the extent to which a distribution differs from a normal distribution.   \n",
-       "2                                          Determines the percent of `True` values.   \n",
-       "3                           Determines the total number of values, excluding `NaN`.   \n",
-       "4                  Determines the number of distinct values, ignoring `NaN` values.   \n",
-       "5                                             Determines the first value in a list.   \n",
-       "6                                      Determines the most commonly repeated value.   \n",
-       "7                                 Calculates the entropy for a categorical variable   \n",
-       "8         Calculates the time elapsed since the last datetime (default in seconds).   \n",
-       "9                                      Determines if any value is 'True' in a list.   \n",
-       "10                                             Determines the last value in a list.   \n",
-       "11               Computes the average number of seconds between consecutive events.   \n",
-       "12                             Calculates the highest value, ignoring `NaN` values.   \n",
-       "13                            Determines the middlemost number in a list of values.   \n",
-       "14                                       Computes the average for a list of values.   \n",
-       "15                                              Counts the number of `True` values.   \n",
-       "16                            Calculates the smallest value, ignoring `NaN` values.   \n",
-       "17                                   Calculates the total addition, ignoring `NaN`.   \n",
-       "18                                    Calculates the trend of a variable over time.   \n",
-       "19                                         Determines the `n` most common elements.   \n",
-       "20               Calculates the time elapsed since the first datetime (in seconds).   \n",
-       "21              Computes the dispersion relative to the mean value, ignoring `NaN`.   \n",
+       "0                                    Calculates the total addition, ignoring `NaN`.   \n",
+       "1                                             Determines the first value in a list.   \n",
+       "2                                              Determines the last value in a list.   \n",
+       "3                                     Calculates the trend of a variable over time.   \n",
+       "4                                          Determines the `n` most common elements.   \n",
+       "5         Calculates the time elapsed since the last datetime (default in seconds).   \n",
+       "6               Computes the dispersion relative to the mean value, ignoring `NaN`.   \n",
+       "7                             Determines the middlemost number in a list of values.   \n",
+       "8                           Determines the total number of values, excluding `NaN`.   \n",
+       "9                                          Determines the percent of `True` values.   \n",
+       "10               Calculates the time elapsed since the first datetime (in seconds).   \n",
+       "11                             Calculates the highest value, ignoring `NaN` values.   \n",
+       "12                                     Determines if any value is 'True' in a list.   \n",
+       "13                                     Determines the most commonly repeated value.   \n",
+       "14                                Calculates the entropy for a categorical variable   \n",
+       "15                            Calculates the smallest value, ignoring `NaN` values.   \n",
+       "16                                   Calculates if all values are 'True' in a list.   \n",
+       "17  Computes the extent to which a distribution differs from a normal distribution.   \n",
+       "18                                       Computes the average for a list of values.   \n",
+       "19               Computes the average number of seconds between consecutive events.   \n",
+       "20                 Determines the number of distinct values, ignoring `NaN` values.   \n",
+       "21                                              Counts the number of `True` values.   \n",
        "\n",
        "                  valid_inputs return_type  \n",
-       "0                      Boolean     Boolean  \n",
-       "1                      Numeric     Numeric  \n",
-       "2                      Boolean     Numeric  \n",
-       "3                        Index     Numeric  \n",
-       "4                     Discrete     Numeric  \n",
-       "5                     Variable        None  \n",
-       "6                     Discrete        None  \n",
-       "7                  Categorical     Numeric  \n",
-       "8            DatetimeTimeIndex     Numeric  \n",
-       "9                      Boolean     Boolean  \n",
-       "10                    Variable        None  \n",
-       "11           DatetimeTimeIndex     Numeric  \n",
-       "12                     Numeric     Numeric  \n",
-       "13                     Numeric     Numeric  \n",
-       "14                     Numeric     Numeric  \n",
-       "15                     Boolean     Numeric  \n",
-       "16                     Numeric     Numeric  \n",
+       "0                      Numeric     Numeric  \n",
+       "1                     Variable        None  \n",
+       "2                     Variable        None  \n",
+       "3   DatetimeTimeIndex, Numeric     Numeric  \n",
+       "4                     Discrete    Discrete  \n",
+       "5            DatetimeTimeIndex     Numeric  \n",
+       "6                      Numeric     Numeric  \n",
+       "7                      Numeric     Numeric  \n",
+       "8                        Index     Numeric  \n",
+       "9                      Boolean     Numeric  \n",
+       "10           DatetimeTimeIndex     Numeric  \n",
+       "11                     Numeric     Numeric  \n",
+       "12                     Boolean     Boolean  \n",
+       "13                    Discrete        None  \n",
+       "14                 Categorical     Numeric  \n",
+       "15                     Numeric     Numeric  \n",
+       "16                     Boolean     Boolean  \n",
        "17                     Numeric     Numeric  \n",
-       "18  Numeric, DatetimeTimeIndex     Numeric  \n",
-       "19                    Discrete    Discrete  \n",
-       "20           DatetimeTimeIndex     Numeric  \n",
-       "21                     Numeric     Numeric  "
+       "18                     Numeric     Numeric  \n",
+       "19           DatetimeTimeIndex     Numeric  \n",
+       "20                    Discrete     Numeric  \n",
+       "21                     Boolean     Numeric  "
       ]
      },
-     "execution_count": 20,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -880,7 +950,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 12,
    "id": "7762885f",
    "metadata": {},
    "outputs": [
@@ -917,52 +987,52 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>22</th>\n",
-       "      <td>url_to_domain</td>\n",
+       "      <td>greater_than</td>\n",
        "      <td>transform</td>\n",
+       "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the domain of a url.</td>\n",
-       "      <td>URL</td>\n",
-       "      <td>Categorical</td>\n",
+       "      <td>Determines if values in one list are greater than another list.</td>\n",
+       "      <td>Ordinal, Datetime, Numeric</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>23</th>\n",
-       "      <td>cum_mean</td>\n",
+       "      <td>less_than</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the cumulative mean.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines if values in one list are less than another list.</td>\n",
+       "      <td>Ordinal, Datetime, Numeric</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>24</th>\n",
-       "      <td>minute</td>\n",
+       "      <td>and</td>\n",
        "      <td>transform</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Determines the minutes value of a datetime.</td>\n",
-       "      <td>Datetime</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>Element-wise logical AND of two lists.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>25</th>\n",
-       "      <td>cum_max</td>\n",
+       "      <td>less_than_scalar</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the cumulative maximum.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines if values are less than a given scalar.</td>\n",
+       "      <td>Ordinal, Datetime, Numeric</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>26</th>\n",
-       "      <td>age</td>\n",
+       "      <td>modulo_numeric</td>\n",
        "      <td>transform</td>\n",
        "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the age in years as a floating point number given a</td>\n",
-       "      <td>DateOfBirth</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Element-wise modulo of two lists.</td>\n",
+       "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -977,52 +1047,52 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>79</th>\n",
-       "      <td>greater_than_scalar</td>\n",
+       "      <td>is_weekend</td>\n",
        "      <td>transform</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Determines if values are greater than a given scalar.</td>\n",
-       "      <td>Numeric, Datetime, Ordinal</td>\n",
+       "      <td>Determines if a date falls on a weekend.</td>\n",
+       "      <td>Datetime</td>\n",
        "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>80</th>\n",
-       "      <td>url_to_protocol</td>\n",
+       "      <td>num_characters</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the protocol (http or https) of a url.</td>\n",
-       "      <td>URL</td>\n",
-       "      <td>Categorical</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the number of characters in a string.</td>\n",
+       "      <td>NaturalLanguage</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>81</th>\n",
-       "      <td>month</td>\n",
+       "      <td>latitude</td>\n",
        "      <td>transform</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Determines the month value of a datetime.</td>\n",
-       "      <td>Datetime</td>\n",
-       "      <td>Ordinal</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Returns the first tuple value in a list of LatLong tuples.</td>\n",
+       "      <td>LatLong</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>82</th>\n",
-       "      <td>divide_numeric_scalar</td>\n",
+       "      <td>cum_sum</td>\n",
        "      <td>transform</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Divide each element in the list by a scalar.</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the cumulative sum.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>83</th>\n",
-       "      <td>time_since_previous</td>\n",
+       "      <td>subtract_numeric_scalar</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Compute the time since the previous entry in a list.</td>\n",
-       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Subtract a scalar from each element in the list.</td>\n",
+       "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -1031,49 +1101,49 @@
        "</div>"
       ],
       "text/plain": [
-       "                     name       type  dask_compatible  koalas_compatible  \\\n",
-       "22          url_to_domain  transform            False              False   \n",
-       "23               cum_mean  transform            False              False   \n",
-       "24                 minute  transform             True               True   \n",
-       "25                cum_max  transform            False              False   \n",
-       "26                    age  transform             True              False   \n",
-       "..                    ...        ...              ...                ...   \n",
-       "79    greater_than_scalar  transform             True               True   \n",
-       "80        url_to_protocol  transform            False              False   \n",
-       "81                  month  transform             True               True   \n",
-       "82  divide_numeric_scalar  transform             True               True   \n",
-       "83    time_since_previous  transform            False              False   \n",
+       "                       name       type  dask_compatible  koalas_compatible  \\\n",
+       "22             greater_than  transform             True              False   \n",
+       "23                less_than  transform             True               True   \n",
+       "24                      and  transform             True               True   \n",
+       "25         less_than_scalar  transform             True               True   \n",
+       "26           modulo_numeric  transform             True               True   \n",
+       "..                      ...        ...              ...                ...   \n",
+       "79               is_weekend  transform             True               True   \n",
+       "80           num_characters  transform             True               True   \n",
+       "81                 latitude  transform            False              False   \n",
+       "82                  cum_sum  transform            False              False   \n",
+       "83  subtract_numeric_scalar  transform             True               True   \n",
        "\n",
-       "                                                       description  \\\n",
-       "22                                 Determines the domain of a url.   \n",
-       "23                                 Calculates the cumulative mean.   \n",
-       "24                     Determines the minutes value of a datetime.   \n",
-       "25                              Calculates the cumulative maximum.   \n",
-       "26  Calculates the age in years as a floating point number given a   \n",
-       "..                                                             ...   \n",
-       "79           Determines if values are greater than a given scalar.   \n",
-       "80               Determines the protocol (http or https) of a url.   \n",
-       "81                       Determines the month value of a datetime.   \n",
-       "82                    Divide each element in the list by a scalar.   \n",
-       "83            Compute the time since the previous entry in a list.   \n",
+       "                                                        description  \\\n",
+       "22  Determines if values in one list are greater than another list.   \n",
+       "23     Determines if values in one list are less than another list.   \n",
+       "24                           Element-wise logical AND of two lists.   \n",
+       "25               Determines if values are less than a given scalar.   \n",
+       "26                                Element-wise modulo of two lists.   \n",
+       "..                                                              ...   \n",
+       "79                         Determines if a date falls on a weekend.   \n",
+       "80                 Calculates the number of characters in a string.   \n",
+       "81       Returns the first tuple value in a list of LatLong tuples.   \n",
+       "82                                   Calculates the cumulative sum.   \n",
+       "83                 Subtract a scalar from each element in the list.   \n",
        "\n",
-       "                  valid_inputs  return_type  \n",
-       "22                         URL  Categorical  \n",
-       "23                     Numeric      Numeric  \n",
-       "24                    Datetime      Numeric  \n",
-       "25                     Numeric      Numeric  \n",
-       "26                 DateOfBirth      Numeric  \n",
-       "..                         ...          ...  \n",
-       "79  Numeric, Datetime, Ordinal      Boolean  \n",
-       "80                         URL  Categorical  \n",
-       "81                    Datetime      Ordinal  \n",
-       "82                     Numeric      Numeric  \n",
-       "83           DatetimeTimeIndex      Numeric  \n",
+       "                  valid_inputs return_type  \n",
+       "22  Ordinal, Datetime, Numeric     Boolean  \n",
+       "23  Ordinal, Datetime, Numeric     Boolean  \n",
+       "24                     Boolean     Boolean  \n",
+       "25  Ordinal, Datetime, Numeric     Boolean  \n",
+       "26                     Numeric     Numeric  \n",
+       "..                         ...         ...  \n",
+       "79                    Datetime     Boolean  \n",
+       "80             NaturalLanguage     Numeric  \n",
+       "81                     LatLong     Numeric  \n",
+       "82                     Numeric     Numeric  \n",
+       "83                     Numeric     Numeric  \n",
        "\n",
        "[62 rows x 7 columns]"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1092,11 +1162,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 14,
    "id": "6d3df2f7",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wall time: 1min 3s\n"
+     ]
+    }
+   ],
    "source": [
+    "%%time\n",
     "features, feature_names = ft.dfs(entityset = es, \n",
     "                                 target_entity = 'X', \n",
     "                                 max_depth = 2)"
@@ -1112,7 +1191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 15,
    "id": "9a44a98a",
    "metadata": {},
    "outputs": [
@@ -1653,7 +1732,7 @@
        " <Feature: Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)>]"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1664,7 +1743,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 16,
    "id": "d5036e65",
    "metadata": {},
    "outputs": [
@@ -1703,31 +1782,31 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>3000.267286</td>\n",
+       "      <td>3000.267334</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>561</th>\n",
-       "      <td>3000.267286</td>\n",
+       "      <td>3000.267334</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2062</th>\n",
-       "      <td>2926.053180</td>\n",
+       "      <td>2926.053223</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6946</th>\n",
-       "      <td>2926.053180</td>\n",
+       "      <td>2926.053223</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6976</th>\n",
-       "      <td>2926.053180</td>\n",
+       "      <td>2926.053223</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -1738,14 +1817,14 @@
       "text/plain": [
        "       Wilderness_Area_0.MEAN(X.Elevation)  Elevation  Wilderness_Area_0\n",
        "index                                                                   \n",
-       "0                              3000.267286     2596.0                1.0\n",
-       "561                            3000.267286     2596.0                1.0\n",
-       "2062                           2926.053180     2596.0                0.0\n",
-       "6946                           2926.053180     2596.0                0.0\n",
-       "6976                           2926.053180     2596.0                0.0"
+       "0                              3000.267334     2596.0                1.0\n",
+       "561                            3000.267334     2596.0                1.0\n",
+       "2062                           2926.053223     2596.0                0.0\n",
+       "6946                           2926.053223     2596.0                0.0\n",
+       "6976                           2926.053223     2596.0                0.0"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1756,7 +1835,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 17,
    "id": "ec8b7ccd",
    "metadata": {},
    "outputs": [
@@ -1766,7 +1845,7 @@
        "(581012, 532)"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1798,7 +1877,7 @@
    "id": "75b7cc64",
    "metadata": {},
    "source": [
-    "为了解决“维数灾难”，有必要应用特征简化和选择，这意味着从数据中去除低值特征。但请记住，特征选择可能会影响ML模型的性能。棘手的是，ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程，要想取得成功就必须遵循这些规则。为了得到一个精确的模型，有必要应用、组合和比较几十种方法。在本notebook中，我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法：\n",
+    "为了解决“维数灾难”，有必要应用特征约简和选择，这意味着从数据中去除低值特征。但请记住，特征选择可能会影响ML模型的性能。棘手的是，ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程，要想取得成功就必须遵循这些规则。为了得到一个精确的模型，有必要应用、组合和比较几十种方法。在本notebook中，我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法：\n",
     "\n",
     "* 确定共线特征\n",
     "\n",
@@ -4248,69 +4327,6 @@
     "注意，正常情况下我们是不知道测试集的标签，所以这里先做分割，切分训练和预测集合"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "1e65ffe8",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>index</th>\n",
-       "      <th>Cover_Type</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   index  Cover_Type\n",
-       "0      0         4.0\n",
-       "1      1         4.0"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "y = pd.DataFrame(y, columns=data.target_names)\n",
-    "y = y.reset_index()\n",
-    "y.head(2)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 46,
@@ -4968,11 +4984,12 @@
     }
    ],
    "source": [
-    "import gc\n",
+    "\"\"\"\n",
     "del features_filtered\n",
     "del features_positive\n",
     "del fetch_covtype\n",
     "del df, X,y, X_selected_df,train,test,train_df,test_df,train_X,train_y\n",
+    "\"\"\"\n",
     "gc.collect()"
    ]
   },
@@ -4986,7 +5003,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 8,
    "id": "b7241552",
    "metadata": {},
    "outputs": [
@@ -5110,7 +5127,7 @@
        "246788          0.0          0.0          0.0          0.0  "
       ]
      },
-     "execution_count": 65,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -5128,24 +5145,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 9,
    "id": "db3d3b92",
    "metadata": {},
    "outputs": [
     {
-     "ename": "MemoryError",
-     "evalue": "Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
-      "\u001b[1;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n",
-      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m    397\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    398\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moob_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 399\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_oob_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    400\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    401\u001b[0m         \u001b[1;31m# Decapsulate classes_ attributes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36m_set_oob_score\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    528\u001b[0m             unsampled_indices = _generate_unsampled_indices(\n\u001b[0;32m    529\u001b[0m                 estimator.random_state, n_samples, n_samples_bootstrap)\n\u001b[1;32m--> 530\u001b[1;33m             p_estimator = estimator.predict_proba(X[unsampled_indices, :],\n\u001b[0m\u001b[0;32m    531\u001b[0m                                                   check_input=False)\n\u001b[0;32m    532\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[1;34m(self, X, check_input)\u001b[0m\n\u001b[0;32m    929\u001b[0m         \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    930\u001b[0m         \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m         \u001b[0mproba\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtree_\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    932\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    933\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
-      "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
-      "\u001b[1;31mMemoryError\u001b[0m: Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9673328605949619\n",
+      "Wall time: 14min 30s\n"
      ]
     }
    ],
@@ -5157,13 +5166,429 @@
     "print(accuracy_score(pred_org_test_y,org_test_y))  # RF"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "50b5f988",
+   "metadata": {},
+   "source": [
+    "### 5.2 使用未约简与选择的特征的分数"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "52e36341",
+   "execution_count": 18,
+   "id": "0dc54e8c",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Elevation</th>\n",
+       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
+       "      <th>Vertical_Distance_To_Hydrology</th>\n",
+       "      <th>Horizontal_Distance_To_Roadways</th>\n",
+       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
+       "      <th>Aspect</th>\n",
+       "      <th>Slope</th>\n",
+       "      <th>Hillshade_9am</th>\n",
+       "      <th>Hillshade_Noon</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Elevation)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "      <th>Cover_Type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2596.0</td>\n",
+       "      <td>258.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>510.0</td>\n",
+       "      <td>6279.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>221.0</td>\n",
+       "      <td>232.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2590.0</td>\n",
+       "      <td>212.0</td>\n",
+       "      <td>-6.0</td>\n",
+       "      <td>390.0</td>\n",
+       "      <td>6225.0</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>220.0</td>\n",
+       "      <td>235.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 534 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   index  Elevation  Horizontal_Distance_To_Hydrology  \\\n",
+       "0      0     2596.0                             258.0   \n",
+       "1      1     2590.0                             212.0   \n",
+       "\n",
+       "   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
+       "0                             0.0                            510.0   \n",
+       "1                            -6.0                            390.0   \n",
+       "\n",
+       "   Horizontal_Distance_To_Fire_Points  Aspect  Slope  Hillshade_9am  \\\n",
+       "0                              6279.0    51.0    3.0          221.0   \n",
+       "1                              6225.0    56.0    2.0          220.0   \n",
+       "\n",
+       "   Hillshade_Noon  ...  Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "0           232.0  ...                                            1324.050751   \n",
+       "1           235.0  ...                                            1324.050751   \n",
+       "\n",
+       "   Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "0                                           212.689925   \n",
+       "1                                           212.689925   \n",
+       "\n",
+       "   Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "0                                         1558.361956   \n",
+       "1                                         1558.361956   \n",
+       "\n",
+       "   Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)  \\\n",
+       "0                                          58.279989   \n",
+       "1                                          58.279989   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Elevation)  \\\n",
+       "0                  1.715981e+09   \n",
+       "1                  1.715981e+09   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "0                                           1.149499e+09   \n",
+       "1                                           1.149499e+09   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "0                                          156171328.0   \n",
+       "1                                          156171328.0   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "0                                        1.364632e+09   \n",
+       "1                                        1.364632e+09   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)  Cover_Type  \n",
+       "0                                         26848308.0         4.0  \n",
+       "1                                         26848308.0         4.0  \n",
+       "\n",
+       "[2 rows x 534 columns]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.merge(features, y, on=['index'])\n",
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "637b3a7e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3256"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del features, X\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "4ac537b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Elevation</th>\n",
+       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
+       "      <th>Vertical_Distance_To_Hydrology</th>\n",
+       "      <th>Horizontal_Distance_To_Roadways</th>\n",
+       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
+       "      <th>Aspect</th>\n",
+       "      <th>Slope</th>\n",
+       "      <th>Hillshade_9am</th>\n",
+       "      <th>Hillshade_Noon</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Soil_Type_4.STD(X.Elevation)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Elevation)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>250728</th>\n",
+       "      <td>250728</td>\n",
+       "      <td>3351.0</td>\n",
+       "      <td>726.0</td>\n",
+       "      <td>124.0</td>\n",
+       "      <td>3813.0</td>\n",
+       "      <td>2271.0</td>\n",
+       "      <td>206.0</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>192.0</td>\n",
+       "      <td>252.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>277.045517</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>246788</th>\n",
+       "      <td>246788</td>\n",
+       "      <td>2732.0</td>\n",
+       "      <td>212.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1082.0</td>\n",
+       "      <td>912.0</td>\n",
+       "      <td>129.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>231.0</td>\n",
+       "      <td>236.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>277.045517</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 533 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         index  Elevation  Horizontal_Distance_To_Hydrology  \\\n",
+       "250728  250728     3351.0                             726.0   \n",
+       "246788  246788     2732.0                             212.0   \n",
+       "\n",
+       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
+       "250728                           124.0                           3813.0   \n",
+       "246788                             1.0                           1082.0   \n",
+       "\n",
+       "        Horizontal_Distance_To_Fire_Points  Aspect  Slope  Hillshade_9am  \\\n",
+       "250728                              2271.0   206.0   27.0          192.0   \n",
+       "246788                               912.0   129.0    7.0          231.0   \n",
+       "\n",
+       "        Hillshade_Noon  ...  Soil_Type_4.STD(X.Elevation)  \\\n",
+       "250728           252.0  ...                    277.045517   \n",
+       "246788           236.0  ...                    277.045517   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "250728                                            1324.050751   \n",
+       "246788                                            1324.050751   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "250728                                           212.689925   \n",
+       "246788                                           212.689925   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "250728                                         1558.361956   \n",
+       "246788                                         1558.361956   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)  \\\n",
+       "250728                                          58.279989   \n",
+       "246788                                          58.279989   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Elevation)  \\\n",
+       "250728                  1.715981e+09   \n",
+       "246788                  1.715981e+09   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "250728                                           1.149499e+09   \n",
+       "246788                                           1.149499e+09   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "250728                                          156171328.0   \n",
+       "246788                                          156171328.0   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "250728                                        1.364632e+09   \n",
+       "246788                                        1.364632e+09   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)  \n",
+       "250728                                         26848308.0  \n",
+       "246788                                         26848308.0  \n",
+       "\n",
+       "[2 rows x 533 columns]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df, test_df = train_test_split(df,random_state=42)\n",
+    "train_X = train_df.drop('Cover_Type',1)\n",
+    "train_y = train_df['Cover_Type']\n",
+    "\n",
+    "test_X = test_df.drop('Cover_Type',1)\n",
+    "test_y = test_df['Cover_Type']\n",
+    "test_X.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "24c7b22f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "45"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del df, train_df, test_df\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "869777ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9442352309418738\n",
+      "Wall time: 30min 31s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "random_forest = RandomForestClassifier(n_estimators=500,oob_score=True)\n",
+    "random_forest.fit(train_X, train_y)\n",
+    "pred_y = random_forest.predict(test_X)\n",
+    "print(accuracy_score(pred_y,test_y))  # RF"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3739a43c",
+   "metadata": {},
+   "source": [
+    "从结果来看，在这个数据集上，不管是增加的特征，还是增加后过滤的特征，效果都比原始特征差。我也咨询了一些朋友他们试了效果都一般，但是kaggle上很多人点赞，如果你们在哪个数据集上试了效果上涨，请联系我。"
+   ]
   }
  ],
  "metadata": {
diff --git a/竞赛优胜技巧/Automated feature engineering.ipynb b/竞赛优胜技巧/Automated feature engineering.ipynb
index 839f48c..488f3e0 100644
--- a/竞赛优胜技巧/Automated feature engineering.ipynb	
+++ b/竞赛优胜技巧/Automated feature engineering.ipynb	
@@ -13,6 +13,7 @@
    "id": "66dfb30d",
    "metadata": {},
    "source": [
+    "### 结论：效果一般\n",
     "搬运参考：https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
    ]
   },
@@ -99,13 +100,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 19,
    "id": "43cc9a46",
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
     "import time\n",
+    "import gc\n",
     "\n",
     "import featuretools as ft\n",
     "from featuretools.primitives import *\n",
@@ -304,6 +306,69 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "af6722f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Cover_Type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   index  Cover_Type\n",
+       "0      0         4.0\n",
+       "1      1         4.0"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y = pd.DataFrame(y, columns=data.target_names)\n",
+    "y = y.reset_index()\n",
+    "y.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "id": "2d34ab5c",
    "metadata": {},
    "outputs": [
@@ -347,7 +412,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "1551c241",
    "metadata": {},
    "outputs": [
@@ -404,10 +469,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 8,
    "id": "06f24545",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Object `es.entity_from_dataframe` not found.\n"
+     ]
+    }
+   ],
    "source": [
     "es.entity_from_dataframe?"
    ]
@@ -430,7 +503,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 9,
    "id": "f2c69a94",
    "metadata": {},
    "outputs": [
@@ -444,7 +517,7 @@
        "    No relationships"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -476,7 +549,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 10,
    "id": "770130bc",
    "metadata": {
     "scrolled": false
@@ -509,7 +582,7 @@
        "    X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -529,7 +602,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 11,
    "id": "352fa085",
    "metadata": {
     "scrolled": true
@@ -568,126 +641,116 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>all</td>\n",
+       "      <td>sum</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates if all values are 'True' in a list.</td>\n",
-       "      <td>Boolean</td>\n",
-       "      <td>Boolean</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the total addition, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>skew</td>\n",
+       "      <td>first</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Computes the extent to which a distribution differs from a normal distribution.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>Determines the first value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>percent_true</td>\n",
+       "      <td>last</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines the percent of `True` values.</td>\n",
-       "      <td>Boolean</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the last value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>count</td>\n",
+       "      <td>trend</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Determines the total number of values, excluding `NaN`.</td>\n",
-       "      <td>Index</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the trend of a variable over time.</td>\n",
+       "      <td>DatetimeTimeIndex, Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>num_unique</td>\n",
+       "      <td>n_most_common</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Determines the number of distinct values, ignoring `NaN` values.</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the `n` most common elements.</td>\n",
+       "      <td>Discrete</td>\n",
        "      <td>Discrete</td>\n",
-       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>first</td>\n",
+       "      <td>time_since_last</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines the first value in a list.</td>\n",
-       "      <td>Variable</td>\n",
-       "      <td>None</td>\n",
+       "      <td>Calculates the time elapsed since the last datetime (default in seconds).</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>mode</td>\n",
+       "      <td>std</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the most commonly repeated value.</td>\n",
-       "      <td>Discrete</td>\n",
-       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Computes the dispersion relative to the mean value, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>entropy</td>\n",
+       "      <td>median</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Calculates the entropy for a categorical variable</td>\n",
-       "      <td>Categorical</td>\n",
+       "      <td>Determines the middlemost number in a list of values.</td>\n",
+       "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>time_since_last</td>\n",
+       "      <td>count</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the time elapsed since the last datetime (default in seconds).</td>\n",
-       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the total number of values, excluding `NaN`.</td>\n",
+       "      <td>Index</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>any</td>\n",
+       "      <td>percent_true</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines if any value is 'True' in a list.</td>\n",
-       "      <td>Boolean</td>\n",
+       "      <td>Determines the percent of `True` values.</td>\n",
        "      <td>Boolean</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>last</td>\n",
-       "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the last value in a list.</td>\n",
-       "      <td>Variable</td>\n",
-       "      <td>None</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>avg_time_between</td>\n",
+       "      <td>time_since_first</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Computes the average number of seconds between consecutive events.</td>\n",
+       "      <td>Calculates the time elapsed since the first datetime (in seconds).</td>\n",
        "      <td>DatetimeTimeIndex</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>12</th>\n",
+       "      <th>11</th>\n",
        "      <td>max</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
@@ -697,93 +760,103 @@
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>any</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines if any value is 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>13</th>\n",
-       "      <td>median</td>\n",
+       "      <td>mode</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Determines the middlemost number in a list of values.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>Determines the most commonly repeated value.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
-       "      <td>mean</td>\n",
+       "      <td>entropy</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the entropy for a categorical variable</td>\n",
+       "      <td>Categorical</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>min</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Computes the average for a list of values.</td>\n",
+       "      <td>Calculates the smallest value, ignoring `NaN` values.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>num_true</td>\n",
+       "      <th>16</th>\n",
+       "      <td>all</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Counts the number of `True` values.</td>\n",
+       "      <td>Calculates if all values are 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
        "      <td>Boolean</td>\n",
-       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>min</td>\n",
+       "      <th>17</th>\n",
+       "      <td>skew</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Calculates the smallest value, ignoring `NaN` values.</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Computes the extent to which a distribution differs from a normal distribution.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>sum</td>\n",
+       "      <th>18</th>\n",
+       "      <td>mean</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Calculates the total addition, ignoring `NaN`.</td>\n",
+       "      <td>Computes the average for a list of values.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>trend</td>\n",
+       "      <th>19</th>\n",
+       "      <td>avg_time_between</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
-       "      <td>Calculates the trend of a variable over time.</td>\n",
-       "      <td>Numeric, DatetimeTimeIndex</td>\n",
+       "      <td>Computes the average number of seconds between consecutive events.</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>n_most_common</td>\n",
+       "      <th>20</th>\n",
+       "      <td>num_unique</td>\n",
        "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the `n` most common elements.</td>\n",
-       "      <td>Discrete</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the number of distinct values, ignoring `NaN` values.</td>\n",
        "      <td>Discrete</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>time_since_first</td>\n",
-       "      <td>aggregation</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the time elapsed since the first datetime (in seconds).</td>\n",
-       "      <td>DatetimeTimeIndex</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>21</th>\n",
-       "      <td>std</td>\n",
+       "      <td>num_true</td>\n",
        "      <td>aggregation</td>\n",
        "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Computes the dispersion relative to the mean value, ignoring `NaN`.</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Counts the number of `True` values.</td>\n",
+       "      <td>Boolean</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -792,79 +865,79 @@
       ],
       "text/plain": [
        "                name         type  dask_compatible  koalas_compatible  \\\n",
-       "0                all  aggregation             True              False   \n",
-       "1               skew  aggregation            False              False   \n",
-       "2       percent_true  aggregation             True              False   \n",
-       "3              count  aggregation             True               True   \n",
-       "4         num_unique  aggregation             True               True   \n",
-       "5              first  aggregation            False              False   \n",
-       "6               mode  aggregation            False              False   \n",
-       "7            entropy  aggregation            False              False   \n",
-       "8    time_since_last  aggregation            False              False   \n",
-       "9                any  aggregation             True              False   \n",
-       "10              last  aggregation            False              False   \n",
-       "11  avg_time_between  aggregation            False              False   \n",
-       "12               max  aggregation             True               True   \n",
-       "13            median  aggregation            False              False   \n",
-       "14              mean  aggregation             True               True   \n",
-       "15          num_true  aggregation             True              False   \n",
-       "16               min  aggregation             True               True   \n",
-       "17               sum  aggregation             True               True   \n",
-       "18             trend  aggregation            False              False   \n",
-       "19     n_most_common  aggregation            False              False   \n",
-       "20  time_since_first  aggregation            False              False   \n",
-       "21               std  aggregation             True               True   \n",
+       "0                sum  aggregation             True               True   \n",
+       "1              first  aggregation            False              False   \n",
+       "2               last  aggregation            False              False   \n",
+       "3              trend  aggregation            False              False   \n",
+       "4      n_most_common  aggregation            False              False   \n",
+       "5    time_since_last  aggregation            False              False   \n",
+       "6                std  aggregation             True               True   \n",
+       "7             median  aggregation            False              False   \n",
+       "8              count  aggregation             True               True   \n",
+       "9       percent_true  aggregation             True              False   \n",
+       "10  time_since_first  aggregation            False              False   \n",
+       "11               max  aggregation             True               True   \n",
+       "12               any  aggregation             True              False   \n",
+       "13              mode  aggregation            False              False   \n",
+       "14           entropy  aggregation            False              False   \n",
+       "15               min  aggregation             True               True   \n",
+       "16               all  aggregation             True              False   \n",
+       "17              skew  aggregation            False              False   \n",
+       "18              mean  aggregation             True               True   \n",
+       "19  avg_time_between  aggregation            False              False   \n",
+       "20        num_unique  aggregation             True               True   \n",
+       "21          num_true  aggregation             True              False   \n",
        "\n",
        "                                                                        description  \\\n",
-       "0                                    Calculates if all values are 'True' in a list.   \n",
-       "1   Computes the extent to which a distribution differs from a normal distribution.   \n",
-       "2                                          Determines the percent of `True` values.   \n",
-       "3                           Determines the total number of values, excluding `NaN`.   \n",
-       "4                  Determines the number of distinct values, ignoring `NaN` values.   \n",
-       "5                                             Determines the first value in a list.   \n",
-       "6                                      Determines the most commonly repeated value.   \n",
-       "7                                 Calculates the entropy for a categorical variable   \n",
-       "8         Calculates the time elapsed since the last datetime (default in seconds).   \n",
-       "9                                      Determines if any value is 'True' in a list.   \n",
-       "10                                             Determines the last value in a list.   \n",
-       "11               Computes the average number of seconds between consecutive events.   \n",
-       "12                             Calculates the highest value, ignoring `NaN` values.   \n",
-       "13                            Determines the middlemost number in a list of values.   \n",
-       "14                                       Computes the average for a list of values.   \n",
-       "15                                              Counts the number of `True` values.   \n",
-       "16                            Calculates the smallest value, ignoring `NaN` values.   \n",
-       "17                                   Calculates the total addition, ignoring `NaN`.   \n",
-       "18                                    Calculates the trend of a variable over time.   \n",
-       "19                                         Determines the `n` most common elements.   \n",
-       "20               Calculates the time elapsed since the first datetime (in seconds).   \n",
-       "21              Computes the dispersion relative to the mean value, ignoring `NaN`.   \n",
+       "0                                    Calculates the total addition, ignoring `NaN`.   \n",
+       "1                                             Determines the first value in a list.   \n",
+       "2                                              Determines the last value in a list.   \n",
+       "3                                     Calculates the trend of a variable over time.   \n",
+       "4                                          Determines the `n` most common elements.   \n",
+       "5         Calculates the time elapsed since the last datetime (default in seconds).   \n",
+       "6               Computes the dispersion relative to the mean value, ignoring `NaN`.   \n",
+       "7                             Determines the middlemost number in a list of values.   \n",
+       "8                           Determines the total number of values, excluding `NaN`.   \n",
+       "9                                          Determines the percent of `True` values.   \n",
+       "10               Calculates the time elapsed since the first datetime (in seconds).   \n",
+       "11                             Calculates the highest value, ignoring `NaN` values.   \n",
+       "12                                     Determines if any value is 'True' in a list.   \n",
+       "13                                     Determines the most commonly repeated value.   \n",
+       "14                                Calculates the entropy for a categorical variable   \n",
+       "15                            Calculates the smallest value, ignoring `NaN` values.   \n",
+       "16                                   Calculates if all values are 'True' in a list.   \n",
+       "17  Computes the extent to which a distribution differs from a normal distribution.   \n",
+       "18                                       Computes the average for a list of values.   \n",
+       "19               Computes the average number of seconds between consecutive events.   \n",
+       "20                 Determines the number of distinct values, ignoring `NaN` values.   \n",
+       "21                                              Counts the number of `True` values.   \n",
        "\n",
        "                  valid_inputs return_type  \n",
-       "0                      Boolean     Boolean  \n",
-       "1                      Numeric     Numeric  \n",
-       "2                      Boolean     Numeric  \n",
-       "3                        Index     Numeric  \n",
-       "4                     Discrete     Numeric  \n",
-       "5                     Variable        None  \n",
-       "6                     Discrete        None  \n",
-       "7                  Categorical     Numeric  \n",
-       "8            DatetimeTimeIndex     Numeric  \n",
-       "9                      Boolean     Boolean  \n",
-       "10                    Variable        None  \n",
-       "11           DatetimeTimeIndex     Numeric  \n",
-       "12                     Numeric     Numeric  \n",
-       "13                     Numeric     Numeric  \n",
-       "14                     Numeric     Numeric  \n",
-       "15                     Boolean     Numeric  \n",
-       "16                     Numeric     Numeric  \n",
+       "0                      Numeric     Numeric  \n",
+       "1                     Variable        None  \n",
+       "2                     Variable        None  \n",
+       "3   DatetimeTimeIndex, Numeric     Numeric  \n",
+       "4                     Discrete    Discrete  \n",
+       "5            DatetimeTimeIndex     Numeric  \n",
+       "6                      Numeric     Numeric  \n",
+       "7                      Numeric     Numeric  \n",
+       "8                        Index     Numeric  \n",
+       "9                      Boolean     Numeric  \n",
+       "10           DatetimeTimeIndex     Numeric  \n",
+       "11                     Numeric     Numeric  \n",
+       "12                     Boolean     Boolean  \n",
+       "13                    Discrete        None  \n",
+       "14                 Categorical     Numeric  \n",
+       "15                     Numeric     Numeric  \n",
+       "16                     Boolean     Boolean  \n",
        "17                     Numeric     Numeric  \n",
-       "18  Numeric, DatetimeTimeIndex     Numeric  \n",
-       "19                    Discrete    Discrete  \n",
-       "20           DatetimeTimeIndex     Numeric  \n",
-       "21                     Numeric     Numeric  "
+       "18                     Numeric     Numeric  \n",
+       "19           DatetimeTimeIndex     Numeric  \n",
+       "20                    Discrete     Numeric  \n",
+       "21                     Boolean     Numeric  "
       ]
      },
-     "execution_count": 20,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -877,7 +950,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 12,
    "id": "7762885f",
    "metadata": {},
    "outputs": [
@@ -914,52 +987,52 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>22</th>\n",
-       "      <td>url_to_domain</td>\n",
+       "      <td>greater_than</td>\n",
        "      <td>transform</td>\n",
+       "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the domain of a url.</td>\n",
-       "      <td>URL</td>\n",
-       "      <td>Categorical</td>\n",
+       "      <td>Determines if values in one list are greater than another list.</td>\n",
+       "      <td>Ordinal, Datetime, Numeric</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>23</th>\n",
-       "      <td>cum_mean</td>\n",
+       "      <td>less_than</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the cumulative mean.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines if values in one list are less than another list.</td>\n",
+       "      <td>Ordinal, Datetime, Numeric</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>24</th>\n",
-       "      <td>minute</td>\n",
+       "      <td>and</td>\n",
        "      <td>transform</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Determines the minutes value of a datetime.</td>\n",
-       "      <td>Datetime</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>Element-wise logical AND of two lists.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>25</th>\n",
-       "      <td>cum_max</td>\n",
+       "      <td>less_than_scalar</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the cumulative maximum.</td>\n",
-       "      <td>Numeric</td>\n",
-       "      <td>Numeric</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines if values are less than a given scalar.</td>\n",
+       "      <td>Ordinal, Datetime, Numeric</td>\n",
+       "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>26</th>\n",
-       "      <td>age</td>\n",
+       "      <td>modulo_numeric</td>\n",
        "      <td>transform</td>\n",
        "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Calculates the age in years as a floating point number given a</td>\n",
-       "      <td>DateOfBirth</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Element-wise modulo of two lists.</td>\n",
+       "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -974,52 +1047,52 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>79</th>\n",
-       "      <td>greater_than_scalar</td>\n",
+       "      <td>is_weekend</td>\n",
        "      <td>transform</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Determines if values are greater than a given scalar.</td>\n",
-       "      <td>Numeric, Datetime, Ordinal</td>\n",
+       "      <td>Determines if a date falls on a weekend.</td>\n",
+       "      <td>Datetime</td>\n",
        "      <td>Boolean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>80</th>\n",
-       "      <td>url_to_protocol</td>\n",
+       "      <td>num_characters</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Determines the protocol (http or https) of a url.</td>\n",
-       "      <td>URL</td>\n",
-       "      <td>Categorical</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the number of characters in a string.</td>\n",
+       "      <td>NaturalLanguage</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>81</th>\n",
-       "      <td>month</td>\n",
+       "      <td>latitude</td>\n",
        "      <td>transform</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Determines the month value of a datetime.</td>\n",
-       "      <td>Datetime</td>\n",
-       "      <td>Ordinal</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Returns the first tuple value in a list of LatLong tuples.</td>\n",
+       "      <td>LatLong</td>\n",
+       "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>82</th>\n",
-       "      <td>divide_numeric_scalar</td>\n",
+       "      <td>cum_sum</td>\n",
        "      <td>transform</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Divide each element in the list by a scalar.</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the cumulative sum.</td>\n",
        "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>83</th>\n",
-       "      <td>time_since_previous</td>\n",
+       "      <td>subtract_numeric_scalar</td>\n",
        "      <td>transform</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Compute the time since the previous entry in a list.</td>\n",
-       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Subtract a scalar from each element in the list.</td>\n",
+       "      <td>Numeric</td>\n",
        "      <td>Numeric</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -1028,49 +1101,49 @@
        "</div>"
       ],
       "text/plain": [
-       "                     name       type  dask_compatible  koalas_compatible  \\\n",
-       "22          url_to_domain  transform            False              False   \n",
-       "23               cum_mean  transform            False              False   \n",
-       "24                 minute  transform             True               True   \n",
-       "25                cum_max  transform            False              False   \n",
-       "26                    age  transform             True              False   \n",
-       "..                    ...        ...              ...                ...   \n",
-       "79    greater_than_scalar  transform             True               True   \n",
-       "80        url_to_protocol  transform            False              False   \n",
-       "81                  month  transform             True               True   \n",
-       "82  divide_numeric_scalar  transform             True               True   \n",
-       "83    time_since_previous  transform            False              False   \n",
+       "                       name       type  dask_compatible  koalas_compatible  \\\n",
+       "22             greater_than  transform             True              False   \n",
+       "23                less_than  transform             True               True   \n",
+       "24                      and  transform             True               True   \n",
+       "25         less_than_scalar  transform             True               True   \n",
+       "26           modulo_numeric  transform             True               True   \n",
+       "..                      ...        ...              ...                ...   \n",
+       "79               is_weekend  transform             True               True   \n",
+       "80           num_characters  transform             True               True   \n",
+       "81                 latitude  transform            False              False   \n",
+       "82                  cum_sum  transform            False              False   \n",
+       "83  subtract_numeric_scalar  transform             True               True   \n",
        "\n",
-       "                                                       description  \\\n",
-       "22                                 Determines the domain of a url.   \n",
-       "23                                 Calculates the cumulative mean.   \n",
-       "24                     Determines the minutes value of a datetime.   \n",
-       "25                              Calculates the cumulative maximum.   \n",
-       "26  Calculates the age in years as a floating point number given a   \n",
-       "..                                                             ...   \n",
-       "79           Determines if values are greater than a given scalar.   \n",
-       "80               Determines the protocol (http or https) of a url.   \n",
-       "81                       Determines the month value of a datetime.   \n",
-       "82                    Divide each element in the list by a scalar.   \n",
-       "83            Compute the time since the previous entry in a list.   \n",
+       "                                                        description  \\\n",
+       "22  Determines if values in one list are greater than another list.   \n",
+       "23     Determines if values in one list are less than another list.   \n",
+       "24                           Element-wise logical AND of two lists.   \n",
+       "25               Determines if values are less than a given scalar.   \n",
+       "26                                Element-wise modulo of two lists.   \n",
+       "..                                                              ...   \n",
+       "79                         Determines if a date falls on a weekend.   \n",
+       "80                 Calculates the number of characters in a string.   \n",
+       "81       Returns the first tuple value in a list of LatLong tuples.   \n",
+       "82                                   Calculates the cumulative sum.   \n",
+       "83                 Subtract a scalar from each element in the list.   \n",
        "\n",
-       "                  valid_inputs  return_type  \n",
-       "22                         URL  Categorical  \n",
-       "23                     Numeric      Numeric  \n",
-       "24                    Datetime      Numeric  \n",
-       "25                     Numeric      Numeric  \n",
-       "26                 DateOfBirth      Numeric  \n",
-       "..                         ...          ...  \n",
-       "79  Numeric, Datetime, Ordinal      Boolean  \n",
-       "80                         URL  Categorical  \n",
-       "81                    Datetime      Ordinal  \n",
-       "82                     Numeric      Numeric  \n",
-       "83           DatetimeTimeIndex      Numeric  \n",
+       "                  valid_inputs return_type  \n",
+       "22  Ordinal, Datetime, Numeric     Boolean  \n",
+       "23  Ordinal, Datetime, Numeric     Boolean  \n",
+       "24                     Boolean     Boolean  \n",
+       "25  Ordinal, Datetime, Numeric     Boolean  \n",
+       "26                     Numeric     Numeric  \n",
+       "..                         ...         ...  \n",
+       "79                    Datetime     Boolean  \n",
+       "80             NaturalLanguage     Numeric  \n",
+       "81                     LatLong     Numeric  \n",
+       "82                     Numeric     Numeric  \n",
+       "83                     Numeric     Numeric  \n",
        "\n",
        "[62 rows x 7 columns]"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1089,11 +1162,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 14,
    "id": "6d3df2f7",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wall time: 1min 3s\n"
+     ]
+    }
+   ],
    "source": [
+    "%%time\n",
     "features, feature_names = ft.dfs(entityset = es, \n",
     "                                 target_entity = 'X', \n",
     "                                 max_depth = 2)"
@@ -1109,7 +1191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 15,
    "id": "9a44a98a",
    "metadata": {},
    "outputs": [
@@ -1650,7 +1732,7 @@
        " <Feature: Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)>]"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1661,7 +1743,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 16,
    "id": "d5036e65",
    "metadata": {},
    "outputs": [
@@ -1700,31 +1782,31 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>3000.267286</td>\n",
+       "      <td>3000.267334</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>561</th>\n",
-       "      <td>3000.267286</td>\n",
+       "      <td>3000.267334</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2062</th>\n",
-       "      <td>2926.053180</td>\n",
+       "      <td>2926.053223</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6946</th>\n",
-       "      <td>2926.053180</td>\n",
+       "      <td>2926.053223</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6976</th>\n",
-       "      <td>2926.053180</td>\n",
+       "      <td>2926.053223</td>\n",
        "      <td>2596.0</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -1735,14 +1817,14 @@
       "text/plain": [
        "       Wilderness_Area_0.MEAN(X.Elevation)  Elevation  Wilderness_Area_0\n",
        "index                                                                   \n",
-       "0                              3000.267286     2596.0                1.0\n",
-       "561                            3000.267286     2596.0                1.0\n",
-       "2062                           2926.053180     2596.0                0.0\n",
-       "6946                           2926.053180     2596.0                0.0\n",
-       "6976                           2926.053180     2596.0                0.0"
+       "0                              3000.267334     2596.0                1.0\n",
+       "561                            3000.267334     2596.0                1.0\n",
+       "2062                           2926.053223     2596.0                0.0\n",
+       "6946                           2926.053223     2596.0                0.0\n",
+       "6976                           2926.053223     2596.0                0.0"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1753,7 +1835,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 17,
    "id": "ec8b7ccd",
    "metadata": {},
    "outputs": [
@@ -1763,7 +1845,7 @@
        "(581012, 532)"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1795,7 +1877,7 @@
    "id": "75b7cc64",
    "metadata": {},
    "source": [
-    "为了解决“维数灾难”，有必要应用特征简化和选择，这意味着从数据中去除低值特征。但请记住，特征选择可能会影响ML模型的性能。棘手的是，ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程，要想取得成功就必须遵循这些规则。为了得到一个精确的模型，有必要应用、组合和比较几十种方法。在本notebook中，我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法：\n",
+    "为了解决“维数灾难”，有必要应用特征约简和选择，这意味着从数据中去除低值特征。但请记住，特征选择可能会影响ML模型的性能。棘手的是，ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程，要想取得成功就必须遵循这些规则。为了得到一个精确的模型，有必要应用、组合和比较几十种方法。在本notebook中，我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法：\n",
     "\n",
     "* 确定共线特征\n",
     "\n",
@@ -4902,11 +4984,12 @@
     }
    ],
    "source": [
-    "import gc\n",
+    "\"\"\"\n",
     "del features_filtered\n",
     "del features_positive\n",
     "del fetch_covtype\n",
     "del df, X,y, X_selected_df,train,test,train_df,test_df,train_X,train_y\n",
+    "\"\"\"\n",
     "gc.collect()"
    ]
   },
@@ -4920,7 +5003,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 8,
    "id": "b7241552",
    "metadata": {},
    "outputs": [
@@ -5044,7 +5127,7 @@
        "246788          0.0          0.0          0.0          0.0  "
       ]
      },
-     "execution_count": 65,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -5062,24 +5145,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 9,
    "id": "db3d3b92",
    "metadata": {},
    "outputs": [
     {
-     "ename": "MemoryError",
-     "evalue": "Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
-      "\u001b[1;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n",
-      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m    397\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    398\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moob_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 399\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_oob_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    400\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    401\u001b[0m         \u001b[1;31m# Decapsulate classes_ attributes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36m_set_oob_score\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    528\u001b[0m             unsampled_indices = _generate_unsampled_indices(\n\u001b[0;32m    529\u001b[0m                 estimator.random_state, n_samples, n_samples_bootstrap)\n\u001b[1;32m--> 530\u001b[1;33m             p_estimator = estimator.predict_proba(X[unsampled_indices, :],\n\u001b[0m\u001b[0;32m    531\u001b[0m                                                   check_input=False)\n\u001b[0;32m    532\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[1;34m(self, X, check_input)\u001b[0m\n\u001b[0;32m    929\u001b[0m         \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    930\u001b[0m         \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m         \u001b[0mproba\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtree_\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    932\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    933\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
-      "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n",
-      "\u001b[1;31mMemoryError\u001b[0m: Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9673328605949619\n",
+      "Wall time: 14min 30s\n"
      ]
     }
    ],
@@ -5091,13 +5166,429 @@
     "print(accuracy_score(pred_org_test_y,org_test_y))  # RF"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "50b5f988",
+   "metadata": {},
+   "source": [
+    "### 5.2 使用未约简与选择的特征的分数"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "52e36341",
+   "execution_count": 18,
+   "id": "0dc54e8c",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Elevation</th>\n",
+       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
+       "      <th>Vertical_Distance_To_Hydrology</th>\n",
+       "      <th>Horizontal_Distance_To_Roadways</th>\n",
+       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
+       "      <th>Aspect</th>\n",
+       "      <th>Slope</th>\n",
+       "      <th>Hillshade_9am</th>\n",
+       "      <th>Hillshade_Noon</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Elevation)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "      <th>Cover_Type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2596.0</td>\n",
+       "      <td>258.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>510.0</td>\n",
+       "      <td>6279.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>221.0</td>\n",
+       "      <td>232.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2590.0</td>\n",
+       "      <td>212.0</td>\n",
+       "      <td>-6.0</td>\n",
+       "      <td>390.0</td>\n",
+       "      <td>6225.0</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>220.0</td>\n",
+       "      <td>235.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 534 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   index  Elevation  Horizontal_Distance_To_Hydrology  \\\n",
+       "0      0     2596.0                             258.0   \n",
+       "1      1     2590.0                             212.0   \n",
+       "\n",
+       "   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
+       "0                             0.0                            510.0   \n",
+       "1                            -6.0                            390.0   \n",
+       "\n",
+       "   Horizontal_Distance_To_Fire_Points  Aspect  Slope  Hillshade_9am  \\\n",
+       "0                              6279.0    51.0    3.0          221.0   \n",
+       "1                              6225.0    56.0    2.0          220.0   \n",
+       "\n",
+       "   Hillshade_Noon  ...  Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "0           232.0  ...                                            1324.050751   \n",
+       "1           235.0  ...                                            1324.050751   \n",
+       "\n",
+       "   Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "0                                           212.689925   \n",
+       "1                                           212.689925   \n",
+       "\n",
+       "   Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "0                                         1558.361956   \n",
+       "1                                         1558.361956   \n",
+       "\n",
+       "   Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)  \\\n",
+       "0                                          58.279989   \n",
+       "1                                          58.279989   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Elevation)  \\\n",
+       "0                  1.715981e+09   \n",
+       "1                  1.715981e+09   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "0                                           1.149499e+09   \n",
+       "1                                           1.149499e+09   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "0                                          156171328.0   \n",
+       "1                                          156171328.0   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "0                                        1.364632e+09   \n",
+       "1                                        1.364632e+09   \n",
+       "\n",
+       "   Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)  Cover_Type  \n",
+       "0                                         26848308.0         4.0  \n",
+       "1                                         26848308.0         4.0  \n",
+       "\n",
+       "[2 rows x 534 columns]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.merge(features, y, on=['index'])\n",
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "637b3a7e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3256"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del features, X\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "4ac537b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Elevation</th>\n",
+       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
+       "      <th>Vertical_Distance_To_Hydrology</th>\n",
+       "      <th>Horizontal_Distance_To_Roadways</th>\n",
+       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
+       "      <th>Aspect</th>\n",
+       "      <th>Slope</th>\n",
+       "      <th>Hillshade_9am</th>\n",
+       "      <th>Hillshade_Noon</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Soil_Type_4.STD(X.Elevation)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Elevation)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)</th>\n",
+       "      <th>Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>250728</th>\n",
+       "      <td>250728</td>\n",
+       "      <td>3351.0</td>\n",
+       "      <td>726.0</td>\n",
+       "      <td>124.0</td>\n",
+       "      <td>3813.0</td>\n",
+       "      <td>2271.0</td>\n",
+       "      <td>206.0</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>192.0</td>\n",
+       "      <td>252.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>277.045517</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>246788</th>\n",
+       "      <td>246788</td>\n",
+       "      <td>2732.0</td>\n",
+       "      <td>212.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1082.0</td>\n",
+       "      <td>912.0</td>\n",
+       "      <td>129.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>231.0</td>\n",
+       "      <td>236.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>277.045517</td>\n",
+       "      <td>1324.050751</td>\n",
+       "      <td>212.689925</td>\n",
+       "      <td>1558.361956</td>\n",
+       "      <td>58.279989</td>\n",
+       "      <td>1.715981e+09</td>\n",
+       "      <td>1.149499e+09</td>\n",
+       "      <td>156171328.0</td>\n",
+       "      <td>1.364632e+09</td>\n",
+       "      <td>26848308.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 533 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         index  Elevation  Horizontal_Distance_To_Hydrology  \\\n",
+       "250728  250728     3351.0                             726.0   \n",
+       "246788  246788     2732.0                             212.0   \n",
+       "\n",
+       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
+       "250728                           124.0                           3813.0   \n",
+       "246788                             1.0                           1082.0   \n",
+       "\n",
+       "        Horizontal_Distance_To_Fire_Points  Aspect  Slope  Hillshade_9am  \\\n",
+       "250728                              2271.0   206.0   27.0          192.0   \n",
+       "246788                               912.0   129.0    7.0          231.0   \n",
+       "\n",
+       "        Hillshade_Noon  ...  Soil_Type_4.STD(X.Elevation)  \\\n",
+       "250728           252.0  ...                    277.045517   \n",
+       "246788           236.0  ...                    277.045517   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "250728                                            1324.050751   \n",
+       "246788                                            1324.050751   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "250728                                           212.689925   \n",
+       "246788                                           212.689925   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "250728                                         1558.361956   \n",
+       "246788                                         1558.361956   \n",
+       "\n",
+       "        Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)  \\\n",
+       "250728                                          58.279989   \n",
+       "246788                                          58.279989   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Elevation)  \\\n",
+       "250728                  1.715981e+09   \n",
+       "246788                  1.715981e+09   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)  \\\n",
+       "250728                                           1.149499e+09   \n",
+       "246788                                           1.149499e+09   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)  \\\n",
+       "250728                                          156171328.0   \n",
+       "246788                                          156171328.0   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)  \\\n",
+       "250728                                        1.364632e+09   \n",
+       "246788                                        1.364632e+09   \n",
+       "\n",
+       "        Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)  \n",
+       "250728                                         26848308.0  \n",
+       "246788                                         26848308.0  \n",
+       "\n",
+       "[2 rows x 533 columns]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df, test_df = train_test_split(df,random_state=42)\n",
+    "train_X = train_df.drop('Cover_Type',1)\n",
+    "train_y = train_df['Cover_Type']\n",
+    "\n",
+    "test_X = test_df.drop('Cover_Type',1)\n",
+    "test_y = test_df['Cover_Type']\n",
+    "test_X.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "24c7b22f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "45"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del df, train_df, test_df\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "869777ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9442352309418738\n",
+      "Wall time: 30min 31s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "random_forest = RandomForestClassifier(n_estimators=500,oob_score=True)\n",
+    "random_forest.fit(train_X, train_y)\n",
+    "pred_y = random_forest.predict(test_X)\n",
+    "print(accuracy_score(pred_y,test_y))  # RF"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3739a43c",
+   "metadata": {},
+   "source": [
+    "从结果来看，在这个数据集上，不管是增加的特征，还是增加后过滤的特征，效果都比原始特征差。我也咨询了一些朋友他们试了效果都一般，但是kaggle上很多人点赞，如果你们在哪个数据集上试了效果上涨，请联系我。"
+   ]
   }
  ],
  "metadata": {

	index	Elevation	Horizontal_Distance_To_Hydrology	Vertical_Distance_To_Hydrology	Horizontal_Distance_To_Roadways	Horizontal_Distance_To_Fire_Points	Aspect	Slope	Hillshade_9am	Hillshade_Noon	...	Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)	Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)	Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)	Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)	Soil_Type_4.SUM(X.Elevation)	Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)	Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)	Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)	Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)	Cover_Type
0	0	2596.0	258.0	0.0	510.0	6279.0	51.0	3.0	221.0	232.0	...	1324.050751	212.689925	1558.361956	58.279989	1.715981e+09	1.149499e+09	156171328.0	1.364632e+09	26848308.0	4.0
1	1	2590.0	212.0	-6.0	390.0	6225.0	56.0	2.0	220.0	235.0	...	1324.050751	212.689925	1558.361956	58.279989	1.715981e+09	1.149499e+09	156171328.0	1.364632e+09	26848308.0	4.0
	index	Elevation	Horizontal_Distance_To_Hydrology	Vertical_Distance_To_Hydrology	Horizontal_Distance_To_Roadways	Horizontal_Distance_To_Fire_Points	Aspect	Slope	Hillshade_9am	Hillshade_Noon	...	Soil_Type_4.STD(X.Elevation)	Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)	Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)	Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)	Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)	Soil_Type_4.SUM(X.Elevation)	Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)	Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)	Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)	Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)
250728	250728	3351.0	726.0	124.0	3813.0	2271.0	206.0	27.0	192.0	252.0	...	277.045517	1324.050751	212.689925	1558.361956	58.279989	1.715981e+09	1.149499e+09	156171328.0	1.364632e+09	26848308.0
246788	246788	2732.0	212.0	1.0	1082.0	912.0	129.0	7.0	231.0	236.0	...	277.045517	1324.050751	212.689925	1558.361956	58.279989	1.715981e+09	1.149499e+09	156171328.0	1.364632e+09	26848308.0