From 6cd891a27804ecca43596e75a7715c8930ce7b65 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Wed, 1 Sep 2021 10:16:05 +0800
Subject: [PATCH] Create Automated feature engineering.ipynb

---
 ...mated feature engineering-checkpoint.ipynb | 1148 +++++++++++++++++
 .../Automated feature engineering.ipynb       | 1148 +++++++++++++++++
 2 files changed, 2296 insertions(+)
 create mode 100644 竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
 create mode 100644 竞赛优胜技巧/Automated feature engineering.ipynb

diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
new file mode 100644
index 0000000..fea5395
--- /dev/null
+++ b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb	
@@ -0,0 +1,1148 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "33127151",
+   "metadata": {},
+   "source": [
+    "# 自动化特征工程"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66dfb30d",
+   "metadata": {},
+   "source": [
+    "搬运参考：https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91896713",
+   "metadata": {},
+   "source": [
+    "### 1.介绍\n",
+    "如果您曾经为您的ML项目手动创建过数百个特性（我相信您做到了），那么您将乐于了解名为“featuretools”的Python包如何帮助完成这项任务。好消息是这个软件包很容易使用。它的目标是自动化特征工程。当然，人类的专业知识是无法替代的，但是“featuretools”可以自动化大量的日常工作。出于探索目的，这里使用fetch_covtype数据集。\n",
+    "\n",
+    "本笔记本的主要内容包括：\n",
+    "\n",
+    "首先，使用自动特征工程（“featuretools”包），从54个特征总数增加到N个。\n",
+    "\n",
+    "其次，应用特征约简和选择方法，从N个特征中选择X个最相关的特征。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "522eb443",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "print(sys.version)  # 版本信息"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "51e62bae",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simpleNote: you may need to restart the kernel to use updated packages.\n",
+      "Collecting featuretools\n",
+      "  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/8f/32/b5d02df152aff86f720524540ae516a8e15d7a8c53bd4ee06e2b1ed0c263/featuretools-0.26.2-py3-none-any.whl (327 kB)\n",
+      "Requirement already satisfied: numpy>=1.16.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.19.5)\n",
+      "Requirement already satisfied: dask[dataframe]>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+      "Requirement already satisfied: pyyaml>=5.4 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.4.1)\n",
+      "Requirement already satisfied: tqdm>=4.32.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (4.59.0)\n",
+      "Requirement already satisfied: scipy>=1.3.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.2)\n",
+      "Requirement already satisfied: click>=7.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (7.1.2)\n",
+      "Requirement already satisfied: pandas<2.0.0,>=1.2.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.2.4)\n",
+      "Requirement already satisfied: psutil>=5.6.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.8.0)\n",
+      "Requirement already satisfied: distributed>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+      "Requirement already satisfied: cloudpickle>=0.4.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.0)\n",
+      "Requirement already satisfied: partd>=0.3.10 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (1.2.0)\n",
+      "Requirement already satisfied: fsspec>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.9.0)\n",
+      "Requirement already satisfied: toolz>=0.8.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.11.1)\n",
+      "Requirement already satisfied: tblib>=1.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.7.0)\n",
+      "Requirement already satisfied: zict>=0.1.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.0.0)\n",
+      "Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.3.0)\n",
+      "Requirement already satisfied: tornado>=6.0.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (6.1)\n",
+      "Requirement already satisfied: msgpack>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.0.2)\n",
+      "Requirement already satisfied: setuptools in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (52.0.0.post20210125)\n",
+      "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2.8.1)\n",
+      "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2021.1)\n",
+      "Requirement already satisfied: locket in d:\\programdata\\anaconda3\\lib\\site-packages\\locket-0.2.1-py3.8.egg (from partd>=0.3.10->dask[dataframe]>=2.12.0->featuretools) (0.2.1)\n",
+      "Requirement already satisfied: six>=1.5 in d:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<2.0.0,>=1.2.0->featuretools) (1.15.0)\n",
+      "Requirement already satisfied: heapdict in d:\\programdata\\anaconda3\\lib\\site-packages (from zict>=0.1.3->distributed>=2.12.0->featuretools) (1.0.1)\n",
+      "Installing collected packages: featuretools\n",
+      "Successfully installed featuretools-0.26.2\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install featuretools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "43cc9a46",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import time\n",
+    "\n",
+    "import featuretools as ft\n",
+    "from featuretools.primitives import *\n",
+    "from featuretools.variable_types import Numeric\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "# 导入相关模型，没有的pip install xxx 即可\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "import xgboost as xgb \n",
+    "import lightgbm as lgb \n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.preprocessing import OrdinalEncoder\n",
+    "from sklearn.metrics import log_loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4c17c0bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import fetch_covtype\n",
+    "data = fetch_covtype()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bcce5a3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "七分类任务，处理前： [1 2 3 4 5 6 7]\n",
+      "[5 5 2 ... 3 3 3]\n",
+      "七分类任务，处理后： [0. 1. 2. 3. 4. 5. 6.]\n",
+      "[4. 4. 1. ... 2. 2. 2.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 预处理\n",
+    "X, y = data['data'], data['target']\n",
+    "# 由于模型标签需要从0开始，所以数字需要全部减1\n",
+    "print('七分类任务，处理前：',np.unique(y))\n",
+    "print(y)\n",
+    "ord = OrdinalEncoder()\n",
+    "y = ord.fit_transform(y.reshape(-1, 1))\n",
+    "y = y.reshape(-1, )\n",
+    "print('七分类任务，处理后：',np.unique(y))\n",
+    "print(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4afeeca5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Elevation</th>\n",
+       "      <th>Aspect</th>\n",
+       "      <th>Slope</th>\n",
+       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
+       "      <th>Vertical_Distance_To_Hydrology</th>\n",
+       "      <th>Horizontal_Distance_To_Roadways</th>\n",
+       "      <th>Hillshade_9am</th>\n",
+       "      <th>Hillshade_Noon</th>\n",
+       "      <th>Hillshade_3pm</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Soil_Type_30</th>\n",
+       "      <th>Soil_Type_31</th>\n",
+       "      <th>Soil_Type_32</th>\n",
+       "      <th>Soil_Type_33</th>\n",
+       "      <th>Soil_Type_34</th>\n",
+       "      <th>Soil_Type_35</th>\n",
+       "      <th>Soil_Type_36</th>\n",
+       "      <th>Soil_Type_37</th>\n",
+       "      <th>Soil_Type_38</th>\n",
+       "      <th>Soil_Type_39</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2596.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>258.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>510.0</td>\n",
+       "      <td>221.0</td>\n",
+       "      <td>232.0</td>\n",
+       "      <td>148.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2590.0</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>212.0</td>\n",
+       "      <td>-6.0</td>\n",
+       "      <td>390.0</td>\n",
+       "      <td>220.0</td>\n",
+       "      <td>235.0</td>\n",
+       "      <td>151.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 55 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   index  Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
+       "0      0     2596.0    51.0    3.0                             258.0   \n",
+       "1      1     2590.0    56.0    2.0                             212.0   \n",
+       "\n",
+       "   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
+       "0                             0.0                            510.0   \n",
+       "1                            -6.0                            390.0   \n",
+       "\n",
+       "   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  ...  Soil_Type_30  \\\n",
+       "0          221.0           232.0          148.0  ...           0.0   \n",
+       "1          220.0           235.0          151.0  ...           0.0   \n",
+       "\n",
+       "   Soil_Type_31  Soil_Type_32  Soil_Type_33  Soil_Type_34  Soil_Type_35  \\\n",
+       "0           0.0           0.0           0.0           0.0           0.0   \n",
+       "1           0.0           0.0           0.0           0.0           0.0   \n",
+       "\n",
+       "   Soil_Type_36  Soil_Type_37  Soil_Type_38  Soil_Type_39  \n",
+       "0           0.0           0.0           0.0           0.0  \n",
+       "1           0.0           0.0           0.0           0.0  \n",
+       "\n",
+       "[2 rows x 55 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X = pd.DataFrame(X,columns=data.feature_names)\n",
+    "X = X.reset_index()\n",
+    "X.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f68429bf",
+   "metadata": {},
+   "source": [
+    "### 2.执行自动化特征工程\n",
+    "需要先确认是否有NaN值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "06f24545",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "es.entity_from_dataframe?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f2c69a94",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Entityset: fetch_covtype_data\n",
+       "  Entities:\n",
+       "    X [Rows: 581012, Columns: 55]\n",
+       "  Relationships:\n",
+       "    No relationships"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "es = ft.EntitySet(id = 'fetch_covtype_data')\n",
+    "es = es.entity_from_dataframe(entity_id = 'X', dataframe = X, \n",
+    "                              variable_types = \n",
+    "                              {\n",
+    "                                  'Aspect': ft.variable_types.Categorical,\n",
+    "                                  'Slope': ft.variable_types.Categorical,\n",
+    "                                  'Hillshade_9am': ft.variable_types.Categorical,\n",
+    "                                  'Hillshade_Noon': ft.variable_types.Categorical,\n",
+    "                                  'Hillshade_3pm': ft.variable_types.Categorical,\n",
+    "                                  'Wilderness_Area_0': ft.variable_types.Boolean,\n",
+    "                                  'Wilderness_Area_1': ft.variable_types.Boolean,\n",
+    "                                  'Wilderness_Area_2': ft.variable_types.Boolean,\n",
+    "                                  'Wilderness_Area_3': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_0': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_1': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_2': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_3': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_4': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_5': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_6': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_7': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_8': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_9': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_10': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_11': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_12': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_13': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_14': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_15': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_16': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_17': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_18': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_19': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_20': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_21': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_22': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_23': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_24': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_25': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_26': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_27': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_28': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_29': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_30': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_31': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_32': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_33': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_34': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_35': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_36': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_37': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_38': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_39': ft.variable_types.Boolean\n",
+    "                              },\n",
+    "                              index = 'index')\n",
+    "\n",
+    "es"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "770130bc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Entityset: fetch_covtype_data\n",
+       "  Entities:\n",
+       "    X [Rows: 581012, Columns: 55]\n",
+       "    Wilderness_Area_0 [Rows: 2, Columns: 1]\n",
+       "    Wilderness_Area_1 [Rows: 2, Columns: 1]\n",
+       "    Wilderness_Area_2 [Rows: 2, Columns: 1]\n",
+       "    Wilderness_Area_3 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_0 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_1 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_2 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_3 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_4 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_5 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_6 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_7 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_8 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_9 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_10 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_11 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_12 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_13 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_14 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_15 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_16 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_17 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_18 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_19 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_20 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_21 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_22 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_23 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_24 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_25 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_26 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_27 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_28 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_29 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_30 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_31 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_32 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_33 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_34 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_35 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_36 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_37 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_38 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_39 [Rows: 2, Columns: 1]\n",
+       "  Relationships:\n",
+       "    X.Wilderness_Area_0 -> Wilderness_Area_0.Wilderness_Area_0\n",
+       "    X.Wilderness_Area_1 -> Wilderness_Area_1.Wilderness_Area_1\n",
+       "    X.Wilderness_Area_2 -> Wilderness_Area_2.Wilderness_Area_2\n",
+       "    X.Wilderness_Area_3 -> Wilderness_Area_3.Wilderness_Area_3\n",
+       "    X.Soil_Type_0 -> Soil_Type_0.Soil_Type_0\n",
+       "    X.Soil_Type_1 -> Soil_Type_1.Soil_Type_1\n",
+       "    X.Soil_Type_2 -> Soil_Type_2.Soil_Type_2\n",
+       "    X.Soil_Type_3 -> Soil_Type_3.Soil_Type_3\n",
+       "    X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4\n",
+       "    X.Soil_Type_5 -> Soil_Type_5.Soil_Type_5\n",
+       "    X.Soil_Type_6 -> Soil_Type_6.Soil_Type_6\n",
+       "    X.Soil_Type_7 -> Soil_Type_7.Soil_Type_7\n",
+       "    X.Soil_Type_8 -> Soil_Type_8.Soil_Type_8\n",
+       "    X.Soil_Type_9 -> Soil_Type_9.Soil_Type_9\n",
+       "    X.Soil_Type_10 -> Soil_Type_10.Soil_Type_10\n",
+       "    X.Soil_Type_11 -> Soil_Type_11.Soil_Type_11\n",
+       "    X.Soil_Type_12 -> Soil_Type_12.Soil_Type_12\n",
+       "    X.Soil_Type_13 -> Soil_Type_13.Soil_Type_13\n",
+       "    X.Soil_Type_14 -> Soil_Type_14.Soil_Type_14\n",
+       "    X.Soil_Type_15 -> Soil_Type_15.Soil_Type_15\n",
+       "    X.Soil_Type_16 -> Soil_Type_16.Soil_Type_16\n",
+       "    X.Soil_Type_17 -> Soil_Type_17.Soil_Type_17\n",
+       "    X.Soil_Type_18 -> Soil_Type_18.Soil_Type_18\n",
+       "    X.Soil_Type_19 -> Soil_Type_19.Soil_Type_19\n",
+       "    X.Soil_Type_20 -> Soil_Type_20.Soil_Type_20\n",
+       "    X.Soil_Type_21 -> Soil_Type_21.Soil_Type_21\n",
+       "    X.Soil_Type_22 -> Soil_Type_22.Soil_Type_22\n",
+       "    X.Soil_Type_23 -> Soil_Type_23.Soil_Type_23\n",
+       "    X.Soil_Type_24 -> Soil_Type_24.Soil_Type_24\n",
+       "    X.Soil_Type_25 -> Soil_Type_25.Soil_Type_25\n",
+       "    X.Soil_Type_26 -> Soil_Type_26.Soil_Type_26\n",
+       "    X.Soil_Type_27 -> Soil_Type_27.Soil_Type_27\n",
+       "    X.Soil_Type_28 -> Soil_Type_28.Soil_Type_28\n",
+       "    X.Soil_Type_29 -> Soil_Type_29.Soil_Type_29\n",
+       "    X.Soil_Type_30 -> Soil_Type_30.Soil_Type_30\n",
+       "    X.Soil_Type_31 -> Soil_Type_31.Soil_Type_31\n",
+       "    X.Soil_Type_32 -> Soil_Type_32.Soil_Type_32\n",
+       "    X.Soil_Type_33 -> Soil_Type_33.Soil_Type_33\n",
+       "    X.Soil_Type_34 -> Soil_Type_34.Soil_Type_34\n",
+       "    X.Soil_Type_35 -> Soil_Type_35.Soil_Type_35\n",
+       "    X.Soil_Type_36 -> Soil_Type_36.Soil_Type_36\n",
+       "    X.Soil_Type_37 -> Soil_Type_37.Soil_Type_37\n",
+       "    X.Soil_Type_38 -> Soil_Type_38.Soil_Type_38\n",
+       "    X.Soil_Type_39 -> Soil_Type_39.Soil_Type_39"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_0', index='Wilderness_Area_0')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_1', index='Wilderness_Area_1')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_2', index='Wilderness_Area_2')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_3', index='Wilderness_Area_3')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_0', index='Soil_Type_0')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_1', index='Soil_Type_1')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_2', index='Soil_Type_2')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_3', index='Soil_Type_3')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_4', index='Soil_Type_4')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_5', index='Soil_Type_5')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_6', index='Soil_Type_6')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_7', index='Soil_Type_7')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_8', index='Soil_Type_8')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_9', index='Soil_Type_9')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_10', index='Soil_Type_10')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_11', index='Soil_Type_11')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_12', index='Soil_Type_12')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_13', index='Soil_Type_13')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_14', index='Soil_Type_14')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_15', index='Soil_Type_15')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_16', index='Soil_Type_16')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_17', index='Soil_Type_17')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_18', index='Soil_Type_18')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_19', index='Soil_Type_19')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_20', index='Soil_Type_20')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_21', index='Soil_Type_21')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_22', index='Soil_Type_22')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_23', index='Soil_Type_23')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_24', index='Soil_Type_24')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_25', index='Soil_Type_25')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_26', index='Soil_Type_26')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_27', index='Soil_Type_27')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_28', index='Soil_Type_28')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_29', index='Soil_Type_29')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_30', index='Soil_Type_30')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_31', index='Soil_Type_31')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_32', index='Soil_Type_32')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_33', index='Soil_Type_33')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_34', index='Soil_Type_34')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_35', index='Soil_Type_35')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_36', index='Soil_Type_36')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_37', index='Soil_Type_37')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_38', index='Soil_Type_38')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_39', index='Soil_Type_39')\n",
+    "es"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "352fa085",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>type</th>\n",
+       "      <th>dask_compatible</th>\n",
+       "      <th>koalas_compatible</th>\n",
+       "      <th>description</th>\n",
+       "      <th>valid_inputs</th>\n",
+       "      <th>return_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>all</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates if all values are 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>skew</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Computes the extent to which a distribution differs from a normal distribution.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>percent_true</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the percent of `True` values.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>count</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the total number of values, excluding `NaN`.</td>\n",
+       "      <td>Index</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>num_unique</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the number of distinct values, ignoring `NaN` values.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>first</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the first value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>mode</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the most commonly repeated value.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>entropy</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the entropy for a categorical variable</td>\n",
+       "      <td>Categorical</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>time_since_last</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the time elapsed since the last datetime (default in seconds).</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>any</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines if any value is 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>last</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the last value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>avg_time_between</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Computes the average number of seconds between consecutive events.</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>max</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the highest value, ignoring `NaN` values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>median</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the middlemost number in a list of values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>mean</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Computes the average for a list of values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>num_true</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Counts the number of `True` values.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>min</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the smallest value, ignoring `NaN` values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>sum</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the total addition, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>trend</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the trend of a variable over time.</td>\n",
+       "      <td>Numeric, DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>n_most_common</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the `n` most common elements.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>Discrete</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>time_since_first</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the time elapsed since the first datetime (in seconds).</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>std</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Computes the dispersion relative to the mean value, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                name         type  dask_compatible  koalas_compatible  \\\n",
+       "0                all  aggregation             True              False   \n",
+       "1               skew  aggregation            False              False   \n",
+       "2       percent_true  aggregation             True              False   \n",
+       "3              count  aggregation             True               True   \n",
+       "4         num_unique  aggregation             True               True   \n",
+       "5              first  aggregation            False              False   \n",
+       "6               mode  aggregation            False              False   \n",
+       "7            entropy  aggregation            False              False   \n",
+       "8    time_since_last  aggregation            False              False   \n",
+       "9                any  aggregation             True              False   \n",
+       "10              last  aggregation            False              False   \n",
+       "11  avg_time_between  aggregation            False              False   \n",
+       "12               max  aggregation             True               True   \n",
+       "13            median  aggregation            False              False   \n",
+       "14              mean  aggregation             True               True   \n",
+       "15          num_true  aggregation             True              False   \n",
+       "16               min  aggregation             True               True   \n",
+       "17               sum  aggregation             True               True   \n",
+       "18             trend  aggregation            False              False   \n",
+       "19     n_most_common  aggregation            False              False   \n",
+       "20  time_since_first  aggregation            False              False   \n",
+       "21               std  aggregation             True               True   \n",
+       "\n",
+       "                                                                        description  \\\n",
+       "0                                    Calculates if all values are 'True' in a list.   \n",
+       "1   Computes the extent to which a distribution differs from a normal distribution.   \n",
+       "2                                          Determines the percent of `True` values.   \n",
+       "3                           Determines the total number of values, excluding `NaN`.   \n",
+       "4                  Determines the number of distinct values, ignoring `NaN` values.   \n",
+       "5                                             Determines the first value in a list.   \n",
+       "6                                      Determines the most commonly repeated value.   \n",
+       "7                                 Calculates the entropy for a categorical variable   \n",
+       "8         Calculates the time elapsed since the last datetime (default in seconds).   \n",
+       "9                                      Determines if any value is 'True' in a list.   \n",
+       "10                                             Determines the last value in a list.   \n",
+       "11               Computes the average number of seconds between consecutive events.   \n",
+       "12                             Calculates the highest value, ignoring `NaN` values.   \n",
+       "13                            Determines the middlemost number in a list of values.   \n",
+       "14                                       Computes the average for a list of values.   \n",
+       "15                                              Counts the number of `True` values.   \n",
+       "16                            Calculates the smallest value, ignoring `NaN` values.   \n",
+       "17                                   Calculates the total addition, ignoring `NaN`.   \n",
+       "18                                    Calculates the trend of a variable over time.   \n",
+       "19                                         Determines the `n` most common elements.   \n",
+       "20               Calculates the time elapsed since the first datetime (in seconds).   \n",
+       "21              Computes the dispersion relative to the mean value, ignoring `NaN`.   \n",
+       "\n",
+       "                  valid_inputs return_type  \n",
+       "0                      Boolean     Boolean  \n",
+       "1                      Numeric     Numeric  \n",
+       "2                      Boolean     Numeric  \n",
+       "3                        Index     Numeric  \n",
+       "4                     Discrete     Numeric  \n",
+       "5                     Variable        None  \n",
+       "6                     Discrete        None  \n",
+       "7                  Categorical     Numeric  \n",
+       "8            DatetimeTimeIndex     Numeric  \n",
+       "9                      Boolean     Boolean  \n",
+       "10                    Variable        None  \n",
+       "11           DatetimeTimeIndex     Numeric  \n",
+       "12                     Numeric     Numeric  \n",
+       "13                     Numeric     Numeric  \n",
+       "14                     Numeric     Numeric  \n",
+       "15                     Boolean     Numeric  \n",
+       "16                     Numeric     Numeric  \n",
+       "17                     Numeric     Numeric  \n",
+       "18  Numeric, DatetimeTimeIndex     Numeric  \n",
+       "19                    Discrete    Discrete  \n",
+       "20           DatetimeTimeIndex     Numeric  \n",
+       "21                     Numeric     Numeric  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "primitives = ft.list_primitives()\n",
+    "pd.options.display.max_colwidth = 100\n",
+    "primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7762885f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>type</th>\n",
+       "      <th>dask_compatible</th>\n",
+       "      <th>koalas_compatible</th>\n",
+       "      <th>description</th>\n",
+       "      <th>valid_inputs</th>\n",
+       "      <th>return_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>url_to_domain</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the domain of a url.</td>\n",
+       "      <td>URL</td>\n",
+       "      <td>Categorical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>cum_mean</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the cumulative mean.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>minute</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the minutes value of a datetime.</td>\n",
+       "      <td>Datetime</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>cum_max</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the cumulative maximum.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>age</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the age in years as a floating point number given a</td>\n",
+       "      <td>DateOfBirth</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79</th>\n",
+       "      <td>greater_than_scalar</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines if values are greater than a given scalar.</td>\n",
+       "      <td>Numeric, Datetime, Ordinal</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>80</th>\n",
+       "      <td>url_to_protocol</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the protocol (http or https) of a url.</td>\n",
+       "      <td>URL</td>\n",
+       "      <td>Categorical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>81</th>\n",
+       "      <td>month</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the month value of a datetime.</td>\n",
+       "      <td>Datetime</td>\n",
+       "      <td>Ordinal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>82</th>\n",
+       "      <td>divide_numeric_scalar</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Divide each element in the list by a scalar.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>83</th>\n",
+       "      <td>time_since_previous</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Compute the time since the previous entry in a list.</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>62 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     name       type  dask_compatible  koalas_compatible  \\\n",
+       "22          url_to_domain  transform            False              False   \n",
+       "23               cum_mean  transform            False              False   \n",
+       "24                 minute  transform             True               True   \n",
+       "25                cum_max  transform            False              False   \n",
+       "26                    age  transform             True              False   \n",
+       "..                    ...        ...              ...                ...   \n",
+       "79    greater_than_scalar  transform             True               True   \n",
+       "80        url_to_protocol  transform            False              False   \n",
+       "81                  month  transform             True               True   \n",
+       "82  divide_numeric_scalar  transform             True               True   \n",
+       "83    time_since_previous  transform            False              False   \n",
+       "\n",
+       "                                                       description  \\\n",
+       "22                                 Determines the domain of a url.   \n",
+       "23                                 Calculates the cumulative mean.   \n",
+       "24                     Determines the minutes value of a datetime.   \n",
+       "25                              Calculates the cumulative maximum.   \n",
+       "26  Calculates the age in years as a floating point number given a   \n",
+       "..                                                             ...   \n",
+       "79           Determines if values are greater than a given scalar.   \n",
+       "80               Determines the protocol (http or https) of a url.   \n",
+       "81                       Determines the month value of a datetime.   \n",
+       "82                    Divide each element in the list by a scalar.   \n",
+       "83            Compute the time since the previous entry in a list.   \n",
+       "\n",
+       "                  valid_inputs  return_type  \n",
+       "22                         URL  Categorical  \n",
+       "23                     Numeric      Numeric  \n",
+       "24                    Datetime      Numeric  \n",
+       "25                     Numeric      Numeric  \n",
+       "26                 DateOfBirth      Numeric  \n",
+       "..                         ...          ...  \n",
+       "79  Numeric, Datetime, Ordinal      Boolean  \n",
+       "80                         URL  Categorical  \n",
+       "81                    Datetime      Ordinal  \n",
+       "82                     Numeric      Numeric  \n",
+       "83           DatetimeTimeIndex      Numeric  \n",
+       "\n",
+       "[62 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a568eb4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/竞赛优胜技巧/Automated feature engineering.ipynb b/竞赛优胜技巧/Automated feature engineering.ipynb
new file mode 100644
index 0000000..fea5395
--- /dev/null
+++ b/竞赛优胜技巧/Automated feature engineering.ipynb	
@@ -0,0 +1,1148 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "33127151",
+   "metadata": {},
+   "source": [
+    "# 自动化特征工程"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66dfb30d",
+   "metadata": {},
+   "source": [
+    "搬运参考：https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91896713",
+   "metadata": {},
+   "source": [
+    "### 1.介绍\n",
+    "如果您曾经为您的ML项目手动创建过数百个特性（我相信您做到了），那么您将乐于了解名为“featuretools”的Python包如何帮助完成这项任务。好消息是这个软件包很容易使用。它的目标是自动化特征工程。当然，人类的专业知识是无法替代的，但是“featuretools”可以自动化大量的日常工作。出于探索目的，这里使用fetch_covtype数据集。\n",
+    "\n",
+    "本笔记本的主要内容包括：\n",
+    "\n",
+    "首先，使用自动特征工程（“featuretools”包），从54个特征总数增加到N个。\n",
+    "\n",
+    "其次，应用特征约简和选择方法，从N个特征中选择X个最相关的特征。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "522eb443",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "print(sys.version)  # 版本信息"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "51e62bae",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simpleNote: you may need to restart the kernel to use updated packages.\n",
+      "Collecting featuretools\n",
+      "  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/8f/32/b5d02df152aff86f720524540ae516a8e15d7a8c53bd4ee06e2b1ed0c263/featuretools-0.26.2-py3-none-any.whl (327 kB)\n",
+      "Requirement already satisfied: numpy>=1.16.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.19.5)\n",
+      "Requirement already satisfied: dask[dataframe]>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+      "Requirement already satisfied: pyyaml>=5.4 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.4.1)\n",
+      "Requirement already satisfied: tqdm>=4.32.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (4.59.0)\n",
+      "Requirement already satisfied: scipy>=1.3.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.2)\n",
+      "Requirement already satisfied: click>=7.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (7.1.2)\n",
+      "Requirement already satisfied: pandas<2.0.0,>=1.2.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.2.4)\n",
+      "Requirement already satisfied: psutil>=5.6.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.8.0)\n",
+      "Requirement already satisfied: distributed>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+      "Requirement already satisfied: cloudpickle>=0.4.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.0)\n",
+      "Requirement already satisfied: partd>=0.3.10 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (1.2.0)\n",
+      "Requirement already satisfied: fsspec>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.9.0)\n",
+      "Requirement already satisfied: toolz>=0.8.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.11.1)\n",
+      "Requirement already satisfied: tblib>=1.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.7.0)\n",
+      "Requirement already satisfied: zict>=0.1.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.0.0)\n",
+      "Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.3.0)\n",
+      "Requirement already satisfied: tornado>=6.0.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (6.1)\n",
+      "Requirement already satisfied: msgpack>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.0.2)\n",
+      "Requirement already satisfied: setuptools in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (52.0.0.post20210125)\n",
+      "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2.8.1)\n",
+      "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2021.1)\n",
+      "Requirement already satisfied: locket in d:\\programdata\\anaconda3\\lib\\site-packages\\locket-0.2.1-py3.8.egg (from partd>=0.3.10->dask[dataframe]>=2.12.0->featuretools) (0.2.1)\n",
+      "Requirement already satisfied: six>=1.5 in d:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<2.0.0,>=1.2.0->featuretools) (1.15.0)\n",
+      "Requirement already satisfied: heapdict in d:\\programdata\\anaconda3\\lib\\site-packages (from zict>=0.1.3->distributed>=2.12.0->featuretools) (1.0.1)\n",
+      "Installing collected packages: featuretools\n",
+      "Successfully installed featuretools-0.26.2\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install featuretools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "43cc9a46",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import time\n",
+    "\n",
+    "import featuretools as ft\n",
+    "from featuretools.primitives import *\n",
+    "from featuretools.variable_types import Numeric\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "# 导入相关模型，没有的pip install xxx 即可\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "import xgboost as xgb \n",
+    "import lightgbm as lgb \n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.preprocessing import OrdinalEncoder\n",
+    "from sklearn.metrics import log_loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4c17c0bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import fetch_covtype\n",
+    "data = fetch_covtype()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bcce5a3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "七分类任务，处理前： [1 2 3 4 5 6 7]\n",
+      "[5 5 2 ... 3 3 3]\n",
+      "七分类任务，处理后： [0. 1. 2. 3. 4. 5. 6.]\n",
+      "[4. 4. 1. ... 2. 2. 2.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 预处理\n",
+    "X, y = data['data'], data['target']\n",
+    "# 由于模型标签需要从0开始，所以数字需要全部减1\n",
+    "print('七分类任务，处理前：',np.unique(y))\n",
+    "print(y)\n",
+    "ord = OrdinalEncoder()\n",
+    "y = ord.fit_transform(y.reshape(-1, 1))\n",
+    "y = y.reshape(-1, )\n",
+    "print('七分类任务，处理后：',np.unique(y))\n",
+    "print(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4afeeca5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>Elevation</th>\n",
+       "      <th>Aspect</th>\n",
+       "      <th>Slope</th>\n",
+       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
+       "      <th>Vertical_Distance_To_Hydrology</th>\n",
+       "      <th>Horizontal_Distance_To_Roadways</th>\n",
+       "      <th>Hillshade_9am</th>\n",
+       "      <th>Hillshade_Noon</th>\n",
+       "      <th>Hillshade_3pm</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Soil_Type_30</th>\n",
+       "      <th>Soil_Type_31</th>\n",
+       "      <th>Soil_Type_32</th>\n",
+       "      <th>Soil_Type_33</th>\n",
+       "      <th>Soil_Type_34</th>\n",
+       "      <th>Soil_Type_35</th>\n",
+       "      <th>Soil_Type_36</th>\n",
+       "      <th>Soil_Type_37</th>\n",
+       "      <th>Soil_Type_38</th>\n",
+       "      <th>Soil_Type_39</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2596.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>258.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>510.0</td>\n",
+       "      <td>221.0</td>\n",
+       "      <td>232.0</td>\n",
+       "      <td>148.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2590.0</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>212.0</td>\n",
+       "      <td>-6.0</td>\n",
+       "      <td>390.0</td>\n",
+       "      <td>220.0</td>\n",
+       "      <td>235.0</td>\n",
+       "      <td>151.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 55 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   index  Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
+       "0      0     2596.0    51.0    3.0                             258.0   \n",
+       "1      1     2590.0    56.0    2.0                             212.0   \n",
+       "\n",
+       "   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
+       "0                             0.0                            510.0   \n",
+       "1                            -6.0                            390.0   \n",
+       "\n",
+       "   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  ...  Soil_Type_30  \\\n",
+       "0          221.0           232.0          148.0  ...           0.0   \n",
+       "1          220.0           235.0          151.0  ...           0.0   \n",
+       "\n",
+       "   Soil_Type_31  Soil_Type_32  Soil_Type_33  Soil_Type_34  Soil_Type_35  \\\n",
+       "0           0.0           0.0           0.0           0.0           0.0   \n",
+       "1           0.0           0.0           0.0           0.0           0.0   \n",
+       "\n",
+       "   Soil_Type_36  Soil_Type_37  Soil_Type_38  Soil_Type_39  \n",
+       "0           0.0           0.0           0.0           0.0  \n",
+       "1           0.0           0.0           0.0           0.0  \n",
+       "\n",
+       "[2 rows x 55 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X = pd.DataFrame(X,columns=data.feature_names)\n",
+    "X = X.reset_index()\n",
+    "X.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f68429bf",
+   "metadata": {},
+   "source": [
+    "### 2.执行自动化特征工程\n",
+    "需要先确认是否有NaN值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "06f24545",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "es.entity_from_dataframe?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f2c69a94",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Entityset: fetch_covtype_data\n",
+       "  Entities:\n",
+       "    X [Rows: 581012, Columns: 55]\n",
+       "  Relationships:\n",
+       "    No relationships"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "es = ft.EntitySet(id = 'fetch_covtype_data')\n",
+    "es = es.entity_from_dataframe(entity_id = 'X', dataframe = X, \n",
+    "                              variable_types = \n",
+    "                              {\n",
+    "                                  'Aspect': ft.variable_types.Categorical,\n",
+    "                                  'Slope': ft.variable_types.Categorical,\n",
+    "                                  'Hillshade_9am': ft.variable_types.Categorical,\n",
+    "                                  'Hillshade_Noon': ft.variable_types.Categorical,\n",
+    "                                  'Hillshade_3pm': ft.variable_types.Categorical,\n",
+    "                                  'Wilderness_Area_0': ft.variable_types.Boolean,\n",
+    "                                  'Wilderness_Area_1': ft.variable_types.Boolean,\n",
+    "                                  'Wilderness_Area_2': ft.variable_types.Boolean,\n",
+    "                                  'Wilderness_Area_3': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_0': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_1': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_2': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_3': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_4': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_5': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_6': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_7': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_8': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_9': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_10': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_11': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_12': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_13': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_14': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_15': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_16': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_17': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_18': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_19': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_20': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_21': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_22': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_23': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_24': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_25': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_26': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_27': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_28': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_29': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_30': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_31': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_32': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_33': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_34': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_35': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_36': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_37': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_38': ft.variable_types.Boolean,\n",
+    "                                  'Soil_Type_39': ft.variable_types.Boolean\n",
+    "                              },\n",
+    "                              index = 'index')\n",
+    "\n",
+    "es"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "770130bc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Entityset: fetch_covtype_data\n",
+       "  Entities:\n",
+       "    X [Rows: 581012, Columns: 55]\n",
+       "    Wilderness_Area_0 [Rows: 2, Columns: 1]\n",
+       "    Wilderness_Area_1 [Rows: 2, Columns: 1]\n",
+       "    Wilderness_Area_2 [Rows: 2, Columns: 1]\n",
+       "    Wilderness_Area_3 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_0 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_1 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_2 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_3 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_4 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_5 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_6 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_7 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_8 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_9 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_10 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_11 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_12 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_13 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_14 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_15 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_16 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_17 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_18 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_19 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_20 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_21 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_22 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_23 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_24 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_25 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_26 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_27 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_28 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_29 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_30 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_31 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_32 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_33 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_34 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_35 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_36 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_37 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_38 [Rows: 2, Columns: 1]\n",
+       "    Soil_Type_39 [Rows: 2, Columns: 1]\n",
+       "  Relationships:\n",
+       "    X.Wilderness_Area_0 -> Wilderness_Area_0.Wilderness_Area_0\n",
+       "    X.Wilderness_Area_1 -> Wilderness_Area_1.Wilderness_Area_1\n",
+       "    X.Wilderness_Area_2 -> Wilderness_Area_2.Wilderness_Area_2\n",
+       "    X.Wilderness_Area_3 -> Wilderness_Area_3.Wilderness_Area_3\n",
+       "    X.Soil_Type_0 -> Soil_Type_0.Soil_Type_0\n",
+       "    X.Soil_Type_1 -> Soil_Type_1.Soil_Type_1\n",
+       "    X.Soil_Type_2 -> Soil_Type_2.Soil_Type_2\n",
+       "    X.Soil_Type_3 -> Soil_Type_3.Soil_Type_3\n",
+       "    X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4\n",
+       "    X.Soil_Type_5 -> Soil_Type_5.Soil_Type_5\n",
+       "    X.Soil_Type_6 -> Soil_Type_6.Soil_Type_6\n",
+       "    X.Soil_Type_7 -> Soil_Type_7.Soil_Type_7\n",
+       "    X.Soil_Type_8 -> Soil_Type_8.Soil_Type_8\n",
+       "    X.Soil_Type_9 -> Soil_Type_9.Soil_Type_9\n",
+       "    X.Soil_Type_10 -> Soil_Type_10.Soil_Type_10\n",
+       "    X.Soil_Type_11 -> Soil_Type_11.Soil_Type_11\n",
+       "    X.Soil_Type_12 -> Soil_Type_12.Soil_Type_12\n",
+       "    X.Soil_Type_13 -> Soil_Type_13.Soil_Type_13\n",
+       "    X.Soil_Type_14 -> Soil_Type_14.Soil_Type_14\n",
+       "    X.Soil_Type_15 -> Soil_Type_15.Soil_Type_15\n",
+       "    X.Soil_Type_16 -> Soil_Type_16.Soil_Type_16\n",
+       "    X.Soil_Type_17 -> Soil_Type_17.Soil_Type_17\n",
+       "    X.Soil_Type_18 -> Soil_Type_18.Soil_Type_18\n",
+       "    X.Soil_Type_19 -> Soil_Type_19.Soil_Type_19\n",
+       "    X.Soil_Type_20 -> Soil_Type_20.Soil_Type_20\n",
+       "    X.Soil_Type_21 -> Soil_Type_21.Soil_Type_21\n",
+       "    X.Soil_Type_22 -> Soil_Type_22.Soil_Type_22\n",
+       "    X.Soil_Type_23 -> Soil_Type_23.Soil_Type_23\n",
+       "    X.Soil_Type_24 -> Soil_Type_24.Soil_Type_24\n",
+       "    X.Soil_Type_25 -> Soil_Type_25.Soil_Type_25\n",
+       "    X.Soil_Type_26 -> Soil_Type_26.Soil_Type_26\n",
+       "    X.Soil_Type_27 -> Soil_Type_27.Soil_Type_27\n",
+       "    X.Soil_Type_28 -> Soil_Type_28.Soil_Type_28\n",
+       "    X.Soil_Type_29 -> Soil_Type_29.Soil_Type_29\n",
+       "    X.Soil_Type_30 -> Soil_Type_30.Soil_Type_30\n",
+       "    X.Soil_Type_31 -> Soil_Type_31.Soil_Type_31\n",
+       "    X.Soil_Type_32 -> Soil_Type_32.Soil_Type_32\n",
+       "    X.Soil_Type_33 -> Soil_Type_33.Soil_Type_33\n",
+       "    X.Soil_Type_34 -> Soil_Type_34.Soil_Type_34\n",
+       "    X.Soil_Type_35 -> Soil_Type_35.Soil_Type_35\n",
+       "    X.Soil_Type_36 -> Soil_Type_36.Soil_Type_36\n",
+       "    X.Soil_Type_37 -> Soil_Type_37.Soil_Type_37\n",
+       "    X.Soil_Type_38 -> Soil_Type_38.Soil_Type_38\n",
+       "    X.Soil_Type_39 -> Soil_Type_39.Soil_Type_39"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_0', index='Wilderness_Area_0')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_1', index='Wilderness_Area_1')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_2', index='Wilderness_Area_2')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_3', index='Wilderness_Area_3')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_0', index='Soil_Type_0')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_1', index='Soil_Type_1')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_2', index='Soil_Type_2')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_3', index='Soil_Type_3')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_4', index='Soil_Type_4')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_5', index='Soil_Type_5')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_6', index='Soil_Type_6')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_7', index='Soil_Type_7')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_8', index='Soil_Type_8')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_9', index='Soil_Type_9')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_10', index='Soil_Type_10')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_11', index='Soil_Type_11')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_12', index='Soil_Type_12')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_13', index='Soil_Type_13')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_14', index='Soil_Type_14')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_15', index='Soil_Type_15')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_16', index='Soil_Type_16')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_17', index='Soil_Type_17')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_18', index='Soil_Type_18')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_19', index='Soil_Type_19')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_20', index='Soil_Type_20')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_21', index='Soil_Type_21')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_22', index='Soil_Type_22')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_23', index='Soil_Type_23')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_24', index='Soil_Type_24')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_25', index='Soil_Type_25')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_26', index='Soil_Type_26')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_27', index='Soil_Type_27')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_28', index='Soil_Type_28')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_29', index='Soil_Type_29')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_30', index='Soil_Type_30')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_31', index='Soil_Type_31')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_32', index='Soil_Type_32')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_33', index='Soil_Type_33')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_34', index='Soil_Type_34')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_35', index='Soil_Type_35')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_36', index='Soil_Type_36')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_37', index='Soil_Type_37')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_38', index='Soil_Type_38')\n",
+    "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_39', index='Soil_Type_39')\n",
+    "es"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "352fa085",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>type</th>\n",
+       "      <th>dask_compatible</th>\n",
+       "      <th>koalas_compatible</th>\n",
+       "      <th>description</th>\n",
+       "      <th>valid_inputs</th>\n",
+       "      <th>return_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>all</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates if all values are 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>skew</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Computes the extent to which a distribution differs from a normal distribution.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>percent_true</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the percent of `True` values.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>count</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the total number of values, excluding `NaN`.</td>\n",
+       "      <td>Index</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>num_unique</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the number of distinct values, ignoring `NaN` values.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>first</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the first value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>mode</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the most commonly repeated value.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>entropy</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the entropy for a categorical variable</td>\n",
+       "      <td>Categorical</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>time_since_last</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the time elapsed since the last datetime (default in seconds).</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>any</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines if any value is 'True' in a list.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>last</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the last value in a list.</td>\n",
+       "      <td>Variable</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>avg_time_between</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Computes the average number of seconds between consecutive events.</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>max</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the highest value, ignoring `NaN` values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>median</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the middlemost number in a list of values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>mean</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Computes the average for a list of values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>num_true</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Counts the number of `True` values.</td>\n",
+       "      <td>Boolean</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>min</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the smallest value, ignoring `NaN` values.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>sum</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Calculates the total addition, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>trend</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the trend of a variable over time.</td>\n",
+       "      <td>Numeric, DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>n_most_common</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the `n` most common elements.</td>\n",
+       "      <td>Discrete</td>\n",
+       "      <td>Discrete</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>time_since_first</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the time elapsed since the first datetime (in seconds).</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>std</td>\n",
+       "      <td>aggregation</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Computes the dispersion relative to the mean value, ignoring `NaN`.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                name         type  dask_compatible  koalas_compatible  \\\n",
+       "0                all  aggregation             True              False   \n",
+       "1               skew  aggregation            False              False   \n",
+       "2       percent_true  aggregation             True              False   \n",
+       "3              count  aggregation             True               True   \n",
+       "4         num_unique  aggregation             True               True   \n",
+       "5              first  aggregation            False              False   \n",
+       "6               mode  aggregation            False              False   \n",
+       "7            entropy  aggregation            False              False   \n",
+       "8    time_since_last  aggregation            False              False   \n",
+       "9                any  aggregation             True              False   \n",
+       "10              last  aggregation            False              False   \n",
+       "11  avg_time_between  aggregation            False              False   \n",
+       "12               max  aggregation             True               True   \n",
+       "13            median  aggregation            False              False   \n",
+       "14              mean  aggregation             True               True   \n",
+       "15          num_true  aggregation             True              False   \n",
+       "16               min  aggregation             True               True   \n",
+       "17               sum  aggregation             True               True   \n",
+       "18             trend  aggregation            False              False   \n",
+       "19     n_most_common  aggregation            False              False   \n",
+       "20  time_since_first  aggregation            False              False   \n",
+       "21               std  aggregation             True               True   \n",
+       "\n",
+       "                                                                        description  \\\n",
+       "0                                    Calculates if all values are 'True' in a list.   \n",
+       "1   Computes the extent to which a distribution differs from a normal distribution.   \n",
+       "2                                          Determines the percent of `True` values.   \n",
+       "3                           Determines the total number of values, excluding `NaN`.   \n",
+       "4                  Determines the number of distinct values, ignoring `NaN` values.   \n",
+       "5                                             Determines the first value in a list.   \n",
+       "6                                      Determines the most commonly repeated value.   \n",
+       "7                                 Calculates the entropy for a categorical variable   \n",
+       "8         Calculates the time elapsed since the last datetime (default in seconds).   \n",
+       "9                                      Determines if any value is 'True' in a list.   \n",
+       "10                                             Determines the last value in a list.   \n",
+       "11               Computes the average number of seconds between consecutive events.   \n",
+       "12                             Calculates the highest value, ignoring `NaN` values.   \n",
+       "13                            Determines the middlemost number in a list of values.   \n",
+       "14                                       Computes the average for a list of values.   \n",
+       "15                                              Counts the number of `True` values.   \n",
+       "16                            Calculates the smallest value, ignoring `NaN` values.   \n",
+       "17                                   Calculates the total addition, ignoring `NaN`.   \n",
+       "18                                    Calculates the trend of a variable over time.   \n",
+       "19                                         Determines the `n` most common elements.   \n",
+       "20               Calculates the time elapsed since the first datetime (in seconds).   \n",
+       "21              Computes the dispersion relative to the mean value, ignoring `NaN`.   \n",
+       "\n",
+       "                  valid_inputs return_type  \n",
+       "0                      Boolean     Boolean  \n",
+       "1                      Numeric     Numeric  \n",
+       "2                      Boolean     Numeric  \n",
+       "3                        Index     Numeric  \n",
+       "4                     Discrete     Numeric  \n",
+       "5                     Variable        None  \n",
+       "6                     Discrete        None  \n",
+       "7                  Categorical     Numeric  \n",
+       "8            DatetimeTimeIndex     Numeric  \n",
+       "9                      Boolean     Boolean  \n",
+       "10                    Variable        None  \n",
+       "11           DatetimeTimeIndex     Numeric  \n",
+       "12                     Numeric     Numeric  \n",
+       "13                     Numeric     Numeric  \n",
+       "14                     Numeric     Numeric  \n",
+       "15                     Boolean     Numeric  \n",
+       "16                     Numeric     Numeric  \n",
+       "17                     Numeric     Numeric  \n",
+       "18  Numeric, DatetimeTimeIndex     Numeric  \n",
+       "19                    Discrete    Discrete  \n",
+       "20           DatetimeTimeIndex     Numeric  \n",
+       "21                     Numeric     Numeric  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "primitives = ft.list_primitives()\n",
+    "pd.options.display.max_colwidth = 100\n",
+    "primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7762885f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>type</th>\n",
+       "      <th>dask_compatible</th>\n",
+       "      <th>koalas_compatible</th>\n",
+       "      <th>description</th>\n",
+       "      <th>valid_inputs</th>\n",
+       "      <th>return_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>url_to_domain</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the domain of a url.</td>\n",
+       "      <td>URL</td>\n",
+       "      <td>Categorical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>cum_mean</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the cumulative mean.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>minute</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the minutes value of a datetime.</td>\n",
+       "      <td>Datetime</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>cum_max</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the cumulative maximum.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>age</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Calculates the age in years as a floating point number given a</td>\n",
+       "      <td>DateOfBirth</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79</th>\n",
+       "      <td>greater_than_scalar</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines if values are greater than a given scalar.</td>\n",
+       "      <td>Numeric, Datetime, Ordinal</td>\n",
+       "      <td>Boolean</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>80</th>\n",
+       "      <td>url_to_protocol</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Determines the protocol (http or https) of a url.</td>\n",
+       "      <td>URL</td>\n",
+       "      <td>Categorical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>81</th>\n",
+       "      <td>month</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Determines the month value of a datetime.</td>\n",
+       "      <td>Datetime</td>\n",
+       "      <td>Ordinal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>82</th>\n",
+       "      <td>divide_numeric_scalar</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Divide each element in the list by a scalar.</td>\n",
+       "      <td>Numeric</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>83</th>\n",
+       "      <td>time_since_previous</td>\n",
+       "      <td>transform</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Compute the time since the previous entry in a list.</td>\n",
+       "      <td>DatetimeTimeIndex</td>\n",
+       "      <td>Numeric</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>62 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     name       type  dask_compatible  koalas_compatible  \\\n",
+       "22          url_to_domain  transform            False              False   \n",
+       "23               cum_mean  transform            False              False   \n",
+       "24                 minute  transform             True               True   \n",
+       "25                cum_max  transform            False              False   \n",
+       "26                    age  transform             True              False   \n",
+       "..                    ...        ...              ...                ...   \n",
+       "79    greater_than_scalar  transform             True               True   \n",
+       "80        url_to_protocol  transform            False              False   \n",
+       "81                  month  transform             True               True   \n",
+       "82  divide_numeric_scalar  transform             True               True   \n",
+       "83    time_since_previous  transform            False              False   \n",
+       "\n",
+       "                                                       description  \\\n",
+       "22                                 Determines the domain of a url.   \n",
+       "23                                 Calculates the cumulative mean.   \n",
+       "24                     Determines the minutes value of a datetime.   \n",
+       "25                              Calculates the cumulative maximum.   \n",
+       "26  Calculates the age in years as a floating point number given a   \n",
+       "..                                                             ...   \n",
+       "79           Determines if values are greater than a given scalar.   \n",
+       "80               Determines the protocol (http or https) of a url.   \n",
+       "81                       Determines the month value of a datetime.   \n",
+       "82                    Divide each element in the list by a scalar.   \n",
+       "83            Compute the time since the previous entry in a list.   \n",
+       "\n",
+       "                  valid_inputs  return_type  \n",
+       "22                         URL  Categorical  \n",
+       "23                     Numeric      Numeric  \n",
+       "24                    Datetime      Numeric  \n",
+       "25                     Numeric      Numeric  \n",
+       "26                 DateOfBirth      Numeric  \n",
+       "..                         ...          ...  \n",
+       "79  Numeric, Datetime, Ordinal      Boolean  \n",
+       "80                         URL  Categorical  \n",
+       "81                    Datetime      Ordinal  \n",
+       "82                     Numeric      Numeric  \n",
+       "83           DatetimeTimeIndex      Numeric  \n",
+       "\n",
+       "[62 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a568eb4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}