From 6cd891a27804ecca43596e75a7715c8930ce7b65 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Wed, 1 Sep 2021 10:16:05 +0800
Subject: [PATCH] Create Automated feature engineering.ipynb
---
...mated feature engineering-checkpoint.ipynb | 1148 +++++++++++++++++
.../Automated feature engineering.ipynb | 1148 +++++++++++++++++
2 files changed, 2296 insertions(+)
create mode 100644 竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
create mode 100644 竞赛优胜技巧/Automated feature engineering.ipynb
diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
new file mode 100644
index 0000000..fea5395
--- /dev/null
+++ b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb
@@ -0,0 +1,1148 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "33127151",
+ "metadata": {},
+ "source": [
+ "# 自动化特征工程"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "66dfb30d",
+ "metadata": {},
+ "source": [
+ "搬运参考:https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91896713",
+ "metadata": {},
+ "source": [
+ "### 1.介绍\n",
+ "如果您曾经为您的ML项目手动创建过数百个特性(我相信您做到了),那么您将乐于了解名为“featuretools”的Python包如何帮助完成这项任务。好消息是这个软件包很容易使用。它的目标是自动化特征工程。当然,人类的专业知识是无法替代的,但是“featuretools”可以自动化大量的日常工作。出于探索目的,这里使用fetch_covtype数据集。\n",
+ "\n",
+ "本笔记本的主要内容包括:\n",
+ "\n",
+ "首先,使用自动特征工程(“featuretools”包),从54个特征总数增加到N个。\n",
+ "\n",
+ "其次,应用特征约简和选择方法,从N个特征中选择X个最相关的特征。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "522eb443",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "print(sys.version) # 版本信息"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "51e62bae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simpleNote: you may need to restart the kernel to use updated packages.\n",
+ "Collecting featuretools\n",
+ " Downloading https://pypi.tuna.tsinghua.edu.cn/packages/8f/32/b5d02df152aff86f720524540ae516a8e15d7a8c53bd4ee06e2b1ed0c263/featuretools-0.26.2-py3-none-any.whl (327 kB)\n",
+ "Requirement already satisfied: numpy>=1.16.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.19.5)\n",
+ "Requirement already satisfied: dask[dataframe]>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+ "Requirement already satisfied: pyyaml>=5.4 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.4.1)\n",
+ "Requirement already satisfied: tqdm>=4.32.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (4.59.0)\n",
+ "Requirement already satisfied: scipy>=1.3.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.2)\n",
+ "Requirement already satisfied: click>=7.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (7.1.2)\n",
+ "Requirement already satisfied: pandas<2.0.0,>=1.2.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.2.4)\n",
+ "Requirement already satisfied: psutil>=5.6.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.8.0)\n",
+ "Requirement already satisfied: distributed>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+ "Requirement already satisfied: cloudpickle>=0.4.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.0)\n",
+ "Requirement already satisfied: partd>=0.3.10 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (1.2.0)\n",
+ "Requirement already satisfied: fsspec>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.9.0)\n",
+ "Requirement already satisfied: toolz>=0.8.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.11.1)\n",
+ "Requirement already satisfied: tblib>=1.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.7.0)\n",
+ "Requirement already satisfied: zict>=0.1.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.0.0)\n",
+ "Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.3.0)\n",
+ "Requirement already satisfied: tornado>=6.0.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (6.1)\n",
+ "Requirement already satisfied: msgpack>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.0.2)\n",
+ "Requirement already satisfied: setuptools in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (52.0.0.post20210125)\n",
+ "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2.8.1)\n",
+ "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2021.1)\n",
+ "Requirement already satisfied: locket in d:\\programdata\\anaconda3\\lib\\site-packages\\locket-0.2.1-py3.8.egg (from partd>=0.3.10->dask[dataframe]>=2.12.0->featuretools) (0.2.1)\n",
+ "Requirement already satisfied: six>=1.5 in d:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<2.0.0,>=1.2.0->featuretools) (1.15.0)\n",
+ "Requirement already satisfied: heapdict in d:\\programdata\\anaconda3\\lib\\site-packages (from zict>=0.1.3->distributed>=2.12.0->featuretools) (1.0.1)\n",
+ "Installing collected packages: featuretools\n",
+ "Successfully installed featuretools-0.26.2\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "pip install featuretools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "43cc9a46",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import time\n",
+ "\n",
+ "import featuretools as ft\n",
+ "from featuretools.primitives import *\n",
+ "from featuretools.variable_types import Numeric\n",
+ "from sklearn.svm import LinearSVC\n",
+ "from sklearn.feature_selection import SelectFromModel\n",
+ "# 导入相关模型,没有的pip install xxx 即可\n",
+ "\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "import xgboost as xgb \n",
+ "import lightgbm as lgb \n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.preprocessing import OrdinalEncoder\n",
+ "from sklearn.metrics import log_loss"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4c17c0bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import fetch_covtype\n",
+ "data = fetch_covtype()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "bcce5a3d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "七分类任务,处理前: [1 2 3 4 5 6 7]\n",
+ "[5 5 2 ... 3 3 3]\n",
+ "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n",
+ "[4. 4. 1. ... 2. 2. 2.]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 预处理\n",
+ "X, y = data['data'], data['target']\n",
+ "# 由于模型标签需要从0开始,所以数字需要全部减1\n",
+ "print('七分类任务,处理前:',np.unique(y))\n",
+ "print(y)\n",
+ "ord = OrdinalEncoder()\n",
+ "y = ord.fit_transform(y.reshape(-1, 1))\n",
+ "y = y.reshape(-1, )\n",
+ "print('七分类任务,处理后:',np.unique(y))\n",
+ "print(y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "4afeeca5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Elevation | \n",
+ " Aspect | \n",
+ " Slope | \n",
+ " Horizontal_Distance_To_Hydrology | \n",
+ " Vertical_Distance_To_Hydrology | \n",
+ " Horizontal_Distance_To_Roadways | \n",
+ " Hillshade_9am | \n",
+ " Hillshade_Noon | \n",
+ " Hillshade_3pm | \n",
+ " ... | \n",
+ " Soil_Type_30 | \n",
+ " Soil_Type_31 | \n",
+ " Soil_Type_32 | \n",
+ " Soil_Type_33 | \n",
+ " Soil_Type_34 | \n",
+ " Soil_Type_35 | \n",
+ " Soil_Type_36 | \n",
+ " Soil_Type_37 | \n",
+ " Soil_Type_38 | \n",
+ " Soil_Type_39 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2596.0 | \n",
+ " 51.0 | \n",
+ " 3.0 | \n",
+ " 258.0 | \n",
+ " 0.0 | \n",
+ " 510.0 | \n",
+ " 221.0 | \n",
+ " 232.0 | \n",
+ " 148.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2590.0 | \n",
+ " 56.0 | \n",
+ " 2.0 | \n",
+ " 212.0 | \n",
+ " -6.0 | \n",
+ " 390.0 | \n",
+ " 220.0 | \n",
+ " 235.0 | \n",
+ " 151.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 55 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Elevation Aspect Slope Horizontal_Distance_To_Hydrology \\\n",
+ "0 0 2596.0 51.0 3.0 258.0 \n",
+ "1 1 2590.0 56.0 2.0 212.0 \n",
+ "\n",
+ " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n",
+ "0 0.0 510.0 \n",
+ "1 -6.0 390.0 \n",
+ "\n",
+ " Hillshade_9am Hillshade_Noon Hillshade_3pm ... Soil_Type_30 \\\n",
+ "0 221.0 232.0 148.0 ... 0.0 \n",
+ "1 220.0 235.0 151.0 ... 0.0 \n",
+ "\n",
+ " Soil_Type_31 Soil_Type_32 Soil_Type_33 Soil_Type_34 Soil_Type_35 \\\n",
+ "0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Soil_Type_36 Soil_Type_37 Soil_Type_38 Soil_Type_39 \n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ "[2 rows x 55 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X = pd.DataFrame(X,columns=data.feature_names)\n",
+ "X = X.reset_index()\n",
+ "X.head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f68429bf",
+ "metadata": {},
+ "source": [
+ "### 2.执行自动化特征工程\n",
+ "需要先确认是否有NaN值"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "06f24545",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "es.entity_from_dataframe?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f2c69a94",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Entityset: fetch_covtype_data\n",
+ " Entities:\n",
+ " X [Rows: 581012, Columns: 55]\n",
+ " Relationships:\n",
+ " No relationships"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "es = ft.EntitySet(id = 'fetch_covtype_data')\n",
+ "es = es.entity_from_dataframe(entity_id = 'X', dataframe = X, \n",
+ " variable_types = \n",
+ " {\n",
+ " 'Aspect': ft.variable_types.Categorical,\n",
+ " 'Slope': ft.variable_types.Categorical,\n",
+ " 'Hillshade_9am': ft.variable_types.Categorical,\n",
+ " 'Hillshade_Noon': ft.variable_types.Categorical,\n",
+ " 'Hillshade_3pm': ft.variable_types.Categorical,\n",
+ " 'Wilderness_Area_0': ft.variable_types.Boolean,\n",
+ " 'Wilderness_Area_1': ft.variable_types.Boolean,\n",
+ " 'Wilderness_Area_2': ft.variable_types.Boolean,\n",
+ " 'Wilderness_Area_3': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_0': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_1': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_2': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_3': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_4': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_5': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_6': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_7': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_8': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_9': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_10': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_11': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_12': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_13': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_14': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_15': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_16': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_17': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_18': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_19': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_20': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_21': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_22': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_23': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_24': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_25': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_26': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_27': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_28': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_29': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_30': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_31': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_32': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_33': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_34': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_35': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_36': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_37': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_38': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_39': ft.variable_types.Boolean\n",
+ " },\n",
+ " index = 'index')\n",
+ "\n",
+ "es"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "770130bc",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Entityset: fetch_covtype_data\n",
+ " Entities:\n",
+ " X [Rows: 581012, Columns: 55]\n",
+ " Wilderness_Area_0 [Rows: 2, Columns: 1]\n",
+ " Wilderness_Area_1 [Rows: 2, Columns: 1]\n",
+ " Wilderness_Area_2 [Rows: 2, Columns: 1]\n",
+ " Wilderness_Area_3 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_0 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_1 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_2 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_3 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_4 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_5 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_6 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_7 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_8 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_9 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_10 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_11 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_12 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_13 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_14 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_15 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_16 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_17 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_18 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_19 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_20 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_21 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_22 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_23 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_24 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_25 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_26 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_27 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_28 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_29 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_30 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_31 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_32 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_33 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_34 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_35 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_36 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_37 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_38 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_39 [Rows: 2, Columns: 1]\n",
+ " Relationships:\n",
+ " X.Wilderness_Area_0 -> Wilderness_Area_0.Wilderness_Area_0\n",
+ " X.Wilderness_Area_1 -> Wilderness_Area_1.Wilderness_Area_1\n",
+ " X.Wilderness_Area_2 -> Wilderness_Area_2.Wilderness_Area_2\n",
+ " X.Wilderness_Area_3 -> Wilderness_Area_3.Wilderness_Area_3\n",
+ " X.Soil_Type_0 -> Soil_Type_0.Soil_Type_0\n",
+ " X.Soil_Type_1 -> Soil_Type_1.Soil_Type_1\n",
+ " X.Soil_Type_2 -> Soil_Type_2.Soil_Type_2\n",
+ " X.Soil_Type_3 -> Soil_Type_3.Soil_Type_3\n",
+ " X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4\n",
+ " X.Soil_Type_5 -> Soil_Type_5.Soil_Type_5\n",
+ " X.Soil_Type_6 -> Soil_Type_6.Soil_Type_6\n",
+ " X.Soil_Type_7 -> Soil_Type_7.Soil_Type_7\n",
+ " X.Soil_Type_8 -> Soil_Type_8.Soil_Type_8\n",
+ " X.Soil_Type_9 -> Soil_Type_9.Soil_Type_9\n",
+ " X.Soil_Type_10 -> Soil_Type_10.Soil_Type_10\n",
+ " X.Soil_Type_11 -> Soil_Type_11.Soil_Type_11\n",
+ " X.Soil_Type_12 -> Soil_Type_12.Soil_Type_12\n",
+ " X.Soil_Type_13 -> Soil_Type_13.Soil_Type_13\n",
+ " X.Soil_Type_14 -> Soil_Type_14.Soil_Type_14\n",
+ " X.Soil_Type_15 -> Soil_Type_15.Soil_Type_15\n",
+ " X.Soil_Type_16 -> Soil_Type_16.Soil_Type_16\n",
+ " X.Soil_Type_17 -> Soil_Type_17.Soil_Type_17\n",
+ " X.Soil_Type_18 -> Soil_Type_18.Soil_Type_18\n",
+ " X.Soil_Type_19 -> Soil_Type_19.Soil_Type_19\n",
+ " X.Soil_Type_20 -> Soil_Type_20.Soil_Type_20\n",
+ " X.Soil_Type_21 -> Soil_Type_21.Soil_Type_21\n",
+ " X.Soil_Type_22 -> Soil_Type_22.Soil_Type_22\n",
+ " X.Soil_Type_23 -> Soil_Type_23.Soil_Type_23\n",
+ " X.Soil_Type_24 -> Soil_Type_24.Soil_Type_24\n",
+ " X.Soil_Type_25 -> Soil_Type_25.Soil_Type_25\n",
+ " X.Soil_Type_26 -> Soil_Type_26.Soil_Type_26\n",
+ " X.Soil_Type_27 -> Soil_Type_27.Soil_Type_27\n",
+ " X.Soil_Type_28 -> Soil_Type_28.Soil_Type_28\n",
+ " X.Soil_Type_29 -> Soil_Type_29.Soil_Type_29\n",
+ " X.Soil_Type_30 -> Soil_Type_30.Soil_Type_30\n",
+ " X.Soil_Type_31 -> Soil_Type_31.Soil_Type_31\n",
+ " X.Soil_Type_32 -> Soil_Type_32.Soil_Type_32\n",
+ " X.Soil_Type_33 -> Soil_Type_33.Soil_Type_33\n",
+ " X.Soil_Type_34 -> Soil_Type_34.Soil_Type_34\n",
+ " X.Soil_Type_35 -> Soil_Type_35.Soil_Type_35\n",
+ " X.Soil_Type_36 -> Soil_Type_36.Soil_Type_36\n",
+ " X.Soil_Type_37 -> Soil_Type_37.Soil_Type_37\n",
+ " X.Soil_Type_38 -> Soil_Type_38.Soil_Type_38\n",
+ " X.Soil_Type_39 -> Soil_Type_39.Soil_Type_39"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_0', index='Wilderness_Area_0')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_1', index='Wilderness_Area_1')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_2', index='Wilderness_Area_2')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_3', index='Wilderness_Area_3')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_0', index='Soil_Type_0')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_1', index='Soil_Type_1')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_2', index='Soil_Type_2')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_3', index='Soil_Type_3')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_4', index='Soil_Type_4')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_5', index='Soil_Type_5')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_6', index='Soil_Type_6')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_7', index='Soil_Type_7')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_8', index='Soil_Type_8')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_9', index='Soil_Type_9')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_10', index='Soil_Type_10')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_11', index='Soil_Type_11')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_12', index='Soil_Type_12')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_13', index='Soil_Type_13')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_14', index='Soil_Type_14')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_15', index='Soil_Type_15')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_16', index='Soil_Type_16')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_17', index='Soil_Type_17')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_18', index='Soil_Type_18')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_19', index='Soil_Type_19')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_20', index='Soil_Type_20')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_21', index='Soil_Type_21')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_22', index='Soil_Type_22')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_23', index='Soil_Type_23')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_24', index='Soil_Type_24')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_25', index='Soil_Type_25')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_26', index='Soil_Type_26')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_27', index='Soil_Type_27')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_28', index='Soil_Type_28')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_29', index='Soil_Type_29')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_30', index='Soil_Type_30')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_31', index='Soil_Type_31')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_32', index='Soil_Type_32')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_33', index='Soil_Type_33')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_34', index='Soil_Type_34')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_35', index='Soil_Type_35')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_36', index='Soil_Type_36')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_37', index='Soil_Type_37')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_38', index='Soil_Type_38')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_39', index='Soil_Type_39')\n",
+ "es"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "352fa085",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " type | \n",
+ " dask_compatible | \n",
+ " koalas_compatible | \n",
+ " description | \n",
+ " valid_inputs | \n",
+ " return_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " all | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Calculates if all values are 'True' in a list. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " skew | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Computes the extent to which a distribution differs from a normal distribution. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " percent_true | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Determines the percent of `True` values. | \n",
+ " Boolean | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " count | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the total number of values, excluding `NaN`. | \n",
+ " Index | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " num_unique | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the number of distinct values, ignoring `NaN` values. | \n",
+ " Discrete | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " first | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the first value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " mode | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the most commonly repeated value. | \n",
+ " Discrete | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " entropy | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the entropy for a categorical variable | \n",
+ " Categorical | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " time_since_last | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the time elapsed since the last datetime (default in seconds). | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " any | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Determines if any value is 'True' in a list. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " last | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the last value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " avg_time_between | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Computes the average number of seconds between consecutive events. | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " max | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the highest value, ignoring `NaN` values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " median | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the middlemost number in a list of values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " mean | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Computes the average for a list of values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " num_true | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Counts the number of `True` values. | \n",
+ " Boolean | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " min | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the smallest value, ignoring `NaN` values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " sum | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the total addition, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " trend | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the trend of a variable over time. | \n",
+ " Numeric, DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " n_most_common | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the `n` most common elements. | \n",
+ " Discrete | \n",
+ " Discrete | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " time_since_first | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the time elapsed since the first datetime (in seconds). | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " std | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Computes the dispersion relative to the mean value, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name type dask_compatible koalas_compatible \\\n",
+ "0 all aggregation True False \n",
+ "1 skew aggregation False False \n",
+ "2 percent_true aggregation True False \n",
+ "3 count aggregation True True \n",
+ "4 num_unique aggregation True True \n",
+ "5 first aggregation False False \n",
+ "6 mode aggregation False False \n",
+ "7 entropy aggregation False False \n",
+ "8 time_since_last aggregation False False \n",
+ "9 any aggregation True False \n",
+ "10 last aggregation False False \n",
+ "11 avg_time_between aggregation False False \n",
+ "12 max aggregation True True \n",
+ "13 median aggregation False False \n",
+ "14 mean aggregation True True \n",
+ "15 num_true aggregation True False \n",
+ "16 min aggregation True True \n",
+ "17 sum aggregation True True \n",
+ "18 trend aggregation False False \n",
+ "19 n_most_common aggregation False False \n",
+ "20 time_since_first aggregation False False \n",
+ "21 std aggregation True True \n",
+ "\n",
+ " description \\\n",
+ "0 Calculates if all values are 'True' in a list. \n",
+ "1 Computes the extent to which a distribution differs from a normal distribution. \n",
+ "2 Determines the percent of `True` values. \n",
+ "3 Determines the total number of values, excluding `NaN`. \n",
+ "4 Determines the number of distinct values, ignoring `NaN` values. \n",
+ "5 Determines the first value in a list. \n",
+ "6 Determines the most commonly repeated value. \n",
+ "7 Calculates the entropy for a categorical variable \n",
+ "8 Calculates the time elapsed since the last datetime (default in seconds). \n",
+ "9 Determines if any value is 'True' in a list. \n",
+ "10 Determines the last value in a list. \n",
+ "11 Computes the average number of seconds between consecutive events. \n",
+ "12 Calculates the highest value, ignoring `NaN` values. \n",
+ "13 Determines the middlemost number in a list of values. \n",
+ "14 Computes the average for a list of values. \n",
+ "15 Counts the number of `True` values. \n",
+ "16 Calculates the smallest value, ignoring `NaN` values. \n",
+ "17 Calculates the total addition, ignoring `NaN`. \n",
+ "18 Calculates the trend of a variable over time. \n",
+ "19 Determines the `n` most common elements. \n",
+ "20 Calculates the time elapsed since the first datetime (in seconds). \n",
+ "21 Computes the dispersion relative to the mean value, ignoring `NaN`. \n",
+ "\n",
+ " valid_inputs return_type \n",
+ "0 Boolean Boolean \n",
+ "1 Numeric Numeric \n",
+ "2 Boolean Numeric \n",
+ "3 Index Numeric \n",
+ "4 Discrete Numeric \n",
+ "5 Variable None \n",
+ "6 Discrete None \n",
+ "7 Categorical Numeric \n",
+ "8 DatetimeTimeIndex Numeric \n",
+ "9 Boolean Boolean \n",
+ "10 Variable None \n",
+ "11 DatetimeTimeIndex Numeric \n",
+ "12 Numeric Numeric \n",
+ "13 Numeric Numeric \n",
+ "14 Numeric Numeric \n",
+ "15 Boolean Numeric \n",
+ "16 Numeric Numeric \n",
+ "17 Numeric Numeric \n",
+ "18 Numeric, DatetimeTimeIndex Numeric \n",
+ "19 Discrete Discrete \n",
+ "20 DatetimeTimeIndex Numeric \n",
+ "21 Numeric Numeric "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "primitives = ft.list_primitives()\n",
+ "pd.options.display.max_colwidth = 100\n",
+ "primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "7762885f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " type | \n",
+ " dask_compatible | \n",
+ " koalas_compatible | \n",
+ " description | \n",
+ " valid_inputs | \n",
+ " return_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 22 | \n",
+ " url_to_domain | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the domain of a url. | \n",
+ " URL | \n",
+ " Categorical | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " cum_mean | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the cumulative mean. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " minute | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the minutes value of a datetime. | \n",
+ " Datetime | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " cum_max | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the cumulative maximum. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " age | \n",
+ " transform | \n",
+ " True | \n",
+ " False | \n",
+ " Calculates the age in years as a floating point number given a | \n",
+ " DateOfBirth | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 79 | \n",
+ " greater_than_scalar | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Determines if values are greater than a given scalar. | \n",
+ " Numeric, Datetime, Ordinal | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
+ " 80 | \n",
+ " url_to_protocol | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the protocol (http or https) of a url. | \n",
+ " URL | \n",
+ " Categorical | \n",
+ "
\n",
+ " \n",
+ " 81 | \n",
+ " month | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the month value of a datetime. | \n",
+ " Datetime | \n",
+ " Ordinal | \n",
+ "
\n",
+ " \n",
+ " 82 | \n",
+ " divide_numeric_scalar | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Divide each element in the list by a scalar. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 83 | \n",
+ " time_since_previous | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Compute the time since the previous entry in a list. | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
62 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name type dask_compatible koalas_compatible \\\n",
+ "22 url_to_domain transform False False \n",
+ "23 cum_mean transform False False \n",
+ "24 minute transform True True \n",
+ "25 cum_max transform False False \n",
+ "26 age transform True False \n",
+ ".. ... ... ... ... \n",
+ "79 greater_than_scalar transform True True \n",
+ "80 url_to_protocol transform False False \n",
+ "81 month transform True True \n",
+ "82 divide_numeric_scalar transform True True \n",
+ "83 time_since_previous transform False False \n",
+ "\n",
+ " description \\\n",
+ "22 Determines the domain of a url. \n",
+ "23 Calculates the cumulative mean. \n",
+ "24 Determines the minutes value of a datetime. \n",
+ "25 Calculates the cumulative maximum. \n",
+ "26 Calculates the age in years as a floating point number given a \n",
+ ".. ... \n",
+ "79 Determines if values are greater than a given scalar. \n",
+ "80 Determines the protocol (http or https) of a url. \n",
+ "81 Determines the month value of a datetime. \n",
+ "82 Divide each element in the list by a scalar. \n",
+ "83 Compute the time since the previous entry in a list. \n",
+ "\n",
+ " valid_inputs return_type \n",
+ "22 URL Categorical \n",
+ "23 Numeric Numeric \n",
+ "24 Datetime Numeric \n",
+ "25 Numeric Numeric \n",
+ "26 DateOfBirth Numeric \n",
+ ".. ... ... \n",
+ "79 Numeric, Datetime, Ordinal Boolean \n",
+ "80 URL Categorical \n",
+ "81 Datetime Ordinal \n",
+ "82 Numeric Numeric \n",
+ "83 DatetimeTimeIndex Numeric \n",
+ "\n",
+ "[62 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a568eb4",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/竞赛优胜技巧/Automated feature engineering.ipynb b/竞赛优胜技巧/Automated feature engineering.ipynb
new file mode 100644
index 0000000..fea5395
--- /dev/null
+++ b/竞赛优胜技巧/Automated feature engineering.ipynb
@@ -0,0 +1,1148 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "33127151",
+ "metadata": {},
+ "source": [
+ "# 自动化特征工程"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "66dfb30d",
+ "metadata": {},
+ "source": [
+ "搬运参考:https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91896713",
+ "metadata": {},
+ "source": [
+ "### 1.介绍\n",
+ "如果您曾经为您的ML项目手动创建过数百个特性(我相信您做到了),那么您将乐于了解名为“featuretools”的Python包如何帮助完成这项任务。好消息是这个软件包很容易使用。它的目标是自动化特征工程。当然,人类的专业知识是无法替代的,但是“featuretools”可以自动化大量的日常工作。出于探索目的,这里使用fetch_covtype数据集。\n",
+ "\n",
+ "本笔记本的主要内容包括:\n",
+ "\n",
+ "首先,使用自动特征工程(“featuretools”包),从54个特征总数增加到N个。\n",
+ "\n",
+ "其次,应用特征约简和选择方法,从N个特征中选择X个最相关的特征。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "522eb443",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "print(sys.version) # 版本信息"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "51e62bae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simpleNote: you may need to restart the kernel to use updated packages.\n",
+ "Collecting featuretools\n",
+ " Downloading https://pypi.tuna.tsinghua.edu.cn/packages/8f/32/b5d02df152aff86f720524540ae516a8e15d7a8c53bd4ee06e2b1ed0c263/featuretools-0.26.2-py3-none-any.whl (327 kB)\n",
+ "Requirement already satisfied: numpy>=1.16.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.19.5)\n",
+ "Requirement already satisfied: dask[dataframe]>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+ "Requirement already satisfied: pyyaml>=5.4 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.4.1)\n",
+ "Requirement already satisfied: tqdm>=4.32.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (4.59.0)\n",
+ "Requirement already satisfied: scipy>=1.3.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.2)\n",
+ "Requirement already satisfied: click>=7.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (7.1.2)\n",
+ "Requirement already satisfied: pandas<2.0.0,>=1.2.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.2.4)\n",
+ "Requirement already satisfied: psutil>=5.6.6 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (5.8.0)\n",
+ "Requirement already satisfied: distributed>=2.12.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (2021.4.0)\n",
+ "Requirement already satisfied: cloudpickle>=0.4.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from featuretools) (1.6.0)\n",
+ "Requirement already satisfied: partd>=0.3.10 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (1.2.0)\n",
+ "Requirement already satisfied: fsspec>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.9.0)\n",
+ "Requirement already satisfied: toolz>=0.8.2 in d:\\programdata\\anaconda3\\lib\\site-packages (from dask[dataframe]>=2.12.0->featuretools) (0.11.1)\n",
+ "Requirement already satisfied: tblib>=1.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.7.0)\n",
+ "Requirement already satisfied: zict>=0.1.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.0.0)\n",
+ "Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (2.3.0)\n",
+ "Requirement already satisfied: tornado>=6.0.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (6.1)\n",
+ "Requirement already satisfied: msgpack>=0.6.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (1.0.2)\n",
+ "Requirement already satisfied: setuptools in d:\\programdata\\anaconda3\\lib\\site-packages (from distributed>=2.12.0->featuretools) (52.0.0.post20210125)\n",
+ "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2.8.1)\n",
+ "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas<2.0.0,>=1.2.0->featuretools) (2021.1)\n",
+ "Requirement already satisfied: locket in d:\\programdata\\anaconda3\\lib\\site-packages\\locket-0.2.1-py3.8.egg (from partd>=0.3.10->dask[dataframe]>=2.12.0->featuretools) (0.2.1)\n",
+ "Requirement already satisfied: six>=1.5 in d:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<2.0.0,>=1.2.0->featuretools) (1.15.0)\n",
+ "Requirement already satisfied: heapdict in d:\\programdata\\anaconda3\\lib\\site-packages (from zict>=0.1.3->distributed>=2.12.0->featuretools) (1.0.1)\n",
+ "Installing collected packages: featuretools\n",
+ "Successfully installed featuretools-0.26.2\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "pip install featuretools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "43cc9a46",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import time\n",
+ "\n",
+ "import featuretools as ft\n",
+ "from featuretools.primitives import *\n",
+ "from featuretools.variable_types import Numeric\n",
+ "from sklearn.svm import LinearSVC\n",
+ "from sklearn.feature_selection import SelectFromModel\n",
+ "# 导入相关模型,没有的pip install xxx 即可\n",
+ "\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "import xgboost as xgb \n",
+ "import lightgbm as lgb \n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.preprocessing import OrdinalEncoder\n",
+ "from sklearn.metrics import log_loss"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4c17c0bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import fetch_covtype\n",
+ "data = fetch_covtype()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "bcce5a3d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "七分类任务,处理前: [1 2 3 4 5 6 7]\n",
+ "[5 5 2 ... 3 3 3]\n",
+ "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n",
+ "[4. 4. 1. ... 2. 2. 2.]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 预处理\n",
+ "X, y = data['data'], data['target']\n",
+ "# 由于模型标签需要从0开始,所以数字需要全部减1\n",
+ "print('七分类任务,处理前:',np.unique(y))\n",
+ "print(y)\n",
+ "ord = OrdinalEncoder()\n",
+ "y = ord.fit_transform(y.reshape(-1, 1))\n",
+ "y = y.reshape(-1, )\n",
+ "print('七分类任务,处理后:',np.unique(y))\n",
+ "print(y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "4afeeca5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " Elevation | \n",
+ " Aspect | \n",
+ " Slope | \n",
+ " Horizontal_Distance_To_Hydrology | \n",
+ " Vertical_Distance_To_Hydrology | \n",
+ " Horizontal_Distance_To_Roadways | \n",
+ " Hillshade_9am | \n",
+ " Hillshade_Noon | \n",
+ " Hillshade_3pm | \n",
+ " ... | \n",
+ " Soil_Type_30 | \n",
+ " Soil_Type_31 | \n",
+ " Soil_Type_32 | \n",
+ " Soil_Type_33 | \n",
+ " Soil_Type_34 | \n",
+ " Soil_Type_35 | \n",
+ " Soil_Type_36 | \n",
+ " Soil_Type_37 | \n",
+ " Soil_Type_38 | \n",
+ " Soil_Type_39 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2596.0 | \n",
+ " 51.0 | \n",
+ " 3.0 | \n",
+ " 258.0 | \n",
+ " 0.0 | \n",
+ " 510.0 | \n",
+ " 221.0 | \n",
+ " 232.0 | \n",
+ " 148.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2590.0 | \n",
+ " 56.0 | \n",
+ " 2.0 | \n",
+ " 212.0 | \n",
+ " -6.0 | \n",
+ " 390.0 | \n",
+ " 220.0 | \n",
+ " 235.0 | \n",
+ " 151.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 55 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index Elevation Aspect Slope Horizontal_Distance_To_Hydrology \\\n",
+ "0 0 2596.0 51.0 3.0 258.0 \n",
+ "1 1 2590.0 56.0 2.0 212.0 \n",
+ "\n",
+ " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n",
+ "0 0.0 510.0 \n",
+ "1 -6.0 390.0 \n",
+ "\n",
+ " Hillshade_9am Hillshade_Noon Hillshade_3pm ... Soil_Type_30 \\\n",
+ "0 221.0 232.0 148.0 ... 0.0 \n",
+ "1 220.0 235.0 151.0 ... 0.0 \n",
+ "\n",
+ " Soil_Type_31 Soil_Type_32 Soil_Type_33 Soil_Type_34 Soil_Type_35 \\\n",
+ "0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Soil_Type_36 Soil_Type_37 Soil_Type_38 Soil_Type_39 \n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ "[2 rows x 55 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X = pd.DataFrame(X,columns=data.feature_names)\n",
+ "X = X.reset_index()\n",
+ "X.head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f68429bf",
+ "metadata": {},
+ "source": [
+ "### 2.执行自动化特征工程\n",
+ "需要先确认是否有NaN值"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "06f24545",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "es.entity_from_dataframe?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f2c69a94",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Entityset: fetch_covtype_data\n",
+ " Entities:\n",
+ " X [Rows: 581012, Columns: 55]\n",
+ " Relationships:\n",
+ " No relationships"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "es = ft.EntitySet(id = 'fetch_covtype_data')\n",
+ "es = es.entity_from_dataframe(entity_id = 'X', dataframe = X, \n",
+ " variable_types = \n",
+ " {\n",
+ " 'Aspect': ft.variable_types.Categorical,\n",
+ " 'Slope': ft.variable_types.Categorical,\n",
+ " 'Hillshade_9am': ft.variable_types.Categorical,\n",
+ " 'Hillshade_Noon': ft.variable_types.Categorical,\n",
+ " 'Hillshade_3pm': ft.variable_types.Categorical,\n",
+ " 'Wilderness_Area_0': ft.variable_types.Boolean,\n",
+ " 'Wilderness_Area_1': ft.variable_types.Boolean,\n",
+ " 'Wilderness_Area_2': ft.variable_types.Boolean,\n",
+ " 'Wilderness_Area_3': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_0': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_1': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_2': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_3': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_4': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_5': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_6': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_7': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_8': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_9': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_10': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_11': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_12': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_13': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_14': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_15': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_16': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_17': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_18': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_19': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_20': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_21': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_22': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_23': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_24': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_25': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_26': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_27': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_28': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_29': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_30': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_31': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_32': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_33': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_34': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_35': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_36': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_37': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_38': ft.variable_types.Boolean,\n",
+ " 'Soil_Type_39': ft.variable_types.Boolean\n",
+ " },\n",
+ " index = 'index')\n",
+ "\n",
+ "es"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "770130bc",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Entityset: fetch_covtype_data\n",
+ " Entities:\n",
+ " X [Rows: 581012, Columns: 55]\n",
+ " Wilderness_Area_0 [Rows: 2, Columns: 1]\n",
+ " Wilderness_Area_1 [Rows: 2, Columns: 1]\n",
+ " Wilderness_Area_2 [Rows: 2, Columns: 1]\n",
+ " Wilderness_Area_3 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_0 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_1 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_2 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_3 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_4 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_5 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_6 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_7 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_8 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_9 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_10 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_11 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_12 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_13 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_14 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_15 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_16 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_17 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_18 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_19 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_20 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_21 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_22 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_23 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_24 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_25 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_26 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_27 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_28 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_29 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_30 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_31 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_32 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_33 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_34 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_35 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_36 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_37 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_38 [Rows: 2, Columns: 1]\n",
+ " Soil_Type_39 [Rows: 2, Columns: 1]\n",
+ " Relationships:\n",
+ " X.Wilderness_Area_0 -> Wilderness_Area_0.Wilderness_Area_0\n",
+ " X.Wilderness_Area_1 -> Wilderness_Area_1.Wilderness_Area_1\n",
+ " X.Wilderness_Area_2 -> Wilderness_Area_2.Wilderness_Area_2\n",
+ " X.Wilderness_Area_3 -> Wilderness_Area_3.Wilderness_Area_3\n",
+ " X.Soil_Type_0 -> Soil_Type_0.Soil_Type_0\n",
+ " X.Soil_Type_1 -> Soil_Type_1.Soil_Type_1\n",
+ " X.Soil_Type_2 -> Soil_Type_2.Soil_Type_2\n",
+ " X.Soil_Type_3 -> Soil_Type_3.Soil_Type_3\n",
+ " X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4\n",
+ " X.Soil_Type_5 -> Soil_Type_5.Soil_Type_5\n",
+ " X.Soil_Type_6 -> Soil_Type_6.Soil_Type_6\n",
+ " X.Soil_Type_7 -> Soil_Type_7.Soil_Type_7\n",
+ " X.Soil_Type_8 -> Soil_Type_8.Soil_Type_8\n",
+ " X.Soil_Type_9 -> Soil_Type_9.Soil_Type_9\n",
+ " X.Soil_Type_10 -> Soil_Type_10.Soil_Type_10\n",
+ " X.Soil_Type_11 -> Soil_Type_11.Soil_Type_11\n",
+ " X.Soil_Type_12 -> Soil_Type_12.Soil_Type_12\n",
+ " X.Soil_Type_13 -> Soil_Type_13.Soil_Type_13\n",
+ " X.Soil_Type_14 -> Soil_Type_14.Soil_Type_14\n",
+ " X.Soil_Type_15 -> Soil_Type_15.Soil_Type_15\n",
+ " X.Soil_Type_16 -> Soil_Type_16.Soil_Type_16\n",
+ " X.Soil_Type_17 -> Soil_Type_17.Soil_Type_17\n",
+ " X.Soil_Type_18 -> Soil_Type_18.Soil_Type_18\n",
+ " X.Soil_Type_19 -> Soil_Type_19.Soil_Type_19\n",
+ " X.Soil_Type_20 -> Soil_Type_20.Soil_Type_20\n",
+ " X.Soil_Type_21 -> Soil_Type_21.Soil_Type_21\n",
+ " X.Soil_Type_22 -> Soil_Type_22.Soil_Type_22\n",
+ " X.Soil_Type_23 -> Soil_Type_23.Soil_Type_23\n",
+ " X.Soil_Type_24 -> Soil_Type_24.Soil_Type_24\n",
+ " X.Soil_Type_25 -> Soil_Type_25.Soil_Type_25\n",
+ " X.Soil_Type_26 -> Soil_Type_26.Soil_Type_26\n",
+ " X.Soil_Type_27 -> Soil_Type_27.Soil_Type_27\n",
+ " X.Soil_Type_28 -> Soil_Type_28.Soil_Type_28\n",
+ " X.Soil_Type_29 -> Soil_Type_29.Soil_Type_29\n",
+ " X.Soil_Type_30 -> Soil_Type_30.Soil_Type_30\n",
+ " X.Soil_Type_31 -> Soil_Type_31.Soil_Type_31\n",
+ " X.Soil_Type_32 -> Soil_Type_32.Soil_Type_32\n",
+ " X.Soil_Type_33 -> Soil_Type_33.Soil_Type_33\n",
+ " X.Soil_Type_34 -> Soil_Type_34.Soil_Type_34\n",
+ " X.Soil_Type_35 -> Soil_Type_35.Soil_Type_35\n",
+ " X.Soil_Type_36 -> Soil_Type_36.Soil_Type_36\n",
+ " X.Soil_Type_37 -> Soil_Type_37.Soil_Type_37\n",
+ " X.Soil_Type_38 -> Soil_Type_38.Soil_Type_38\n",
+ " X.Soil_Type_39 -> Soil_Type_39.Soil_Type_39"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_0', index='Wilderness_Area_0')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_1', index='Wilderness_Area_1')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_2', index='Wilderness_Area_2')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Wilderness_Area_3', index='Wilderness_Area_3')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_0', index='Soil_Type_0')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_1', index='Soil_Type_1')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_2', index='Soil_Type_2')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_3', index='Soil_Type_3')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_4', index='Soil_Type_4')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_5', index='Soil_Type_5')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_6', index='Soil_Type_6')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_7', index='Soil_Type_7')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_8', index='Soil_Type_8')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_9', index='Soil_Type_9')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_10', index='Soil_Type_10')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_11', index='Soil_Type_11')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_12', index='Soil_Type_12')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_13', index='Soil_Type_13')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_14', index='Soil_Type_14')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_15', index='Soil_Type_15')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_16', index='Soil_Type_16')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_17', index='Soil_Type_17')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_18', index='Soil_Type_18')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_19', index='Soil_Type_19')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_20', index='Soil_Type_20')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_21', index='Soil_Type_21')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_22', index='Soil_Type_22')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_23', index='Soil_Type_23')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_24', index='Soil_Type_24')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_25', index='Soil_Type_25')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_26', index='Soil_Type_26')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_27', index='Soil_Type_27')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_28', index='Soil_Type_28')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_29', index='Soil_Type_29')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_30', index='Soil_Type_30')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_31', index='Soil_Type_31')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_32', index='Soil_Type_32')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_33', index='Soil_Type_33')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_34', index='Soil_Type_34')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_35', index='Soil_Type_35')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_36', index='Soil_Type_36')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_37', index='Soil_Type_37')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_38', index='Soil_Type_38')\n",
+ "es = es.normalize_entity(base_entity_id='X', new_entity_id='Soil_Type_39', index='Soil_Type_39')\n",
+ "es"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "352fa085",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " type | \n",
+ " dask_compatible | \n",
+ " koalas_compatible | \n",
+ " description | \n",
+ " valid_inputs | \n",
+ " return_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " all | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Calculates if all values are 'True' in a list. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " skew | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Computes the extent to which a distribution differs from a normal distribution. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " percent_true | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Determines the percent of `True` values. | \n",
+ " Boolean | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " count | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the total number of values, excluding `NaN`. | \n",
+ " Index | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " num_unique | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the number of distinct values, ignoring `NaN` values. | \n",
+ " Discrete | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " first | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the first value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " mode | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the most commonly repeated value. | \n",
+ " Discrete | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " entropy | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the entropy for a categorical variable | \n",
+ " Categorical | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " time_since_last | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the time elapsed since the last datetime (default in seconds). | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " any | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Determines if any value is 'True' in a list. | \n",
+ " Boolean | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " last | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the last value in a list. | \n",
+ " Variable | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " avg_time_between | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Computes the average number of seconds between consecutive events. | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " max | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the highest value, ignoring `NaN` values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " median | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the middlemost number in a list of values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " mean | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Computes the average for a list of values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " num_true | \n",
+ " aggregation | \n",
+ " True | \n",
+ " False | \n",
+ " Counts the number of `True` values. | \n",
+ " Boolean | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " min | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the smallest value, ignoring `NaN` values. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " sum | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Calculates the total addition, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " trend | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the trend of a variable over time. | \n",
+ " Numeric, DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " n_most_common | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the `n` most common elements. | \n",
+ " Discrete | \n",
+ " Discrete | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " time_since_first | \n",
+ " aggregation | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the time elapsed since the first datetime (in seconds). | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " std | \n",
+ " aggregation | \n",
+ " True | \n",
+ " True | \n",
+ " Computes the dispersion relative to the mean value, ignoring `NaN`. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name type dask_compatible koalas_compatible \\\n",
+ "0 all aggregation True False \n",
+ "1 skew aggregation False False \n",
+ "2 percent_true aggregation True False \n",
+ "3 count aggregation True True \n",
+ "4 num_unique aggregation True True \n",
+ "5 first aggregation False False \n",
+ "6 mode aggregation False False \n",
+ "7 entropy aggregation False False \n",
+ "8 time_since_last aggregation False False \n",
+ "9 any aggregation True False \n",
+ "10 last aggregation False False \n",
+ "11 avg_time_between aggregation False False \n",
+ "12 max aggregation True True \n",
+ "13 median aggregation False False \n",
+ "14 mean aggregation True True \n",
+ "15 num_true aggregation True False \n",
+ "16 min aggregation True True \n",
+ "17 sum aggregation True True \n",
+ "18 trend aggregation False False \n",
+ "19 n_most_common aggregation False False \n",
+ "20 time_since_first aggregation False False \n",
+ "21 std aggregation True True \n",
+ "\n",
+ " description \\\n",
+ "0 Calculates if all values are 'True' in a list. \n",
+ "1 Computes the extent to which a distribution differs from a normal distribution. \n",
+ "2 Determines the percent of `True` values. \n",
+ "3 Determines the total number of values, excluding `NaN`. \n",
+ "4 Determines the number of distinct values, ignoring `NaN` values. \n",
+ "5 Determines the first value in a list. \n",
+ "6 Determines the most commonly repeated value. \n",
+ "7 Calculates the entropy for a categorical variable \n",
+ "8 Calculates the time elapsed since the last datetime (default in seconds). \n",
+ "9 Determines if any value is 'True' in a list. \n",
+ "10 Determines the last value in a list. \n",
+ "11 Computes the average number of seconds between consecutive events. \n",
+ "12 Calculates the highest value, ignoring `NaN` values. \n",
+ "13 Determines the middlemost number in a list of values. \n",
+ "14 Computes the average for a list of values. \n",
+ "15 Counts the number of `True` values. \n",
+ "16 Calculates the smallest value, ignoring `NaN` values. \n",
+ "17 Calculates the total addition, ignoring `NaN`. \n",
+ "18 Calculates the trend of a variable over time. \n",
+ "19 Determines the `n` most common elements. \n",
+ "20 Calculates the time elapsed since the first datetime (in seconds). \n",
+ "21 Computes the dispersion relative to the mean value, ignoring `NaN`. \n",
+ "\n",
+ " valid_inputs return_type \n",
+ "0 Boolean Boolean \n",
+ "1 Numeric Numeric \n",
+ "2 Boolean Numeric \n",
+ "3 Index Numeric \n",
+ "4 Discrete Numeric \n",
+ "5 Variable None \n",
+ "6 Discrete None \n",
+ "7 Categorical Numeric \n",
+ "8 DatetimeTimeIndex Numeric \n",
+ "9 Boolean Boolean \n",
+ "10 Variable None \n",
+ "11 DatetimeTimeIndex Numeric \n",
+ "12 Numeric Numeric \n",
+ "13 Numeric Numeric \n",
+ "14 Numeric Numeric \n",
+ "15 Boolean Numeric \n",
+ "16 Numeric Numeric \n",
+ "17 Numeric Numeric \n",
+ "18 Numeric, DatetimeTimeIndex Numeric \n",
+ "19 Discrete Discrete \n",
+ "20 DatetimeTimeIndex Numeric \n",
+ "21 Numeric Numeric "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "primitives = ft.list_primitives()\n",
+ "pd.options.display.max_colwidth = 100\n",
+ "primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "7762885f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " type | \n",
+ " dask_compatible | \n",
+ " koalas_compatible | \n",
+ " description | \n",
+ " valid_inputs | \n",
+ " return_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 22 | \n",
+ " url_to_domain | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the domain of a url. | \n",
+ " URL | \n",
+ " Categorical | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " cum_mean | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the cumulative mean. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " minute | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the minutes value of a datetime. | \n",
+ " Datetime | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " cum_max | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Calculates the cumulative maximum. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " age | \n",
+ " transform | \n",
+ " True | \n",
+ " False | \n",
+ " Calculates the age in years as a floating point number given a | \n",
+ " DateOfBirth | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 79 | \n",
+ " greater_than_scalar | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Determines if values are greater than a given scalar. | \n",
+ " Numeric, Datetime, Ordinal | \n",
+ " Boolean | \n",
+ "
\n",
+ " \n",
+ " 80 | \n",
+ " url_to_protocol | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Determines the protocol (http or https) of a url. | \n",
+ " URL | \n",
+ " Categorical | \n",
+ "
\n",
+ " \n",
+ " 81 | \n",
+ " month | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Determines the month value of a datetime. | \n",
+ " Datetime | \n",
+ " Ordinal | \n",
+ "
\n",
+ " \n",
+ " 82 | \n",
+ " divide_numeric_scalar | \n",
+ " transform | \n",
+ " True | \n",
+ " True | \n",
+ " Divide each element in the list by a scalar. | \n",
+ " Numeric | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ " 83 | \n",
+ " time_since_previous | \n",
+ " transform | \n",
+ " False | \n",
+ " False | \n",
+ " Compute the time since the previous entry in a list. | \n",
+ " DatetimeTimeIndex | \n",
+ " Numeric | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
62 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name type dask_compatible koalas_compatible \\\n",
+ "22 url_to_domain transform False False \n",
+ "23 cum_mean transform False False \n",
+ "24 minute transform True True \n",
+ "25 cum_max transform False False \n",
+ "26 age transform True False \n",
+ ".. ... ... ... ... \n",
+ "79 greater_than_scalar transform True True \n",
+ "80 url_to_protocol transform False False \n",
+ "81 month transform True True \n",
+ "82 divide_numeric_scalar transform True True \n",
+ "83 time_since_previous transform False False \n",
+ "\n",
+ " description \\\n",
+ "22 Determines the domain of a url. \n",
+ "23 Calculates the cumulative mean. \n",
+ "24 Determines the minutes value of a datetime. \n",
+ "25 Calculates the cumulative maximum. \n",
+ "26 Calculates the age in years as a floating point number given a \n",
+ ".. ... \n",
+ "79 Determines if values are greater than a given scalar. \n",
+ "80 Determines the protocol (http or https) of a url. \n",
+ "81 Determines the month value of a datetime. \n",
+ "82 Divide each element in the list by a scalar. \n",
+ "83 Compute the time since the previous entry in a list. \n",
+ "\n",
+ " valid_inputs return_type \n",
+ "22 URL Categorical \n",
+ "23 Numeric Numeric \n",
+ "24 Datetime Numeric \n",
+ "25 Numeric Numeric \n",
+ "26 DateOfBirth Numeric \n",
+ ".. ... ... \n",
+ "79 Numeric, Datetime, Ordinal Boolean \n",
+ "80 URL Categorical \n",
+ "81 Datetime Ordinal \n",
+ "82 Numeric Numeric \n",
+ "83 DatetimeTimeIndex Numeric \n",
+ "\n",
+ "[62 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a568eb4",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}