diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb index 8428481..488f3e0 100644 --- a/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb +++ b/竞赛优胜技巧/.ipynb_checkpoints/Automated feature engineering-checkpoint.ipynb @@ -13,6 +13,7 @@ "id": "66dfb30d", "metadata": {}, "source": [ + "### 结论:效果一般\n", "搬运参考:https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset" ] }, @@ -99,13 +100,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "id": "43cc9a46", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import time\n", + "import gc\n", "\n", "import featuretools as ft\n", "from featuretools.primitives import *\n", @@ -115,9 +117,6 @@ "# 导入相关模型,没有的pip install xxx 即可\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.linear_model import LogisticRegression\n", - "import xgboost as xgb \n", - "import lightgbm as lgb \n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score\n", @@ -127,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "4c17c0bc", "metadata": {}, "outputs": [], @@ -138,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "bcce5a3d", "metadata": {}, "outputs": [ @@ -168,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "id": "4afeeca5", "metadata": {}, "outputs": [ @@ -292,7 +291,7 @@ "1 0.0 0.0 0.0 " ] }, - "execution_count": 17, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -306,7 +305,70 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, + "id": "af6722f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexCover_Type
004.0
114.0
\n", + "
" + ], + "text/plain": [ + " index Cover_Type\n", + "0 0 4.0\n", + "1 1 4.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = pd.DataFrame(y, columns=data.target_names)\n", + "y = y.reset_index()\n", + "y.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "2d34ab5c", "metadata": {}, "outputs": [ @@ -350,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "id": "1551c241", "metadata": {}, "outputs": [ @@ -407,10 +469,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "id": "06f24545", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Object `es.entity_from_dataframe` not found.\n" + ] + } + ], "source": [ "es.entity_from_dataframe?" ] @@ -433,7 +503,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, "id": "f2c69a94", "metadata": {}, "outputs": [ @@ -447,7 +517,7 @@ " No relationships" ] }, - "execution_count": 18, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -479,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 10, "id": "770130bc", "metadata": { "scrolled": false @@ -512,7 +582,7 @@ " X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4" ] }, - "execution_count": 19, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -532,7 +602,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 11, "id": "352fa085", "metadata": { "scrolled": true @@ -571,126 +641,116 @@ " \n", " \n", " 0\n", - " all\n", + " sum\n", " aggregation\n", " True\n", - " False\n", - " Calculates if all values are 'True' in a list.\n", - " Boolean\n", - " Boolean\n", + " True\n", + " Calculates the total addition, ignoring `NaN`.\n", + " Numeric\n", + " Numeric\n", " \n", " \n", " 1\n", - " skew\n", + " first\n", " aggregation\n", " False\n", " False\n", - " Computes the extent to which a distribution differs from a normal distribution.\n", - " Numeric\n", - " Numeric\n", + " Determines the first value in a list.\n", + " Variable\n", + " None\n", " \n", " \n", " 2\n", - " percent_true\n", + " last\n", " aggregation\n", - " True\n", " False\n", - " Determines the percent of `True` values.\n", - " Boolean\n", - " Numeric\n", + " False\n", + " Determines the last value in a list.\n", + " Variable\n", + " None\n", " \n", " \n", " 3\n", - " count\n", + " trend\n", " aggregation\n", - " True\n", - " True\n", - " Determines the total number of values, excluding `NaN`.\n", - " Index\n", + " False\n", + " False\n", + " Calculates the trend of a variable over time.\n", + " DatetimeTimeIndex, Numeric\n", " Numeric\n", " \n", " \n", " 4\n", - " num_unique\n", + " n_most_common\n", " aggregation\n", - " True\n", - " True\n", - " Determines the number of distinct values, ignoring `NaN` values.\n", + " False\n", + " False\n", + " Determines the `n` most common elements.\n", + " Discrete\n", " Discrete\n", - " Numeric\n", " \n", " \n", " 5\n", - " first\n", + " time_since_last\n", " aggregation\n", " False\n", " False\n", - " Determines the first value in a list.\n", - " Variable\n", - " None\n", + " Calculates the time elapsed since the last datetime (default in seconds).\n", + " DatetimeTimeIndex\n", + " Numeric\n", " \n", " \n", " 6\n", - " mode\n", + " std\n", " aggregation\n", - " False\n", - " False\n", - " Determines the most commonly repeated value.\n", - " Discrete\n", - " None\n", + " True\n", + " True\n", + " Computes the dispersion relative to the mean value, ignoring `NaN`.\n", + " Numeric\n", + " Numeric\n", " \n", " \n", " 7\n", - " entropy\n", + " median\n", " aggregation\n", " False\n", " False\n", - " Calculates the entropy for a categorical variable\n", - " Categorical\n", + " Determines the middlemost number in a list of values.\n", + " Numeric\n", " Numeric\n", " \n", " \n", " 8\n", - " time_since_last\n", + " count\n", " aggregation\n", - " False\n", - " False\n", - " Calculates the time elapsed since the last datetime (default in seconds).\n", - " DatetimeTimeIndex\n", + " True\n", + " True\n", + " Determines the total number of values, excluding `NaN`.\n", + " Index\n", " Numeric\n", " \n", " \n", " 9\n", - " any\n", + " percent_true\n", " aggregation\n", " True\n", " False\n", - " Determines if any value is 'True' in a list.\n", - " Boolean\n", + " Determines the percent of `True` values.\n", " Boolean\n", + " Numeric\n", " \n", " \n", " 10\n", - " last\n", - " aggregation\n", - " False\n", - " False\n", - " Determines the last value in a list.\n", - " Variable\n", - " None\n", - " \n", - " \n", - " 11\n", - " avg_time_between\n", + " time_since_first\n", " aggregation\n", " False\n", " False\n", - " Computes the average number of seconds between consecutive events.\n", + " Calculates the time elapsed since the first datetime (in seconds).\n", " DatetimeTimeIndex\n", " Numeric\n", " \n", " \n", - " 12\n", + " 11\n", " max\n", " aggregation\n", " True\n", @@ -700,93 +760,103 @@ " Numeric\n", " \n", " \n", + " 12\n", + " any\n", + " aggregation\n", + " True\n", + " False\n", + " Determines if any value is 'True' in a list.\n", + " Boolean\n", + " Boolean\n", + " \n", + " \n", " 13\n", - " median\n", + " mode\n", " aggregation\n", " False\n", " False\n", - " Determines the middlemost number in a list of values.\n", - " Numeric\n", - " Numeric\n", + " Determines the most commonly repeated value.\n", + " Discrete\n", + " None\n", " \n", " \n", " 14\n", - " mean\n", + " entropy\n", + " aggregation\n", + " False\n", + " False\n", + " Calculates the entropy for a categorical variable\n", + " Categorical\n", + " Numeric\n", + " \n", + " \n", + " 15\n", + " min\n", " aggregation\n", " True\n", " True\n", - " Computes the average for a list of values.\n", + " Calculates the smallest value, ignoring `NaN` values.\n", " Numeric\n", " Numeric\n", " \n", " \n", - " 15\n", - " num_true\n", + " 16\n", + " all\n", " aggregation\n", " True\n", " False\n", - " Counts the number of `True` values.\n", + " Calculates if all values are 'True' in a list.\n", + " Boolean\n", " Boolean\n", - " Numeric\n", " \n", " \n", - " 16\n", - " min\n", + " 17\n", + " skew\n", " aggregation\n", - " True\n", - " True\n", - " Calculates the smallest value, ignoring `NaN` values.\n", + " False\n", + " False\n", + " Computes the extent to which a distribution differs from a normal distribution.\n", " Numeric\n", " Numeric\n", " \n", " \n", - " 17\n", - " sum\n", + " 18\n", + " mean\n", " aggregation\n", " True\n", " True\n", - " Calculates the total addition, ignoring `NaN`.\n", + " Computes the average for a list of values.\n", " Numeric\n", " Numeric\n", " \n", " \n", - " 18\n", - " trend\n", + " 19\n", + " avg_time_between\n", " aggregation\n", " False\n", " False\n", - " Calculates the trend of a variable over time.\n", - " Numeric, DatetimeTimeIndex\n", + " Computes the average number of seconds between consecutive events.\n", + " DatetimeTimeIndex\n", " Numeric\n", " \n", " \n", - " 19\n", - " n_most_common\n", - " aggregation\n", - " False\n", - " False\n", - " Determines the `n` most common elements.\n", - " Discrete\n", - " Discrete\n", - " \n", - " \n", " 20\n", - " time_since_first\n", + " num_unique\n", " aggregation\n", - " False\n", - " False\n", - " Calculates the time elapsed since the first datetime (in seconds).\n", - " DatetimeTimeIndex\n", + " True\n", + " True\n", + " Determines the number of distinct values, ignoring `NaN` values.\n", + " Discrete\n", " Numeric\n", " \n", " \n", " 21\n", - " std\n", + " num_true\n", " aggregation\n", " True\n", - " True\n", - " Computes the dispersion relative to the mean value, ignoring `NaN`.\n", - " Numeric\n", + " False\n", + " Counts the number of `True` values.\n", + " Boolean\n", " Numeric\n", " \n", " \n", @@ -795,79 +865,79 @@ ], "text/plain": [ " name type dask_compatible koalas_compatible \\\n", - "0 all aggregation True False \n", - "1 skew aggregation False False \n", - "2 percent_true aggregation True False \n", - "3 count aggregation True True \n", - "4 num_unique aggregation True True \n", - "5 first aggregation False False \n", - "6 mode aggregation False False \n", - "7 entropy aggregation False False \n", - "8 time_since_last aggregation False False \n", - "9 any aggregation True False \n", - "10 last aggregation False False \n", - "11 avg_time_between aggregation False False \n", - "12 max aggregation True True \n", - "13 median aggregation False False \n", - "14 mean aggregation True True \n", - "15 num_true aggregation True False \n", - "16 min aggregation True True \n", - "17 sum aggregation True True \n", - "18 trend aggregation False False \n", - "19 n_most_common aggregation False False \n", - "20 time_since_first aggregation False False \n", - "21 std aggregation True True \n", + "0 sum aggregation True True \n", + "1 first aggregation False False \n", + "2 last aggregation False False \n", + "3 trend aggregation False False \n", + "4 n_most_common aggregation False False \n", + "5 time_since_last aggregation False False \n", + "6 std aggregation True True \n", + "7 median aggregation False False \n", + "8 count aggregation True True \n", + "9 percent_true aggregation True False \n", + "10 time_since_first aggregation False False \n", + "11 max aggregation True True \n", + "12 any aggregation True False \n", + "13 mode aggregation False False \n", + "14 entropy aggregation False False \n", + "15 min aggregation True True \n", + "16 all aggregation True False \n", + "17 skew aggregation False False \n", + "18 mean aggregation True True \n", + "19 avg_time_between aggregation False False \n", + "20 num_unique aggregation True True \n", + "21 num_true aggregation True False \n", "\n", " description \\\n", - "0 Calculates if all values are 'True' in a list. \n", - "1 Computes the extent to which a distribution differs from a normal distribution. \n", - "2 Determines the percent of `True` values. \n", - "3 Determines the total number of values, excluding `NaN`. \n", - "4 Determines the number of distinct values, ignoring `NaN` values. \n", - "5 Determines the first value in a list. \n", - "6 Determines the most commonly repeated value. \n", - "7 Calculates the entropy for a categorical variable \n", - "8 Calculates the time elapsed since the last datetime (default in seconds). \n", - "9 Determines if any value is 'True' in a list. \n", - "10 Determines the last value in a list. \n", - "11 Computes the average number of seconds between consecutive events. \n", - "12 Calculates the highest value, ignoring `NaN` values. \n", - "13 Determines the middlemost number in a list of values. \n", - "14 Computes the average for a list of values. \n", - "15 Counts the number of `True` values. \n", - "16 Calculates the smallest value, ignoring `NaN` values. \n", - "17 Calculates the total addition, ignoring `NaN`. \n", - "18 Calculates the trend of a variable over time. \n", - "19 Determines the `n` most common elements. \n", - "20 Calculates the time elapsed since the first datetime (in seconds). \n", - "21 Computes the dispersion relative to the mean value, ignoring `NaN`. \n", + "0 Calculates the total addition, ignoring `NaN`. \n", + "1 Determines the first value in a list. \n", + "2 Determines the last value in a list. \n", + "3 Calculates the trend of a variable over time. \n", + "4 Determines the `n` most common elements. \n", + "5 Calculates the time elapsed since the last datetime (default in seconds). \n", + "6 Computes the dispersion relative to the mean value, ignoring `NaN`. \n", + "7 Determines the middlemost number in a list of values. \n", + "8 Determines the total number of values, excluding `NaN`. \n", + "9 Determines the percent of `True` values. \n", + "10 Calculates the time elapsed since the first datetime (in seconds). \n", + "11 Calculates the highest value, ignoring `NaN` values. \n", + "12 Determines if any value is 'True' in a list. \n", + "13 Determines the most commonly repeated value. \n", + "14 Calculates the entropy for a categorical variable \n", + "15 Calculates the smallest value, ignoring `NaN` values. \n", + "16 Calculates if all values are 'True' in a list. \n", + "17 Computes the extent to which a distribution differs from a normal distribution. \n", + "18 Computes the average for a list of values. \n", + "19 Computes the average number of seconds between consecutive events. \n", + "20 Determines the number of distinct values, ignoring `NaN` values. \n", + "21 Counts the number of `True` values. \n", "\n", " valid_inputs return_type \n", - "0 Boolean Boolean \n", - "1 Numeric Numeric \n", - "2 Boolean Numeric \n", - "3 Index Numeric \n", - "4 Discrete Numeric \n", - "5 Variable None \n", - "6 Discrete None \n", - "7 Categorical Numeric \n", - "8 DatetimeTimeIndex Numeric \n", - "9 Boolean Boolean \n", - "10 Variable None \n", - "11 DatetimeTimeIndex Numeric \n", - "12 Numeric Numeric \n", - "13 Numeric Numeric \n", - "14 Numeric Numeric \n", - "15 Boolean Numeric \n", - "16 Numeric Numeric \n", + "0 Numeric Numeric \n", + "1 Variable None \n", + "2 Variable None \n", + "3 DatetimeTimeIndex, Numeric Numeric \n", + "4 Discrete Discrete \n", + "5 DatetimeTimeIndex Numeric \n", + "6 Numeric Numeric \n", + "7 Numeric Numeric \n", + "8 Index Numeric \n", + "9 Boolean Numeric \n", + "10 DatetimeTimeIndex Numeric \n", + "11 Numeric Numeric \n", + "12 Boolean Boolean \n", + "13 Discrete None \n", + "14 Categorical Numeric \n", + "15 Numeric Numeric \n", + "16 Boolean Boolean \n", "17 Numeric Numeric \n", - "18 Numeric, DatetimeTimeIndex Numeric \n", - "19 Discrete Discrete \n", - "20 DatetimeTimeIndex Numeric \n", - "21 Numeric Numeric " + "18 Numeric Numeric \n", + "19 DatetimeTimeIndex Numeric \n", + "20 Discrete Numeric \n", + "21 Boolean Numeric " ] }, - "execution_count": 20, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -880,7 +950,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "id": "7762885f", "metadata": {}, "outputs": [ @@ -917,52 +987,52 @@ " \n", " \n", " 22\n", - " url_to_domain\n", + " greater_than\n", " transform\n", + " True\n", " False\n", - " False\n", - " Determines the domain of a url.\n", - " URL\n", - " Categorical\n", + " Determines if values in one list are greater than another list.\n", + " Ordinal, Datetime, Numeric\n", + " Boolean\n", " \n", " \n", " 23\n", - " cum_mean\n", + " less_than\n", " transform\n", - " False\n", - " False\n", - " Calculates the cumulative mean.\n", - " Numeric\n", - " Numeric\n", + " True\n", + " True\n", + " Determines if values in one list are less than another list.\n", + " Ordinal, Datetime, Numeric\n", + " Boolean\n", " \n", " \n", " 24\n", - " minute\n", + " and\n", " transform\n", " True\n", " True\n", - " Determines the minutes value of a datetime.\n", - " Datetime\n", - " Numeric\n", + " Element-wise logical AND of two lists.\n", + " Boolean\n", + " Boolean\n", " \n", " \n", " 25\n", - " cum_max\n", + " less_than_scalar\n", " transform\n", - " False\n", - " False\n", - " Calculates the cumulative maximum.\n", - " Numeric\n", - " Numeric\n", + " True\n", + " True\n", + " Determines if values are less than a given scalar.\n", + " Ordinal, Datetime, Numeric\n", + " Boolean\n", " \n", " \n", " 26\n", - " age\n", + " modulo_numeric\n", " transform\n", " True\n", - " False\n", - " Calculates the age in years as a floating point number given a\n", - " DateOfBirth\n", + " True\n", + " Element-wise modulo of two lists.\n", + " Numeric\n", " Numeric\n", " \n", " \n", @@ -977,52 +1047,52 @@ " \n", " \n", " 79\n", - " greater_than_scalar\n", + " is_weekend\n", " transform\n", " True\n", " True\n", - " Determines if values are greater than a given scalar.\n", - " Numeric, Datetime, Ordinal\n", + " Determines if a date falls on a weekend.\n", + " Datetime\n", " Boolean\n", " \n", " \n", " 80\n", - " url_to_protocol\n", + " num_characters\n", " transform\n", - " False\n", - " False\n", - " Determines the protocol (http or https) of a url.\n", - " URL\n", - " Categorical\n", + " True\n", + " True\n", + " Calculates the number of characters in a string.\n", + " NaturalLanguage\n", + " Numeric\n", " \n", " \n", " 81\n", - " month\n", + " latitude\n", " transform\n", - " True\n", - " True\n", - " Determines the month value of a datetime.\n", - " Datetime\n", - " Ordinal\n", + " False\n", + " False\n", + " Returns the first tuple value in a list of LatLong tuples.\n", + " LatLong\n", + " Numeric\n", " \n", " \n", " 82\n", - " divide_numeric_scalar\n", + " cum_sum\n", " transform\n", - " True\n", - " True\n", - " Divide each element in the list by a scalar.\n", + " False\n", + " False\n", + " Calculates the cumulative sum.\n", " Numeric\n", " Numeric\n", " \n", " \n", " 83\n", - " time_since_previous\n", + " subtract_numeric_scalar\n", " transform\n", - " False\n", - " False\n", - " Compute the time since the previous entry in a list.\n", - " DatetimeTimeIndex\n", + " True\n", + " True\n", + " Subtract a scalar from each element in the list.\n", + " Numeric\n", " Numeric\n", " \n", " \n", @@ -1031,49 +1101,49 @@ "" ], "text/plain": [ - " name type dask_compatible koalas_compatible \\\n", - "22 url_to_domain transform False False \n", - "23 cum_mean transform False False \n", - "24 minute transform True True \n", - "25 cum_max transform False False \n", - "26 age transform True False \n", - ".. ... ... ... ... \n", - "79 greater_than_scalar transform True True \n", - "80 url_to_protocol transform False False \n", - "81 month transform True True \n", - "82 divide_numeric_scalar transform True True \n", - "83 time_since_previous transform False False \n", + " name type dask_compatible koalas_compatible \\\n", + "22 greater_than transform True False \n", + "23 less_than transform True True \n", + "24 and transform True True \n", + "25 less_than_scalar transform True True \n", + "26 modulo_numeric transform True True \n", + ".. ... ... ... ... \n", + "79 is_weekend transform True True \n", + "80 num_characters transform True True \n", + "81 latitude transform False False \n", + "82 cum_sum transform False False \n", + "83 subtract_numeric_scalar transform True True \n", "\n", - " description \\\n", - "22 Determines the domain of a url. \n", - "23 Calculates the cumulative mean. \n", - "24 Determines the minutes value of a datetime. \n", - "25 Calculates the cumulative maximum. \n", - "26 Calculates the age in years as a floating point number given a \n", - ".. ... \n", - "79 Determines if values are greater than a given scalar. \n", - "80 Determines the protocol (http or https) of a url. \n", - "81 Determines the month value of a datetime. \n", - "82 Divide each element in the list by a scalar. \n", - "83 Compute the time since the previous entry in a list. \n", + " description \\\n", + "22 Determines if values in one list are greater than another list. \n", + "23 Determines if values in one list are less than another list. \n", + "24 Element-wise logical AND of two lists. \n", + "25 Determines if values are less than a given scalar. \n", + "26 Element-wise modulo of two lists. \n", + ".. ... \n", + "79 Determines if a date falls on a weekend. \n", + "80 Calculates the number of characters in a string. \n", + "81 Returns the first tuple value in a list of LatLong tuples. \n", + "82 Calculates the cumulative sum. \n", + "83 Subtract a scalar from each element in the list. \n", "\n", - " valid_inputs return_type \n", - "22 URL Categorical \n", - "23 Numeric Numeric \n", - "24 Datetime Numeric \n", - "25 Numeric Numeric \n", - "26 DateOfBirth Numeric \n", - ".. ... ... \n", - "79 Numeric, Datetime, Ordinal Boolean \n", - "80 URL Categorical \n", - "81 Datetime Ordinal \n", - "82 Numeric Numeric \n", - "83 DatetimeTimeIndex Numeric \n", + " valid_inputs return_type \n", + "22 Ordinal, Datetime, Numeric Boolean \n", + "23 Ordinal, Datetime, Numeric Boolean \n", + "24 Boolean Boolean \n", + "25 Ordinal, Datetime, Numeric Boolean \n", + "26 Numeric Numeric \n", + ".. ... ... \n", + "79 Datetime Boolean \n", + "80 NaturalLanguage Numeric \n", + "81 LatLong Numeric \n", + "82 Numeric Numeric \n", + "83 Numeric Numeric \n", "\n", "[62 rows x 7 columns]" ] }, - "execution_count": 21, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1092,11 +1162,20 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "id": "6d3df2f7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 1min 3s\n" + ] + } + ], "source": [ + "%%time\n", "features, feature_names = ft.dfs(entityset = es, \n", " target_entity = 'X', \n", " max_depth = 2)" @@ -1112,7 +1191,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "id": "9a44a98a", "metadata": {}, "outputs": [ @@ -1653,7 +1732,7 @@ " ]" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1664,7 +1743,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 16, "id": "d5036e65", "metadata": {}, "outputs": [ @@ -1703,31 +1782,31 @@ " \n", " \n", " 0\n", - " 3000.267286\n", + " 3000.267334\n", " 2596.0\n", " 1.0\n", " \n", " \n", " 561\n", - " 3000.267286\n", + " 3000.267334\n", " 2596.0\n", " 1.0\n", " \n", " \n", " 2062\n", - " 2926.053180\n", + " 2926.053223\n", " 2596.0\n", " 0.0\n", " \n", " \n", " 6946\n", - " 2926.053180\n", + " 2926.053223\n", " 2596.0\n", " 0.0\n", " \n", " \n", " 6976\n", - " 2926.053180\n", + " 2926.053223\n", " 2596.0\n", " 0.0\n", " \n", @@ -1738,14 +1817,14 @@ "text/plain": [ " Wilderness_Area_0.MEAN(X.Elevation) Elevation Wilderness_Area_0\n", "index \n", - "0 3000.267286 2596.0 1.0\n", - "561 3000.267286 2596.0 1.0\n", - "2062 2926.053180 2596.0 0.0\n", - "6946 2926.053180 2596.0 0.0\n", - "6976 2926.053180 2596.0 0.0" + "0 3000.267334 2596.0 1.0\n", + "561 3000.267334 2596.0 1.0\n", + "2062 2926.053223 2596.0 0.0\n", + "6946 2926.053223 2596.0 0.0\n", + "6976 2926.053223 2596.0 0.0" ] }, - "execution_count": 35, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1756,7 +1835,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 17, "id": "ec8b7ccd", "metadata": {}, "outputs": [ @@ -1766,7 +1845,7 @@ "(581012, 532)" ] }, - "execution_count": 36, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1798,7 +1877,7 @@ "id": "75b7cc64", "metadata": {}, "source": [ - "为了解决“维数灾难”,有必要应用特征简化和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n", + "为了解决“维数灾难”,有必要应用特征约简和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n", "\n", "* 确定共线特征\n", "\n", @@ -4248,69 +4327,6 @@ "注意,正常情况下我们是不知道测试集的标签,所以这里先做分割,切分训练和预测集合" ] }, - { - "cell_type": "code", - "execution_count": 43, - "id": "1e65ffe8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexCover_Type
004.0
114.0
\n", - "
" - ], - "text/plain": [ - " index Cover_Type\n", - "0 0 4.0\n", - "1 1 4.0" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y = pd.DataFrame(y, columns=data.target_names)\n", - "y = y.reset_index()\n", - "y.head(2)" - ] - }, { "cell_type": "code", "execution_count": 46, @@ -4968,11 +4984,12 @@ } ], "source": [ - "import gc\n", + "\"\"\"\n", "del features_filtered\n", "del features_positive\n", "del fetch_covtype\n", "del df, X,y, X_selected_df,train,test,train_df,test_df,train_X,train_y\n", + "\"\"\"\n", "gc.collect()" ] }, @@ -4986,7 +5003,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 8, "id": "b7241552", "metadata": {}, "outputs": [ @@ -5110,7 +5127,7 @@ "246788 0.0 0.0 0.0 0.0 " ] }, - "execution_count": 65, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -5128,24 +5145,16 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 9, "id": "db3d3b92", "metadata": {}, "outputs": [ { - "ename": "MemoryError", - "evalue": "Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n", - "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 397\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 398\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moob_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 399\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_oob_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 400\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 401\u001b[0m \u001b[1;31m# Decapsulate classes_ attributes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36m_set_oob_score\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 528\u001b[0m unsampled_indices = _generate_unsampled_indices(\n\u001b[0;32m 529\u001b[0m estimator.random_state, n_samples, n_samples_bootstrap)\n\u001b[1;32m--> 530\u001b[1;33m p_estimator = estimator.predict_proba(X[unsampled_indices, :],\n\u001b[0m\u001b[0;32m 531\u001b[0m check_input=False)\n\u001b[0;32m 532\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[1;34m(self, X, check_input)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m \u001b[0mproba\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtree_\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n", - "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n", - "\u001b[1;31mMemoryError\u001b[0m: Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64" + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9673328605949619\n", + "Wall time: 14min 30s\n" ] } ], @@ -5157,13 +5166,429 @@ "print(accuracy_score(pred_org_test_y,org_test_y)) # RF" ] }, + { + "cell_type": "markdown", + "id": "50b5f988", + "metadata": {}, + "source": [ + "### 5.2 使用未约简与选择的特征的分数" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "52e36341", + "execution_count": 18, + "id": "0dc54e8c", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexElevationHorizontal_Distance_To_HydrologyVertical_Distance_To_HydrologyHorizontal_Distance_To_RoadwaysHorizontal_Distance_To_Fire_PointsAspectSlopeHillshade_9amHillshade_Noon...Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)Soil_Type_4.SUM(X.Elevation)Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)Cover_Type
002596.0258.00.0510.06279.051.03.0221.0232.0...1324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.04.0
112590.0212.0-6.0390.06225.056.02.0220.0235.0...1324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.04.0
\n", + "

2 rows × 534 columns

\n", + "
" + ], + "text/plain": [ + " index Elevation Horizontal_Distance_To_Hydrology \\\n", + "0 0 2596.0 258.0 \n", + "1 1 2590.0 212.0 \n", + "\n", + " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n", + "0 0.0 510.0 \n", + "1 -6.0 390.0 \n", + "\n", + " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n", + "0 6279.0 51.0 3.0 221.0 \n", + "1 6225.0 56.0 2.0 220.0 \n", + "\n", + " Hillshade_Noon ... Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n", + "0 232.0 ... 1324.050751 \n", + "1 235.0 ... 1324.050751 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n", + "0 212.689925 \n", + "1 212.689925 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n", + "0 1558.361956 \n", + "1 1558.361956 \n", + "\n", + " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n", + "0 58.279989 \n", + "1 58.279989 \n", + "\n", + " Soil_Type_4.SUM(X.Elevation) \\\n", + "0 1.715981e+09 \n", + "1 1.715981e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n", + "0 1.149499e+09 \n", + "1 1.149499e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n", + "0 156171328.0 \n", + "1 156171328.0 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n", + "0 1.364632e+09 \n", + "1 1.364632e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) Cover_Type \n", + "0 26848308.0 4.0 \n", + "1 26848308.0 4.0 \n", + "\n", + "[2 rows x 534 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.merge(features, y, on=['index'])\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "637b3a7e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3256" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del features, X\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4ac537b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexElevationHorizontal_Distance_To_HydrologyVertical_Distance_To_HydrologyHorizontal_Distance_To_RoadwaysHorizontal_Distance_To_Fire_PointsAspectSlopeHillshade_9amHillshade_Noon...Soil_Type_4.STD(X.Elevation)Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)Soil_Type_4.SUM(X.Elevation)Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)
2507282507283351.0726.0124.03813.02271.0206.027.0192.0252.0...277.0455171324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.0
2467882467882732.0212.01.01082.0912.0129.07.0231.0236.0...277.0455171324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.0
\n", + "

2 rows × 533 columns

\n", + "
" + ], + "text/plain": [ + " index Elevation Horizontal_Distance_To_Hydrology \\\n", + "250728 250728 3351.0 726.0 \n", + "246788 246788 2732.0 212.0 \n", + "\n", + " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n", + "250728 124.0 3813.0 \n", + "246788 1.0 1082.0 \n", + "\n", + " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n", + "250728 2271.0 206.0 27.0 192.0 \n", + "246788 912.0 129.0 7.0 231.0 \n", + "\n", + " Hillshade_Noon ... Soil_Type_4.STD(X.Elevation) \\\n", + "250728 252.0 ... 277.045517 \n", + "246788 236.0 ... 277.045517 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n", + "250728 1324.050751 \n", + "246788 1324.050751 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n", + "250728 212.689925 \n", + "246788 212.689925 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n", + "250728 1558.361956 \n", + "246788 1558.361956 \n", + "\n", + " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n", + "250728 58.279989 \n", + "246788 58.279989 \n", + "\n", + " Soil_Type_4.SUM(X.Elevation) \\\n", + "250728 1.715981e+09 \n", + "246788 1.715981e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n", + "250728 1.149499e+09 \n", + "246788 1.149499e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n", + "250728 156171328.0 \n", + "246788 156171328.0 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n", + "250728 1.364632e+09 \n", + "246788 1.364632e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) \n", + "250728 26848308.0 \n", + "246788 26848308.0 \n", + "\n", + "[2 rows x 533 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df, test_df = train_test_split(df,random_state=42)\n", + "train_X = train_df.drop('Cover_Type',1)\n", + "train_y = train_df['Cover_Type']\n", + "\n", + "test_X = test_df.drop('Cover_Type',1)\n", + "test_y = test_df['Cover_Type']\n", + "test_X.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "24c7b22f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "45" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del df, train_df, test_df\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "869777ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9442352309418738\n", + "Wall time: 30min 31s\n" + ] + } + ], + "source": [ + "%%time\n", + "random_forest = RandomForestClassifier(n_estimators=500,oob_score=True)\n", + "random_forest.fit(train_X, train_y)\n", + "pred_y = random_forest.predict(test_X)\n", + "print(accuracy_score(pred_y,test_y)) # RF" + ] + }, + { + "cell_type": "markdown", + "id": "3739a43c", + "metadata": {}, + "source": [ + "从结果来看,在这个数据集上,不管是增加的特征,还是增加后过滤的特征,效果都比原始特征差。我也咨询了一些朋友他们试了效果都一般,但是kaggle上很多人点赞,如果你们在哪个数据集上试了效果上涨,请联系我。" + ] } ], "metadata": { diff --git a/竞赛优胜技巧/Automated feature engineering.ipynb b/竞赛优胜技巧/Automated feature engineering.ipynb index 839f48c..488f3e0 100644 --- a/竞赛优胜技巧/Automated feature engineering.ipynb +++ b/竞赛优胜技巧/Automated feature engineering.ipynb @@ -13,6 +13,7 @@ "id": "66dfb30d", "metadata": {}, "source": [ + "### 结论:效果一般\n", "搬运参考:https://www.kaggle.com/liananapalkova/automated-feature-engineering-for-titanic-dataset" ] }, @@ -99,13 +100,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "id": "43cc9a46", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import time\n", + "import gc\n", "\n", "import featuretools as ft\n", "from featuretools.primitives import *\n", @@ -304,6 +306,69 @@ { "cell_type": "code", "execution_count": 5, + "id": "af6722f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexCover_Type
004.0
114.0
\n", + "
" + ], + "text/plain": [ + " index Cover_Type\n", + "0 0 4.0\n", + "1 1 4.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = pd.DataFrame(y, columns=data.target_names)\n", + "y = y.reset_index()\n", + "y.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "2d34ab5c", "metadata": {}, "outputs": [ @@ -347,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "1551c241", "metadata": {}, "outputs": [ @@ -404,10 +469,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "id": "06f24545", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Object `es.entity_from_dataframe` not found.\n" + ] + } + ], "source": [ "es.entity_from_dataframe?" ] @@ -430,7 +503,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, "id": "f2c69a94", "metadata": {}, "outputs": [ @@ -444,7 +517,7 @@ " No relationships" ] }, - "execution_count": 18, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -476,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 10, "id": "770130bc", "metadata": { "scrolled": false @@ -509,7 +582,7 @@ " X.Soil_Type_4 -> Soil_Type_4.Soil_Type_4" ] }, - "execution_count": 19, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -529,7 +602,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 11, "id": "352fa085", "metadata": { "scrolled": true @@ -568,126 +641,116 @@ " \n", " \n", " 0\n", - " all\n", + " sum\n", " aggregation\n", " True\n", - " False\n", - " Calculates if all values are 'True' in a list.\n", - " Boolean\n", - " Boolean\n", + " True\n", + " Calculates the total addition, ignoring `NaN`.\n", + " Numeric\n", + " Numeric\n", " \n", " \n", " 1\n", - " skew\n", + " first\n", " aggregation\n", " False\n", " False\n", - " Computes the extent to which a distribution differs from a normal distribution.\n", - " Numeric\n", - " Numeric\n", + " Determines the first value in a list.\n", + " Variable\n", + " None\n", " \n", " \n", " 2\n", - " percent_true\n", + " last\n", " aggregation\n", - " True\n", " False\n", - " Determines the percent of `True` values.\n", - " Boolean\n", - " Numeric\n", + " False\n", + " Determines the last value in a list.\n", + " Variable\n", + " None\n", " \n", " \n", " 3\n", - " count\n", + " trend\n", " aggregation\n", - " True\n", - " True\n", - " Determines the total number of values, excluding `NaN`.\n", - " Index\n", + " False\n", + " False\n", + " Calculates the trend of a variable over time.\n", + " DatetimeTimeIndex, Numeric\n", " Numeric\n", " \n", " \n", " 4\n", - " num_unique\n", + " n_most_common\n", " aggregation\n", - " True\n", - " True\n", - " Determines the number of distinct values, ignoring `NaN` values.\n", + " False\n", + " False\n", + " Determines the `n` most common elements.\n", + " Discrete\n", " Discrete\n", - " Numeric\n", " \n", " \n", " 5\n", - " first\n", + " time_since_last\n", " aggregation\n", " False\n", " False\n", - " Determines the first value in a list.\n", - " Variable\n", - " None\n", + " Calculates the time elapsed since the last datetime (default in seconds).\n", + " DatetimeTimeIndex\n", + " Numeric\n", " \n", " \n", " 6\n", - " mode\n", + " std\n", " aggregation\n", - " False\n", - " False\n", - " Determines the most commonly repeated value.\n", - " Discrete\n", - " None\n", + " True\n", + " True\n", + " Computes the dispersion relative to the mean value, ignoring `NaN`.\n", + " Numeric\n", + " Numeric\n", " \n", " \n", " 7\n", - " entropy\n", + " median\n", " aggregation\n", " False\n", " False\n", - " Calculates the entropy for a categorical variable\n", - " Categorical\n", + " Determines the middlemost number in a list of values.\n", + " Numeric\n", " Numeric\n", " \n", " \n", " 8\n", - " time_since_last\n", + " count\n", " aggregation\n", - " False\n", - " False\n", - " Calculates the time elapsed since the last datetime (default in seconds).\n", - " DatetimeTimeIndex\n", + " True\n", + " True\n", + " Determines the total number of values, excluding `NaN`.\n", + " Index\n", " Numeric\n", " \n", " \n", " 9\n", - " any\n", + " percent_true\n", " aggregation\n", " True\n", " False\n", - " Determines if any value is 'True' in a list.\n", - " Boolean\n", + " Determines the percent of `True` values.\n", " Boolean\n", + " Numeric\n", " \n", " \n", " 10\n", - " last\n", - " aggregation\n", - " False\n", - " False\n", - " Determines the last value in a list.\n", - " Variable\n", - " None\n", - " \n", - " \n", - " 11\n", - " avg_time_between\n", + " time_since_first\n", " aggregation\n", " False\n", " False\n", - " Computes the average number of seconds between consecutive events.\n", + " Calculates the time elapsed since the first datetime (in seconds).\n", " DatetimeTimeIndex\n", " Numeric\n", " \n", " \n", - " 12\n", + " 11\n", " max\n", " aggregation\n", " True\n", @@ -697,93 +760,103 @@ " Numeric\n", " \n", " \n", + " 12\n", + " any\n", + " aggregation\n", + " True\n", + " False\n", + " Determines if any value is 'True' in a list.\n", + " Boolean\n", + " Boolean\n", + " \n", + " \n", " 13\n", - " median\n", + " mode\n", " aggregation\n", " False\n", " False\n", - " Determines the middlemost number in a list of values.\n", - " Numeric\n", - " Numeric\n", + " Determines the most commonly repeated value.\n", + " Discrete\n", + " None\n", " \n", " \n", " 14\n", - " mean\n", + " entropy\n", + " aggregation\n", + " False\n", + " False\n", + " Calculates the entropy for a categorical variable\n", + " Categorical\n", + " Numeric\n", + " \n", + " \n", + " 15\n", + " min\n", " aggregation\n", " True\n", " True\n", - " Computes the average for a list of values.\n", + " Calculates the smallest value, ignoring `NaN` values.\n", " Numeric\n", " Numeric\n", " \n", " \n", - " 15\n", - " num_true\n", + " 16\n", + " all\n", " aggregation\n", " True\n", " False\n", - " Counts the number of `True` values.\n", + " Calculates if all values are 'True' in a list.\n", + " Boolean\n", " Boolean\n", - " Numeric\n", " \n", " \n", - " 16\n", - " min\n", + " 17\n", + " skew\n", " aggregation\n", - " True\n", - " True\n", - " Calculates the smallest value, ignoring `NaN` values.\n", + " False\n", + " False\n", + " Computes the extent to which a distribution differs from a normal distribution.\n", " Numeric\n", " Numeric\n", " \n", " \n", - " 17\n", - " sum\n", + " 18\n", + " mean\n", " aggregation\n", " True\n", " True\n", - " Calculates the total addition, ignoring `NaN`.\n", + " Computes the average for a list of values.\n", " Numeric\n", " Numeric\n", " \n", " \n", - " 18\n", - " trend\n", + " 19\n", + " avg_time_between\n", " aggregation\n", " False\n", " False\n", - " Calculates the trend of a variable over time.\n", - " Numeric, DatetimeTimeIndex\n", + " Computes the average number of seconds between consecutive events.\n", + " DatetimeTimeIndex\n", " Numeric\n", " \n", " \n", - " 19\n", - " n_most_common\n", + " 20\n", + " num_unique\n", " aggregation\n", - " False\n", - " False\n", - " Determines the `n` most common elements.\n", - " Discrete\n", + " True\n", + " True\n", + " Determines the number of distinct values, ignoring `NaN` values.\n", " Discrete\n", - " \n", - " \n", - " 20\n", - " time_since_first\n", - " aggregation\n", - " False\n", - " False\n", - " Calculates the time elapsed since the first datetime (in seconds).\n", - " DatetimeTimeIndex\n", - " Numeric\n", + " Numeric\n", " \n", " \n", " 21\n", - " std\n", + " num_true\n", " aggregation\n", " True\n", - " True\n", - " Computes the dispersion relative to the mean value, ignoring `NaN`.\n", - " Numeric\n", + " False\n", + " Counts the number of `True` values.\n", + " Boolean\n", " Numeric\n", " \n", " \n", @@ -792,79 +865,79 @@ ], "text/plain": [ " name type dask_compatible koalas_compatible \\\n", - "0 all aggregation True False \n", - "1 skew aggregation False False \n", - "2 percent_true aggregation True False \n", - "3 count aggregation True True \n", - "4 num_unique aggregation True True \n", - "5 first aggregation False False \n", - "6 mode aggregation False False \n", - "7 entropy aggregation False False \n", - "8 time_since_last aggregation False False \n", - "9 any aggregation True False \n", - "10 last aggregation False False \n", - "11 avg_time_between aggregation False False \n", - "12 max aggregation True True \n", - "13 median aggregation False False \n", - "14 mean aggregation True True \n", - "15 num_true aggregation True False \n", - "16 min aggregation True True \n", - "17 sum aggregation True True \n", - "18 trend aggregation False False \n", - "19 n_most_common aggregation False False \n", - "20 time_since_first aggregation False False \n", - "21 std aggregation True True \n", + "0 sum aggregation True True \n", + "1 first aggregation False False \n", + "2 last aggregation False False \n", + "3 trend aggregation False False \n", + "4 n_most_common aggregation False False \n", + "5 time_since_last aggregation False False \n", + "6 std aggregation True True \n", + "7 median aggregation False False \n", + "8 count aggregation True True \n", + "9 percent_true aggregation True False \n", + "10 time_since_first aggregation False False \n", + "11 max aggregation True True \n", + "12 any aggregation True False \n", + "13 mode aggregation False False \n", + "14 entropy aggregation False False \n", + "15 min aggregation True True \n", + "16 all aggregation True False \n", + "17 skew aggregation False False \n", + "18 mean aggregation True True \n", + "19 avg_time_between aggregation False False \n", + "20 num_unique aggregation True True \n", + "21 num_true aggregation True False \n", "\n", " description \\\n", - "0 Calculates if all values are 'True' in a list. \n", - "1 Computes the extent to which a distribution differs from a normal distribution. \n", - "2 Determines the percent of `True` values. \n", - "3 Determines the total number of values, excluding `NaN`. \n", - "4 Determines the number of distinct values, ignoring `NaN` values. \n", - "5 Determines the first value in a list. \n", - "6 Determines the most commonly repeated value. \n", - "7 Calculates the entropy for a categorical variable \n", - "8 Calculates the time elapsed since the last datetime (default in seconds). \n", - "9 Determines if any value is 'True' in a list. \n", - "10 Determines the last value in a list. \n", - "11 Computes the average number of seconds between consecutive events. \n", - "12 Calculates the highest value, ignoring `NaN` values. \n", - "13 Determines the middlemost number in a list of values. \n", - "14 Computes the average for a list of values. \n", - "15 Counts the number of `True` values. \n", - "16 Calculates the smallest value, ignoring `NaN` values. \n", - "17 Calculates the total addition, ignoring `NaN`. \n", - "18 Calculates the trend of a variable over time. \n", - "19 Determines the `n` most common elements. \n", - "20 Calculates the time elapsed since the first datetime (in seconds). \n", - "21 Computes the dispersion relative to the mean value, ignoring `NaN`. \n", + "0 Calculates the total addition, ignoring `NaN`. \n", + "1 Determines the first value in a list. \n", + "2 Determines the last value in a list. \n", + "3 Calculates the trend of a variable over time. \n", + "4 Determines the `n` most common elements. \n", + "5 Calculates the time elapsed since the last datetime (default in seconds). \n", + "6 Computes the dispersion relative to the mean value, ignoring `NaN`. \n", + "7 Determines the middlemost number in a list of values. \n", + "8 Determines the total number of values, excluding `NaN`. \n", + "9 Determines the percent of `True` values. \n", + "10 Calculates the time elapsed since the first datetime (in seconds). \n", + "11 Calculates the highest value, ignoring `NaN` values. \n", + "12 Determines if any value is 'True' in a list. \n", + "13 Determines the most commonly repeated value. \n", + "14 Calculates the entropy for a categorical variable \n", + "15 Calculates the smallest value, ignoring `NaN` values. \n", + "16 Calculates if all values are 'True' in a list. \n", + "17 Computes the extent to which a distribution differs from a normal distribution. \n", + "18 Computes the average for a list of values. \n", + "19 Computes the average number of seconds between consecutive events. \n", + "20 Determines the number of distinct values, ignoring `NaN` values. \n", + "21 Counts the number of `True` values. \n", "\n", " valid_inputs return_type \n", - "0 Boolean Boolean \n", - "1 Numeric Numeric \n", - "2 Boolean Numeric \n", - "3 Index Numeric \n", - "4 Discrete Numeric \n", - "5 Variable None \n", - "6 Discrete None \n", - "7 Categorical Numeric \n", - "8 DatetimeTimeIndex Numeric \n", - "9 Boolean Boolean \n", - "10 Variable None \n", - "11 DatetimeTimeIndex Numeric \n", - "12 Numeric Numeric \n", - "13 Numeric Numeric \n", - "14 Numeric Numeric \n", - "15 Boolean Numeric \n", - "16 Numeric Numeric \n", + "0 Numeric Numeric \n", + "1 Variable None \n", + "2 Variable None \n", + "3 DatetimeTimeIndex, Numeric Numeric \n", + "4 Discrete Discrete \n", + "5 DatetimeTimeIndex Numeric \n", + "6 Numeric Numeric \n", + "7 Numeric Numeric \n", + "8 Index Numeric \n", + "9 Boolean Numeric \n", + "10 DatetimeTimeIndex Numeric \n", + "11 Numeric Numeric \n", + "12 Boolean Boolean \n", + "13 Discrete None \n", + "14 Categorical Numeric \n", + "15 Numeric Numeric \n", + "16 Boolean Boolean \n", "17 Numeric Numeric \n", - "18 Numeric, DatetimeTimeIndex Numeric \n", - "19 Discrete Discrete \n", - "20 DatetimeTimeIndex Numeric \n", - "21 Numeric Numeric " + "18 Numeric Numeric \n", + "19 DatetimeTimeIndex Numeric \n", + "20 Discrete Numeric \n", + "21 Boolean Numeric " ] }, - "execution_count": 20, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -877,7 +950,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "id": "7762885f", "metadata": {}, "outputs": [ @@ -914,52 +987,52 @@ " \n", " \n", " 22\n", - " url_to_domain\n", + " greater_than\n", " transform\n", + " True\n", " False\n", - " False\n", - " Determines the domain of a url.\n", - " URL\n", - " Categorical\n", + " Determines if values in one list are greater than another list.\n", + " Ordinal, Datetime, Numeric\n", + " Boolean\n", " \n", " \n", " 23\n", - " cum_mean\n", + " less_than\n", " transform\n", - " False\n", - " False\n", - " Calculates the cumulative mean.\n", - " Numeric\n", - " Numeric\n", + " True\n", + " True\n", + " Determines if values in one list are less than another list.\n", + " Ordinal, Datetime, Numeric\n", + " Boolean\n", " \n", " \n", " 24\n", - " minute\n", + " and\n", " transform\n", " True\n", " True\n", - " Determines the minutes value of a datetime.\n", - " Datetime\n", - " Numeric\n", + " Element-wise logical AND of two lists.\n", + " Boolean\n", + " Boolean\n", " \n", " \n", " 25\n", - " cum_max\n", + " less_than_scalar\n", " transform\n", - " False\n", - " False\n", - " Calculates the cumulative maximum.\n", - " Numeric\n", - " Numeric\n", + " True\n", + " True\n", + " Determines if values are less than a given scalar.\n", + " Ordinal, Datetime, Numeric\n", + " Boolean\n", " \n", " \n", " 26\n", - " age\n", + " modulo_numeric\n", " transform\n", " True\n", - " False\n", - " Calculates the age in years as a floating point number given a\n", - " DateOfBirth\n", + " True\n", + " Element-wise modulo of two lists.\n", + " Numeric\n", " Numeric\n", " \n", " \n", @@ -974,52 +1047,52 @@ " \n", " \n", " 79\n", - " greater_than_scalar\n", + " is_weekend\n", " transform\n", " True\n", " True\n", - " Determines if values are greater than a given scalar.\n", - " Numeric, Datetime, Ordinal\n", + " Determines if a date falls on a weekend.\n", + " Datetime\n", " Boolean\n", " \n", " \n", " 80\n", - " url_to_protocol\n", + " num_characters\n", " transform\n", - " False\n", - " False\n", - " Determines the protocol (http or https) of a url.\n", - " URL\n", - " Categorical\n", + " True\n", + " True\n", + " Calculates the number of characters in a string.\n", + " NaturalLanguage\n", + " Numeric\n", " \n", " \n", " 81\n", - " month\n", + " latitude\n", " transform\n", - " True\n", - " True\n", - " Determines the month value of a datetime.\n", - " Datetime\n", - " Ordinal\n", + " False\n", + " False\n", + " Returns the first tuple value in a list of LatLong tuples.\n", + " LatLong\n", + " Numeric\n", " \n", " \n", " 82\n", - " divide_numeric_scalar\n", + " cum_sum\n", " transform\n", - " True\n", - " True\n", - " Divide each element in the list by a scalar.\n", + " False\n", + " False\n", + " Calculates the cumulative sum.\n", " Numeric\n", " Numeric\n", " \n", " \n", " 83\n", - " time_since_previous\n", + " subtract_numeric_scalar\n", " transform\n", - " False\n", - " False\n", - " Compute the time since the previous entry in a list.\n", - " DatetimeTimeIndex\n", + " True\n", + " True\n", + " Subtract a scalar from each element in the list.\n", + " Numeric\n", " Numeric\n", " \n", " \n", @@ -1028,49 +1101,49 @@ "" ], "text/plain": [ - " name type dask_compatible koalas_compatible \\\n", - "22 url_to_domain transform False False \n", - "23 cum_mean transform False False \n", - "24 minute transform True True \n", - "25 cum_max transform False False \n", - "26 age transform True False \n", - ".. ... ... ... ... \n", - "79 greater_than_scalar transform True True \n", - "80 url_to_protocol transform False False \n", - "81 month transform True True \n", - "82 divide_numeric_scalar transform True True \n", - "83 time_since_previous transform False False \n", + " name type dask_compatible koalas_compatible \\\n", + "22 greater_than transform True False \n", + "23 less_than transform True True \n", + "24 and transform True True \n", + "25 less_than_scalar transform True True \n", + "26 modulo_numeric transform True True \n", + ".. ... ... ... ... \n", + "79 is_weekend transform True True \n", + "80 num_characters transform True True \n", + "81 latitude transform False False \n", + "82 cum_sum transform False False \n", + "83 subtract_numeric_scalar transform True True \n", "\n", - " description \\\n", - "22 Determines the domain of a url. \n", - "23 Calculates the cumulative mean. \n", - "24 Determines the minutes value of a datetime. \n", - "25 Calculates the cumulative maximum. \n", - "26 Calculates the age in years as a floating point number given a \n", - ".. ... \n", - "79 Determines if values are greater than a given scalar. \n", - "80 Determines the protocol (http or https) of a url. \n", - "81 Determines the month value of a datetime. \n", - "82 Divide each element in the list by a scalar. \n", - "83 Compute the time since the previous entry in a list. \n", + " description \\\n", + "22 Determines if values in one list are greater than another list. \n", + "23 Determines if values in one list are less than another list. \n", + "24 Element-wise logical AND of two lists. \n", + "25 Determines if values are less than a given scalar. \n", + "26 Element-wise modulo of two lists. \n", + ".. ... \n", + "79 Determines if a date falls on a weekend. \n", + "80 Calculates the number of characters in a string. \n", + "81 Returns the first tuple value in a list of LatLong tuples. \n", + "82 Calculates the cumulative sum. \n", + "83 Subtract a scalar from each element in the list. \n", "\n", - " valid_inputs return_type \n", - "22 URL Categorical \n", - "23 Numeric Numeric \n", - "24 Datetime Numeric \n", - "25 Numeric Numeric \n", - "26 DateOfBirth Numeric \n", - ".. ... ... \n", - "79 Numeric, Datetime, Ordinal Boolean \n", - "80 URL Categorical \n", - "81 Datetime Ordinal \n", - "82 Numeric Numeric \n", - "83 DatetimeTimeIndex Numeric \n", + " valid_inputs return_type \n", + "22 Ordinal, Datetime, Numeric Boolean \n", + "23 Ordinal, Datetime, Numeric Boolean \n", + "24 Boolean Boolean \n", + "25 Ordinal, Datetime, Numeric Boolean \n", + "26 Numeric Numeric \n", + ".. ... ... \n", + "79 Datetime Boolean \n", + "80 NaturalLanguage Numeric \n", + "81 LatLong Numeric \n", + "82 Numeric Numeric \n", + "83 Numeric Numeric \n", "\n", "[62 rows x 7 columns]" ] }, - "execution_count": 21, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1089,11 +1162,20 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "id": "6d3df2f7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 1min 3s\n" + ] + } + ], "source": [ + "%%time\n", "features, feature_names = ft.dfs(entityset = es, \n", " target_entity = 'X', \n", " max_depth = 2)" @@ -1109,7 +1191,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "id": "9a44a98a", "metadata": {}, "outputs": [ @@ -1650,7 +1732,7 @@ " ]" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1661,7 +1743,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 16, "id": "d5036e65", "metadata": {}, "outputs": [ @@ -1700,31 +1782,31 @@ " \n", " \n", " 0\n", - " 3000.267286\n", + " 3000.267334\n", " 2596.0\n", " 1.0\n", " \n", " \n", " 561\n", - " 3000.267286\n", + " 3000.267334\n", " 2596.0\n", " 1.0\n", " \n", " \n", " 2062\n", - " 2926.053180\n", + " 2926.053223\n", " 2596.0\n", " 0.0\n", " \n", " \n", " 6946\n", - " 2926.053180\n", + " 2926.053223\n", " 2596.0\n", " 0.0\n", " \n", " \n", " 6976\n", - " 2926.053180\n", + " 2926.053223\n", " 2596.0\n", " 0.0\n", " \n", @@ -1735,14 +1817,14 @@ "text/plain": [ " Wilderness_Area_0.MEAN(X.Elevation) Elevation Wilderness_Area_0\n", "index \n", - "0 3000.267286 2596.0 1.0\n", - "561 3000.267286 2596.0 1.0\n", - "2062 2926.053180 2596.0 0.0\n", - "6946 2926.053180 2596.0 0.0\n", - "6976 2926.053180 2596.0 0.0" + "0 3000.267334 2596.0 1.0\n", + "561 3000.267334 2596.0 1.0\n", + "2062 2926.053223 2596.0 0.0\n", + "6946 2926.053223 2596.0 0.0\n", + "6976 2926.053223 2596.0 0.0" ] }, - "execution_count": 35, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1753,7 +1835,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 17, "id": "ec8b7ccd", "metadata": {}, "outputs": [ @@ -1763,7 +1845,7 @@ "(581012, 532)" ] }, - "execution_count": 36, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1795,7 +1877,7 @@ "id": "75b7cc64", "metadata": {}, "source": [ - "为了解决“维数灾难”,有必要应用特征简化和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n", + "为了解决“维数灾难”,有必要应用特征约简和选择,这意味着从数据中去除低值特征。但请记住,特征选择可能会影响ML模型的性能。棘手的是,ML模型的设计包含一个艺术元素。这绝对不是一个具有严格规则的确定性过程,要想取得成功就必须遵循这些规则。为了得到一个精确的模型,有必要应用、组合和比较几十种方法。在本notebook中,我不会解释所有可能的方法来处理“维度灾难”。我将集中讨论以下方法:\n", "\n", "* 确定共线特征\n", "\n", @@ -4902,11 +4984,12 @@ } ], "source": [ - "import gc\n", + "\"\"\"\n", "del features_filtered\n", "del features_positive\n", "del fetch_covtype\n", "del df, X,y, X_selected_df,train,test,train_df,test_df,train_X,train_y\n", + "\"\"\"\n", "gc.collect()" ] }, @@ -4920,7 +5003,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 8, "id": "b7241552", "metadata": {}, "outputs": [ @@ -5044,7 +5127,7 @@ "246788 0.0 0.0 0.0 0.0 " ] }, - "execution_count": 65, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -5062,24 +5145,16 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 9, "id": "db3d3b92", "metadata": {}, "outputs": [ { - "ename": "MemoryError", - "evalue": "Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n", - "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 397\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 398\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moob_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 399\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_oob_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 400\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 401\u001b[0m \u001b[1;31m# Decapsulate classes_ attributes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\_forest.py\u001b[0m in \u001b[0;36m_set_oob_score\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 528\u001b[0m unsampled_indices = _generate_unsampled_indices(\n\u001b[0;32m 529\u001b[0m estimator.random_state, n_samples, n_samples_bootstrap)\n\u001b[1;32m--> 530\u001b[1;33m p_estimator = estimator.predict_proba(X[unsampled_indices, :],\n\u001b[0m\u001b[0;32m 531\u001b[0m check_input=False)\n\u001b[0;32m 532\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\tree\\_classes.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[1;34m(self, X, check_input)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_X_predict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_input\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m \u001b[0mproba\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtree_\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_outputs_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n", - "\u001b[1;32msklearn\\tree\\_tree.pyx\u001b[0m in \u001b[0;36msklearn.tree._tree.Tree.predict\u001b[1;34m()\u001b[0m\n", - "\u001b[1;31mMemoryError\u001b[0m: Unable to allocate 8.55 MiB for an array with shape (160080, 1, 7) and data type float64" + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9673328605949619\n", + "Wall time: 14min 30s\n" ] } ], @@ -5091,13 +5166,429 @@ "print(accuracy_score(pred_org_test_y,org_test_y)) # RF" ] }, + { + "cell_type": "markdown", + "id": "50b5f988", + "metadata": {}, + "source": [ + "### 5.2 使用未约简与选择的特征的分数" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "52e36341", + "execution_count": 18, + "id": "0dc54e8c", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexElevationHorizontal_Distance_To_HydrologyVertical_Distance_To_HydrologyHorizontal_Distance_To_RoadwaysHorizontal_Distance_To_Fire_PointsAspectSlopeHillshade_9amHillshade_Noon...Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)Soil_Type_4.SUM(X.Elevation)Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)Cover_Type
002596.0258.00.0510.06279.051.03.0221.0232.0...1324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.04.0
112590.0212.0-6.0390.06225.056.02.0220.0235.0...1324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.04.0
\n", + "

2 rows × 534 columns

\n", + "
" + ], + "text/plain": [ + " index Elevation Horizontal_Distance_To_Hydrology \\\n", + "0 0 2596.0 258.0 \n", + "1 1 2590.0 212.0 \n", + "\n", + " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n", + "0 0.0 510.0 \n", + "1 -6.0 390.0 \n", + "\n", + " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n", + "0 6279.0 51.0 3.0 221.0 \n", + "1 6225.0 56.0 2.0 220.0 \n", + "\n", + " Hillshade_Noon ... Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n", + "0 232.0 ... 1324.050751 \n", + "1 235.0 ... 1324.050751 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n", + "0 212.689925 \n", + "1 212.689925 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n", + "0 1558.361956 \n", + "1 1558.361956 \n", + "\n", + " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n", + "0 58.279989 \n", + "1 58.279989 \n", + "\n", + " Soil_Type_4.SUM(X.Elevation) \\\n", + "0 1.715981e+09 \n", + "1 1.715981e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n", + "0 1.149499e+09 \n", + "1 1.149499e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n", + "0 156171328.0 \n", + "1 156171328.0 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n", + "0 1.364632e+09 \n", + "1 1.364632e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) Cover_Type \n", + "0 26848308.0 4.0 \n", + "1 26848308.0 4.0 \n", + "\n", + "[2 rows x 534 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.merge(features, y, on=['index'])\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "637b3a7e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3256" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del features, X\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4ac537b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexElevationHorizontal_Distance_To_HydrologyVertical_Distance_To_HydrologyHorizontal_Distance_To_RoadwaysHorizontal_Distance_To_Fire_PointsAspectSlopeHillshade_9amHillshade_Noon...Soil_Type_4.STD(X.Elevation)Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways)Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology)Soil_Type_4.SUM(X.Elevation)Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points)Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology)Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways)Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology)
2507282507283351.0726.0124.03813.02271.0206.027.0192.0252.0...277.0455171324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.0
2467882467882732.0212.01.01082.0912.0129.07.0231.0236.0...277.0455171324.050751212.6899251558.36195658.2799891.715981e+091.149499e+09156171328.01.364632e+0926848308.0
\n", + "

2 rows × 533 columns

\n", + "
" + ], + "text/plain": [ + " index Elevation Horizontal_Distance_To_Hydrology \\\n", + "250728 250728 3351.0 726.0 \n", + "246788 246788 2732.0 212.0 \n", + "\n", + " Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways \\\n", + "250728 124.0 3813.0 \n", + "246788 1.0 1082.0 \n", + "\n", + " Horizontal_Distance_To_Fire_Points Aspect Slope Hillshade_9am \\\n", + "250728 2271.0 206.0 27.0 192.0 \n", + "246788 912.0 129.0 7.0 231.0 \n", + "\n", + " Hillshade_Noon ... Soil_Type_4.STD(X.Elevation) \\\n", + "250728 252.0 ... 277.045517 \n", + "246788 236.0 ... 277.045517 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Fire_Points) \\\n", + "250728 1324.050751 \n", + "246788 1324.050751 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Hydrology) \\\n", + "250728 212.689925 \n", + "246788 212.689925 \n", + "\n", + " Soil_Type_4.STD(X.Horizontal_Distance_To_Roadways) \\\n", + "250728 1558.361956 \n", + "246788 1558.361956 \n", + "\n", + " Soil_Type_4.STD(X.Vertical_Distance_To_Hydrology) \\\n", + "250728 58.279989 \n", + "246788 58.279989 \n", + "\n", + " Soil_Type_4.SUM(X.Elevation) \\\n", + "250728 1.715981e+09 \n", + "246788 1.715981e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Fire_Points) \\\n", + "250728 1.149499e+09 \n", + "246788 1.149499e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Hydrology) \\\n", + "250728 156171328.0 \n", + "246788 156171328.0 \n", + "\n", + " Soil_Type_4.SUM(X.Horizontal_Distance_To_Roadways) \\\n", + "250728 1.364632e+09 \n", + "246788 1.364632e+09 \n", + "\n", + " Soil_Type_4.SUM(X.Vertical_Distance_To_Hydrology) \n", + "250728 26848308.0 \n", + "246788 26848308.0 \n", + "\n", + "[2 rows x 533 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df, test_df = train_test_split(df,random_state=42)\n", + "train_X = train_df.drop('Cover_Type',1)\n", + "train_y = train_df['Cover_Type']\n", + "\n", + "test_X = test_df.drop('Cover_Type',1)\n", + "test_y = test_df['Cover_Type']\n", + "test_X.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "24c7b22f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "45" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del df, train_df, test_df\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "869777ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9442352309418738\n", + "Wall time: 30min 31s\n" + ] + } + ], + "source": [ + "%%time\n", + "random_forest = RandomForestClassifier(n_estimators=500,oob_score=True)\n", + "random_forest.fit(train_X, train_y)\n", + "pred_y = random_forest.predict(test_X)\n", + "print(accuracy_score(pred_y,test_y)) # RF" + ] + }, + { + "cell_type": "markdown", + "id": "3739a43c", + "metadata": {}, + "source": [ + "从结果来看,在这个数据集上,不管是增加的特征,还是增加后过滤的特征,效果都比原始特征差。我也咨询了一些朋友他们试了效果都一般,但是kaggle上很多人点赞,如果你们在哪个数据集上试了效果上涨,请联系我。" + ] } ], "metadata": {