From 272052520eb08cb249563a1b0c7f54a19e3321f8 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Sun, 29 Aug 2021 22:44:34 +0800 Subject: [PATCH] Create Feature Engineering Techniques.ipynb --- ...re Engineering Techniques-checkpoint.ipynb | 274 ++++++++++++++++++ .../Feature Engineering Techniques.ipynb | 274 ++++++++++++++++++ 2 files changed, 548 insertions(+) create mode 100644 竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb create mode 100644 竞赛优胜技巧/Feature Engineering Techniques.ipynb diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb new file mode 100644 index 0000000..20c2e02 --- /dev/null +++ b/竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "278c7a1e", + "metadata": {}, + "source": [ + "# 特征工程技术" + ] + }, + { + "cell_type": "markdown", + "id": "67f256b4", + "metadata": {}, + "source": [ + "搬运参考:https://www.kaggle.com/c/ieee-fraud-detection/discussion/108575" + ] + }, + { + "cell_type": "markdown", + "id": "5a28bcf6", + "metadata": {}, + "source": [ + "## 关于编码\n", + "在执行编码时,最好训练和测试集一起编码,如下所示" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0edffa6", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([train[col],test[col]],axis=0)\n", + "# PERFORM FEATURE ENGINEERING HERE\n", + "train[col] = df[:len(train)]\n", + "test[col] = df[len(train):]" + ] + }, + { + "cell_type": "markdown", + "id": "3bd8a464", + "metadata": {}, + "source": [ + "## NAN值加工\n", + "如果将np.nan给LGBM,那么在每个树节点分裂时,它会分裂非 NAN 值,然后将所有 NAN 发送到左节点或右节点,这取决于什么是最好的。\n", + "\n", + "因此,NAN 在每个节点都得到特殊处理,并且可能会变得过拟合。\n", + "\n", + "通过简单地将所有 NAN 转换为低于所有非 NAN 值的负数(例如 - 999),来防止测试集过拟合。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2c552c7", + "metadata": {}, + "outputs": [], + "source": [ + "df[col].fillna(-999, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "fe85c377", + "metadata": {}, + "source": [ + "这样LGBM将不再过度处理 NAN。相反,它会给予它与其他数字相同的关注。可以尝试两种方法,看看哪个给出了最高的CV。" + ] + }, + { + "cell_type": "markdown", + "id": "05e77c5a", + "metadata": {}, + "source": [ + "## 标签编码/因式分解/内存减少\n", + "标签编码(分解)将(字符串、类别、对象)列转换为整数。类似get_dummies,不同点在于如果有几十个取值,如果用pd.get_dummies()则会得到好几十列,增加了数据的稀疏性" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "554159aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
color
00
11
22
31
40
\n", + "
" + ], + "text/plain": [ + " color\n", + "0 0\n", + "1 1\n", + "2 2\n", + "3 1\n", + "4 0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "df = pd.DataFrame(['green','bule','red','bule','green'],columns=['color'])\n", + "df['color'],_ = df['color'].factorize()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "e5bf12a9", + "metadata": {}, + "source": [ + "之后,可以将其转换为 int8、int16 或 int32用以减少内存,具体取决于 max 是否小于 128、小于 32768。" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "863fee6f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5 entries, 0 to 4\n", + "Data columns (total 1 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 color 5 non-null int8 \n", + "dtypes: int8(1)\n", + "memory usage: 133.0 bytes\n" + ] + } + ], + "source": [ + "if df['color'].max()<128:\n", + " df['color'] = df['color'].astype('int8')\n", + "elif df['color'].max()<32768:\n", + " df['color'] = df['color'].astype('int16')\n", + "else: df['color'] = df['color'].astype('int32')\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1a6bac81", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5 entries, 0 to 4\n", + "Data columns (total 1 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 color 5 non-null int32\n", + "dtypes: int32(1)\n", + "memory usage: 148.0 bytes\n" + ] + } + ], + "source": [ + "df['color'] = df['color'].astype('int32') # 如果使用int32,可以看到memory usage: 变成148了\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "0951f3c7", + "metadata": {}, + "source": [ + "另外为了减少内存,人们memory_reduce在其他列上使用流行的功能。\n", + "\n", + "一种更简单、更安全的方法是将所有 float64 转换为 float32,将所有 int64 转换为 int32。(最好避免使用 float16。如果你愿意,可以使用 int8 和 int16)。" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "88368fc6", + "metadata": {}, + "outputs": [], + "source": [ + "for col in df.columns:\n", + " if df[col].dtype=='float64': df[col] = df[col].astype('float32')\n", + " if df[col].dtype=='int64': df[col] = df[col].astype('int32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ecd48ce", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/竞赛优胜技巧/Feature Engineering Techniques.ipynb b/竞赛优胜技巧/Feature Engineering Techniques.ipynb new file mode 100644 index 0000000..20c2e02 --- /dev/null +++ b/竞赛优胜技巧/Feature Engineering Techniques.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "278c7a1e", + "metadata": {}, + "source": [ + "# 特征工程技术" + ] + }, + { + "cell_type": "markdown", + "id": "67f256b4", + "metadata": {}, + "source": [ + "搬运参考:https://www.kaggle.com/c/ieee-fraud-detection/discussion/108575" + ] + }, + { + "cell_type": "markdown", + "id": "5a28bcf6", + "metadata": {}, + "source": [ + "## 关于编码\n", + "在执行编码时,最好训练和测试集一起编码,如下所示" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0edffa6", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([train[col],test[col]],axis=0)\n", + "# PERFORM FEATURE ENGINEERING HERE\n", + "train[col] = df[:len(train)]\n", + "test[col] = df[len(train):]" + ] + }, + { + "cell_type": "markdown", + "id": "3bd8a464", + "metadata": {}, + "source": [ + "## NAN值加工\n", + "如果将np.nan给LGBM,那么在每个树节点分裂时,它会分裂非 NAN 值,然后将所有 NAN 发送到左节点或右节点,这取决于什么是最好的。\n", + "\n", + "因此,NAN 在每个节点都得到特殊处理,并且可能会变得过拟合。\n", + "\n", + "通过简单地将所有 NAN 转换为低于所有非 NAN 值的负数(例如 - 999),来防止测试集过拟合。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2c552c7", + "metadata": {}, + "outputs": [], + "source": [ + "df[col].fillna(-999, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "fe85c377", + "metadata": {}, + "source": [ + "这样LGBM将不再过度处理 NAN。相反,它会给予它与其他数字相同的关注。可以尝试两种方法,看看哪个给出了最高的CV。" + ] + }, + { + "cell_type": "markdown", + "id": "05e77c5a", + "metadata": {}, + "source": [ + "## 标签编码/因式分解/内存减少\n", + "标签编码(分解)将(字符串、类别、对象)列转换为整数。类似get_dummies,不同点在于如果有几十个取值,如果用pd.get_dummies()则会得到好几十列,增加了数据的稀疏性" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "554159aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
color
00
11
22
31
40
\n", + "
" + ], + "text/plain": [ + " color\n", + "0 0\n", + "1 1\n", + "2 2\n", + "3 1\n", + "4 0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "df = pd.DataFrame(['green','bule','red','bule','green'],columns=['color'])\n", + "df['color'],_ = df['color'].factorize()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "e5bf12a9", + "metadata": {}, + "source": [ + "之后,可以将其转换为 int8、int16 或 int32用以减少内存,具体取决于 max 是否小于 128、小于 32768。" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "863fee6f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5 entries, 0 to 4\n", + "Data columns (total 1 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 color 5 non-null int8 \n", + "dtypes: int8(1)\n", + "memory usage: 133.0 bytes\n" + ] + } + ], + "source": [ + "if df['color'].max()<128:\n", + " df['color'] = df['color'].astype('int8')\n", + "elif df['color'].max()<32768:\n", + " df['color'] = df['color'].astype('int16')\n", + "else: df['color'] = df['color'].astype('int32')\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1a6bac81", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5 entries, 0 to 4\n", + "Data columns (total 1 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 color 5 non-null int32\n", + "dtypes: int32(1)\n", + "memory usage: 148.0 bytes\n" + ] + } + ], + "source": [ + "df['color'] = df['color'].astype('int32') # 如果使用int32,可以看到memory usage: 变成148了\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "0951f3c7", + "metadata": {}, + "source": [ + "另外为了减少内存,人们memory_reduce在其他列上使用流行的功能。\n", + "\n", + "一种更简单、更安全的方法是将所有 float64 转换为 float32,将所有 int64 转换为 int32。(最好避免使用 float16。如果你愿意,可以使用 int8 和 int16)。" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "88368fc6", + "metadata": {}, + "outputs": [], + "source": [ + "for col in df.columns:\n", + " if df[col].dtype=='float64': df[col] = df[col].astype('float32')\n", + " if df[col].dtype=='int64': df[col] = df[col].astype('int32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ecd48ce", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}