|
|
|
@ -243,7 +243,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "ac6cd644",
|
|
|
|
|
"id": "f5102024",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 分类特征\n",
|
|
|
|
@ -252,8 +252,8 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 10,
|
|
|
|
|
"id": "e3285fce",
|
|
|
|
|
"execution_count": 17,
|
|
|
|
|
"id": "65eeb045",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
@ -280,7 +280,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "d5016f4c",
|
|
|
|
|
"id": "34da34b3",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Splitting\n",
|
|
|
|
@ -291,7 +291,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "66f3fb03",
|
|
|
|
|
"id": "8c42b4ed",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 组合/转化/交互\n",
|
|
|
|
@ -301,16 +301,16 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "7a267704",
|
|
|
|
|
"id": "1df43cfd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df['uid'] = df[‘card1’].astype(str)+’_’+df[‘card2’].astype(str)"
|
|
|
|
|
"df['uid'] = df['card1'].astype(str)+'_'+df['card2'].astype(str)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "7e41e460",
|
|
|
|
|
"id": "054ee902",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"这有助于LGBM将card1和card2一起去与目标关联,并不会在树节点分裂他们。\n",
|
|
|
|
@ -321,7 +321,7 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "15b54354",
|
|
|
|
|
"id": "4cf0ee0f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
@ -330,7 +330,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "e38268bf",
|
|
|
|
|
"id": "7d60d0b6",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 频率编码\n",
|
|
|
|
@ -339,8 +339,8 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"id": "4f6983bd",
|
|
|
|
|
"execution_count": 19,
|
|
|
|
|
"id": "bb167930",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
@ -407,7 +407,7 @@
|
|
|
|
|
"4 0 2"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"execution_count": 19,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
@ -418,10 +418,116 @@
|
|
|
|
|
"df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "48229c86",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 聚合/组统计\n",
|
|
|
|
|
"为 LGBM 提供组统计数据允许 LGBM 确定某个值对于特定组是常见的还是罕见的。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"可以通过为 pandas 提供 3 个变量来计算组统计数据。你给它组、感兴趣的变量和统计类型。例如"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 20,
|
|
|
|
|
"id": "76380f6f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>color</th>\n",
|
|
|
|
|
" <th>color_counts</th>\n",
|
|
|
|
|
" <th>color_counts_sum</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" color color_counts color_counts_sum\n",
|
|
|
|
|
"0 0 2 4\n",
|
|
|
|
|
"1 1 2 4\n",
|
|
|
|
|
"2 2 1 1\n",
|
|
|
|
|
"3 1 2 4\n",
|
|
|
|
|
"4 0 2 4"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 20,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"temp = df.groupby('color')['color_counts'].agg(['mean']).rename({'mean':'color_counts_mean'},axis=1)\n",
|
|
|
|
|
"df = pd.merge(df,temp,on='color',how='left')\n",
|
|
|
|
|
"df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "fd72f933",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"此处的功能向每一行添加color_counts该行color组的平均值。因此,LGBM 现在可以判断color_counts对它们的color组是否为极少数的部分。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "26f55eeb",
|
|
|
|
|
"id": "9ed09035",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|