|
|
|
@ -243,7 +243,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "6091bf47",
|
|
|
|
|
"id": "ac6cd644",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 分类特征\n",
|
|
|
|
@ -253,7 +253,7 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 10,
|
|
|
|
|
"id": "6061ae00",
|
|
|
|
|
"id": "e3285fce",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
@ -280,7 +280,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "94a95b95",
|
|
|
|
|
"id": "d5016f4c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Splitting\n",
|
|
|
|
@ -291,7 +291,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "87e8b887",
|
|
|
|
|
"id": "66f3fb03",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 组合/转化/交互\n",
|
|
|
|
@ -301,7 +301,7 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "92515211",
|
|
|
|
|
"id": "7a267704",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
@ -310,7 +310,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "b9c66a13",
|
|
|
|
|
"id": "7e41e460",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"这有助于LGBM将card1和card2一起去与目标关联,并不会在树节点分裂他们。\n",
|
|
|
|
@ -321,17 +321,107 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "d50f2c15",
|
|
|
|
|
"id": "15b54354",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df['x1_x2'] = df['x1'] * df['x2']"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "e38268bf",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 频率编码\n",
|
|
|
|
|
"频率编码是一种强大的技术,它允许 LGBM 查看列值是罕见的还是常见的。例如,如果您希望 LGBM“查看”哪些颜色不常使用,请尝试"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"id": "4f6983bd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>color</th>\n",
|
|
|
|
|
" <th>color_counts</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" color color_counts\n",
|
|
|
|
|
"0 0 2\n",
|
|
|
|
|
"1 1 2\n",
|
|
|
|
|
"2 2 1\n",
|
|
|
|
|
"3 1 2\n",
|
|
|
|
|
"4 0 2"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"temp = df['color'].value_counts().to_dict()\n",
|
|
|
|
|
"df['color_counts'] = df['color'].map(temp)\n",
|
|
|
|
|
"df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "3ce3cb5f",
|
|
|
|
|
"id": "26f55eeb",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|