|
|
|
@ -0,0 +1,677 @@
|
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "720f2b62",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"# Stacking"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "0b365a02",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 先说结论,该数据集(fetch_covtype)Stacking的方法比线性加权更好\n",
|
|
|
|
|
"比赛中我们常用线性加权作为最终的融合方式,我们同样也会好奇怎样的线性加权权重更好,下面也会举例子\n",
|
|
|
|
|
"参考:https://github.com/rushter/heamy/tree/master/examples"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "cc8fecb1",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"通过对训练集进行五折验证,将验证结果作为第二层的训练和测试集合\n",
|
|
|
|
|
"<img src=\"assets/stacking.jpg\" width=\"50%\">"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"id": "18a12000",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
|
|
|
|
|
"Collecting heamy\n",
|
|
|
|
|
" Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/32/2f3e1efa38a8e34f790d90b6d49ef06ab812181ae896c50e89b8750fa5a0/heamy-0.0.7.tar.gz (30 kB)\n",
|
|
|
|
|
"Requirement already satisfied: scikit-learn>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (0.24.1)\n",
|
|
|
|
|
"Requirement already satisfied: pandas>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.2.4)\n",
|
|
|
|
|
"Requirement already satisfied: six>=1.10.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.15.0)\n",
|
|
|
|
|
"Requirement already satisfied: scipy>=0.16.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.6.2)\n",
|
|
|
|
|
"Requirement already satisfied: numpy>=1.7.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.19.5)\n",
|
|
|
|
|
"Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2021.1)\n",
|
|
|
|
|
"Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2.8.1)\n",
|
|
|
|
|
"Requirement already satisfied: threadpoolctl>=2.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (2.1.0)\n",
|
|
|
|
|
"Requirement already satisfied: joblib>=0.11 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (1.0.1)\n",
|
|
|
|
|
"Building wheels for collected packages: heamy\n",
|
|
|
|
|
" Building wheel for heamy (setup.py): started\n",
|
|
|
|
|
" Building wheel for heamy (setup.py): finished with status 'done'\n",
|
|
|
|
|
" Created wheel for heamy: filename=heamy-0.0.7-py2.py3-none-any.whl size=15353 sha256=e3ba65b34e2bdee3b90b45b637e28836afdbdb0c9547f76b36fe10d17f8aba8f\n",
|
|
|
|
|
" Stored in directory: c:\\users\\administrator\\appdata\\local\\pip\\cache\\wheels\\6e\\f1\\7d\\048e558da94f495a0ed0d9c09d312e73eb176a092e36774ec2\n",
|
|
|
|
|
"Successfully built heamy\n",
|
|
|
|
|
"Installing collected packages: heamy\n",
|
|
|
|
|
"Successfully installed heamy-0.0.7\n",
|
|
|
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"pip install heamy # 安装相关包"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"id": "69632c6a",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"import sys\n",
|
|
|
|
|
"print(sys.version) # 版本信息"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 61,
|
|
|
|
|
"id": "ca421279",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"import time\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from heamy.dataset import Dataset\n",
|
|
|
|
|
"from heamy.estimator import Classifier \n",
|
|
|
|
|
"from heamy.pipeline import ModelsPipeline\n",
|
|
|
|
|
"# 导入相关模型,没有的pip install xxx 即可\n",
|
|
|
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
|
|
|
"from sklearn.linear_model import LogisticRegression\n",
|
|
|
|
|
"import xgboost as xgb \n",
|
|
|
|
|
"import lightgbm as lgb \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
|
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
|
|
|
"from sklearn.preprocessing import OrdinalEncoder\n",
|
|
|
|
|
"from sklearn.metrics import log_loss"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "2592fbbd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 准备数据集"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 15,
|
|
|
|
|
"id": "9a0fabe1",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.datasets import fetch_covtype\n",
|
|
|
|
|
"data = fetch_covtype()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 47,
|
|
|
|
|
"id": "5bd75178",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"七分类任务,处理前: [1 2 3 4 5 6 7]\n",
|
|
|
|
|
"[5 5 2 ... 3 3 3]\n",
|
|
|
|
|
"七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n",
|
|
|
|
|
"[4. 4. 1. ... 2. 2. 2.]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 预处理\n",
|
|
|
|
|
"X, y = data['data'], data['target']\n",
|
|
|
|
|
"# 由于模型标签需要从0开始,所以数字需要全部减1\n",
|
|
|
|
|
"print('七分类任务,处理前:',np.unique(y))\n",
|
|
|
|
|
"print(y)\n",
|
|
|
|
|
"ord = OrdinalEncoder()\n",
|
|
|
|
|
"y = ord.fit_transform(y.reshape(-1, 1))\n",
|
|
|
|
|
"y = y_enc.reshape(-1, )\n",
|
|
|
|
|
"print('七分类任务,处理后:',np.unique(y))\n",
|
|
|
|
|
"print(y)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 48,
|
|
|
|
|
"id": "23d9778c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"(435759, 54)\n",
|
|
|
|
|
"(145253, 54)\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 切分训练和测试集\n",
|
|
|
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)\n",
|
|
|
|
|
"print(X_train.shape)\n",
|
|
|
|
|
"print(X_test.shape)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 49,
|
|
|
|
|
"id": "eac48668",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Dataset(5c3ccfb5c81451d098565ef5e7e36ac5)\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 创建数据集\n",
|
|
|
|
|
"'''use_cache : bool, default True\n",
|
|
|
|
|
" If use_cache=True then preprocessing step will be cached until function codeis changed.'''\n",
|
|
|
|
|
"dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None,use_cache=True) # 注意这里的y_test=None,即不存在数据泄露\n",
|
|
|
|
|
"print(dataset)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 50,
|
|
|
|
|
"id": "fba3f975",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"array([[2.833e+03, 2.580e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,\n",
|
|
|
|
|
" 0.000e+00],\n",
|
|
|
|
|
" [3.008e+03, 4.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,\n",
|
|
|
|
|
" 0.000e+00],\n",
|
|
|
|
|
" [2.949e+03, 0.000e+00, 1.100e+01, ..., 0.000e+00, 0.000e+00,\n",
|
|
|
|
|
" 0.000e+00],\n",
|
|
|
|
|
" ...,\n",
|
|
|
|
|
" [3.153e+03, 2.870e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,\n",
|
|
|
|
|
" 0.000e+00],\n",
|
|
|
|
|
" [3.065e+03, 3.480e+02, 2.100e+01, ..., 0.000e+00, 0.000e+00,\n",
|
|
|
|
|
" 0.000e+00],\n",
|
|
|
|
|
" [3.021e+03, 2.600e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,\n",
|
|
|
|
|
" 0.000e+00]])"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 50,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 处理后的数据集\n",
|
|
|
|
|
"dataset.X_train"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "d4517ea1",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 定义模型"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 40,
|
|
|
|
|
"id": "e8393e73",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def xgb_model(X_train, y_train, X_test, y_test):\n",
|
|
|
|
|
" \"\"\"参数必须为X_train,y_train,X_test,y_test\"\"\"\n",
|
|
|
|
|
" # 可以内置参数\n",
|
|
|
|
|
" params = {'objective': 'multi:softprob',\n",
|
|
|
|
|
" \"eval_metric\": 'mlogloss',\n",
|
|
|
|
|
" \"verbosity\": 0,\n",
|
|
|
|
|
" 'num_class': 7,\n",
|
|
|
|
|
" 'nthread': -1}\n",
|
|
|
|
|
" dtrain = xgb.DMatrix(X_train, y_train)\n",
|
|
|
|
|
" dtest = xgb.DMatrix(X_test)\n",
|
|
|
|
|
" model = xgb.train(params, dtrain, num_boost_round=300)\n",
|
|
|
|
|
" predict = model.predict(dtest)\n",
|
|
|
|
|
" return predict # 返回值必须为X_test的预测\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def lgb_model(X_train, y_train, X_test, y_test,**parameters):\n",
|
|
|
|
|
" # 也可以开放参数接口\n",
|
|
|
|
|
" if parameters is None:\n",
|
|
|
|
|
" parameters = {}\n",
|
|
|
|
|
" lgb_train = lgb.Dataset(X_train, y_train)\n",
|
|
|
|
|
" model = lgb.train(params=parameters, train_set=lgb_train,num_boost_round=300)\n",
|
|
|
|
|
" predict = model.predict(X_test)\n",
|
|
|
|
|
" return predict\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def rf_model(X_train, y_train, X_test, y_test):\n",
|
|
|
|
|
" params = {\"n_estimators\": 100, \"n_jobs\": -1}\n",
|
|
|
|
|
" model = RandomForestClassifier(**params).fit(X_train, y_train)\n",
|
|
|
|
|
" predict = model.predict_proba(X_test)\n",
|
|
|
|
|
" return predict"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "0715cf6e",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 构建和训练模型"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 52,
|
|
|
|
|
"id": "78ab0083",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"params = {\"objective\": \"multiclass\",\n",
|
|
|
|
|
" \"num_class\": 7,\n",
|
|
|
|
|
" \"n_jobs\": -1,\n",
|
|
|
|
|
" \"verbose\": -4, \n",
|
|
|
|
|
" \"metric\": (\"multi_logloss\",)}\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb',use_cache=False)\n",
|
|
|
|
|
"model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',parameters=params,use_cache=False)\n",
|
|
|
|
|
"model_rf = Classifier(dataset=dataset, estimator=rf_model,name='rf',use_cache=False)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 53,
|
|
|
|
|
"id": "173ef0f0",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Best Score (log_loss): 0.18744137777851164\n",
|
|
|
|
|
"Best Weights: [0.36556831 0.00303401 0.63139768]\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"array([0.36556831, 0.00303401, 0.63139768])"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 53,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"pipeline.find_weights(scorer=log_loss, ) # 输出最优权重组合"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 55,
|
|
|
|
|
"id": "80726d19",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 5折训练构建5折模型特征集,这里比较耗时\n",
|
|
|
|
|
"stack_ds = pipeline.stack(k=5,stratify=False,seed=42,full_test=False) # full_test指明预测全部还是预测当前折的验证集"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 56,
|
|
|
|
|
"id": "b25bba3c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
" xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n",
|
|
|
|
|
"0 0.177179 0.818728 2.185222e-07 9.264143e-09 4.090067e-03 \n",
|
|
|
|
|
"1 0.005155 0.994845 7.055579e-10 1.326343e-08 6.331572e-09 \n",
|
|
|
|
|
"2 0.293492 0.706508 3.650662e-10 1.017633e-09 8.823530e-09 \n",
|
|
|
|
|
"3 0.478112 0.521816 3.207779e-06 2.878019e-08 1.076500e-08 \n",
|
|
|
|
|
"4 0.992430 0.006652 1.233117e-05 1.887496e-07 1.569583e-06 \n",
|
|
|
|
|
"... ... ... ... ... ... \n",
|
|
|
|
|
"435754 0.988518 0.011477 3.190797e-09 5.645121e-08 2.940739e-09 \n",
|
|
|
|
|
"435755 0.969212 0.030723 2.142020e-08 1.572054e-05 4.321913e-07 \n",
|
|
|
|
|
"435756 0.415850 0.584142 4.283793e-08 7.367601e-08 6.148067e-07 \n",
|
|
|
|
|
"435757 0.602601 0.397399 6.606462e-10 1.015894e-09 7.221973e-08 \n",
|
|
|
|
|
"435758 0.834587 0.165411 3.267833e-09 2.057172e-08 2.078704e-08 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n",
|
|
|
|
|
"0 1.725062e-06 1.048052e-06 0.172406 0.812678 1.416886e-06 ... \n",
|
|
|
|
|
"1 1.435787e-09 1.603579e-10 0.008114 0.991886 0.000000e+00 ... \n",
|
|
|
|
|
"2 6.384080e-10 2.823794e-08 0.817627 0.182372 0.000000e+00 ... \n",
|
|
|
|
|
"3 2.230641e-06 6.630235e-05 0.465733 0.534184 0.000000e+00 ... \n",
|
|
|
|
|
"4 5.604260e-07 9.037877e-04 0.932050 0.043451 0.000000e+00 ... \n",
|
|
|
|
|
"... ... ... ... ... ... ... \n",
|
|
|
|
|
"435754 1.530261e-08 4.466830e-06 0.970593 0.029399 0.000000e+00 ... \n",
|
|
|
|
|
"435755 5.574208e-10 4.977021e-05 0.862591 0.136644 0.000000e+00 ... \n",
|
|
|
|
|
"435756 2.371389e-06 5.185283e-06 0.466886 0.533039 0.000000e+00 ... \n",
|
|
|
|
|
"435757 2.326313e-09 9.193871e-09 0.674250 0.325750 4.092880e-211 ... \n",
|
|
|
|
|
"435758 5.976972e-08 2.204258e-06 0.709320 0.290680 0.000000e+00 ... \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n",
|
|
|
|
|
"0 1.486358e-02 5.093522e-05 6.805300e-08 0.06 0.92 0.0 0.0 \n",
|
|
|
|
|
"1 0.000000e+00 0.000000e+00 0.000000e+00 0.12 0.88 0.0 0.0 \n",
|
|
|
|
|
"2 4.452850e-07 7.825338e-09 1.012052e-07 0.63 0.37 0.0 0.0 \n",
|
|
|
|
|
"3 0.000000e+00 0.000000e+00 8.245405e-05 0.56 0.44 0.0 0.0 \n",
|
|
|
|
|
"4 0.000000e+00 0.000000e+00 2.449972e-02 0.95 0.04 0.0 0.0 \n",
|
|
|
|
|
"... ... ... ... ... ... ... ... \n",
|
|
|
|
|
"435754 0.000000e+00 0.000000e+00 7.871809e-06 0.97 0.03 0.0 0.0 \n",
|
|
|
|
|
"435755 0.000000e+00 0.000000e+00 7.647430e-04 0.93 0.06 0.0 0.0 \n",
|
|
|
|
|
"435756 0.000000e+00 0.000000e+00 7.493861e-05 0.45 0.55 0.0 0.0 \n",
|
|
|
|
|
"435757 0.000000e+00 0.000000e+00 0.000000e+00 0.52 0.48 0.0 0.0 \n",
|
|
|
|
|
"435758 0.000000e+00 0.000000e+00 0.000000e+00 0.87 0.13 0.0 0.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" rf_4 rf_5 rf_6 \n",
|
|
|
|
|
"0 0.02 0.0 0.00 \n",
|
|
|
|
|
"1 0.00 0.0 0.00 \n",
|
|
|
|
|
"2 0.00 0.0 0.00 \n",
|
|
|
|
|
"3 0.00 0.0 0.00 \n",
|
|
|
|
|
"4 0.00 0.0 0.01 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"435754 0.00 0.0 0.00 \n",
|
|
|
|
|
"435755 0.00 0.0 0.01 \n",
|
|
|
|
|
"435756 0.00 0.0 0.00 \n",
|
|
|
|
|
"435757 0.00 0.0 0.00 \n",
|
|
|
|
|
"435758 0.00 0.0 0.00 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[435759 rows x 21 columns]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 模型输出的训练集,7个特征对应7个标签的预测概率\n",
|
|
|
|
|
"print(stack_ds.X_train)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 59,
|
|
|
|
|
"id": "835205e9",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
" xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n",
|
|
|
|
|
"0 9.876224e-01 0.000789 2.774616e-06 4.129093e-07 1.311387e-06 \n",
|
|
|
|
|
"1 5.139124e-02 0.929659 1.852793e-03 1.518293e-07 1.692924e-02 \n",
|
|
|
|
|
"2 7.695035e-04 0.973729 6.878623e-04 1.573823e-07 2.408167e-02 \n",
|
|
|
|
|
"3 3.376913e-02 0.966229 2.024872e-07 7.321523e-08 1.071163e-06 \n",
|
|
|
|
|
"4 1.013981e-03 0.998553 3.794874e-06 8.755425e-08 4.243054e-04 \n",
|
|
|
|
|
"... ... ... ... ... ... \n",
|
|
|
|
|
"145248 9.615189e-01 0.038480 6.486028e-08 1.744931e-08 1.069370e-06 \n",
|
|
|
|
|
"145249 3.055384e-02 0.969440 2.475371e-07 5.530033e-08 4.299908e-06 \n",
|
|
|
|
|
"145250 8.224608e-06 0.058361 9.212288e-01 9.705171e-08 5.440121e-05 \n",
|
|
|
|
|
"145251 9.183387e-01 0.081601 5.612090e-08 1.088283e-08 5.225256e-07 \n",
|
|
|
|
|
"145252 9.203915e-07 0.003578 2.372825e-01 1.582836e-06 3.307252e-07 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n",
|
|
|
|
|
"0 7.851924e-09 1.158422e-02 0.962538 0.004222 9.599869e-23 ... \n",
|
|
|
|
|
"1 1.613036e-04 6.763449e-06 0.070947 0.882463 2.232464e-03 ... \n",
|
|
|
|
|
"2 7.296442e-04 1.884172e-06 0.004029 0.945838 1.014722e-02 ... \n",
|
|
|
|
|
"3 7.227585e-08 1.448203e-09 0.066538 0.933450 1.206630e-06 ... \n",
|
|
|
|
|
"4 4.374311e-06 4.566837e-09 0.001334 0.997391 1.580417e-06 ... \n",
|
|
|
|
|
"... ... ... ... ... ... ... \n",
|
|
|
|
|
"145248 5.049759e-08 4.010809e-07 0.917842 0.082153 2.154302e-17 ... \n",
|
|
|
|
|
"145249 1.255851e-08 1.224208e-06 0.058622 0.941370 1.332795e-12 ... \n",
|
|
|
|
|
"145250 2.034389e-02 3.132630e-06 0.000268 0.083680 8.789707e-01 ... \n",
|
|
|
|
|
"145251 2.566383e-07 5.976933e-05 0.875834 0.123030 2.631276e-12 ... \n",
|
|
|
|
|
"145252 7.591362e-01 6.988637e-08 0.000032 0.037757 2.462795e-01 ... \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n",
|
|
|
|
|
"0 1.726260e-240 0.000000e+00 3.324009e-02 0.984 0.000 0.000 0.0 \n",
|
|
|
|
|
"1 4.396802e-02 3.888647e-04 1.077146e-08 0.106 0.816 0.038 0.0 \n",
|
|
|
|
|
"2 3.329283e-02 6.693269e-03 3.213404e-09 0.008 0.950 0.002 0.0 \n",
|
|
|
|
|
"3 9.908823e-06 1.371448e-07 1.280235e-09 0.078 0.922 0.000 0.0 \n",
|
|
|
|
|
"4 1.273086e-03 7.811306e-07 5.594472e-10 0.004 0.988 0.000 0.0 \n",
|
|
|
|
|
"... ... ... ... ... ... ... ... \n",
|
|
|
|
|
"145248 6.191352e-08 0.000000e+00 4.958509e-06 0.968 0.032 0.000 0.0 \n",
|
|
|
|
|
"145249 7.656931e-06 3.415083e-47 2.271880e-07 0.018 0.972 0.000 0.0 \n",
|
|
|
|
|
"145250 2.052535e-04 3.687570e-02 1.393421e-09 0.000 0.040 0.946 0.0 \n",
|
|
|
|
|
"145251 2.521124e-07 5.749375e-08 1.135236e-03 0.992 0.008 0.000 0.0 \n",
|
|
|
|
|
"145252 2.927400e-06 7.159244e-01 8.608624e-140 0.000 0.018 0.110 0.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" rf_4 rf_5 rf_6 \n",
|
|
|
|
|
"0 0.000 0.000 0.016 \n",
|
|
|
|
|
"1 0.034 0.006 0.000 \n",
|
|
|
|
|
"2 0.032 0.008 0.000 \n",
|
|
|
|
|
"3 0.000 0.000 0.000 \n",
|
|
|
|
|
"4 0.008 0.000 0.000 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"145248 0.000 0.000 0.000 \n",
|
|
|
|
|
"145249 0.010 0.000 0.000 \n",
|
|
|
|
|
"145250 0.000 0.014 0.000 \n",
|
|
|
|
|
"145251 0.000 0.000 0.000 \n",
|
|
|
|
|
"145252 0.000 0.872 0.000 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[145253 rows x 21 columns]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 模型输出的测试集,7个特征对应7个标签的预测概率\n",
|
|
|
|
|
"print(stack_ds.X_test) "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 60,
|
|
|
|
|
"id": "b9db35dc",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 用lr做最后一层\n",
|
|
|
|
|
"stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={\"solver\": 'lbfgs', \"max_iter\": 1000},use_cache=False)\n",
|
|
|
|
|
"predict_stack = stacker.predict()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 64,
|
|
|
|
|
"id": "a4a48219",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"[[9.95173402e-01 2.67623709e-03 4.23846755e-08 ... 3.15435935e-05\n",
|
|
|
|
|
" 5.66194220e-06 2.11044140e-03]\n",
|
|
|
|
|
" [2.23612439e-02 9.70927685e-01 1.23929922e-03 ... 4.49727904e-03\n",
|
|
|
|
|
" 8.73983383e-04 9.97020226e-05]\n",
|
|
|
|
|
" [6.22588197e-03 9.89402233e-01 9.81655972e-04 ... 2.83331258e-03\n",
|
|
|
|
|
" 5.22139184e-04 3.45071569e-05]\n",
|
|
|
|
|
" ...\n",
|
|
|
|
|
" [5.36335125e-06 2.06267200e-03 9.90604140e-01 ... 8.55252386e-04\n",
|
|
|
|
|
" 4.18405061e-03 1.64678945e-05]\n",
|
|
|
|
|
" [9.96602824e-01 2.15991442e-03 7.27481581e-08 ... 3.63552051e-05\n",
|
|
|
|
|
" 6.80942632e-06 1.19199377e-03]\n",
|
|
|
|
|
" [5.89156494e-05 1.15333400e-03 1.09178439e-02 ... 3.09244417e-04\n",
|
|
|
|
|
" 9.85167196e-01 2.21261408e-05]]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"print(predict_stack) # stacking后的结果"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "1372d4f8",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 验证结果"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "52ef71d4",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 单模分数"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 65,
|
|
|
|
|
"id": "a28806a0",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"0.9284696357390209\n",
|
|
|
|
|
"0.8890005714167694\n",
|
|
|
|
|
"0.9511404239499356\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1),y_test)) # XGB\n",
|
|
|
|
|
"print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1),y_test)) # LGB\n",
|
|
|
|
|
"print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1),y_test)) # RF"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "2e9423ce",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 线性加权分数"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 70,
|
|
|
|
|
"id": "d2b50ba4",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"主观根据结果blending: 0.9425209806337908\n",
|
|
|
|
|
"根据最优权重的blending: 0.9488616414118813\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# blending的分数\n",
|
|
|
|
|
"xgb_t = stack_ds.X_test.iloc[:, :7].values\n",
|
|
|
|
|
"lgb_t = stack_ds.X_test.iloc[:, 7:14].values\n",
|
|
|
|
|
"rf_t = stack_ds.X_test.iloc[:, 14:].values\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# 根据分数好坏随机定\n",
|
|
|
|
|
"result = 0.3*xgb_t+0.2*lgb_t+0.5*rf_t\n",
|
|
|
|
|
"print('主观根据结果blending:', accuracy_score(np.argmax(result, axis=1), y_test))\n",
|
|
|
|
|
"# 根据上面提供的最优权重 Best Weights: [0.36556831 0.00303401 0.63139768]\n",
|
|
|
|
|
"result = 0.36556831*xgb_t+0.00303401*lgb_t+0.63139768*rf_t\n",
|
|
|
|
|
"print('根据最优权重的blending:',accuracy_score(np.argmax(result, axis=1), y_test))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "dfec8968",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"可以观察到最优权重比我们主观选权重更优"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "e8daf1e3",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### stacking的分数"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 71,
|
|
|
|
|
"id": "4930b407",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"0.957439777491687\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "6311fbd7",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## 再说结论,该数据集(fetch_covtype)Stacking的方法更好"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.8.8"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
}
|