diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Stacking-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Stacking-checkpoint.ipynb
new file mode 100644
index 0000000..ef7f238
--- /dev/null
+++ b/竞赛优胜技巧/.ipynb_checkpoints/Stacking-checkpoint.ipynb
@@ -0,0 +1,677 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "720f2b62",
+ "metadata": {},
+ "source": [
+ "# Stacking"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0b365a02",
+ "metadata": {},
+ "source": [
+ "## 先说结论,该数据集(fetch_covtype)Stacking的方法比线性加权更好\n",
+ "比赛中我们常用线性加权作为最终的融合方式,我们同样也会好奇怎样的线性加权权重更好,下面也会举例子\n",
+ "参考:https://github.com/rushter/heamy/tree/master/examples"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cc8fecb1",
+ "metadata": {},
+ "source": [
+ "通过对训练集进行五折验证,将验证结果作为第二层的训练和测试集合\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "18a12000",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
+ "Collecting heamy\n",
+ " Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/32/2f3e1efa38a8e34f790d90b6d49ef06ab812181ae896c50e89b8750fa5a0/heamy-0.0.7.tar.gz (30 kB)\n",
+ "Requirement already satisfied: scikit-learn>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (0.24.1)\n",
+ "Requirement already satisfied: pandas>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.2.4)\n",
+ "Requirement already satisfied: six>=1.10.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.15.0)\n",
+ "Requirement already satisfied: scipy>=0.16.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.6.2)\n",
+ "Requirement already satisfied: numpy>=1.7.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.19.5)\n",
+ "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2021.1)\n",
+ "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2.8.1)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (2.1.0)\n",
+ "Requirement already satisfied: joblib>=0.11 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (1.0.1)\n",
+ "Building wheels for collected packages: heamy\n",
+ " Building wheel for heamy (setup.py): started\n",
+ " Building wheel for heamy (setup.py): finished with status 'done'\n",
+ " Created wheel for heamy: filename=heamy-0.0.7-py2.py3-none-any.whl size=15353 sha256=e3ba65b34e2bdee3b90b45b637e28836afdbdb0c9547f76b36fe10d17f8aba8f\n",
+ " Stored in directory: c:\\users\\administrator\\appdata\\local\\pip\\cache\\wheels\\6e\\f1\\7d\\048e558da94f495a0ed0d9c09d312e73eb176a092e36774ec2\n",
+ "Successfully built heamy\n",
+ "Installing collected packages: heamy\n",
+ "Successfully installed heamy-0.0.7\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "pip install heamy # 安装相关包"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "69632c6a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "print(sys.version) # 版本信息"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "ca421279",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import time\n",
+ "\n",
+ "from heamy.dataset import Dataset\n",
+ "from heamy.estimator import Classifier \n",
+ "from heamy.pipeline import ModelsPipeline\n",
+ "# 导入相关模型,没有的pip install xxx 即可\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "import xgboost as xgb \n",
+ "import lightgbm as lgb \n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.preprocessing import OrdinalEncoder\n",
+ "from sklearn.metrics import log_loss"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2592fbbd",
+ "metadata": {},
+ "source": [
+ "## 准备数据集"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "9a0fabe1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import fetch_covtype\n",
+ "data = fetch_covtype()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "5bd75178",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "七分类任务,处理前: [1 2 3 4 5 6 7]\n",
+ "[5 5 2 ... 3 3 3]\n",
+ "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n",
+ "[4. 4. 1. ... 2. 2. 2.]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 预处理\n",
+ "X, y = data['data'], data['target']\n",
+ "# 由于模型标签需要从0开始,所以数字需要全部减1\n",
+ "print('七分类任务,处理前:',np.unique(y))\n",
+ "print(y)\n",
+ "ord = OrdinalEncoder()\n",
+ "y = ord.fit_transform(y.reshape(-1, 1))\n",
+ "y = y_enc.reshape(-1, )\n",
+ "print('七分类任务,处理后:',np.unique(y))\n",
+ "print(y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "23d9778c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(435759, 54)\n",
+ "(145253, 54)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 切分训练和测试集\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)\n",
+ "print(X_train.shape)\n",
+ "print(X_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "eac48668",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dataset(5c3ccfb5c81451d098565ef5e7e36ac5)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 创建数据集\n",
+ "'''use_cache : bool, default True\n",
+ " If use_cache=True then preprocessing step will be cached until function codeis changed.'''\n",
+ "dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None,use_cache=True) # 注意这里的y_test=None,即不存在数据泄露\n",
+ "print(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "fba3f975",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[2.833e+03, 2.580e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [3.008e+03, 4.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [2.949e+03, 0.000e+00, 1.100e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " ...,\n",
+ " [3.153e+03, 2.870e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [3.065e+03, 3.480e+02, 2.100e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [3.021e+03, 2.600e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00]])"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 处理后的数据集\n",
+ "dataset.X_train"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4517ea1",
+ "metadata": {},
+ "source": [
+ "## 定义模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "e8393e73",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def xgb_model(X_train, y_train, X_test, y_test):\n",
+ " \"\"\"参数必须为X_train,y_train,X_test,y_test\"\"\"\n",
+ " # 可以内置参数\n",
+ " params = {'objective': 'multi:softprob',\n",
+ " \"eval_metric\": 'mlogloss',\n",
+ " \"verbosity\": 0,\n",
+ " 'num_class': 7,\n",
+ " 'nthread': -1}\n",
+ " dtrain = xgb.DMatrix(X_train, y_train)\n",
+ " dtest = xgb.DMatrix(X_test)\n",
+ " model = xgb.train(params, dtrain, num_boost_round=300)\n",
+ " predict = model.predict(dtest)\n",
+ " return predict # 返回值必须为X_test的预测\n",
+ "\n",
+ "\n",
+ "def lgb_model(X_train, y_train, X_test, y_test,**parameters):\n",
+ " # 也可以开放参数接口\n",
+ " if parameters is None:\n",
+ " parameters = {}\n",
+ " lgb_train = lgb.Dataset(X_train, y_train)\n",
+ " model = lgb.train(params=parameters, train_set=lgb_train,num_boost_round=300)\n",
+ " predict = model.predict(X_test)\n",
+ " return predict\n",
+ "\n",
+ "\n",
+ "def rf_model(X_train, y_train, X_test, y_test):\n",
+ " params = {\"n_estimators\": 100, \"n_jobs\": -1}\n",
+ " model = RandomForestClassifier(**params).fit(X_train, y_train)\n",
+ " predict = model.predict_proba(X_test)\n",
+ " return predict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0715cf6e",
+ "metadata": {},
+ "source": [
+ "## 构建和训练模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "78ab0083",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = {\"objective\": \"multiclass\",\n",
+ " \"num_class\": 7,\n",
+ " \"n_jobs\": -1,\n",
+ " \"verbose\": -4, \n",
+ " \"metric\": (\"multi_logloss\",)}\n",
+ "\n",
+ "model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb',use_cache=False)\n",
+ "model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',parameters=params,use_cache=False)\n",
+ "model_rf = Classifier(dataset=dataset, estimator=rf_model,name='rf',use_cache=False)\n",
+ "\n",
+ "pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "173ef0f0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Best Score (log_loss): 0.18744137777851164\n",
+ "Best Weights: [0.36556831 0.00303401 0.63139768]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([0.36556831, 0.00303401, 0.63139768])"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.find_weights(scorer=log_loss, ) # 输出最优权重组合"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "80726d19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 5折训练构建5折模型特征集,这里比较耗时\n",
+ "stack_ds = pipeline.stack(k=5,stratify=False,seed=42,full_test=False) # full_test指明预测全部还是预测当前折的验证集"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "b25bba3c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n",
+ "0 0.177179 0.818728 2.185222e-07 9.264143e-09 4.090067e-03 \n",
+ "1 0.005155 0.994845 7.055579e-10 1.326343e-08 6.331572e-09 \n",
+ "2 0.293492 0.706508 3.650662e-10 1.017633e-09 8.823530e-09 \n",
+ "3 0.478112 0.521816 3.207779e-06 2.878019e-08 1.076500e-08 \n",
+ "4 0.992430 0.006652 1.233117e-05 1.887496e-07 1.569583e-06 \n",
+ "... ... ... ... ... ... \n",
+ "435754 0.988518 0.011477 3.190797e-09 5.645121e-08 2.940739e-09 \n",
+ "435755 0.969212 0.030723 2.142020e-08 1.572054e-05 4.321913e-07 \n",
+ "435756 0.415850 0.584142 4.283793e-08 7.367601e-08 6.148067e-07 \n",
+ "435757 0.602601 0.397399 6.606462e-10 1.015894e-09 7.221973e-08 \n",
+ "435758 0.834587 0.165411 3.267833e-09 2.057172e-08 2.078704e-08 \n",
+ "\n",
+ " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n",
+ "0 1.725062e-06 1.048052e-06 0.172406 0.812678 1.416886e-06 ... \n",
+ "1 1.435787e-09 1.603579e-10 0.008114 0.991886 0.000000e+00 ... \n",
+ "2 6.384080e-10 2.823794e-08 0.817627 0.182372 0.000000e+00 ... \n",
+ "3 2.230641e-06 6.630235e-05 0.465733 0.534184 0.000000e+00 ... \n",
+ "4 5.604260e-07 9.037877e-04 0.932050 0.043451 0.000000e+00 ... \n",
+ "... ... ... ... ... ... ... \n",
+ "435754 1.530261e-08 4.466830e-06 0.970593 0.029399 0.000000e+00 ... \n",
+ "435755 5.574208e-10 4.977021e-05 0.862591 0.136644 0.000000e+00 ... \n",
+ "435756 2.371389e-06 5.185283e-06 0.466886 0.533039 0.000000e+00 ... \n",
+ "435757 2.326313e-09 9.193871e-09 0.674250 0.325750 4.092880e-211 ... \n",
+ "435758 5.976972e-08 2.204258e-06 0.709320 0.290680 0.000000e+00 ... \n",
+ "\n",
+ " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n",
+ "0 1.486358e-02 5.093522e-05 6.805300e-08 0.06 0.92 0.0 0.0 \n",
+ "1 0.000000e+00 0.000000e+00 0.000000e+00 0.12 0.88 0.0 0.0 \n",
+ "2 4.452850e-07 7.825338e-09 1.012052e-07 0.63 0.37 0.0 0.0 \n",
+ "3 0.000000e+00 0.000000e+00 8.245405e-05 0.56 0.44 0.0 0.0 \n",
+ "4 0.000000e+00 0.000000e+00 2.449972e-02 0.95 0.04 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "435754 0.000000e+00 0.000000e+00 7.871809e-06 0.97 0.03 0.0 0.0 \n",
+ "435755 0.000000e+00 0.000000e+00 7.647430e-04 0.93 0.06 0.0 0.0 \n",
+ "435756 0.000000e+00 0.000000e+00 7.493861e-05 0.45 0.55 0.0 0.0 \n",
+ "435757 0.000000e+00 0.000000e+00 0.000000e+00 0.52 0.48 0.0 0.0 \n",
+ "435758 0.000000e+00 0.000000e+00 0.000000e+00 0.87 0.13 0.0 0.0 \n",
+ "\n",
+ " rf_4 rf_5 rf_6 \n",
+ "0 0.02 0.0 0.00 \n",
+ "1 0.00 0.0 0.00 \n",
+ "2 0.00 0.0 0.00 \n",
+ "3 0.00 0.0 0.00 \n",
+ "4 0.00 0.0 0.01 \n",
+ "... ... ... ... \n",
+ "435754 0.00 0.0 0.00 \n",
+ "435755 0.00 0.0 0.01 \n",
+ "435756 0.00 0.0 0.00 \n",
+ "435757 0.00 0.0 0.00 \n",
+ "435758 0.00 0.0 0.00 \n",
+ "\n",
+ "[435759 rows x 21 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 模型输出的训练集,7个特征对应7个标签的预测概率\n",
+ "print(stack_ds.X_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "835205e9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n",
+ "0 9.876224e-01 0.000789 2.774616e-06 4.129093e-07 1.311387e-06 \n",
+ "1 5.139124e-02 0.929659 1.852793e-03 1.518293e-07 1.692924e-02 \n",
+ "2 7.695035e-04 0.973729 6.878623e-04 1.573823e-07 2.408167e-02 \n",
+ "3 3.376913e-02 0.966229 2.024872e-07 7.321523e-08 1.071163e-06 \n",
+ "4 1.013981e-03 0.998553 3.794874e-06 8.755425e-08 4.243054e-04 \n",
+ "... ... ... ... ... ... \n",
+ "145248 9.615189e-01 0.038480 6.486028e-08 1.744931e-08 1.069370e-06 \n",
+ "145249 3.055384e-02 0.969440 2.475371e-07 5.530033e-08 4.299908e-06 \n",
+ "145250 8.224608e-06 0.058361 9.212288e-01 9.705171e-08 5.440121e-05 \n",
+ "145251 9.183387e-01 0.081601 5.612090e-08 1.088283e-08 5.225256e-07 \n",
+ "145252 9.203915e-07 0.003578 2.372825e-01 1.582836e-06 3.307252e-07 \n",
+ "\n",
+ " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n",
+ "0 7.851924e-09 1.158422e-02 0.962538 0.004222 9.599869e-23 ... \n",
+ "1 1.613036e-04 6.763449e-06 0.070947 0.882463 2.232464e-03 ... \n",
+ "2 7.296442e-04 1.884172e-06 0.004029 0.945838 1.014722e-02 ... \n",
+ "3 7.227585e-08 1.448203e-09 0.066538 0.933450 1.206630e-06 ... \n",
+ "4 4.374311e-06 4.566837e-09 0.001334 0.997391 1.580417e-06 ... \n",
+ "... ... ... ... ... ... ... \n",
+ "145248 5.049759e-08 4.010809e-07 0.917842 0.082153 2.154302e-17 ... \n",
+ "145249 1.255851e-08 1.224208e-06 0.058622 0.941370 1.332795e-12 ... \n",
+ "145250 2.034389e-02 3.132630e-06 0.000268 0.083680 8.789707e-01 ... \n",
+ "145251 2.566383e-07 5.976933e-05 0.875834 0.123030 2.631276e-12 ... \n",
+ "145252 7.591362e-01 6.988637e-08 0.000032 0.037757 2.462795e-01 ... \n",
+ "\n",
+ " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n",
+ "0 1.726260e-240 0.000000e+00 3.324009e-02 0.984 0.000 0.000 0.0 \n",
+ "1 4.396802e-02 3.888647e-04 1.077146e-08 0.106 0.816 0.038 0.0 \n",
+ "2 3.329283e-02 6.693269e-03 3.213404e-09 0.008 0.950 0.002 0.0 \n",
+ "3 9.908823e-06 1.371448e-07 1.280235e-09 0.078 0.922 0.000 0.0 \n",
+ "4 1.273086e-03 7.811306e-07 5.594472e-10 0.004 0.988 0.000 0.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "145248 6.191352e-08 0.000000e+00 4.958509e-06 0.968 0.032 0.000 0.0 \n",
+ "145249 7.656931e-06 3.415083e-47 2.271880e-07 0.018 0.972 0.000 0.0 \n",
+ "145250 2.052535e-04 3.687570e-02 1.393421e-09 0.000 0.040 0.946 0.0 \n",
+ "145251 2.521124e-07 5.749375e-08 1.135236e-03 0.992 0.008 0.000 0.0 \n",
+ "145252 2.927400e-06 7.159244e-01 8.608624e-140 0.000 0.018 0.110 0.0 \n",
+ "\n",
+ " rf_4 rf_5 rf_6 \n",
+ "0 0.000 0.000 0.016 \n",
+ "1 0.034 0.006 0.000 \n",
+ "2 0.032 0.008 0.000 \n",
+ "3 0.000 0.000 0.000 \n",
+ "4 0.008 0.000 0.000 \n",
+ "... ... ... ... \n",
+ "145248 0.000 0.000 0.000 \n",
+ "145249 0.010 0.000 0.000 \n",
+ "145250 0.000 0.014 0.000 \n",
+ "145251 0.000 0.000 0.000 \n",
+ "145252 0.000 0.872 0.000 \n",
+ "\n",
+ "[145253 rows x 21 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 模型输出的测试集,7个特征对应7个标签的预测概率\n",
+ "print(stack_ds.X_test) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "b9db35dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 用lr做最后一层\n",
+ "stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={\"solver\": 'lbfgs', \"max_iter\": 1000},use_cache=False)\n",
+ "predict_stack = stacker.predict()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "a4a48219",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[9.95173402e-01 2.67623709e-03 4.23846755e-08 ... 3.15435935e-05\n",
+ " 5.66194220e-06 2.11044140e-03]\n",
+ " [2.23612439e-02 9.70927685e-01 1.23929922e-03 ... 4.49727904e-03\n",
+ " 8.73983383e-04 9.97020226e-05]\n",
+ " [6.22588197e-03 9.89402233e-01 9.81655972e-04 ... 2.83331258e-03\n",
+ " 5.22139184e-04 3.45071569e-05]\n",
+ " ...\n",
+ " [5.36335125e-06 2.06267200e-03 9.90604140e-01 ... 8.55252386e-04\n",
+ " 4.18405061e-03 1.64678945e-05]\n",
+ " [9.96602824e-01 2.15991442e-03 7.27481581e-08 ... 3.63552051e-05\n",
+ " 6.80942632e-06 1.19199377e-03]\n",
+ " [5.89156494e-05 1.15333400e-03 1.09178439e-02 ... 3.09244417e-04\n",
+ " 9.85167196e-01 2.21261408e-05]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(predict_stack) # stacking后的结果"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1372d4f8",
+ "metadata": {},
+ "source": [
+ "## 验证结果"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52ef71d4",
+ "metadata": {},
+ "source": [
+ "### 单模分数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "a28806a0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9284696357390209\n",
+ "0.8890005714167694\n",
+ "0.9511404239499356\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1),y_test)) # XGB\n",
+ "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1),y_test)) # LGB\n",
+ "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1),y_test)) # RF"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e9423ce",
+ "metadata": {},
+ "source": [
+ "### 线性加权分数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "d2b50ba4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "主观根据结果blending: 0.9425209806337908\n",
+ "根据最优权重的blending: 0.9488616414118813\n"
+ ]
+ }
+ ],
+ "source": [
+ "# blending的分数\n",
+ "xgb_t = stack_ds.X_test.iloc[:, :7].values\n",
+ "lgb_t = stack_ds.X_test.iloc[:, 7:14].values\n",
+ "rf_t = stack_ds.X_test.iloc[:, 14:].values\n",
+ "\n",
+ "# 根据分数好坏随机定\n",
+ "result = 0.3*xgb_t+0.2*lgb_t+0.5*rf_t\n",
+ "print('主观根据结果blending:', accuracy_score(np.argmax(result, axis=1), y_test))\n",
+ "# 根据上面提供的最优权重 Best Weights: [0.36556831 0.00303401 0.63139768]\n",
+ "result = 0.36556831*xgb_t+0.00303401*lgb_t+0.63139768*rf_t\n",
+ "print('根据最优权重的blending:',accuracy_score(np.argmax(result, axis=1), y_test))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dfec8968",
+ "metadata": {},
+ "source": [
+ "可以观察到最优权重比我们主观选权重更优"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8daf1e3",
+ "metadata": {},
+ "source": [
+ "### stacking的分数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "4930b407",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.957439777491687\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6311fbd7",
+ "metadata": {},
+ "source": [
+ "## 再说结论,该数据集(fetch_covtype)Stacking的方法更好"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/竞赛优胜技巧/Stacking.ipynb b/竞赛优胜技巧/Stacking.ipynb
new file mode 100644
index 0000000..ef7f238
--- /dev/null
+++ b/竞赛优胜技巧/Stacking.ipynb
@@ -0,0 +1,677 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "720f2b62",
+ "metadata": {},
+ "source": [
+ "# Stacking"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0b365a02",
+ "metadata": {},
+ "source": [
+ "## 先说结论,该数据集(fetch_covtype)Stacking的方法比线性加权更好\n",
+ "比赛中我们常用线性加权作为最终的融合方式,我们同样也会好奇怎样的线性加权权重更好,下面也会举例子\n",
+ "参考:https://github.com/rushter/heamy/tree/master/examples"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cc8fecb1",
+ "metadata": {},
+ "source": [
+ "通过对训练集进行五折验证,将验证结果作为第二层的训练和测试集合\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "18a12000",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
+ "Collecting heamy\n",
+ " Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/32/2f3e1efa38a8e34f790d90b6d49ef06ab812181ae896c50e89b8750fa5a0/heamy-0.0.7.tar.gz (30 kB)\n",
+ "Requirement already satisfied: scikit-learn>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (0.24.1)\n",
+ "Requirement already satisfied: pandas>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.2.4)\n",
+ "Requirement already satisfied: six>=1.10.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.15.0)\n",
+ "Requirement already satisfied: scipy>=0.16.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.6.2)\n",
+ "Requirement already satisfied: numpy>=1.7.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.19.5)\n",
+ "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2021.1)\n",
+ "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2.8.1)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (2.1.0)\n",
+ "Requirement already satisfied: joblib>=0.11 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (1.0.1)\n",
+ "Building wheels for collected packages: heamy\n",
+ " Building wheel for heamy (setup.py): started\n",
+ " Building wheel for heamy (setup.py): finished with status 'done'\n",
+ " Created wheel for heamy: filename=heamy-0.0.7-py2.py3-none-any.whl size=15353 sha256=e3ba65b34e2bdee3b90b45b637e28836afdbdb0c9547f76b36fe10d17f8aba8f\n",
+ " Stored in directory: c:\\users\\administrator\\appdata\\local\\pip\\cache\\wheels\\6e\\f1\\7d\\048e558da94f495a0ed0d9c09d312e73eb176a092e36774ec2\n",
+ "Successfully built heamy\n",
+ "Installing collected packages: heamy\n",
+ "Successfully installed heamy-0.0.7\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "pip install heamy # 安装相关包"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "69632c6a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "print(sys.version) # 版本信息"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "ca421279",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import time\n",
+ "\n",
+ "from heamy.dataset import Dataset\n",
+ "from heamy.estimator import Classifier \n",
+ "from heamy.pipeline import ModelsPipeline\n",
+ "# 导入相关模型,没有的pip install xxx 即可\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "import xgboost as xgb \n",
+ "import lightgbm as lgb \n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.preprocessing import OrdinalEncoder\n",
+ "from sklearn.metrics import log_loss"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2592fbbd",
+ "metadata": {},
+ "source": [
+ "## 准备数据集"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "9a0fabe1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import fetch_covtype\n",
+ "data = fetch_covtype()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "5bd75178",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "七分类任务,处理前: [1 2 3 4 5 6 7]\n",
+ "[5 5 2 ... 3 3 3]\n",
+ "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n",
+ "[4. 4. 1. ... 2. 2. 2.]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 预处理\n",
+ "X, y = data['data'], data['target']\n",
+ "# 由于模型标签需要从0开始,所以数字需要全部减1\n",
+ "print('七分类任务,处理前:',np.unique(y))\n",
+ "print(y)\n",
+ "ord = OrdinalEncoder()\n",
+ "y = ord.fit_transform(y.reshape(-1, 1))\n",
+ "y = y_enc.reshape(-1, )\n",
+ "print('七分类任务,处理后:',np.unique(y))\n",
+ "print(y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "23d9778c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(435759, 54)\n",
+ "(145253, 54)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 切分训练和测试集\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)\n",
+ "print(X_train.shape)\n",
+ "print(X_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "eac48668",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dataset(5c3ccfb5c81451d098565ef5e7e36ac5)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 创建数据集\n",
+ "'''use_cache : bool, default True\n",
+ " If use_cache=True then preprocessing step will be cached until function codeis changed.'''\n",
+ "dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None,use_cache=True) # 注意这里的y_test=None,即不存在数据泄露\n",
+ "print(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "fba3f975",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[2.833e+03, 2.580e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [3.008e+03, 4.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [2.949e+03, 0.000e+00, 1.100e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " ...,\n",
+ " [3.153e+03, 2.870e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [3.065e+03, 3.480e+02, 2.100e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00],\n",
+ " [3.021e+03, 2.600e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,\n",
+ " 0.000e+00]])"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 处理后的数据集\n",
+ "dataset.X_train"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4517ea1",
+ "metadata": {},
+ "source": [
+ "## 定义模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "e8393e73",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def xgb_model(X_train, y_train, X_test, y_test):\n",
+ " \"\"\"参数必须为X_train,y_train,X_test,y_test\"\"\"\n",
+ " # 可以内置参数\n",
+ " params = {'objective': 'multi:softprob',\n",
+ " \"eval_metric\": 'mlogloss',\n",
+ " \"verbosity\": 0,\n",
+ " 'num_class': 7,\n",
+ " 'nthread': -1}\n",
+ " dtrain = xgb.DMatrix(X_train, y_train)\n",
+ " dtest = xgb.DMatrix(X_test)\n",
+ " model = xgb.train(params, dtrain, num_boost_round=300)\n",
+ " predict = model.predict(dtest)\n",
+ " return predict # 返回值必须为X_test的预测\n",
+ "\n",
+ "\n",
+ "def lgb_model(X_train, y_train, X_test, y_test,**parameters):\n",
+ " # 也可以开放参数接口\n",
+ " if parameters is None:\n",
+ " parameters = {}\n",
+ " lgb_train = lgb.Dataset(X_train, y_train)\n",
+ " model = lgb.train(params=parameters, train_set=lgb_train,num_boost_round=300)\n",
+ " predict = model.predict(X_test)\n",
+ " return predict\n",
+ "\n",
+ "\n",
+ "def rf_model(X_train, y_train, X_test, y_test):\n",
+ " params = {\"n_estimators\": 100, \"n_jobs\": -1}\n",
+ " model = RandomForestClassifier(**params).fit(X_train, y_train)\n",
+ " predict = model.predict_proba(X_test)\n",
+ " return predict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0715cf6e",
+ "metadata": {},
+ "source": [
+ "## 构建和训练模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "78ab0083",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = {\"objective\": \"multiclass\",\n",
+ " \"num_class\": 7,\n",
+ " \"n_jobs\": -1,\n",
+ " \"verbose\": -4, \n",
+ " \"metric\": (\"multi_logloss\",)}\n",
+ "\n",
+ "model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb',use_cache=False)\n",
+ "model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',parameters=params,use_cache=False)\n",
+ "model_rf = Classifier(dataset=dataset, estimator=rf_model,name='rf',use_cache=False)\n",
+ "\n",
+ "pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "173ef0f0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Best Score (log_loss): 0.18744137777851164\n",
+ "Best Weights: [0.36556831 0.00303401 0.63139768]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([0.36556831, 0.00303401, 0.63139768])"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.find_weights(scorer=log_loss, ) # 输出最优权重组合"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "80726d19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 5折训练构建5折模型特征集,这里比较耗时\n",
+ "stack_ds = pipeline.stack(k=5,stratify=False,seed=42,full_test=False) # full_test指明预测全部还是预测当前折的验证集"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "b25bba3c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n",
+ "0 0.177179 0.818728 2.185222e-07 9.264143e-09 4.090067e-03 \n",
+ "1 0.005155 0.994845 7.055579e-10 1.326343e-08 6.331572e-09 \n",
+ "2 0.293492 0.706508 3.650662e-10 1.017633e-09 8.823530e-09 \n",
+ "3 0.478112 0.521816 3.207779e-06 2.878019e-08 1.076500e-08 \n",
+ "4 0.992430 0.006652 1.233117e-05 1.887496e-07 1.569583e-06 \n",
+ "... ... ... ... ... ... \n",
+ "435754 0.988518 0.011477 3.190797e-09 5.645121e-08 2.940739e-09 \n",
+ "435755 0.969212 0.030723 2.142020e-08 1.572054e-05 4.321913e-07 \n",
+ "435756 0.415850 0.584142 4.283793e-08 7.367601e-08 6.148067e-07 \n",
+ "435757 0.602601 0.397399 6.606462e-10 1.015894e-09 7.221973e-08 \n",
+ "435758 0.834587 0.165411 3.267833e-09 2.057172e-08 2.078704e-08 \n",
+ "\n",
+ " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n",
+ "0 1.725062e-06 1.048052e-06 0.172406 0.812678 1.416886e-06 ... \n",
+ "1 1.435787e-09 1.603579e-10 0.008114 0.991886 0.000000e+00 ... \n",
+ "2 6.384080e-10 2.823794e-08 0.817627 0.182372 0.000000e+00 ... \n",
+ "3 2.230641e-06 6.630235e-05 0.465733 0.534184 0.000000e+00 ... \n",
+ "4 5.604260e-07 9.037877e-04 0.932050 0.043451 0.000000e+00 ... \n",
+ "... ... ... ... ... ... ... \n",
+ "435754 1.530261e-08 4.466830e-06 0.970593 0.029399 0.000000e+00 ... \n",
+ "435755 5.574208e-10 4.977021e-05 0.862591 0.136644 0.000000e+00 ... \n",
+ "435756 2.371389e-06 5.185283e-06 0.466886 0.533039 0.000000e+00 ... \n",
+ "435757 2.326313e-09 9.193871e-09 0.674250 0.325750 4.092880e-211 ... \n",
+ "435758 5.976972e-08 2.204258e-06 0.709320 0.290680 0.000000e+00 ... \n",
+ "\n",
+ " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n",
+ "0 1.486358e-02 5.093522e-05 6.805300e-08 0.06 0.92 0.0 0.0 \n",
+ "1 0.000000e+00 0.000000e+00 0.000000e+00 0.12 0.88 0.0 0.0 \n",
+ "2 4.452850e-07 7.825338e-09 1.012052e-07 0.63 0.37 0.0 0.0 \n",
+ "3 0.000000e+00 0.000000e+00 8.245405e-05 0.56 0.44 0.0 0.0 \n",
+ "4 0.000000e+00 0.000000e+00 2.449972e-02 0.95 0.04 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "435754 0.000000e+00 0.000000e+00 7.871809e-06 0.97 0.03 0.0 0.0 \n",
+ "435755 0.000000e+00 0.000000e+00 7.647430e-04 0.93 0.06 0.0 0.0 \n",
+ "435756 0.000000e+00 0.000000e+00 7.493861e-05 0.45 0.55 0.0 0.0 \n",
+ "435757 0.000000e+00 0.000000e+00 0.000000e+00 0.52 0.48 0.0 0.0 \n",
+ "435758 0.000000e+00 0.000000e+00 0.000000e+00 0.87 0.13 0.0 0.0 \n",
+ "\n",
+ " rf_4 rf_5 rf_6 \n",
+ "0 0.02 0.0 0.00 \n",
+ "1 0.00 0.0 0.00 \n",
+ "2 0.00 0.0 0.00 \n",
+ "3 0.00 0.0 0.00 \n",
+ "4 0.00 0.0 0.01 \n",
+ "... ... ... ... \n",
+ "435754 0.00 0.0 0.00 \n",
+ "435755 0.00 0.0 0.01 \n",
+ "435756 0.00 0.0 0.00 \n",
+ "435757 0.00 0.0 0.00 \n",
+ "435758 0.00 0.0 0.00 \n",
+ "\n",
+ "[435759 rows x 21 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 模型输出的训练集,7个特征对应7个标签的预测概率\n",
+ "print(stack_ds.X_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "835205e9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n",
+ "0 9.876224e-01 0.000789 2.774616e-06 4.129093e-07 1.311387e-06 \n",
+ "1 5.139124e-02 0.929659 1.852793e-03 1.518293e-07 1.692924e-02 \n",
+ "2 7.695035e-04 0.973729 6.878623e-04 1.573823e-07 2.408167e-02 \n",
+ "3 3.376913e-02 0.966229 2.024872e-07 7.321523e-08 1.071163e-06 \n",
+ "4 1.013981e-03 0.998553 3.794874e-06 8.755425e-08 4.243054e-04 \n",
+ "... ... ... ... ... ... \n",
+ "145248 9.615189e-01 0.038480 6.486028e-08 1.744931e-08 1.069370e-06 \n",
+ "145249 3.055384e-02 0.969440 2.475371e-07 5.530033e-08 4.299908e-06 \n",
+ "145250 8.224608e-06 0.058361 9.212288e-01 9.705171e-08 5.440121e-05 \n",
+ "145251 9.183387e-01 0.081601 5.612090e-08 1.088283e-08 5.225256e-07 \n",
+ "145252 9.203915e-07 0.003578 2.372825e-01 1.582836e-06 3.307252e-07 \n",
+ "\n",
+ " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n",
+ "0 7.851924e-09 1.158422e-02 0.962538 0.004222 9.599869e-23 ... \n",
+ "1 1.613036e-04 6.763449e-06 0.070947 0.882463 2.232464e-03 ... \n",
+ "2 7.296442e-04 1.884172e-06 0.004029 0.945838 1.014722e-02 ... \n",
+ "3 7.227585e-08 1.448203e-09 0.066538 0.933450 1.206630e-06 ... \n",
+ "4 4.374311e-06 4.566837e-09 0.001334 0.997391 1.580417e-06 ... \n",
+ "... ... ... ... ... ... ... \n",
+ "145248 5.049759e-08 4.010809e-07 0.917842 0.082153 2.154302e-17 ... \n",
+ "145249 1.255851e-08 1.224208e-06 0.058622 0.941370 1.332795e-12 ... \n",
+ "145250 2.034389e-02 3.132630e-06 0.000268 0.083680 8.789707e-01 ... \n",
+ "145251 2.566383e-07 5.976933e-05 0.875834 0.123030 2.631276e-12 ... \n",
+ "145252 7.591362e-01 6.988637e-08 0.000032 0.037757 2.462795e-01 ... \n",
+ "\n",
+ " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n",
+ "0 1.726260e-240 0.000000e+00 3.324009e-02 0.984 0.000 0.000 0.0 \n",
+ "1 4.396802e-02 3.888647e-04 1.077146e-08 0.106 0.816 0.038 0.0 \n",
+ "2 3.329283e-02 6.693269e-03 3.213404e-09 0.008 0.950 0.002 0.0 \n",
+ "3 9.908823e-06 1.371448e-07 1.280235e-09 0.078 0.922 0.000 0.0 \n",
+ "4 1.273086e-03 7.811306e-07 5.594472e-10 0.004 0.988 0.000 0.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "145248 6.191352e-08 0.000000e+00 4.958509e-06 0.968 0.032 0.000 0.0 \n",
+ "145249 7.656931e-06 3.415083e-47 2.271880e-07 0.018 0.972 0.000 0.0 \n",
+ "145250 2.052535e-04 3.687570e-02 1.393421e-09 0.000 0.040 0.946 0.0 \n",
+ "145251 2.521124e-07 5.749375e-08 1.135236e-03 0.992 0.008 0.000 0.0 \n",
+ "145252 2.927400e-06 7.159244e-01 8.608624e-140 0.000 0.018 0.110 0.0 \n",
+ "\n",
+ " rf_4 rf_5 rf_6 \n",
+ "0 0.000 0.000 0.016 \n",
+ "1 0.034 0.006 0.000 \n",
+ "2 0.032 0.008 0.000 \n",
+ "3 0.000 0.000 0.000 \n",
+ "4 0.008 0.000 0.000 \n",
+ "... ... ... ... \n",
+ "145248 0.000 0.000 0.000 \n",
+ "145249 0.010 0.000 0.000 \n",
+ "145250 0.000 0.014 0.000 \n",
+ "145251 0.000 0.000 0.000 \n",
+ "145252 0.000 0.872 0.000 \n",
+ "\n",
+ "[145253 rows x 21 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 模型输出的测试集,7个特征对应7个标签的预测概率\n",
+ "print(stack_ds.X_test) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "b9db35dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 用lr做最后一层\n",
+ "stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={\"solver\": 'lbfgs', \"max_iter\": 1000},use_cache=False)\n",
+ "predict_stack = stacker.predict()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "a4a48219",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[9.95173402e-01 2.67623709e-03 4.23846755e-08 ... 3.15435935e-05\n",
+ " 5.66194220e-06 2.11044140e-03]\n",
+ " [2.23612439e-02 9.70927685e-01 1.23929922e-03 ... 4.49727904e-03\n",
+ " 8.73983383e-04 9.97020226e-05]\n",
+ " [6.22588197e-03 9.89402233e-01 9.81655972e-04 ... 2.83331258e-03\n",
+ " 5.22139184e-04 3.45071569e-05]\n",
+ " ...\n",
+ " [5.36335125e-06 2.06267200e-03 9.90604140e-01 ... 8.55252386e-04\n",
+ " 4.18405061e-03 1.64678945e-05]\n",
+ " [9.96602824e-01 2.15991442e-03 7.27481581e-08 ... 3.63552051e-05\n",
+ " 6.80942632e-06 1.19199377e-03]\n",
+ " [5.89156494e-05 1.15333400e-03 1.09178439e-02 ... 3.09244417e-04\n",
+ " 9.85167196e-01 2.21261408e-05]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(predict_stack) # stacking后的结果"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1372d4f8",
+ "metadata": {},
+ "source": [
+ "## 验证结果"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52ef71d4",
+ "metadata": {},
+ "source": [
+ "### 单模分数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "a28806a0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.9284696357390209\n",
+ "0.8890005714167694\n",
+ "0.9511404239499356\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1),y_test)) # XGB\n",
+ "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1),y_test)) # LGB\n",
+ "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1),y_test)) # RF"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e9423ce",
+ "metadata": {},
+ "source": [
+ "### 线性加权分数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "d2b50ba4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "主观根据结果blending: 0.9425209806337908\n",
+ "根据最优权重的blending: 0.9488616414118813\n"
+ ]
+ }
+ ],
+ "source": [
+ "# blending的分数\n",
+ "xgb_t = stack_ds.X_test.iloc[:, :7].values\n",
+ "lgb_t = stack_ds.X_test.iloc[:, 7:14].values\n",
+ "rf_t = stack_ds.X_test.iloc[:, 14:].values\n",
+ "\n",
+ "# 根据分数好坏随机定\n",
+ "result = 0.3*xgb_t+0.2*lgb_t+0.5*rf_t\n",
+ "print('主观根据结果blending:', accuracy_score(np.argmax(result, axis=1), y_test))\n",
+ "# 根据上面提供的最优权重 Best Weights: [0.36556831 0.00303401 0.63139768]\n",
+ "result = 0.36556831*xgb_t+0.00303401*lgb_t+0.63139768*rf_t\n",
+ "print('根据最优权重的blending:',accuracy_score(np.argmax(result, axis=1), y_test))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dfec8968",
+ "metadata": {},
+ "source": [
+ "可以观察到最优权重比我们主观选权重更优"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8daf1e3",
+ "metadata": {},
+ "source": [
+ "### stacking的分数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "4930b407",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.957439777491687\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6311fbd7",
+ "metadata": {},
+ "source": [
+ "## 再说结论,该数据集(fetch_covtype)Stacking的方法更好"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/竞赛优胜技巧/assets/stacking.jpg b/竞赛优胜技巧/assets/stacking.jpg
new file mode 100644
index 0000000..fead306
Binary files /dev/null and b/竞赛优胜技巧/assets/stacking.jpg differ