diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Stacking-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Stacking-checkpoint.ipynb new file mode 100644 index 0000000..ef7f238 --- /dev/null +++ b/竞赛优胜技巧/.ipynb_checkpoints/Stacking-checkpoint.ipynb @@ -0,0 +1,677 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "720f2b62", + "metadata": {}, + "source": [ + "# Stacking" + ] + }, + { + "cell_type": "markdown", + "id": "0b365a02", + "metadata": {}, + "source": [ + "## 先说结论,该数据集(fetch_covtype)Stacking的方法比线性加权更好\n", + "比赛中我们常用线性加权作为最终的融合方式,我们同样也会好奇怎样的线性加权权重更好,下面也会举例子\n", + "参考:https://github.com/rushter/heamy/tree/master/examples" + ] + }, + { + "cell_type": "markdown", + "id": "cc8fecb1", + "metadata": {}, + "source": [ + "通过对训练集进行五折验证,将验证结果作为第二层的训练和测试集合\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "18a12000", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n", + "Collecting heamy\n", + " Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/32/2f3e1efa38a8e34f790d90b6d49ef06ab812181ae896c50e89b8750fa5a0/heamy-0.0.7.tar.gz (30 kB)\n", + "Requirement already satisfied: scikit-learn>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (0.24.1)\n", + "Requirement already satisfied: pandas>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.2.4)\n", + "Requirement already satisfied: six>=1.10.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.15.0)\n", + "Requirement already satisfied: scipy>=0.16.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.6.2)\n", + "Requirement already satisfied: numpy>=1.7.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.19.5)\n", + "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2021.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2.8.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (2.1.0)\n", + "Requirement already satisfied: joblib>=0.11 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (1.0.1)\n", + "Building wheels for collected packages: heamy\n", + " Building wheel for heamy (setup.py): started\n", + " Building wheel for heamy (setup.py): finished with status 'done'\n", + " Created wheel for heamy: filename=heamy-0.0.7-py2.py3-none-any.whl size=15353 sha256=e3ba65b34e2bdee3b90b45b637e28836afdbdb0c9547f76b36fe10d17f8aba8f\n", + " Stored in directory: c:\\users\\administrator\\appdata\\local\\pip\\cache\\wheels\\6e\\f1\\7d\\048e558da94f495a0ed0d9c09d312e73eb176a092e36774ec2\n", + "Successfully built heamy\n", + "Installing collected packages: heamy\n", + "Successfully installed heamy-0.0.7\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install heamy # 安装相关包" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "69632c6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n" + ] + } + ], + "source": [ + "import sys\n", + "print(sys.version) # 版本信息" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "ca421279", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import time\n", + "\n", + "from heamy.dataset import Dataset\n", + "from heamy.estimator import Classifier \n", + "from heamy.pipeline import ModelsPipeline\n", + "# 导入相关模型,没有的pip install xxx 即可\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "import xgboost as xgb \n", + "import lightgbm as lgb \n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.preprocessing import OrdinalEncoder\n", + "from sklearn.metrics import log_loss" + ] + }, + { + "cell_type": "markdown", + "id": "2592fbbd", + "metadata": {}, + "source": [ + "## 准备数据集" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9a0fabe1", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_covtype\n", + "data = fetch_covtype()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "5bd75178", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "七分类任务,处理前: [1 2 3 4 5 6 7]\n", + "[5 5 2 ... 3 3 3]\n", + "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n", + "[4. 4. 1. ... 2. 2. 2.]\n" + ] + } + ], + "source": [ + "# 预处理\n", + "X, y = data['data'], data['target']\n", + "# 由于模型标签需要从0开始,所以数字需要全部减1\n", + "print('七分类任务,处理前:',np.unique(y))\n", + "print(y)\n", + "ord = OrdinalEncoder()\n", + "y = ord.fit_transform(y.reshape(-1, 1))\n", + "y = y_enc.reshape(-1, )\n", + "print('七分类任务,处理后:',np.unique(y))\n", + "print(y)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "23d9778c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(435759, 54)\n", + "(145253, 54)\n" + ] + } + ], + "source": [ + "# 切分训练和测试集\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)\n", + "print(X_train.shape)\n", + "print(X_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "eac48668", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset(5c3ccfb5c81451d098565ef5e7e36ac5)\n" + ] + } + ], + "source": [ + "# 创建数据集\n", + "'''use_cache : bool, default True\n", + " If use_cache=True then preprocessing step will be cached until function codeis changed.'''\n", + "dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None,use_cache=True) # 注意这里的y_test=None,即不存在数据泄露\n", + "print(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "fba3f975", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2.833e+03, 2.580e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [3.008e+03, 4.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [2.949e+03, 0.000e+00, 1.100e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " ...,\n", + " [3.153e+03, 2.870e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [3.065e+03, 3.480e+02, 2.100e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [3.021e+03, 2.600e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00]])" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 处理后的数据集\n", + "dataset.X_train" + ] + }, + { + "cell_type": "markdown", + "id": "d4517ea1", + "metadata": {}, + "source": [ + "## 定义模型" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e8393e73", + "metadata": {}, + "outputs": [], + "source": [ + "def xgb_model(X_train, y_train, X_test, y_test):\n", + " \"\"\"参数必须为X_train,y_train,X_test,y_test\"\"\"\n", + " # 可以内置参数\n", + " params = {'objective': 'multi:softprob',\n", + " \"eval_metric\": 'mlogloss',\n", + " \"verbosity\": 0,\n", + " 'num_class': 7,\n", + " 'nthread': -1}\n", + " dtrain = xgb.DMatrix(X_train, y_train)\n", + " dtest = xgb.DMatrix(X_test)\n", + " model = xgb.train(params, dtrain, num_boost_round=300)\n", + " predict = model.predict(dtest)\n", + " return predict # 返回值必须为X_test的预测\n", + "\n", + "\n", + "def lgb_model(X_train, y_train, X_test, y_test,**parameters):\n", + " # 也可以开放参数接口\n", + " if parameters is None:\n", + " parameters = {}\n", + " lgb_train = lgb.Dataset(X_train, y_train)\n", + " model = lgb.train(params=parameters, train_set=lgb_train,num_boost_round=300)\n", + " predict = model.predict(X_test)\n", + " return predict\n", + "\n", + "\n", + "def rf_model(X_train, y_train, X_test, y_test):\n", + " params = {\"n_estimators\": 100, \"n_jobs\": -1}\n", + " model = RandomForestClassifier(**params).fit(X_train, y_train)\n", + " predict = model.predict_proba(X_test)\n", + " return predict" + ] + }, + { + "cell_type": "markdown", + "id": "0715cf6e", + "metadata": {}, + "source": [ + "## 构建和训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "78ab0083", + "metadata": {}, + "outputs": [], + "source": [ + "params = {\"objective\": \"multiclass\",\n", + " \"num_class\": 7,\n", + " \"n_jobs\": -1,\n", + " \"verbose\": -4, \n", + " \"metric\": (\"multi_logloss\",)}\n", + "\n", + "model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb',use_cache=False)\n", + "model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',parameters=params,use_cache=False)\n", + "model_rf = Classifier(dataset=dataset, estimator=rf_model,name='rf',use_cache=False)\n", + "\n", + "pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "173ef0f0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Score (log_loss): 0.18744137777851164\n", + "Best Weights: [0.36556831 0.00303401 0.63139768]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([0.36556831, 0.00303401, 0.63139768])" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.find_weights(scorer=log_loss, ) # 输出最优权重组合" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "80726d19", + "metadata": {}, + "outputs": [], + "source": [ + "# 5折训练构建5折模型特征集,这里比较耗时\n", + "stack_ds = pipeline.stack(k=5,stratify=False,seed=42,full_test=False) # full_test指明预测全部还是预测当前折的验证集" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "b25bba3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n", + "0 0.177179 0.818728 2.185222e-07 9.264143e-09 4.090067e-03 \n", + "1 0.005155 0.994845 7.055579e-10 1.326343e-08 6.331572e-09 \n", + "2 0.293492 0.706508 3.650662e-10 1.017633e-09 8.823530e-09 \n", + "3 0.478112 0.521816 3.207779e-06 2.878019e-08 1.076500e-08 \n", + "4 0.992430 0.006652 1.233117e-05 1.887496e-07 1.569583e-06 \n", + "... ... ... ... ... ... \n", + "435754 0.988518 0.011477 3.190797e-09 5.645121e-08 2.940739e-09 \n", + "435755 0.969212 0.030723 2.142020e-08 1.572054e-05 4.321913e-07 \n", + "435756 0.415850 0.584142 4.283793e-08 7.367601e-08 6.148067e-07 \n", + "435757 0.602601 0.397399 6.606462e-10 1.015894e-09 7.221973e-08 \n", + "435758 0.834587 0.165411 3.267833e-09 2.057172e-08 2.078704e-08 \n", + "\n", + " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n", + "0 1.725062e-06 1.048052e-06 0.172406 0.812678 1.416886e-06 ... \n", + "1 1.435787e-09 1.603579e-10 0.008114 0.991886 0.000000e+00 ... \n", + "2 6.384080e-10 2.823794e-08 0.817627 0.182372 0.000000e+00 ... \n", + "3 2.230641e-06 6.630235e-05 0.465733 0.534184 0.000000e+00 ... \n", + "4 5.604260e-07 9.037877e-04 0.932050 0.043451 0.000000e+00 ... \n", + "... ... ... ... ... ... ... \n", + "435754 1.530261e-08 4.466830e-06 0.970593 0.029399 0.000000e+00 ... \n", + "435755 5.574208e-10 4.977021e-05 0.862591 0.136644 0.000000e+00 ... \n", + "435756 2.371389e-06 5.185283e-06 0.466886 0.533039 0.000000e+00 ... \n", + "435757 2.326313e-09 9.193871e-09 0.674250 0.325750 4.092880e-211 ... \n", + "435758 5.976972e-08 2.204258e-06 0.709320 0.290680 0.000000e+00 ... \n", + "\n", + " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n", + "0 1.486358e-02 5.093522e-05 6.805300e-08 0.06 0.92 0.0 0.0 \n", + "1 0.000000e+00 0.000000e+00 0.000000e+00 0.12 0.88 0.0 0.0 \n", + "2 4.452850e-07 7.825338e-09 1.012052e-07 0.63 0.37 0.0 0.0 \n", + "3 0.000000e+00 0.000000e+00 8.245405e-05 0.56 0.44 0.0 0.0 \n", + "4 0.000000e+00 0.000000e+00 2.449972e-02 0.95 0.04 0.0 0.0 \n", + "... ... ... ... ... ... ... ... \n", + "435754 0.000000e+00 0.000000e+00 7.871809e-06 0.97 0.03 0.0 0.0 \n", + "435755 0.000000e+00 0.000000e+00 7.647430e-04 0.93 0.06 0.0 0.0 \n", + "435756 0.000000e+00 0.000000e+00 7.493861e-05 0.45 0.55 0.0 0.0 \n", + "435757 0.000000e+00 0.000000e+00 0.000000e+00 0.52 0.48 0.0 0.0 \n", + "435758 0.000000e+00 0.000000e+00 0.000000e+00 0.87 0.13 0.0 0.0 \n", + "\n", + " rf_4 rf_5 rf_6 \n", + "0 0.02 0.0 0.00 \n", + "1 0.00 0.0 0.00 \n", + "2 0.00 0.0 0.00 \n", + "3 0.00 0.0 0.00 \n", + "4 0.00 0.0 0.01 \n", + "... ... ... ... \n", + "435754 0.00 0.0 0.00 \n", + "435755 0.00 0.0 0.01 \n", + "435756 0.00 0.0 0.00 \n", + "435757 0.00 0.0 0.00 \n", + "435758 0.00 0.0 0.00 \n", + "\n", + "[435759 rows x 21 columns]\n" + ] + } + ], + "source": [ + "# 模型输出的训练集,7个特征对应7个标签的预测概率\n", + "print(stack_ds.X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "835205e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n", + "0 9.876224e-01 0.000789 2.774616e-06 4.129093e-07 1.311387e-06 \n", + "1 5.139124e-02 0.929659 1.852793e-03 1.518293e-07 1.692924e-02 \n", + "2 7.695035e-04 0.973729 6.878623e-04 1.573823e-07 2.408167e-02 \n", + "3 3.376913e-02 0.966229 2.024872e-07 7.321523e-08 1.071163e-06 \n", + "4 1.013981e-03 0.998553 3.794874e-06 8.755425e-08 4.243054e-04 \n", + "... ... ... ... ... ... \n", + "145248 9.615189e-01 0.038480 6.486028e-08 1.744931e-08 1.069370e-06 \n", + "145249 3.055384e-02 0.969440 2.475371e-07 5.530033e-08 4.299908e-06 \n", + "145250 8.224608e-06 0.058361 9.212288e-01 9.705171e-08 5.440121e-05 \n", + "145251 9.183387e-01 0.081601 5.612090e-08 1.088283e-08 5.225256e-07 \n", + "145252 9.203915e-07 0.003578 2.372825e-01 1.582836e-06 3.307252e-07 \n", + "\n", + " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n", + "0 7.851924e-09 1.158422e-02 0.962538 0.004222 9.599869e-23 ... \n", + "1 1.613036e-04 6.763449e-06 0.070947 0.882463 2.232464e-03 ... \n", + "2 7.296442e-04 1.884172e-06 0.004029 0.945838 1.014722e-02 ... \n", + "3 7.227585e-08 1.448203e-09 0.066538 0.933450 1.206630e-06 ... \n", + "4 4.374311e-06 4.566837e-09 0.001334 0.997391 1.580417e-06 ... \n", + "... ... ... ... ... ... ... \n", + "145248 5.049759e-08 4.010809e-07 0.917842 0.082153 2.154302e-17 ... \n", + "145249 1.255851e-08 1.224208e-06 0.058622 0.941370 1.332795e-12 ... \n", + "145250 2.034389e-02 3.132630e-06 0.000268 0.083680 8.789707e-01 ... \n", + "145251 2.566383e-07 5.976933e-05 0.875834 0.123030 2.631276e-12 ... \n", + "145252 7.591362e-01 6.988637e-08 0.000032 0.037757 2.462795e-01 ... \n", + "\n", + " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n", + "0 1.726260e-240 0.000000e+00 3.324009e-02 0.984 0.000 0.000 0.0 \n", + "1 4.396802e-02 3.888647e-04 1.077146e-08 0.106 0.816 0.038 0.0 \n", + "2 3.329283e-02 6.693269e-03 3.213404e-09 0.008 0.950 0.002 0.0 \n", + "3 9.908823e-06 1.371448e-07 1.280235e-09 0.078 0.922 0.000 0.0 \n", + "4 1.273086e-03 7.811306e-07 5.594472e-10 0.004 0.988 0.000 0.0 \n", + "... ... ... ... ... ... ... ... \n", + "145248 6.191352e-08 0.000000e+00 4.958509e-06 0.968 0.032 0.000 0.0 \n", + "145249 7.656931e-06 3.415083e-47 2.271880e-07 0.018 0.972 0.000 0.0 \n", + "145250 2.052535e-04 3.687570e-02 1.393421e-09 0.000 0.040 0.946 0.0 \n", + "145251 2.521124e-07 5.749375e-08 1.135236e-03 0.992 0.008 0.000 0.0 \n", + "145252 2.927400e-06 7.159244e-01 8.608624e-140 0.000 0.018 0.110 0.0 \n", + "\n", + " rf_4 rf_5 rf_6 \n", + "0 0.000 0.000 0.016 \n", + "1 0.034 0.006 0.000 \n", + "2 0.032 0.008 0.000 \n", + "3 0.000 0.000 0.000 \n", + "4 0.008 0.000 0.000 \n", + "... ... ... ... \n", + "145248 0.000 0.000 0.000 \n", + "145249 0.010 0.000 0.000 \n", + "145250 0.000 0.014 0.000 \n", + "145251 0.000 0.000 0.000 \n", + "145252 0.000 0.872 0.000 \n", + "\n", + "[145253 rows x 21 columns]\n" + ] + } + ], + "source": [ + "# 模型输出的测试集,7个特征对应7个标签的预测概率\n", + "print(stack_ds.X_test) " + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "b9db35dc", + "metadata": {}, + "outputs": [], + "source": [ + "# 用lr做最后一层\n", + "stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={\"solver\": 'lbfgs', \"max_iter\": 1000},use_cache=False)\n", + "predict_stack = stacker.predict()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "a4a48219", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[9.95173402e-01 2.67623709e-03 4.23846755e-08 ... 3.15435935e-05\n", + " 5.66194220e-06 2.11044140e-03]\n", + " [2.23612439e-02 9.70927685e-01 1.23929922e-03 ... 4.49727904e-03\n", + " 8.73983383e-04 9.97020226e-05]\n", + " [6.22588197e-03 9.89402233e-01 9.81655972e-04 ... 2.83331258e-03\n", + " 5.22139184e-04 3.45071569e-05]\n", + " ...\n", + " [5.36335125e-06 2.06267200e-03 9.90604140e-01 ... 8.55252386e-04\n", + " 4.18405061e-03 1.64678945e-05]\n", + " [9.96602824e-01 2.15991442e-03 7.27481581e-08 ... 3.63552051e-05\n", + " 6.80942632e-06 1.19199377e-03]\n", + " [5.89156494e-05 1.15333400e-03 1.09178439e-02 ... 3.09244417e-04\n", + " 9.85167196e-01 2.21261408e-05]]\n" + ] + } + ], + "source": [ + "print(predict_stack) # stacking后的结果" + ] + }, + { + "cell_type": "markdown", + "id": "1372d4f8", + "metadata": {}, + "source": [ + "## 验证结果" + ] + }, + { + "cell_type": "markdown", + "id": "52ef71d4", + "metadata": {}, + "source": [ + "### 单模分数" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a28806a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9284696357390209\n", + "0.8890005714167694\n", + "0.9511404239499356\n" + ] + } + ], + "source": [ + "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1),y_test)) # XGB\n", + "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1),y_test)) # LGB\n", + "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1),y_test)) # RF" + ] + }, + { + "cell_type": "markdown", + "id": "2e9423ce", + "metadata": {}, + "source": [ + "### 线性加权分数" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "d2b50ba4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "主观根据结果blending: 0.9425209806337908\n", + "根据最优权重的blending: 0.9488616414118813\n" + ] + } + ], + "source": [ + "# blending的分数\n", + "xgb_t = stack_ds.X_test.iloc[:, :7].values\n", + "lgb_t = stack_ds.X_test.iloc[:, 7:14].values\n", + "rf_t = stack_ds.X_test.iloc[:, 14:].values\n", + "\n", + "# 根据分数好坏随机定\n", + "result = 0.3*xgb_t+0.2*lgb_t+0.5*rf_t\n", + "print('主观根据结果blending:', accuracy_score(np.argmax(result, axis=1), y_test))\n", + "# 根据上面提供的最优权重 Best Weights: [0.36556831 0.00303401 0.63139768]\n", + "result = 0.36556831*xgb_t+0.00303401*lgb_t+0.63139768*rf_t\n", + "print('根据最优权重的blending:',accuracy_score(np.argmax(result, axis=1), y_test))" + ] + }, + { + "cell_type": "markdown", + "id": "dfec8968", + "metadata": {}, + "source": [ + "可以观察到最优权重比我们主观选权重更优" + ] + }, + { + "cell_type": "markdown", + "id": "e8daf1e3", + "metadata": {}, + "source": [ + "### stacking的分数" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "4930b407", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.957439777491687\n" + ] + } + ], + "source": [ + "print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))" + ] + }, + { + "cell_type": "markdown", + "id": "6311fbd7", + "metadata": {}, + "source": [ + "## 再说结论,该数据集(fetch_covtype)Stacking的方法更好" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/竞赛优胜技巧/Stacking.ipynb b/竞赛优胜技巧/Stacking.ipynb new file mode 100644 index 0000000..ef7f238 --- /dev/null +++ b/竞赛优胜技巧/Stacking.ipynb @@ -0,0 +1,677 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "720f2b62", + "metadata": {}, + "source": [ + "# Stacking" + ] + }, + { + "cell_type": "markdown", + "id": "0b365a02", + "metadata": {}, + "source": [ + "## 先说结论,该数据集(fetch_covtype)Stacking的方法比线性加权更好\n", + "比赛中我们常用线性加权作为最终的融合方式,我们同样也会好奇怎样的线性加权权重更好,下面也会举例子\n", + "参考:https://github.com/rushter/heamy/tree/master/examples" + ] + }, + { + "cell_type": "markdown", + "id": "cc8fecb1", + "metadata": {}, + "source": [ + "通过对训练集进行五折验证,将验证结果作为第二层的训练和测试集合\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "18a12000", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n", + "Collecting heamy\n", + " Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/32/2f3e1efa38a8e34f790d90b6d49ef06ab812181ae896c50e89b8750fa5a0/heamy-0.0.7.tar.gz (30 kB)\n", + "Requirement already satisfied: scikit-learn>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (0.24.1)\n", + "Requirement already satisfied: pandas>=0.17.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.2.4)\n", + "Requirement already satisfied: six>=1.10.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.15.0)\n", + "Requirement already satisfied: scipy>=0.16.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.6.2)\n", + "Requirement already satisfied: numpy>=1.7.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from heamy) (1.19.5)\n", + "Requirement already satisfied: pytz>=2017.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2021.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in d:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.17.0->heamy) (2.8.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (2.1.0)\n", + "Requirement already satisfied: joblib>=0.11 in d:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=0.17.0->heamy) (1.0.1)\n", + "Building wheels for collected packages: heamy\n", + " Building wheel for heamy (setup.py): started\n", + " Building wheel for heamy (setup.py): finished with status 'done'\n", + " Created wheel for heamy: filename=heamy-0.0.7-py2.py3-none-any.whl size=15353 sha256=e3ba65b34e2bdee3b90b45b637e28836afdbdb0c9547f76b36fe10d17f8aba8f\n", + " Stored in directory: c:\\users\\administrator\\appdata\\local\\pip\\cache\\wheels\\6e\\f1\\7d\\048e558da94f495a0ed0d9c09d312e73eb176a092e36774ec2\n", + "Successfully built heamy\n", + "Installing collected packages: heamy\n", + "Successfully installed heamy-0.0.7\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install heamy # 安装相关包" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "69632c6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]\n" + ] + } + ], + "source": [ + "import sys\n", + "print(sys.version) # 版本信息" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "ca421279", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import time\n", + "\n", + "from heamy.dataset import Dataset\n", + "from heamy.estimator import Classifier \n", + "from heamy.pipeline import ModelsPipeline\n", + "# 导入相关模型,没有的pip install xxx 即可\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "import xgboost as xgb \n", + "import lightgbm as lgb \n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.preprocessing import OrdinalEncoder\n", + "from sklearn.metrics import log_loss" + ] + }, + { + "cell_type": "markdown", + "id": "2592fbbd", + "metadata": {}, + "source": [ + "## 准备数据集" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9a0fabe1", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_covtype\n", + "data = fetch_covtype()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "5bd75178", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "七分类任务,处理前: [1 2 3 4 5 6 7]\n", + "[5 5 2 ... 3 3 3]\n", + "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n", + "[4. 4. 1. ... 2. 2. 2.]\n" + ] + } + ], + "source": [ + "# 预处理\n", + "X, y = data['data'], data['target']\n", + "# 由于模型标签需要从0开始,所以数字需要全部减1\n", + "print('七分类任务,处理前:',np.unique(y))\n", + "print(y)\n", + "ord = OrdinalEncoder()\n", + "y = ord.fit_transform(y.reshape(-1, 1))\n", + "y = y_enc.reshape(-1, )\n", + "print('七分类任务,处理后:',np.unique(y))\n", + "print(y)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "23d9778c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(435759, 54)\n", + "(145253, 54)\n" + ] + } + ], + "source": [ + "# 切分训练和测试集\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)\n", + "print(X_train.shape)\n", + "print(X_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "eac48668", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset(5c3ccfb5c81451d098565ef5e7e36ac5)\n" + ] + } + ], + "source": [ + "# 创建数据集\n", + "'''use_cache : bool, default True\n", + " If use_cache=True then preprocessing step will be cached until function codeis changed.'''\n", + "dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None,use_cache=True) # 注意这里的y_test=None,即不存在数据泄露\n", + "print(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "fba3f975", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2.833e+03, 2.580e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [3.008e+03, 4.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [2.949e+03, 0.000e+00, 1.100e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " ...,\n", + " [3.153e+03, 2.870e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [3.065e+03, 3.480e+02, 2.100e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [3.021e+03, 2.600e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00]])" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 处理后的数据集\n", + "dataset.X_train" + ] + }, + { + "cell_type": "markdown", + "id": "d4517ea1", + "metadata": {}, + "source": [ + "## 定义模型" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e8393e73", + "metadata": {}, + "outputs": [], + "source": [ + "def xgb_model(X_train, y_train, X_test, y_test):\n", + " \"\"\"参数必须为X_train,y_train,X_test,y_test\"\"\"\n", + " # 可以内置参数\n", + " params = {'objective': 'multi:softprob',\n", + " \"eval_metric\": 'mlogloss',\n", + " \"verbosity\": 0,\n", + " 'num_class': 7,\n", + " 'nthread': -1}\n", + " dtrain = xgb.DMatrix(X_train, y_train)\n", + " dtest = xgb.DMatrix(X_test)\n", + " model = xgb.train(params, dtrain, num_boost_round=300)\n", + " predict = model.predict(dtest)\n", + " return predict # 返回值必须为X_test的预测\n", + "\n", + "\n", + "def lgb_model(X_train, y_train, X_test, y_test,**parameters):\n", + " # 也可以开放参数接口\n", + " if parameters is None:\n", + " parameters = {}\n", + " lgb_train = lgb.Dataset(X_train, y_train)\n", + " model = lgb.train(params=parameters, train_set=lgb_train,num_boost_round=300)\n", + " predict = model.predict(X_test)\n", + " return predict\n", + "\n", + "\n", + "def rf_model(X_train, y_train, X_test, y_test):\n", + " params = {\"n_estimators\": 100, \"n_jobs\": -1}\n", + " model = RandomForestClassifier(**params).fit(X_train, y_train)\n", + " predict = model.predict_proba(X_test)\n", + " return predict" + ] + }, + { + "cell_type": "markdown", + "id": "0715cf6e", + "metadata": {}, + "source": [ + "## 构建和训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "78ab0083", + "metadata": {}, + "outputs": [], + "source": [ + "params = {\"objective\": \"multiclass\",\n", + " \"num_class\": 7,\n", + " \"n_jobs\": -1,\n", + " \"verbose\": -4, \n", + " \"metric\": (\"multi_logloss\",)}\n", + "\n", + "model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb',use_cache=False)\n", + "model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',parameters=params,use_cache=False)\n", + "model_rf = Classifier(dataset=dataset, estimator=rf_model,name='rf',use_cache=False)\n", + "\n", + "pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "173ef0f0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Score (log_loss): 0.18744137777851164\n", + "Best Weights: [0.36556831 0.00303401 0.63139768]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([0.36556831, 0.00303401, 0.63139768])" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.find_weights(scorer=log_loss, ) # 输出最优权重组合" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "80726d19", + "metadata": {}, + "outputs": [], + "source": [ + "# 5折训练构建5折模型特征集,这里比较耗时\n", + "stack_ds = pipeline.stack(k=5,stratify=False,seed=42,full_test=False) # full_test指明预测全部还是预测当前折的验证集" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "b25bba3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n", + "0 0.177179 0.818728 2.185222e-07 9.264143e-09 4.090067e-03 \n", + "1 0.005155 0.994845 7.055579e-10 1.326343e-08 6.331572e-09 \n", + "2 0.293492 0.706508 3.650662e-10 1.017633e-09 8.823530e-09 \n", + "3 0.478112 0.521816 3.207779e-06 2.878019e-08 1.076500e-08 \n", + "4 0.992430 0.006652 1.233117e-05 1.887496e-07 1.569583e-06 \n", + "... ... ... ... ... ... \n", + "435754 0.988518 0.011477 3.190797e-09 5.645121e-08 2.940739e-09 \n", + "435755 0.969212 0.030723 2.142020e-08 1.572054e-05 4.321913e-07 \n", + "435756 0.415850 0.584142 4.283793e-08 7.367601e-08 6.148067e-07 \n", + "435757 0.602601 0.397399 6.606462e-10 1.015894e-09 7.221973e-08 \n", + "435758 0.834587 0.165411 3.267833e-09 2.057172e-08 2.078704e-08 \n", + "\n", + " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n", + "0 1.725062e-06 1.048052e-06 0.172406 0.812678 1.416886e-06 ... \n", + "1 1.435787e-09 1.603579e-10 0.008114 0.991886 0.000000e+00 ... \n", + "2 6.384080e-10 2.823794e-08 0.817627 0.182372 0.000000e+00 ... \n", + "3 2.230641e-06 6.630235e-05 0.465733 0.534184 0.000000e+00 ... \n", + "4 5.604260e-07 9.037877e-04 0.932050 0.043451 0.000000e+00 ... \n", + "... ... ... ... ... ... ... \n", + "435754 1.530261e-08 4.466830e-06 0.970593 0.029399 0.000000e+00 ... \n", + "435755 5.574208e-10 4.977021e-05 0.862591 0.136644 0.000000e+00 ... \n", + "435756 2.371389e-06 5.185283e-06 0.466886 0.533039 0.000000e+00 ... \n", + "435757 2.326313e-09 9.193871e-09 0.674250 0.325750 4.092880e-211 ... \n", + "435758 5.976972e-08 2.204258e-06 0.709320 0.290680 0.000000e+00 ... \n", + "\n", + " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n", + "0 1.486358e-02 5.093522e-05 6.805300e-08 0.06 0.92 0.0 0.0 \n", + "1 0.000000e+00 0.000000e+00 0.000000e+00 0.12 0.88 0.0 0.0 \n", + "2 4.452850e-07 7.825338e-09 1.012052e-07 0.63 0.37 0.0 0.0 \n", + "3 0.000000e+00 0.000000e+00 8.245405e-05 0.56 0.44 0.0 0.0 \n", + "4 0.000000e+00 0.000000e+00 2.449972e-02 0.95 0.04 0.0 0.0 \n", + "... ... ... ... ... ... ... ... \n", + "435754 0.000000e+00 0.000000e+00 7.871809e-06 0.97 0.03 0.0 0.0 \n", + "435755 0.000000e+00 0.000000e+00 7.647430e-04 0.93 0.06 0.0 0.0 \n", + "435756 0.000000e+00 0.000000e+00 7.493861e-05 0.45 0.55 0.0 0.0 \n", + "435757 0.000000e+00 0.000000e+00 0.000000e+00 0.52 0.48 0.0 0.0 \n", + "435758 0.000000e+00 0.000000e+00 0.000000e+00 0.87 0.13 0.0 0.0 \n", + "\n", + " rf_4 rf_5 rf_6 \n", + "0 0.02 0.0 0.00 \n", + "1 0.00 0.0 0.00 \n", + "2 0.00 0.0 0.00 \n", + "3 0.00 0.0 0.00 \n", + "4 0.00 0.0 0.01 \n", + "... ... ... ... \n", + "435754 0.00 0.0 0.00 \n", + "435755 0.00 0.0 0.01 \n", + "435756 0.00 0.0 0.00 \n", + "435757 0.00 0.0 0.00 \n", + "435758 0.00 0.0 0.00 \n", + "\n", + "[435759 rows x 21 columns]\n" + ] + } + ], + "source": [ + "# 模型输出的训练集,7个特征对应7个标签的预测概率\n", + "print(stack_ds.X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "835205e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " xgb_0 xgb_1 xgb_2 xgb_3 xgb_4 \\\n", + "0 9.876224e-01 0.000789 2.774616e-06 4.129093e-07 1.311387e-06 \n", + "1 5.139124e-02 0.929659 1.852793e-03 1.518293e-07 1.692924e-02 \n", + "2 7.695035e-04 0.973729 6.878623e-04 1.573823e-07 2.408167e-02 \n", + "3 3.376913e-02 0.966229 2.024872e-07 7.321523e-08 1.071163e-06 \n", + "4 1.013981e-03 0.998553 3.794874e-06 8.755425e-08 4.243054e-04 \n", + "... ... ... ... ... ... \n", + "145248 9.615189e-01 0.038480 6.486028e-08 1.744931e-08 1.069370e-06 \n", + "145249 3.055384e-02 0.969440 2.475371e-07 5.530033e-08 4.299908e-06 \n", + "145250 8.224608e-06 0.058361 9.212288e-01 9.705171e-08 5.440121e-05 \n", + "145251 9.183387e-01 0.081601 5.612090e-08 1.088283e-08 5.225256e-07 \n", + "145252 9.203915e-07 0.003578 2.372825e-01 1.582836e-06 3.307252e-07 \n", + "\n", + " xgb_5 xgb_6 lgb_0 lgb_1 lgb_2 ... \\\n", + "0 7.851924e-09 1.158422e-02 0.962538 0.004222 9.599869e-23 ... \n", + "1 1.613036e-04 6.763449e-06 0.070947 0.882463 2.232464e-03 ... \n", + "2 7.296442e-04 1.884172e-06 0.004029 0.945838 1.014722e-02 ... \n", + "3 7.227585e-08 1.448203e-09 0.066538 0.933450 1.206630e-06 ... \n", + "4 4.374311e-06 4.566837e-09 0.001334 0.997391 1.580417e-06 ... \n", + "... ... ... ... ... ... ... \n", + "145248 5.049759e-08 4.010809e-07 0.917842 0.082153 2.154302e-17 ... \n", + "145249 1.255851e-08 1.224208e-06 0.058622 0.941370 1.332795e-12 ... \n", + "145250 2.034389e-02 3.132630e-06 0.000268 0.083680 8.789707e-01 ... \n", + "145251 2.566383e-07 5.976933e-05 0.875834 0.123030 2.631276e-12 ... \n", + "145252 7.591362e-01 6.988637e-08 0.000032 0.037757 2.462795e-01 ... \n", + "\n", + " lgb_4 lgb_5 lgb_6 rf_0 rf_1 rf_2 rf_3 \\\n", + "0 1.726260e-240 0.000000e+00 3.324009e-02 0.984 0.000 0.000 0.0 \n", + "1 4.396802e-02 3.888647e-04 1.077146e-08 0.106 0.816 0.038 0.0 \n", + "2 3.329283e-02 6.693269e-03 3.213404e-09 0.008 0.950 0.002 0.0 \n", + "3 9.908823e-06 1.371448e-07 1.280235e-09 0.078 0.922 0.000 0.0 \n", + "4 1.273086e-03 7.811306e-07 5.594472e-10 0.004 0.988 0.000 0.0 \n", + "... ... ... ... ... ... ... ... \n", + "145248 6.191352e-08 0.000000e+00 4.958509e-06 0.968 0.032 0.000 0.0 \n", + "145249 7.656931e-06 3.415083e-47 2.271880e-07 0.018 0.972 0.000 0.0 \n", + "145250 2.052535e-04 3.687570e-02 1.393421e-09 0.000 0.040 0.946 0.0 \n", + "145251 2.521124e-07 5.749375e-08 1.135236e-03 0.992 0.008 0.000 0.0 \n", + "145252 2.927400e-06 7.159244e-01 8.608624e-140 0.000 0.018 0.110 0.0 \n", + "\n", + " rf_4 rf_5 rf_6 \n", + "0 0.000 0.000 0.016 \n", + "1 0.034 0.006 0.000 \n", + "2 0.032 0.008 0.000 \n", + "3 0.000 0.000 0.000 \n", + "4 0.008 0.000 0.000 \n", + "... ... ... ... \n", + "145248 0.000 0.000 0.000 \n", + "145249 0.010 0.000 0.000 \n", + "145250 0.000 0.014 0.000 \n", + "145251 0.000 0.000 0.000 \n", + "145252 0.000 0.872 0.000 \n", + "\n", + "[145253 rows x 21 columns]\n" + ] + } + ], + "source": [ + "# 模型输出的测试集,7个特征对应7个标签的预测概率\n", + "print(stack_ds.X_test) " + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "b9db35dc", + "metadata": {}, + "outputs": [], + "source": [ + "# 用lr做最后一层\n", + "stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={\"solver\": 'lbfgs', \"max_iter\": 1000},use_cache=False)\n", + "predict_stack = stacker.predict()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "a4a48219", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[9.95173402e-01 2.67623709e-03 4.23846755e-08 ... 3.15435935e-05\n", + " 5.66194220e-06 2.11044140e-03]\n", + " [2.23612439e-02 9.70927685e-01 1.23929922e-03 ... 4.49727904e-03\n", + " 8.73983383e-04 9.97020226e-05]\n", + " [6.22588197e-03 9.89402233e-01 9.81655972e-04 ... 2.83331258e-03\n", + " 5.22139184e-04 3.45071569e-05]\n", + " ...\n", + " [5.36335125e-06 2.06267200e-03 9.90604140e-01 ... 8.55252386e-04\n", + " 4.18405061e-03 1.64678945e-05]\n", + " [9.96602824e-01 2.15991442e-03 7.27481581e-08 ... 3.63552051e-05\n", + " 6.80942632e-06 1.19199377e-03]\n", + " [5.89156494e-05 1.15333400e-03 1.09178439e-02 ... 3.09244417e-04\n", + " 9.85167196e-01 2.21261408e-05]]\n" + ] + } + ], + "source": [ + "print(predict_stack) # stacking后的结果" + ] + }, + { + "cell_type": "markdown", + "id": "1372d4f8", + "metadata": {}, + "source": [ + "## 验证结果" + ] + }, + { + "cell_type": "markdown", + "id": "52ef71d4", + "metadata": {}, + "source": [ + "### 单模分数" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a28806a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9284696357390209\n", + "0.8890005714167694\n", + "0.9511404239499356\n" + ] + } + ], + "source": [ + "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1),y_test)) # XGB\n", + "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1),y_test)) # LGB\n", + "print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1),y_test)) # RF" + ] + }, + { + "cell_type": "markdown", + "id": "2e9423ce", + "metadata": {}, + "source": [ + "### 线性加权分数" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "d2b50ba4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "主观根据结果blending: 0.9425209806337908\n", + "根据最优权重的blending: 0.9488616414118813\n" + ] + } + ], + "source": [ + "# blending的分数\n", + "xgb_t = stack_ds.X_test.iloc[:, :7].values\n", + "lgb_t = stack_ds.X_test.iloc[:, 7:14].values\n", + "rf_t = stack_ds.X_test.iloc[:, 14:].values\n", + "\n", + "# 根据分数好坏随机定\n", + "result = 0.3*xgb_t+0.2*lgb_t+0.5*rf_t\n", + "print('主观根据结果blending:', accuracy_score(np.argmax(result, axis=1), y_test))\n", + "# 根据上面提供的最优权重 Best Weights: [0.36556831 0.00303401 0.63139768]\n", + "result = 0.36556831*xgb_t+0.00303401*lgb_t+0.63139768*rf_t\n", + "print('根据最优权重的blending:',accuracy_score(np.argmax(result, axis=1), y_test))" + ] + }, + { + "cell_type": "markdown", + "id": "dfec8968", + "metadata": {}, + "source": [ + "可以观察到最优权重比我们主观选权重更优" + ] + }, + { + "cell_type": "markdown", + "id": "e8daf1e3", + "metadata": {}, + "source": [ + "### stacking的分数" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "4930b407", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.957439777491687\n" + ] + } + ], + "source": [ + "print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))" + ] + }, + { + "cell_type": "markdown", + "id": "6311fbd7", + "metadata": {}, + "source": [ + "## 再说结论,该数据集(fetch_covtype)Stacking的方法更好" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/竞赛优胜技巧/assets/stacking.jpg b/竞赛优胜技巧/assets/stacking.jpg new file mode 100644 index 0000000..fead306 Binary files /dev/null and b/竞赛优胜技巧/assets/stacking.jpg differ