# Stacking

## 先说结论，该数据集（fetch_covtype）Stacking的方法相比Blending和线性加权更好
比赛中我们常用线性加权作为最终的融合方式，我们同样也会好奇怎样的线性加权权重更好，下面也会举例子
参考：https://github.com/rushter/heamy/tree/master/examples

通过对训练集进行五折验证，将验证结果作为第二层的训练和测试集合
<img src="assets/stacking.jpg" width="50%">

In [1]:
pip install heamy  # 安装相关包

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting heamy
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/32/2f3e1efa38a8e34f790d90b6d49ef06ab812181ae896c50e89b8750fa5a0/heamy-0.0.7.tar.gz (30 kB)
Building wheels for collected packages: heamy
  Building wheel for heamy (setup.py): started
  Building wheel for heamy (setup.py): finished with status 'done'
  Created wheel for heamy: filename=heamy-0.0.7-py2.py3-none-any.whl size=15353 sha256=e3ba65b34e2bdee3b90b45b637e28836afdbdb0c9547f76b36fe10d17f8aba8f
  Stored in directory: c:\users\administrator\appdata\local\pip\cache\wheels\6e\f1\7d\048e558da94f495a0ed0d9c09d312e73eb176a092e36774ec2
Successfully built heamy
Installing collected packages: heamy
Successfully installed heamy-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
import sys
print(sys.version)  # 版本信息

3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]


In [2]:
import numpy as np
import time

from heamy.dataset import Dataset
from heamy.estimator import Classifier 
from heamy.pipeline import ModelsPipeline
# 导入相关模型，没有的pip install xxx 即可
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb 
import lightgbm as lgb 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss

## 准备数据集

In [3]:
from sklearn.datasets import fetch_covtype
data = fetch_covtype()

In [6]:
# 预处理
X, y = data['data'], data['target']
# 由于模型标签需要从0开始，所以数字需要全部减1
print('七分类任务，处理前：',np.unique(y))
print(y)
ord = OrdinalEncoder()
y = ord.fit_transform(y.reshape(-1, 1))
y = y.reshape(-1, )
print('七分类任务，处理后：',np.unique(y))
print(y)

七分类任务，处理前： [1 2 3 4 5 6 7]
[5 5 2 ... 3 3 3]
七分类任务，处理后： [0. 1. 2. 3. 4. 5. 6.]
[4. 4. 1. ... 2. 2. 2.]


In [7]:
# 切分训练和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)
print(X_train.shape)
print(X_test.shape)

(435759, 54)
(145253, 54)


In [8]:
# 创建数据集
'''use_cache : bool, default True
    If use_cache=True then preprocessing step will be cached until function codeis changed.'''
dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None,use_cache=True)  # 注意这里的y_test=None，即不存在数据泄露
print(dataset)

Dataset(5c3ccfb5c81451d098565ef5e7e36ac5)


In [9]:
# 处理后的数据集
dataset.X_train

array([[2.833e+03, 2.580e+02, 2.600e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.008e+03, 4.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.949e+03, 0.000e+00, 1.100e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [3.153e+03, 2.870e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.065e+03, 3.480e+02, 2.100e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.021e+03, 2.600e+01, 1.600e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

## 定义模型

In [10]:
def xgb_model(X_train, y_train, X_test, y_test):
    """参数必须为X_train,y_train,X_test,y_test"""
    # 可以内置参数
    params = {'objective': 'multi:softprob',
              "eval_metric": 'mlogloss',
              "verbosity": 0,
              'num_class': 7,
              'nthread': -1}
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=300)
    predict = model.predict(dtest)
    return predict  # 返回值必须为X_test的预测


def lgb_model(X_train, y_train, X_test, y_test,**parameters):
    # 也可以开放参数接口
    if parameters is None:
        parameters = {}
    lgb_train = lgb.Dataset(X_train, y_train)
    model = lgb.train(params=parameters, train_set=lgb_train,num_boost_round=300)
    predict = model.predict(X_test)
    return predict


def rf_model(X_train, y_train, X_test, y_test):
    params = {"n_estimators": 100, "n_jobs": -1}
    model = RandomForestClassifier(**params).fit(X_train, y_train)
    predict = model.predict_proba(X_test)
    return predict

## 构建和训练模型

In [11]:
params = {"objective": "multiclass",
          "num_class": 7,
          "n_jobs": -1,
          "verbose": -4, 
          "metric": ("multi_logloss",)}

model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb',use_cache=False)
model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',parameters=params,use_cache=False)
model_rf = Classifier(dataset=dataset, estimator=rf_model,name='rf',use_cache=False)

pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)

In [13]:
%%time
pipeline.find_weights(scorer=log_loss)  # 输出最优权重组合

Best Score (log_loss): 0.18646435443714865
Best Weights: [2.53464919e-01 1.48562205e-20 7.46535081e-01]
Wall time: 14min 19s


array([2.53464919e-01, 1.48562205e-20, 7.46535081e-01])

In [22]:
%%time
# 5折训练构建5折模型特征集，这里比较耗时
stack_ds = pipeline.stack(k=5,seed=42)

Wall time: 1h 39min 20s


In [26]:
# 模型输出的训练集，7个特征对应7个标签的预测概率
stack_ds.X_train.head()

Unnamed: 0,xgb_0,xgb_1,xgb_2,xgb_3,xgb_4,xgb_5,xgb_6,lgb_0,lgb_1,lgb_2,...,lgb_4,lgb_5,lgb_6,rf_0,rf_1,rf_2,rf_3,rf_4,rf_5,rf_6
0,0.177179,0.818728,2.185222e-07,9.264143e-09,0.004090067,1.725062e-06,1.048052e-06,0.179625,0.804684,3e-06,...,0.01562435,6.370849e-05,1.004259e-08,0.03,0.96,0.0,0.0,0.01,0.0,0.0
1,0.005155,0.994845,7.055579e-10,1.326343e-08,6.331572e-09,1.435787e-09,1.603579e-10,0.008114,0.991886,0.0,...,0.0,0.0,0.0,0.13,0.87,0.0,0.0,0.0,0.0,0.0
2,0.293492,0.706508,3.650662e-10,1.017633e-09,8.82353e-09,6.38408e-10,2.823794e-08,0.831445,0.168555,0.0,...,4.999034e-07,4.01519e-09,4.997854e-09,0.63,0.37,0.0,0.0,0.0,0.0,0.0
3,0.478112,0.521816,3.207779e-06,2.878019e-08,1.0765e-08,2.230641e-06,6.630235e-05,0.465733,0.534184,0.0,...,0.0,0.0,8.245405e-05,0.55,0.45,0.0,0.0,0.0,0.0,0.0
4,0.99243,0.006652,1.233117e-05,1.887496e-07,1.569583e-06,5.60426e-07,0.0009037877,0.93205,0.043451,0.0,...,0.0,0.0,0.02449972,0.97,0.03,0.0,0.0,0.0,0.0,0.0


In [28]:
# 模型输出的测试集，7个特征对应7个标签的预测概率
stack_ds.X_test.head()

Unnamed: 0,xgb_0,xgb_1,xgb_2,xgb_3,xgb_4,xgb_5,xgb_6,lgb_0,lgb_1,lgb_2,...,lgb_4,lgb_5,lgb_6,rf_0,rf_1,rf_2,rf_3,rf_4,rf_5,rf_6
0,0.991493,0.00021,4.796058e-06,6.178684e-08,6.947614e-07,2.41049e-09,0.008291459,0.0,0.0,0.0,...,0.0,0.0,1.0,0.99,0.0,0.0,0.0,0.0,0.0,0.01
1,0.024731,0.964372,0.0006387765,4.205048e-08,0.01006575,0.0001879628,4.830114e-06,0.065073,0.877354,0.001678,...,0.05530599,0.000589,8.601484e-11,0.09,0.8,0.05,0.0,0.06,0.0,0.0
2,0.00078,0.979776,0.0008593459,1.267791e-07,0.01710379,0.001477527,2.521008e-06,0.005164,0.933849,0.016355,...,0.03657553,0.008057,0.0,0.01,0.97,0.0,0.0,0.01,0.01,0.0
3,0.042695,0.957304,2.283268e-08,4.387427e-08,4.175481e-07,4.406019e-08,6.909629e-10,0.054392,0.945608,0.0,...,4.285638e-08,0.0,0.0,0.04,0.96,0.0,0.0,0.0,0.0,0.0
4,0.000457,0.999334,3.366338e-06,4.893879e-08,0.0002045808,7.889498e-07,1.415576e-09,0.001367,0.995857,0.0,...,0.002776106,0.0,0.0,0.0,0.99,0.0,0.0,0.01,0.0,0.0


In [30]:
%%time
# 用lr做最后一层
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={"solver": 'lbfgs', "max_iter": 1000},use_cache=False)
predict_stack = stacker.predict()

Wall time: 1min 53s


In [32]:
print(predict_stack)  # stacking后的结果

[[9.99137967e-01 4.76574513e-04 3.32764186e-07 ... 3.03237700e-06
  1.62161211e-06 3.79326620e-04]
 [2.00732175e-02 9.71682830e-01 1.51484266e-03 ... 5.69138554e-03
  9.40725428e-04 9.60822625e-05]
 [5.54556002e-03 9.91048437e-01 8.04840682e-04 ... 2.11437934e-03
  4.56463787e-04 3.00919502e-05]
 ...
 [4.60179790e-06 1.78298095e-03 9.91553958e-01 ... 7.26752933e-04
  3.79135124e-03 1.53584401e-05]
 [9.96307096e-01 2.43558944e-03 1.01596361e-07 ... 3.94985596e-05
  7.41569805e-06 1.20819024e-03]
 [5.34671504e-05 7.62534718e-04 5.58323657e-03 ... 2.11410908e-04
  9.91805379e-01 1.69502656e-05]]


## 验证结果

### 单模分数

In [33]:
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1),y_test))  # XGB
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1),y_test))  # LGB
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1),y_test))  # RF

0.9254473229468583
0.8412562907478676
0.9535087055000585


In [35]:
%%time
# 测试单模运行结果是否一致
rf_predict = rf_model(X_train, y_train, X_test, None)
print(accuracy_score(np.argmax(rf_predict, axis=1),y_test))  # RF

0.9537840870756542
Wall time: 50.8 s


### 线性加权分数

In [37]:
# blending的分数
xgb_t = stack_ds.X_test.iloc[:, :7].values
lgb_t = stack_ds.X_test.iloc[:, 7:14].values
rf_t = stack_ds.X_test.iloc[:, 14:].values

# 根据分数好坏随机定
result = 0.2*xgb_t+0.1*lgb_t+0.7*rf_t
print('主观根据结果blending：', accuracy_score(np.argmax(result, axis=1), y_test))
# 根据上面提供的最优权重 Best Weights: [2.53464919e-01 1.48562205e-20 7.46535081e-01]
result =  2.53464919e-01*xgb_t+1.48562205e-20*lgb_t+7.46535081e-01*rf_t
print('根据最优权重的线性加权：',accuracy_score(np.argmax(result, axis=1), y_test))

主观根据结果blending： 0.9504106627746071
根据最优权重的线性加权： 0.9530749795184953


可以观察到最优权重比我们主观选权重更优，单对比单模结果反而下降了

### Blending的分数

In [17]:
%%time
blend_ds = pipeline.blend(seed=111)
blender = Classifier(dataset=blend_ds, estimator=LogisticRegression, parameters={"solver": 'lbfgs', "max_iter": 1000},use_cache=False)
predict_blend = blender.predict()

Wall time: 14min 10s


In [19]:
print(accuracy_score(np.argmax(predict_blend, axis=1), y_test))

0.9546859617357301


使用Blending的分数有所提升

### Stacking的分数

In [38]:
print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))

0.9589887988544127


可以明显看到提升的效果

## 再说结论，该数据集（fetch_covtype）Stacking的方法相比Blending和线性加权更好