diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型预测及评估.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型预测及评估.ipynb new file mode 100644 index 0000000..c442ea5 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型预测及评估.ipynb @@ -0,0 +1,1296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import lightgbm as lgb # 模型\n", + "import pandas as pd # 数据处理包\n", + "import numpy as np # 数据处理包\n", + "from sklearn import metrics # 混淆矩阵\n", + "from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split # 分层五折验证包、寻找最优参函数、切分数据\n", + "from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix # 准确率、roc计算、auc计算、混淆矩阵\n", + "import matplotlib.pyplot as plt # 图形处理包\n", + "import itertools # 处理混淆矩阵\n", + "import gc # 处理缓存,有兴趣的可以搜搜怎么使用\n", + "import warnings # 忽略普通警告,不打印太多东西\n", + "warnings.filterwarnings('ignore')\n", + "plt.rcParams['font.sans-serif']=['SimHei'] # 让图形可以显示中文\n", + "plt.rcParams['axes.unicode_minus']=False" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | user_id | \n", + "sku_id | \n", + "cate | \n", + "action_before_3_1.0_x | \n", + "action_before_3_2.0_x | \n", + "action_before_3_3.0_x | \n", + "action_before_3_4.0_x | \n", + "action_before_3_5.0_x | \n", + "action_before_3_6.0_x | \n", + "action_before_3_1.0_y | \n", + "... | \n", + "cate_action_5_mean | \n", + "cate_action_6_mean | \n", + "has_bad_comment | \n", + "bad_comment_rate | \n", + "comment_num_0 | \n", + "comment_num_1 | \n", + "comment_num_2 | \n", + "comment_num_3 | \n", + "comment_num_4 | \n", + "label | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "202633.0 | \n", + "12564.0 | \n", + "8.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2.0 | \n", + "1.0 | \n", + "... | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0260 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
1 | \n", + "218498.0 | \n", + "149854.0 | \n", + "8.0 | \n", + "4.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "4.0 | \n", + "2.0 | \n", + "... | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0403 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
2 | \n", + "221842.0 | \n", + "75877.0 | \n", + "8.0 | \n", + "3.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "5.0 | \n", + "79.0 | \n", + "... | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0245 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
3 | \n", + "222886.0 | \n", + "154636.0 | \n", + "8.0 | \n", + "20.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "26.0 | \n", + "10.0 | \n", + "... | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0208 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
4 | \n", + "235240.0 | \n", + "38222.0 | \n", + "8.0 | \n", + "30.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "28.0 | \n", + "55.0 | \n", + "... | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0166 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
5 rows × 237 columns
\n", + "\n", + " | user_id | \n", + "sku_id | \n", + "cate | \n", + "action_before_3_1.0_x | \n", + "action_before_3_2.0_x | \n", + "action_before_3_3.0_x | \n", + "action_before_3_4.0_x | \n", + "action_before_3_5.0_x | \n", + "action_before_3_6.0_x | \n", + "action_before_3_1.0_y | \n", + "... | \n", + "cate_action_4_mean | \n", + "cate_action_5_mean | \n", + "cate_action_6_mean | \n", + "has_bad_comment | \n", + "bad_comment_rate | \n", + "comment_num_0 | \n", + "comment_num_1 | \n", + "comment_num_2 | \n", + "comment_num_3 | \n", + "comment_num_4 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "202633.0 | \n", + "12564.0 | \n", + "8.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2.0 | \n", + "1.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0260 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
1 | \n", + "218498.0 | \n", + "149854.0 | \n", + "8.0 | \n", + "4.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "4.0 | \n", + "2.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0403 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
2 | \n", + "221842.0 | \n", + "75877.0 | \n", + "8.0 | \n", + "3.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "5.0 | \n", + "79.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0245 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
3 | \n", + "222886.0 | \n", + "154636.0 | \n", + "8.0 | \n", + "20.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "26.0 | \n", + "10.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0208 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
4 | \n", + "235240.0 | \n", + "38222.0 | \n", + "8.0 | \n", + "30.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "28.0 | \n", + "55.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0166 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
5 rows × 236 columns
\n", + "\n", + " | label | \n", + "
---|---|
0 | \n", + "1.0 | \n", + "
1 | \n", + "1.0 | \n", + "
2 | \n", + "1.0 | \n", + "
3 | \n", + "1.0 | \n", + "
4 | \n", + "1.0 | \n", + "
\n", + " | cate | \n", + "action_before_3_1.0_x | \n", + "action_before_3_2.0_x | \n", + "action_before_3_3.0_x | \n", + "action_before_3_4.0_x | \n", + "action_before_3_5.0_x | \n", + "action_before_3_6.0_x | \n", + "action_before_3_1.0_y | \n", + "action_before_3_2.0_y | \n", + "action_before_3_3.0_y | \n", + "... | \n", + "cate_action_4_mean | \n", + "cate_action_5_mean | \n", + "cate_action_6_mean | \n", + "has_bad_comment | \n", + "bad_comment_rate | \n", + "comment_num_0 | \n", + "comment_num_1 | \n", + "comment_num_2 | \n", + "comment_num_3 | \n", + "comment_num_4 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "8.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0260 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
1 | \n", + "8.0 | \n", + "4.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "4.0 | \n", + "2.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0403 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
2 | \n", + "8.0 | \n", + "3.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "5.0 | \n", + "79.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0245 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
3 | \n", + "8.0 | \n", + "20.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "26.0 | \n", + "10.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0208 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
4 | \n", + "8.0 | \n", + "30.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "28.0 | \n", + "55.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "8.4 | \n", + "20.866667 | \n", + "5167.6 | \n", + "1.0 | \n", + "0.0166 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
5 rows × 234 columns
\n", + "\n", + " | user_id | \n", + "sku_id | \n", + "cate | \n", + "action_before_3_1.0_x | \n", + "action_before_3_2.0_x | \n", + "action_before_3_3.0_x | \n", + "action_before_3_4.0_x | \n", + "action_before_3_5.0_x | \n", + "action_before_3_6.0_x | \n", + "action_before_3_1.0_y | \n", + "... | \n", + "cate_action_4_mean | \n", + "cate_action_5_mean | \n", + "cate_action_6_mean | \n", + "has_bad_comment | \n", + "bad_comment_rate | \n", + "comment_num_0 | \n", + "comment_num_1 | \n", + "comment_num_2 | \n", + "comment_num_3 | \n", + "comment_num_4 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "200005.0 | \n", + "67444.0 | \n", + "4.0 | \n", + "2.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "3.0 | \n", + "26.0 | \n", + "... | \n", + "73.400000 | \n", + "169.366667 | \n", + "48251.0 | \n", + "1.0 | \n", + "0.0821 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
1 | \n", + "200005.0 | \n", + "72967.0 | \n", + "4.0 | \n", + "26.0 | \n", + "1.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "30.0 | \n", + "2.0 | \n", + "... | \n", + "73.400000 | \n", + "169.366667 | \n", + "48251.0 | \n", + "1.0 | \n", + "0.0196 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
2 | \n", + "200007.0 | \n", + "26229.0 | \n", + "9.0 | \n", + "2.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2.0 | \n", + "12.0 | \n", + "... | \n", + "20.766667 | \n", + "56.700000 | \n", + "12937.7 | \n", + "1.0 | \n", + "0.0198 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
3 | \n", + "200007.0 | \n", + "63315.0 | \n", + "9.0 | \n", + "4.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "3.0 | \n", + "10.0 | \n", + "... | \n", + "20.766667 | \n", + "56.700000 | \n", + "12937.7 | \n", + "1.0 | \n", + "0.0476 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
4 | \n", + "200007.0 | \n", + "126404.0 | \n", + "9.0 | \n", + "4.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "3.0 | \n", + "10.0 | \n", + "... | \n", + "20.766667 | \n", + "56.700000 | \n", + "12937.7 | \n", + "0.0 | \n", + "0.0000 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
5 rows × 236 columns
\n", + "\n", + " | user_id | \n", + "sku_id | \n", + "cate | \n", + "action_before_3_1.0_x | \n", + "action_before_3_2.0_x | \n", + "action_before_3_3.0_x | \n", + "action_before_3_4.0_x | \n", + "action_before_3_5.0_x | \n", + "action_before_3_6.0_x | \n", + "action_before_3_1.0_y | \n", + "... | \n", + "cate_action_4_mean | \n", + "cate_action_5_mean | \n", + "cate_action_6_mean | \n", + "has_bad_comment | \n", + "bad_comment_rate | \n", + "comment_num_0 | \n", + "comment_num_1 | \n", + "comment_num_2 | \n", + "comment_num_3 | \n", + "comment_num_4 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "200005.0 | \n", + "67444.0 | \n", + "4.0 | \n", + "2.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "3.0 | \n", + "26.0 | \n", + "... | \n", + "73.400000 | \n", + "169.366667 | \n", + "48251.0 | \n", + "1.0 | \n", + "0.0821 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
1 | \n", + "200005.0 | \n", + "72967.0 | \n", + "4.0 | \n", + "26.0 | \n", + "1.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "30.0 | \n", + "2.0 | \n", + "... | \n", + "73.400000 | \n", + "169.366667 | \n", + "48251.0 | \n", + "1.0 | \n", + "0.0196 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
2 | \n", + "200007.0 | \n", + "26229.0 | \n", + "9.0 | \n", + "2.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2.0 | \n", + "12.0 | \n", + "... | \n", + "20.766667 | \n", + "56.700000 | \n", + "12937.7 | \n", + "1.0 | \n", + "0.0198 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
3 | \n", + "200007.0 | \n", + "63315.0 | \n", + "9.0 | \n", + "4.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "3.0 | \n", + "10.0 | \n", + "... | \n", + "20.766667 | \n", + "56.700000 | \n", + "12937.7 | \n", + "1.0 | \n", + "0.0476 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
4 | \n", + "200007.0 | \n", + "126404.0 | \n", + "9.0 | \n", + "4.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "3.0 | \n", + "10.0 | \n", + "... | \n", + "20.766667 | \n", + "56.700000 | \n", + "12937.7 | \n", + "0.0 | \n", + "0.0000 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
5 rows × 236 columns
\n", + "