From 31c0dbe61c9a70df32bce18b4e416b62adb5b308 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 5 Feb 2021 13:40:02 +0800 Subject: [PATCH] =?UTF-8?q?Create=204-=E6=A8=A1=E5=9E=8B=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E5=92=8C=E9=A2=84=E6=B5=8B.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../4-模型训练和预测.ipynb | 2505 +++++++++++++++++ 1 file changed, 2505 insertions(+) create mode 100644 机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb new file mode 100644 index 0000000..491fa75 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb @@ -0,0 +1,2505 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import pandas as pd\n", + "import numpy as np\n", + "import xgboost as xgb\n", + "from sklearn.model_selection import train_test_split\n", + "import operator\n", + "from matplotlib import pylab as plt\n", + "from datetime import datetime\n", + "import time\n", + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsku_idcateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_y...cate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4label
0202633.012564.08.01.00.00.00.00.02.01.0...20.8666675167.61.00.02600.00.00.00.01.01.0
1218498.0149854.08.04.00.00.00.00.04.02.0...20.8666675167.61.00.04030.00.00.00.01.01.0
2221842.075877.08.03.00.00.00.00.05.079.0...20.8666675167.61.00.02450.00.00.00.01.01.0
3222886.0154636.08.020.01.00.00.00.026.010.0...20.8666675167.61.00.02080.00.00.00.01.01.0
4235240.038222.08.030.01.00.00.00.028.055.0...20.8666675167.61.00.01660.00.00.00.01.01.0
\n", + "

5 rows × 237 columns

\n", + "
" + ], + "text/plain": [ + " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "0 202633.0 12564.0 8.0 1.0 0.0 \n", + "1 218498.0 149854.0 8.0 4.0 0.0 \n", + "2 221842.0 75877.0 8.0 3.0 0.0 \n", + "3 222886.0 154636.0 8.0 20.0 1.0 \n", + "4 235240.0 38222.0 8.0 30.0 1.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_5_mean \\\n", + "0 2.0 1.0 ... 20.866667 \n", + "1 4.0 2.0 ... 20.866667 \n", + "2 5.0 79.0 ... 20.866667 \n", + "3 26.0 10.0 ... 20.866667 \n", + "4 28.0 55.0 ... 20.866667 \n", + "\n", + " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n", + "0 5167.6 1.0 0.0260 0.0 \n", + "1 5167.6 1.0 0.0403 0.0 \n", + "2 5167.6 1.0 0.0245 0.0 \n", + "3 5167.6 1.0 0.0208 0.0 \n", + "4 5167.6 1.0 0.0166 0.0 \n", + "\n", + " comment_num_1 comment_num_2 comment_num_3 comment_num_4 label \n", + "0 0.0 0.0 0.0 1.0 1.0 \n", + "1 0.0 0.0 0.0 1.0 1.0 \n", + "2 0.0 0.0 0.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 1.0 1.0 \n", + "4 0.0 0.0 0.0 1.0 1.0 \n", + "\n", + "[5 rows x 237 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv('data/train_set.csv')\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['user_id', 'sku_id', 'cate', 'action_before_3_1.0_x',\n", + " 'action_before_3_2.0_x', 'action_before_3_3.0_x',\n", + " 'action_before_3_4.0_x', 'action_before_3_5.0_x',\n", + " 'action_before_3_6.0_x', 'action_before_3_1.0_y',\n", + " ...\n", + " 'cate_action_5_mean', 'cate_action_6_mean', 'has_bad_comment',\n", + " 'bad_comment_rate', 'comment_num_0', 'comment_num_1', 'comment_num_2',\n", + " 'comment_num_3', 'comment_num_4', 'label'],\n", + " dtype='object', length=237)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsku_idcateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_y...cate_action_4_meancate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
0202633.012564.08.01.00.00.00.00.02.01.0...8.420.8666675167.61.00.02600.00.00.00.01.0
1218498.0149854.08.04.00.00.00.00.04.02.0...8.420.8666675167.61.00.04030.00.00.00.01.0
2221842.075877.08.03.00.00.00.00.05.079.0...8.420.8666675167.61.00.02450.00.00.00.01.0
3222886.0154636.08.020.01.00.00.00.026.010.0...8.420.8666675167.61.00.02080.00.00.00.01.0
4235240.038222.08.030.01.00.00.00.028.055.0...8.420.8666675167.61.00.01660.00.00.00.01.0
\n", + "

5 rows × 236 columns

\n", + "
" + ], + "text/plain": [ + " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "0 202633.0 12564.0 8.0 1.0 0.0 \n", + "1 218498.0 149854.0 8.0 4.0 0.0 \n", + "2 221842.0 75877.0 8.0 3.0 0.0 \n", + "3 222886.0 154636.0 8.0 20.0 1.0 \n", + "4 235240.0 38222.0 8.0 30.0 1.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n", + "0 2.0 1.0 ... 8.4 \n", + "1 4.0 2.0 ... 8.4 \n", + "2 5.0 79.0 ... 8.4 \n", + "3 26.0 10.0 ... 8.4 \n", + "4 28.0 55.0 ... 8.4 \n", + "\n", + " cate_action_5_mean cate_action_6_mean has_bad_comment bad_comment_rate \\\n", + "0 20.866667 5167.6 1.0 0.0260 \n", + "1 20.866667 5167.6 1.0 0.0403 \n", + "2 20.866667 5167.6 1.0 0.0245 \n", + "3 20.866667 5167.6 1.0 0.0208 \n", + "4 20.866667 5167.6 1.0 0.0166 \n", + "\n", + " comment_num_0 comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n", + "0 0.0 0.0 0.0 0.0 1.0 \n", + "1 0.0 0.0 0.0 0.0 1.0 \n", + "2 0.0 0.0 0.0 0.0 1.0 \n", + "3 0.0 0.0 0.0 0.0 1.0 \n", + "4 0.0 0.0 0.0 0.0 1.0 \n", + "\n", + "[5 rows x 236 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_x = data.loc[:,data.columns != 'label'] # 将数据集分成训练集和预测集\n", + "data_y = data.loc[:,data.columns == 'label']\n", + "data_x.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
label
01.0
11.0
21.0
31.0
41.0
\n", + "
" + ], + "text/plain": [ + " label\n", + "0 1.0\n", + "1 1.0\n", + "2 1.0\n", + "3 1.0\n", + "4 1.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(11695, 236)\n", + "(2924, 236)\n" + ] + } + ], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,test_size = 0.2, random_state = 0) # 数据切分成两份,训练和测试,8:2切分\n", + "print(x_train.shape)\n", + "print(x_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# 上面测试集刚好有近3000条,二切分成训练时的验证和预测\n", + "x_val = x_test.iloc[:1500,:]\n", + "y_val = y_test.iloc[:1500,:]\n", + "\n", + "x_test = x_test.iloc[1500:,:] \n", + "y_test = y_test.iloc[1500:,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1500, 236)\n", + "(1424, 236)\n" + ] + } + ], + "source": [ + "print(x_val.shape)\n", + "print(x_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_4_meancate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
21574.02.00.00.00.00.04.048.00.00.0...25.30000052.13333316112.0333331.00.03440.00.00.00.01.0
246410.02.00.00.00.00.07.0288.00.00.0...0.4333335.0666671273.5000001.00.01320.00.00.00.01.0
103268.02.01.01.00.00.03.042.02.01.0...35.233333149.26666732299.2333331.00.02130.00.00.00.01.0
70258.02.00.00.00.00.02.036.01.00.0...25.733333116.80000024942.6666670.00.00000.00.01.00.00.0
66257.013.00.00.00.00.022.092.05.01.0...17.00000036.1666679447.2666671.00.08000.00.00.00.01.0
\n", + "

5 rows × 234 columns

\n", + "
" + ], + "text/plain": [ + " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "2157 4.0 2.0 0.0 \n", + "2464 10.0 2.0 0.0 \n", + "10326 8.0 2.0 1.0 \n", + "7025 8.0 2.0 0.0 \n", + "6625 7.0 13.0 0.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "2157 0.0 0.0 0.0 \n", + "2464 0.0 0.0 0.0 \n", + "10326 1.0 0.0 0.0 \n", + "7025 0.0 0.0 0.0 \n", + "6625 0.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", + "2157 4.0 48.0 0.0 \n", + "2464 7.0 288.0 0.0 \n", + "10326 3.0 42.0 2.0 \n", + "7025 2.0 36.0 1.0 \n", + "6625 22.0 92.0 5.0 \n", + "\n", + " action_before_3_3.0_y ... cate_action_4_mean cate_action_5_mean \\\n", + "2157 0.0 ... 25.300000 52.133333 \n", + "2464 0.0 ... 0.433333 5.066667 \n", + "10326 1.0 ... 35.233333 149.266667 \n", + "7025 0.0 ... 25.733333 116.800000 \n", + "6625 1.0 ... 17.000000 36.166667 \n", + "\n", + " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n", + "2157 16112.033333 1.0 0.0344 0.0 \n", + "2464 1273.500000 1.0 0.0132 0.0 \n", + "10326 32299.233333 1.0 0.0213 0.0 \n", + "7025 24942.666667 0.0 0.0000 0.0 \n", + "6625 9447.266667 1.0 0.0800 0.0 \n", + "\n", + " comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n", + "2157 0.0 0.0 0.0 1.0 \n", + "2464 0.0 0.0 0.0 1.0 \n", + "10326 0.0 0.0 0.0 1.0 \n", + "7025 0.0 1.0 0.0 0.0 \n", + "6625 0.0 0.0 0.0 1.0 \n", + "\n", + "[5 rows x 234 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del x_train['user_id']\n", + "del x_train['sku_id']\n", + "\n", + "del x_val['user_id']\n", + "del x_val['sku_id']\n", + "\n", + "x_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "dtrain = xgb.DMatrix(x_train, label=y_train)\n", + "dvalid = xgb.DMatrix(x_val, label=y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "param = {'n_estimators': 4000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, \n", + " 'colsample_bytree': 0.8, 'scale_pos_weight':10, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic',\n", + " 'eval_metric':'auc'}" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0]\ttrain-auc:0.938547\teval-auc:0.934522\n", + "Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.\n", + "\n", + "Will train until eval-auc hasn't improved in 10 rounds.\n", + "[1]\ttrain-auc:0.947568\teval-auc:0.944769\n", + "[2]\ttrain-auc:0.952758\teval-auc:0.949358\n", + "[3]\ttrain-auc:0.955704\teval-auc:0.952481\n", + "[4]\ttrain-auc:0.95525\teval-auc:0.952343\n", + "[5]\ttrain-auc:0.957462\teval-auc:0.95475\n", + "[6]\ttrain-auc:0.957636\teval-auc:0.955133\n", + "[7]\ttrain-auc:0.958327\teval-auc:0.955077\n", + "[8]\ttrain-auc:0.958339\teval-auc:0.95549\n", + "[9]\ttrain-auc:0.958235\teval-auc:0.955479\n", + "[10]\ttrain-auc:0.958922\teval-auc:0.955668\n", + "[11]\ttrain-auc:0.959378\teval-auc:0.956423\n", + "[12]\ttrain-auc:0.959666\teval-auc:0.956724\n", + "[13]\ttrain-auc:0.959674\teval-auc:0.957192\n", + "[14]\ttrain-auc:0.960227\teval-auc:0.957447\n", + "[15]\ttrain-auc:0.960316\teval-auc:0.957583\n", + "[16]\ttrain-auc:0.961338\teval-auc:0.959102\n", + "[17]\ttrain-auc:0.961986\teval-auc:0.958905\n", + "[18]\ttrain-auc:0.962297\teval-auc:0.959365\n", + "[19]\ttrain-auc:0.962798\teval-auc:0.959863\n", + "[20]\ttrain-auc:0.963266\teval-auc:0.960243\n", + "[21]\ttrain-auc:0.963661\teval-auc:0.96069\n", + "[22]\ttrain-auc:0.964377\teval-auc:0.961733\n", + "[23]\ttrain-auc:0.964741\teval-auc:0.962419\n", + "[24]\ttrain-auc:0.964886\teval-auc:0.962837\n", + "[25]\ttrain-auc:0.965193\teval-auc:0.963015\n", + "[26]\ttrain-auc:0.965453\teval-auc:0.963356\n", + "[27]\ttrain-auc:0.965766\teval-auc:0.963154\n", + "[28]\ttrain-auc:0.965954\teval-auc:0.963015\n", + "[29]\ttrain-auc:0.966586\teval-auc:0.963877\n", + "[30]\ttrain-auc:0.966813\teval-auc:0.963994\n", + "[31]\ttrain-auc:0.967003\teval-auc:0.96425\n", + "[32]\ttrain-auc:0.967469\teval-auc:0.965021\n", + "[33]\ttrain-auc:0.967952\teval-auc:0.964936\n", + "[34]\ttrain-auc:0.968365\teval-auc:0.965487\n", + "[35]\ttrain-auc:0.968576\teval-auc:0.965891\n", + "[36]\ttrain-auc:0.968961\teval-auc:0.966418\n", + "[37]\ttrain-auc:0.969475\teval-auc:0.966474\n", + "[38]\ttrain-auc:0.96979\teval-auc:0.966923\n", + "[39]\ttrain-auc:0.970028\teval-auc:0.967189\n", + "[40]\ttrain-auc:0.970177\teval-auc:0.967397\n", + "[41]\ttrain-auc:0.970596\teval-auc:0.967048\n", + "[42]\ttrain-auc:0.970871\teval-auc:0.967607\n", + "[43]\ttrain-auc:0.971206\teval-auc:0.968102\n", + "[44]\ttrain-auc:0.971298\teval-auc:0.968176\n", + "[45]\ttrain-auc:0.971754\teval-auc:0.968493\n", + "[46]\ttrain-auc:0.971813\teval-auc:0.968546\n", + "[47]\ttrain-auc:0.972115\teval-auc:0.968902\n", + "[48]\ttrain-auc:0.972266\teval-auc:0.968961\n", + "[49]\ttrain-auc:0.972328\teval-auc:0.969046\n", + "[50]\ttrain-auc:0.972632\teval-auc:0.968804\n", + "[51]\ttrain-auc:0.973076\teval-auc:0.968977\n", + "[52]\ttrain-auc:0.973468\teval-auc:0.969573\n", + "[53]\ttrain-auc:0.973681\teval-auc:0.969738\n", + "[54]\ttrain-auc:0.973826\teval-auc:0.970062\n", + "[55]\ttrain-auc:0.974159\teval-auc:0.970142\n", + "[56]\ttrain-auc:0.974259\teval-auc:0.970254\n", + "[57]\ttrain-auc:0.974533\teval-auc:0.970278\n", + "[58]\ttrain-auc:0.974716\teval-auc:0.970485\n", + "[59]\ttrain-auc:0.974808\teval-auc:0.970538\n", + "[60]\ttrain-auc:0.975131\teval-auc:0.9709\n", + "[61]\ttrain-auc:0.975251\teval-auc:0.97106\n", + "[62]\ttrain-auc:0.97532\teval-auc:0.971113\n", + "[63]\ttrain-auc:0.975468\teval-auc:0.971262\n", + "[64]\ttrain-auc:0.975523\teval-auc:0.971342\n", + "[65]\ttrain-auc:0.975621\teval-auc:0.971342\n", + "[66]\ttrain-auc:0.975726\teval-auc:0.97132\n", + "[67]\ttrain-auc:0.975945\teval-auc:0.971496\n", + "[68]\ttrain-auc:0.976067\teval-auc:0.971703\n", + "[69]\ttrain-auc:0.976234\teval-auc:0.971991\n", + "[70]\ttrain-auc:0.976296\teval-auc:0.972044\n", + "[71]\ttrain-auc:0.976386\teval-auc:0.972129\n", + "[72]\ttrain-auc:0.976636\teval-auc:0.972087\n", + "[73]\ttrain-auc:0.976809\teval-auc:0.972156\n", + "[74]\ttrain-auc:0.97688\teval-auc:0.972251\n", + "[75]\ttrain-auc:0.977256\teval-auc:0.972459\n", + "[76]\ttrain-auc:0.977306\teval-auc:0.972507\n", + "[77]\ttrain-auc:0.977407\teval-auc:0.972603\n", + "[78]\ttrain-auc:0.977514\teval-auc:0.972656\n", + "[79]\ttrain-auc:0.977588\teval-auc:0.972757\n", + "[80]\ttrain-auc:0.977843\teval-auc:0.972972\n", + "[81]\ttrain-auc:0.977938\teval-auc:0.973036\n", + "[82]\ttrain-auc:0.978056\teval-auc:0.972962\n", + "[83]\ttrain-auc:0.97829\teval-auc:0.973058\n", + "[84]\ttrain-auc:0.978366\teval-auc:0.973132\n", + "[85]\ttrain-auc:0.97844\teval-auc:0.973132\n", + "[86]\ttrain-auc:0.978461\teval-auc:0.973143\n", + "[87]\ttrain-auc:0.97852\teval-auc:0.973207\n", + "[88]\ttrain-auc:0.978731\teval-auc:0.973457\n", + "[89]\ttrain-auc:0.978776\teval-auc:0.973499\n", + "[90]\ttrain-auc:0.978881\teval-auc:0.973446\n", + "[91]\ttrain-auc:0.979052\teval-auc:0.973494\n", + "[92]\ttrain-auc:0.979078\teval-auc:0.973499\n", + "[93]\ttrain-auc:0.979186\teval-auc:0.973637\n", + "[94]\ttrain-auc:0.9793\teval-auc:0.973712\n", + "[95]\ttrain-auc:0.979578\teval-auc:0.973733\n", + "[96]\ttrain-auc:0.979638\teval-auc:0.973797\n", + "[97]\ttrain-auc:0.979718\teval-auc:0.974021\n", + "[98]\ttrain-auc:0.979887\teval-auc:0.973978\n", + "[99]\ttrain-auc:0.9799\teval-auc:0.973957\n", + "[100]\ttrain-auc:0.979966\teval-auc:0.974106\n", + "[101]\ttrain-auc:0.980003\teval-auc:0.974159\n", + "[102]\ttrain-auc:0.98012\teval-auc:0.973994\n", + "[103]\ttrain-auc:0.980258\teval-auc:0.973962\n", + "[104]\ttrain-auc:0.980323\teval-auc:0.973903\n", + "[105]\ttrain-auc:0.980386\teval-auc:0.973999\n", + "[106]\ttrain-auc:0.980468\teval-auc:0.973946\n", + "[107]\ttrain-auc:0.980523\teval-auc:0.974058\n", + "[108]\ttrain-auc:0.980577\teval-auc:0.974116\n", + "[109]\ttrain-auc:0.98073\teval-auc:0.974239\n", + "[110]\ttrain-auc:0.98088\teval-auc:0.974244\n", + "[111]\ttrain-auc:0.980953\teval-auc:0.974377\n", + "[112]\ttrain-auc:0.981079\teval-auc:0.974409\n", + "[113]\ttrain-auc:0.981224\teval-auc:0.974499\n", + "[114]\ttrain-auc:0.981241\teval-auc:0.974515\n", + "[115]\ttrain-auc:0.981318\teval-auc:0.97434\n", + "[116]\ttrain-auc:0.981389\teval-auc:0.97451\n", + "[117]\ttrain-auc:0.981489\teval-auc:0.974537\n", + "[118]\ttrain-auc:0.981613\teval-auc:0.974654\n", + "[119]\ttrain-auc:0.981645\teval-auc:0.974765\n", + "[120]\ttrain-auc:0.981738\teval-auc:0.974739\n", + "[121]\ttrain-auc:0.98188\teval-auc:0.974707\n", + "[122]\ttrain-auc:0.98195\teval-auc:0.974643\n", + "[123]\ttrain-auc:0.982098\teval-auc:0.974659\n", + "[124]\ttrain-auc:0.982177\teval-auc:0.974723\n", + "[125]\ttrain-auc:0.982389\teval-auc:0.974941\n", + "[126]\ttrain-auc:0.982517\teval-auc:0.97509\n", + "[127]\ttrain-auc:0.982527\teval-auc:0.975132\n", + "[128]\ttrain-auc:0.982643\teval-auc:0.97517\n", + "[129]\ttrain-auc:0.982795\teval-auc:0.97509\n", + "[130]\ttrain-auc:0.982866\teval-auc:0.975122\n", + "[131]\ttrain-auc:0.98296\teval-auc:0.975186\n", + "[132]\ttrain-auc:0.983059\teval-auc:0.975223\n", + "[133]\ttrain-auc:0.983209\teval-auc:0.975143\n", + "[134]\ttrain-auc:0.983343\teval-auc:0.975239\n", + "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n", + "[136]\ttrain-auc:0.983545\teval-auc:0.975228\n", + "[137]\ttrain-auc:0.98368\teval-auc:0.975196\n", + "[138]\ttrain-auc:0.983674\teval-auc:0.975244\n", + "[139]\ttrain-auc:0.983737\teval-auc:0.975223\n", + "[140]\ttrain-auc:0.983804\teval-auc:0.97518\n", + "[141]\ttrain-auc:0.983939\teval-auc:0.975143\n", + "[142]\ttrain-auc:0.983985\teval-auc:0.975159\n", + "[143]\ttrain-auc:0.984077\teval-auc:0.975095\n", + "[144]\ttrain-auc:0.984248\teval-auc:0.975074\n", + "[145]\ttrain-auc:0.984285\teval-auc:0.975042\n", + "Stopping. Best iteration:\n", + "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n", + "\n" + ] + } + ], + "source": [ + "num_round = param['n_estimators']\n", + "\n", + "plst = param.items()\n", + "evallist = [(dtrain, 'train'), (dvalid, 'eval')]\n", + "best = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=10) # 寻找最优参\n", + "best.save_model('bst.model')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'best_iteration': '135',\n", + " 'best_msg': '[135]\\ttrain-auc:0.983497\\teval-auc:0.975266',\n", + " 'best_score': '0.975266'}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best.attributes()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def create_feature_map(features):\n", + " outfile = open(r'xgb.fmap', 'w')\n", + " i = 0\n", + " for feat in features:\n", + " outfile.write('{0}\\t{1}\\tq\\n'.format(i, feat))\n", + " i = i + 1\n", + " outfile.close()\n", + "\n", + "\n", + "features = list(x_train.columns[:])\n", + "create_feature_map(features)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def feature_importance(best_xgb):\n", + " importance = best_xgb.get_fscore(fmap=r'xgb.fmap')\n", + " importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)\n", + "\n", + " df = pd.DataFrame(importance, columns=['feature', 'fscore'])\n", + " df['fscore'] = df['fscore'] / df['fscore'].sum()\n", + " file_name = 'data/feature_importance_' + str(datetime.now().date())[5:] + '.csv'\n", + " df.to_csv(file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "feature_importance(best)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0featurefscore
00brand0.077852
11action_before_10_5.0_x0.041611
22bad_comment_rate0.038926
33product_action_5_ratio0.028188
44user_lv_cd_20.025503
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 feature fscore\n", + "0 0 brand 0.077852\n", + "1 1 action_before_10_5.0_x 0.041611\n", + "2 2 bad_comment_rate 0.038926\n", + "3 3 product_action_5_ratio 0.028188\n", + "4 4 user_lv_cd_2 0.025503" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fi = pd.read_csv('data/feature_importance_02-05.csv')\n", + "fi.sort_values(\"fscore\", inplace=True, ascending=False)\n", + "fi.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsku_idcateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_y...cate_action_4_meancate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
6765235394.024371.08.08.00.00.00.01.018.012.0...25.733333116.824942.6666671.00.02060.00.00.00.01.0
13767272948.0108907.04.08.00.00.00.00.07.00.0...61.633333143.941669.2333330.00.00000.00.01.00.00.0
9672245846.063026.010.01.00.00.00.00.00.047.0...1.1000009.02361.6000001.00.09380.00.00.01.00.0
9116272178.066704.09.02.00.00.00.00.04.0112.0...10.40000032.57264.2000000.00.00000.00.00.00.00.0
10055216485.0131364.06.02.01.01.00.00.06.028.0...38.23333388.017558.0333331.00.04730.00.00.00.01.0
\n", + "

5 rows × 236 columns

\n", + "
" + ], + "text/plain": [ + " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "6765 235394.0 24371.0 8.0 8.0 0.0 \n", + "13767 272948.0 108907.0 4.0 8.0 0.0 \n", + "9672 245846.0 63026.0 10.0 1.0 0.0 \n", + "9116 272178.0 66704.0 9.0 2.0 0.0 \n", + "10055 216485.0 131364.0 6.0 2.0 1.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "6765 0.0 0.0 1.0 \n", + "13767 0.0 0.0 0.0 \n", + "9672 0.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 \n", + "10055 1.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n", + "6765 18.0 12.0 ... 25.733333 \n", + "13767 7.0 0.0 ... 61.633333 \n", + "9672 0.0 47.0 ... 1.100000 \n", + "9116 4.0 112.0 ... 10.400000 \n", + "10055 6.0 28.0 ... 38.233333 \n", + "\n", + " cate_action_5_mean cate_action_6_mean has_bad_comment \\\n", + "6765 116.8 24942.666667 1.0 \n", + "13767 143.9 41669.233333 0.0 \n", + "9672 9.0 2361.600000 1.0 \n", + "9116 32.5 7264.200000 0.0 \n", + "10055 88.0 17558.033333 1.0 \n", + "\n", + " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n", + "6765 0.0206 0.0 0.0 0.0 \n", + "13767 0.0000 0.0 0.0 1.0 \n", + "9672 0.0938 0.0 0.0 0.0 \n", + "9116 0.0000 0.0 0.0 0.0 \n", + "10055 0.0473 0.0 0.0 0.0 \n", + "\n", + " comment_num_3 comment_num_4 \n", + "6765 0.0 1.0 \n", + "13767 0.0 0.0 \n", + "9672 1.0 0.0 \n", + "9116 0.0 0.0 \n", + "10055 0.0 1.0 \n", + "\n", + "[5 rows x 236 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "users = x_test[['user_id', 'sku_id', 'cate']].copy()\n", + "del x_test['user_id']\n", + "del x_test['sku_id']\n", + "x_test_DMatrix = xgb.DMatrix(x_test)\n", + "y_pred = bst.predict(x_test_DMatrix, ntree_limit=bst.best_ntree_limit)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_label
67658.08.00.00.00.01.018.012.00.00.0...116.824942.6666671.00.02060.00.00.00.01.00.453736
137674.08.00.00.00.00.07.00.00.00.0...143.941669.2333330.00.00000.00.01.00.00.00.002793
967210.01.00.00.00.00.00.047.00.00.0...9.02361.6000001.00.09380.00.00.01.00.00.000167
91169.02.00.00.00.00.04.0112.00.00.0...32.57264.2000000.00.00000.00.00.00.00.00.000225
100556.02.01.01.00.00.06.028.01.00.0...88.017558.0333331.00.04730.00.00.00.01.00.000507
\n", + "

5 rows × 235 columns

\n", + "
" + ], + "text/plain": [ + " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "6765 8.0 8.0 0.0 \n", + "13767 4.0 8.0 0.0 \n", + "9672 10.0 1.0 0.0 \n", + "9116 9.0 2.0 0.0 \n", + "10055 6.0 2.0 1.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "6765 0.0 0.0 1.0 \n", + "13767 0.0 0.0 0.0 \n", + "9672 0.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 \n", + "10055 1.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", + "6765 18.0 12.0 0.0 \n", + "13767 7.0 0.0 0.0 \n", + "9672 0.0 47.0 0.0 \n", + "9116 4.0 112.0 0.0 \n", + "10055 6.0 28.0 1.0 \n", + "\n", + " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n", + "6765 0.0 ... 116.8 24942.666667 \n", + "13767 0.0 ... 143.9 41669.233333 \n", + "9672 0.0 ... 9.0 2361.600000 \n", + "9116 0.0 ... 32.5 7264.200000 \n", + "10055 0.0 ... 88.0 17558.033333 \n", + "\n", + " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n", + "6765 1.0 0.0206 0.0 0.0 \n", + "13767 0.0 0.0000 0.0 0.0 \n", + "9672 1.0 0.0938 0.0 0.0 \n", + "9116 0.0 0.0000 0.0 0.0 \n", + "10055 1.0 0.0473 0.0 0.0 \n", + "\n", + " comment_num_2 comment_num_3 comment_num_4 pred_label \n", + "6765 0.0 0.0 1.0 0.453736 \n", + "13767 1.0 0.0 0.0 0.002793 \n", + "9672 0.0 1.0 0.0 0.000167 \n", + "9116 0.0 0.0 0.0 0.000225 \n", + "10055 0.0 0.0 1.0 0.000507 \n", + "\n", + "[5 rows x 235 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_test['pred_label'] = y_pred\n", + "x_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_label
67658.08.00.00.00.01.018.012.00.00.0...116.824942.6666671.00.02060.00.00.00.01.00.0
137674.08.00.00.00.00.07.00.00.00.0...143.941669.2333330.00.00000.00.01.00.00.00.0
967210.01.00.00.00.00.00.047.00.00.0...9.02361.6000001.00.09380.00.00.01.00.00.0
91169.02.00.00.00.00.04.0112.00.00.0...32.57264.2000000.00.00000.00.00.00.00.00.0
100556.02.01.01.00.00.06.028.01.00.0...88.017558.0333331.00.04730.00.00.00.01.00.0
\n", + "

5 rows × 235 columns

\n", + "
" + ], + "text/plain": [ + " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "6765 8.0 8.0 0.0 \n", + "13767 4.0 8.0 0.0 \n", + "9672 10.0 1.0 0.0 \n", + "9116 9.0 2.0 0.0 \n", + "10055 6.0 2.0 1.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "6765 0.0 0.0 1.0 \n", + "13767 0.0 0.0 0.0 \n", + "9672 0.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 \n", + "10055 1.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", + "6765 18.0 12.0 0.0 \n", + "13767 7.0 0.0 0.0 \n", + "9672 0.0 47.0 0.0 \n", + "9116 4.0 112.0 0.0 \n", + "10055 6.0 28.0 1.0 \n", + "\n", + " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n", + "6765 0.0 ... 116.8 24942.666667 \n", + "13767 0.0 ... 143.9 41669.233333 \n", + "9672 0.0 ... 9.0 2361.600000 \n", + "9116 0.0 ... 32.5 7264.200000 \n", + "10055 0.0 ... 88.0 17558.033333 \n", + "\n", + " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n", + "6765 1.0 0.0206 0.0 0.0 \n", + "13767 0.0 0.0000 0.0 0.0 \n", + "9672 1.0 0.0938 0.0 0.0 \n", + "9116 0.0 0.0000 0.0 0.0 \n", + "10055 1.0 0.0473 0.0 0.0 \n", + "\n", + " comment_num_2 comment_num_3 comment_num_4 pred_label \n", + "6765 0.0 0.0 1.0 0.0 \n", + "13767 1.0 0.0 0.0 0.0 \n", + "9672 0.0 1.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 0.0 \n", + "10055 0.0 0.0 1.0 0.0 \n", + "\n", + "[5 rows x 235 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def label(column):\n", + " if column['pred_label'] > 0.5:\n", + " #rint ('yes')\n", + " column['pred_label'] = 1\n", + " else:\n", + " column['pred_label'] = 0\n", + " return column\n", + "x_test = x_test.apply(label,axis = 1)\n", + "x_test.head() " + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_labeltrue_label
67658.08.00.00.00.01.018.012.00.00.0...24942.6666671.00.02060.00.00.00.01.00.00.0
137674.08.00.00.00.00.07.00.00.00.0...41669.2333330.00.00000.00.01.00.00.00.00.0
967210.01.00.00.00.00.00.047.00.00.0...2361.6000001.00.09380.00.00.01.00.00.00.0
91169.02.00.00.00.00.04.0112.00.00.0...7264.2000000.00.00000.00.00.00.00.00.00.0
100556.02.01.01.00.00.06.028.01.00.0...17558.0333331.00.04730.00.00.00.01.00.00.0
\n", + "

5 rows × 236 columns

\n", + "
" + ], + "text/plain": [ + " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "6765 8.0 8.0 0.0 \n", + "13767 4.0 8.0 0.0 \n", + "9672 10.0 1.0 0.0 \n", + "9116 9.0 2.0 0.0 \n", + "10055 6.0 2.0 1.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "6765 0.0 0.0 1.0 \n", + "13767 0.0 0.0 0.0 \n", + "9672 0.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 \n", + "10055 1.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", + "6765 18.0 12.0 0.0 \n", + "13767 7.0 0.0 0.0 \n", + "9672 0.0 47.0 0.0 \n", + "9116 4.0 112.0 0.0 \n", + "10055 6.0 28.0 1.0 \n", + "\n", + " action_before_3_3.0_y ... cate_action_6_mean has_bad_comment \\\n", + "6765 0.0 ... 24942.666667 1.0 \n", + "13767 0.0 ... 41669.233333 0.0 \n", + "9672 0.0 ... 2361.600000 1.0 \n", + "9116 0.0 ... 7264.200000 0.0 \n", + "10055 0.0 ... 17558.033333 1.0 \n", + "\n", + " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n", + "6765 0.0206 0.0 0.0 0.0 \n", + "13767 0.0000 0.0 0.0 1.0 \n", + "9672 0.0938 0.0 0.0 0.0 \n", + "9116 0.0000 0.0 0.0 0.0 \n", + "10055 0.0473 0.0 0.0 0.0 \n", + "\n", + " comment_num_3 comment_num_4 pred_label true_label \n", + "6765 0.0 1.0 0.0 0.0 \n", + "13767 0.0 0.0 0.0 0.0 \n", + "9672 1.0 0.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 0.0 \n", + "10055 0.0 1.0 0.0 0.0 \n", + "\n", + "[5 rows x 236 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_test['true_label'] = y_test\n", + "x_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...bad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_labeltrue_labeluser_idsku_id
67658.08.00.00.00.01.018.012.00.00.0...0.02060.00.00.00.01.00.00.0235394.024371.0
137674.08.00.00.00.00.07.00.00.00.0...0.00000.00.01.00.00.00.00.0272948.0108907.0
967210.01.00.00.00.00.00.047.00.00.0...0.09380.00.00.01.00.00.00.0245846.063026.0
91169.02.00.00.00.00.04.0112.00.00.0...0.00000.00.00.00.00.00.00.0272178.066704.0
100556.02.01.01.00.00.06.028.01.00.0...0.04730.00.00.00.01.00.00.0216485.0131364.0
\n", + "

5 rows × 238 columns

\n", + "
" + ], + "text/plain": [ + " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", + "6765 8.0 8.0 0.0 \n", + "13767 4.0 8.0 0.0 \n", + "9672 10.0 1.0 0.0 \n", + "9116 9.0 2.0 0.0 \n", + "10055 6.0 2.0 1.0 \n", + "\n", + " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", + "6765 0.0 0.0 1.0 \n", + "13767 0.0 0.0 0.0 \n", + "9672 0.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 \n", + "10055 1.0 0.0 0.0 \n", + "\n", + " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", + "6765 18.0 12.0 0.0 \n", + "13767 7.0 0.0 0.0 \n", + "9672 0.0 47.0 0.0 \n", + "9116 4.0 112.0 0.0 \n", + "10055 6.0 28.0 1.0 \n", + "\n", + " action_before_3_3.0_y ... bad_comment_rate comment_num_0 \\\n", + "6765 0.0 ... 0.0206 0.0 \n", + "13767 0.0 ... 0.0000 0.0 \n", + "9672 0.0 ... 0.0938 0.0 \n", + "9116 0.0 ... 0.0000 0.0 \n", + "10055 0.0 ... 0.0473 0.0 \n", + "\n", + " comment_num_1 comment_num_2 comment_num_3 comment_num_4 pred_label \\\n", + "6765 0.0 0.0 0.0 1.0 0.0 \n", + "13767 0.0 1.0 0.0 0.0 0.0 \n", + "9672 0.0 0.0 1.0 0.0 0.0 \n", + "9116 0.0 0.0 0.0 0.0 0.0 \n", + "10055 0.0 0.0 0.0 1.0 0.0 \n", + "\n", + " true_label user_id sku_id \n", + "6765 0.0 235394.0 24371.0 \n", + "13767 0.0 272948.0 108907.0 \n", + "9672 0.0 245846.0 63026.0 \n", + "9116 0.0 272178.0 66704.0 \n", + "10055 0.0 216485.0 131364.0 \n", + "\n", + "[5 rows x 238 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_test['user_id'] = users['user_id']\n", + "x_test['sku_id'] = users['sku_id']\n", + "x_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "126\n", + "248\n", + "267\n" + ] + } + ], + "source": [ + "# 所有购买用户\n", + "all_user_set = x_test[x_test['true_label']==1]['user_id'].unique()\n", + "print (len(all_user_set))\n", + "# 所有预测购买的用户\n", + "all_user_test_set = x_test[x_test['pred_label'] == 1]['user_id'].unique()\n", + "print (len(all_user_test_set))\n", + "all_user_test_item_pair = x_test[x_test['pred_label'] == 1]['user_id'].map(str) + '-' + x_test[x_test['pred_label'] == 1]['sku_id'].map(str)\n", + "all_user_test_item_pair = np.array(all_user_test_item_pair)\n", + "print (len(all_user_test_item_pair))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "所有用户中预测购买用户的准确率为 0.4838709677419355\n", + "所有用户中预测购买用户的召回率0.9523809523809523\n" + ] + } + ], + "source": [ + "pos, neg = 0,0\n", + "for user_id in all_user_test_set:\n", + " if user_id in all_user_set:\n", + " pos += 1\n", + " else:\n", + " neg += 1\n", + "all_user_acc = 1.0 * pos / ( pos + neg)\n", + "all_user_recall = 1.0 * pos / len(all_user_set)\n", + "print ('所有用户中预测购买用户的准确率为 ' + str(all_user_acc))\n", + "print ('所有用户中预测购买用户的召回率' + str(all_user_recall))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "144\n", + "所有用户中预测购买商品的准确率为 0.5131086142322098\n", + "所有用户中预测购买商品的召回率0.9513888888888888\n", + "F11=0.527086383601757\n", + "F12=0.7091097308488614\n", + "score=0.6363003919500196\n" + ] + } + ], + "source": [ + "#所有实际商品对\n", + "all_user_item_pair = x_test[x_test['true_label']==1]['user_id'].map(str) + '-' + x_test[x_test['true_label']==1]['sku_id'].map(str)\n", + "all_user_item_pair = np.array(all_user_item_pair)\n", + "print (len(all_user_item_pair))\n", + "pos, neg = 0, 0\n", + "for user_item_pair in all_user_test_item_pair:\n", + " #print (user_item_pair)\n", + " if user_item_pair in all_user_item_pair:\n", + " pos += 1\n", + " else:\n", + " neg += 1\n", + "all_item_acc = 1.0 * pos / ( pos + neg)\n", + "all_item_recall = 1.0 * pos / len(all_user_item_pair)\n", + "print ('所有用户中预测购买商品的准确率为 ' + str(all_item_acc))\n", + "print ('所有用户中预测购买商品的召回率' + str(all_item_recall))\n", + "F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc)\n", + "F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc)\n", + "score = 0.4 * F11 + 0.6 * F12\n", + "print ('F11=' + str(F11))\n", + "print ('F12=' + str(F12))\n", + "print ('score=' + str(score))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}