diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb deleted file mode 100644 index 7984da2..0000000 --- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb +++ /dev/null @@ -1,2505 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import pandas as pd\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from sklearn.model_selection import train_test_split\n", - "import operator\n", - "from matplotlib import pylab as plt\n", - "from datetime import datetime\n", - "import time\n", - "from sklearn.model_selection import GridSearchCV" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idsku_idcateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_y...cate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4label
0202633.012564.08.01.00.00.00.00.02.01.0...20.8666675167.61.00.02600.00.00.00.01.01.0
1218498.0149854.08.04.00.00.00.00.04.02.0...20.8666675167.61.00.04030.00.00.00.01.01.0
2221842.075877.08.03.00.00.00.00.05.079.0...20.8666675167.61.00.02450.00.00.00.01.01.0
3222886.0154636.08.020.01.00.00.00.026.010.0...20.8666675167.61.00.02080.00.00.00.01.01.0
4235240.038222.08.030.01.00.00.00.028.055.0...20.8666675167.61.00.01660.00.00.00.01.01.0
\n", - "

5 rows × 237 columns

\n", - "
" - ], - "text/plain": [ - " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "0 202633.0 12564.0 8.0 1.0 0.0 \n", - "1 218498.0 149854.0 8.0 4.0 0.0 \n", - "2 221842.0 75877.0 8.0 3.0 0.0 \n", - "3 222886.0 154636.0 8.0 20.0 1.0 \n", - "4 235240.0 38222.0 8.0 30.0 1.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 \n", - "3 0.0 0.0 0.0 \n", - "4 0.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_5_mean \\\n", - "0 2.0 1.0 ... 20.866667 \n", - "1 4.0 2.0 ... 20.866667 \n", - "2 5.0 79.0 ... 20.866667 \n", - "3 26.0 10.0 ... 20.866667 \n", - "4 28.0 55.0 ... 20.866667 \n", - "\n", - " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n", - "0 5167.6 1.0 0.0260 0.0 \n", - "1 5167.6 1.0 0.0403 0.0 \n", - "2 5167.6 1.0 0.0245 0.0 \n", - "3 5167.6 1.0 0.0208 0.0 \n", - "4 5167.6 1.0 0.0166 0.0 \n", - "\n", - " comment_num_1 comment_num_2 comment_num_3 comment_num_4 label \n", - "0 0.0 0.0 0.0 1.0 1.0 \n", - "1 0.0 0.0 0.0 1.0 1.0 \n", - "2 0.0 0.0 0.0 1.0 1.0 \n", - "3 0.0 0.0 0.0 1.0 1.0 \n", - "4 0.0 0.0 0.0 1.0 1.0 \n", - "\n", - "[5 rows x 237 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = pd.read_csv('data/train_set.csv') # 读取训练数据\n", - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['user_id', 'sku_id', 'cate', 'action_before_3_1.0_x',\n", - " 'action_before_3_2.0_x', 'action_before_3_3.0_x',\n", - " 'action_before_3_4.0_x', 'action_before_3_5.0_x',\n", - " 'action_before_3_6.0_x', 'action_before_3_1.0_y',\n", - " ...\n", - " 'cate_action_5_mean', 'cate_action_6_mean', 'has_bad_comment',\n", - " 'bad_comment_rate', 'comment_num_0', 'comment_num_1', 'comment_num_2',\n", - " 'comment_num_3', 'comment_num_4', 'label'],\n", - " dtype='object', length=237)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idsku_idcateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_y...cate_action_4_meancate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
0202633.012564.08.01.00.00.00.00.02.01.0...8.420.8666675167.61.00.02600.00.00.00.01.0
1218498.0149854.08.04.00.00.00.00.04.02.0...8.420.8666675167.61.00.04030.00.00.00.01.0
2221842.075877.08.03.00.00.00.00.05.079.0...8.420.8666675167.61.00.02450.00.00.00.01.0
3222886.0154636.08.020.01.00.00.00.026.010.0...8.420.8666675167.61.00.02080.00.00.00.01.0
4235240.038222.08.030.01.00.00.00.028.055.0...8.420.8666675167.61.00.01660.00.00.00.01.0
\n", - "

5 rows × 236 columns

\n", - "
" - ], - "text/plain": [ - " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "0 202633.0 12564.0 8.0 1.0 0.0 \n", - "1 218498.0 149854.0 8.0 4.0 0.0 \n", - "2 221842.0 75877.0 8.0 3.0 0.0 \n", - "3 222886.0 154636.0 8.0 20.0 1.0 \n", - "4 235240.0 38222.0 8.0 30.0 1.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 \n", - "3 0.0 0.0 0.0 \n", - "4 0.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n", - "0 2.0 1.0 ... 8.4 \n", - "1 4.0 2.0 ... 8.4 \n", - "2 5.0 79.0 ... 8.4 \n", - "3 26.0 10.0 ... 8.4 \n", - "4 28.0 55.0 ... 8.4 \n", - "\n", - " cate_action_5_mean cate_action_6_mean has_bad_comment bad_comment_rate \\\n", - "0 20.866667 5167.6 1.0 0.0260 \n", - "1 20.866667 5167.6 1.0 0.0403 \n", - "2 20.866667 5167.6 1.0 0.0245 \n", - "3 20.866667 5167.6 1.0 0.0208 \n", - "4 20.866667 5167.6 1.0 0.0166 \n", - "\n", - " comment_num_0 comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n", - "0 0.0 0.0 0.0 0.0 1.0 \n", - "1 0.0 0.0 0.0 0.0 1.0 \n", - "2 0.0 0.0 0.0 0.0 1.0 \n", - "3 0.0 0.0 0.0 0.0 1.0 \n", - "4 0.0 0.0 0.0 0.0 1.0 \n", - "\n", - "[5 rows x 236 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_x = data.loc[:,data.columns != 'label'] # 将训练数据集分成特征和标签\n", - "data_y = data.loc[:,data.columns == 'label']\n", - "data_x.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
label
01.0
11.0
21.0
31.0
41.0
\n", - "
" - ], - "text/plain": [ - " label\n", - "0 1.0\n", - "1 1.0\n", - "2 1.0\n", - "3 1.0\n", - "4 1.0" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_y.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(11695, 236)\n", - "(2924, 236)\n" - ] - } - ], - "source": [ - "x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,test_size = 0.2, random_state = 0) # 数据切分成两份,训练和测试,8:2切分\n", - "print(x_train.shape)\n", - "print(x_test.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# 上面测试集刚好有近3000条,二切分成训练时的验证和预测\n", - "x_val = x_test.iloc[:1500,:]\n", - "y_val = y_test.iloc[:1500,:]\n", - "\n", - "x_test = x_test.iloc[1500:,:] \n", - "y_test = y_test.iloc[1500:,:]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1500, 236)\n", - "(1424, 236)\n" - ] - } - ], - "source": [ - "print(x_val.shape)\n", - "print(x_test.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_4_meancate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
21574.02.00.00.00.00.04.048.00.00.0...25.30000052.13333316112.0333331.00.03440.00.00.00.01.0
246410.02.00.00.00.00.07.0288.00.00.0...0.4333335.0666671273.5000001.00.01320.00.00.00.01.0
103268.02.01.01.00.00.03.042.02.01.0...35.233333149.26666732299.2333331.00.02130.00.00.00.01.0
70258.02.00.00.00.00.02.036.01.00.0...25.733333116.80000024942.6666670.00.00000.00.01.00.00.0
66257.013.00.00.00.00.022.092.05.01.0...17.00000036.1666679447.2666671.00.08000.00.00.00.01.0
\n", - "

5 rows × 234 columns

\n", - "
" - ], - "text/plain": [ - " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "2157 4.0 2.0 0.0 \n", - "2464 10.0 2.0 0.0 \n", - "10326 8.0 2.0 1.0 \n", - "7025 8.0 2.0 0.0 \n", - "6625 7.0 13.0 0.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "2157 0.0 0.0 0.0 \n", - "2464 0.0 0.0 0.0 \n", - "10326 1.0 0.0 0.0 \n", - "7025 0.0 0.0 0.0 \n", - "6625 0.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", - "2157 4.0 48.0 0.0 \n", - "2464 7.0 288.0 0.0 \n", - "10326 3.0 42.0 2.0 \n", - "7025 2.0 36.0 1.0 \n", - "6625 22.0 92.0 5.0 \n", - "\n", - " action_before_3_3.0_y ... cate_action_4_mean cate_action_5_mean \\\n", - "2157 0.0 ... 25.300000 52.133333 \n", - "2464 0.0 ... 0.433333 5.066667 \n", - "10326 1.0 ... 35.233333 149.266667 \n", - "7025 0.0 ... 25.733333 116.800000 \n", - "6625 1.0 ... 17.000000 36.166667 \n", - "\n", - " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n", - "2157 16112.033333 1.0 0.0344 0.0 \n", - "2464 1273.500000 1.0 0.0132 0.0 \n", - "10326 32299.233333 1.0 0.0213 0.0 \n", - "7025 24942.666667 0.0 0.0000 0.0 \n", - "6625 9447.266667 1.0 0.0800 0.0 \n", - "\n", - " comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n", - "2157 0.0 0.0 0.0 1.0 \n", - "2464 0.0 0.0 0.0 1.0 \n", - "10326 0.0 0.0 0.0 1.0 \n", - "7025 0.0 1.0 0.0 0.0 \n", - "6625 0.0 0.0 0.0 1.0 \n", - "\n", - "[5 rows x 234 columns]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "del x_train['user_id']\n", - "del x_train['sku_id']\n", - "\n", - "del x_val['user_id']\n", - "del x_val['sku_id']\n", - "\n", - "x_train.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "dtrain = xgb.DMatrix(x_train, label=y_train)\n", - "dvalid = xgb.DMatrix(x_val, label=y_val)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "param = {'n_estimators': 4000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, \n", - " 'colsample_bytree': 0.8, 'scale_pos_weight':10, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic',\n", - " 'eval_metric':'auc'}" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0]\ttrain-auc:0.938547\teval-auc:0.934522\n", - "Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.\n", - "\n", - "Will train until eval-auc hasn't improved in 10 rounds.\n", - "[1]\ttrain-auc:0.947568\teval-auc:0.944769\n", - "[2]\ttrain-auc:0.952758\teval-auc:0.949358\n", - "[3]\ttrain-auc:0.955704\teval-auc:0.952481\n", - "[4]\ttrain-auc:0.95525\teval-auc:0.952343\n", - "[5]\ttrain-auc:0.957462\teval-auc:0.95475\n", - "[6]\ttrain-auc:0.957636\teval-auc:0.955133\n", - "[7]\ttrain-auc:0.958327\teval-auc:0.955077\n", - "[8]\ttrain-auc:0.958339\teval-auc:0.95549\n", - "[9]\ttrain-auc:0.958235\teval-auc:0.955479\n", - "[10]\ttrain-auc:0.958922\teval-auc:0.955668\n", - "[11]\ttrain-auc:0.959378\teval-auc:0.956423\n", - "[12]\ttrain-auc:0.959666\teval-auc:0.956724\n", - "[13]\ttrain-auc:0.959674\teval-auc:0.957192\n", - "[14]\ttrain-auc:0.960227\teval-auc:0.957447\n", - "[15]\ttrain-auc:0.960316\teval-auc:0.957583\n", - "[16]\ttrain-auc:0.961338\teval-auc:0.959102\n", - "[17]\ttrain-auc:0.961986\teval-auc:0.958905\n", - "[18]\ttrain-auc:0.962297\teval-auc:0.959365\n", - "[19]\ttrain-auc:0.962798\teval-auc:0.959863\n", - "[20]\ttrain-auc:0.963266\teval-auc:0.960243\n", - "[21]\ttrain-auc:0.963661\teval-auc:0.96069\n", - "[22]\ttrain-auc:0.964377\teval-auc:0.961733\n", - "[23]\ttrain-auc:0.964741\teval-auc:0.962419\n", - "[24]\ttrain-auc:0.964886\teval-auc:0.962837\n", - "[25]\ttrain-auc:0.965193\teval-auc:0.963015\n", - "[26]\ttrain-auc:0.965453\teval-auc:0.963356\n", - "[27]\ttrain-auc:0.965766\teval-auc:0.963154\n", - "[28]\ttrain-auc:0.965954\teval-auc:0.963015\n", - "[29]\ttrain-auc:0.966586\teval-auc:0.963877\n", - "[30]\ttrain-auc:0.966813\teval-auc:0.963994\n", - "[31]\ttrain-auc:0.967003\teval-auc:0.96425\n", - "[32]\ttrain-auc:0.967469\teval-auc:0.965021\n", - "[33]\ttrain-auc:0.967952\teval-auc:0.964936\n", - "[34]\ttrain-auc:0.968365\teval-auc:0.965487\n", - "[35]\ttrain-auc:0.968576\teval-auc:0.965891\n", - "[36]\ttrain-auc:0.968961\teval-auc:0.966418\n", - "[37]\ttrain-auc:0.969475\teval-auc:0.966474\n", - "[38]\ttrain-auc:0.96979\teval-auc:0.966923\n", - "[39]\ttrain-auc:0.970028\teval-auc:0.967189\n", - "[40]\ttrain-auc:0.970177\teval-auc:0.967397\n", - "[41]\ttrain-auc:0.970596\teval-auc:0.967048\n", - "[42]\ttrain-auc:0.970871\teval-auc:0.967607\n", - "[43]\ttrain-auc:0.971206\teval-auc:0.968102\n", - "[44]\ttrain-auc:0.971298\teval-auc:0.968176\n", - "[45]\ttrain-auc:0.971754\teval-auc:0.968493\n", - "[46]\ttrain-auc:0.971813\teval-auc:0.968546\n", - "[47]\ttrain-auc:0.972115\teval-auc:0.968902\n", - "[48]\ttrain-auc:0.972266\teval-auc:0.968961\n", - "[49]\ttrain-auc:0.972328\teval-auc:0.969046\n", - "[50]\ttrain-auc:0.972632\teval-auc:0.968804\n", - "[51]\ttrain-auc:0.973076\teval-auc:0.968977\n", - "[52]\ttrain-auc:0.973468\teval-auc:0.969573\n", - "[53]\ttrain-auc:0.973681\teval-auc:0.969738\n", - "[54]\ttrain-auc:0.973826\teval-auc:0.970062\n", - "[55]\ttrain-auc:0.974159\teval-auc:0.970142\n", - "[56]\ttrain-auc:0.974259\teval-auc:0.970254\n", - "[57]\ttrain-auc:0.974533\teval-auc:0.970278\n", - "[58]\ttrain-auc:0.974716\teval-auc:0.970485\n", - "[59]\ttrain-auc:0.974808\teval-auc:0.970538\n", - "[60]\ttrain-auc:0.975131\teval-auc:0.9709\n", - "[61]\ttrain-auc:0.975251\teval-auc:0.97106\n", - "[62]\ttrain-auc:0.97532\teval-auc:0.971113\n", - "[63]\ttrain-auc:0.975468\teval-auc:0.971262\n", - "[64]\ttrain-auc:0.975523\teval-auc:0.971342\n", - "[65]\ttrain-auc:0.975621\teval-auc:0.971342\n", - "[66]\ttrain-auc:0.975726\teval-auc:0.97132\n", - "[67]\ttrain-auc:0.975945\teval-auc:0.971496\n", - "[68]\ttrain-auc:0.976067\teval-auc:0.971703\n", - "[69]\ttrain-auc:0.976234\teval-auc:0.971991\n", - "[70]\ttrain-auc:0.976296\teval-auc:0.972044\n", - "[71]\ttrain-auc:0.976386\teval-auc:0.972129\n", - "[72]\ttrain-auc:0.976636\teval-auc:0.972087\n", - "[73]\ttrain-auc:0.976809\teval-auc:0.972156\n", - "[74]\ttrain-auc:0.97688\teval-auc:0.972251\n", - "[75]\ttrain-auc:0.977256\teval-auc:0.972459\n", - "[76]\ttrain-auc:0.977306\teval-auc:0.972507\n", - "[77]\ttrain-auc:0.977407\teval-auc:0.972603\n", - "[78]\ttrain-auc:0.977514\teval-auc:0.972656\n", - "[79]\ttrain-auc:0.977588\teval-auc:0.972757\n", - "[80]\ttrain-auc:0.977843\teval-auc:0.972972\n", - "[81]\ttrain-auc:0.977938\teval-auc:0.973036\n", - "[82]\ttrain-auc:0.978056\teval-auc:0.972962\n", - "[83]\ttrain-auc:0.97829\teval-auc:0.973058\n", - "[84]\ttrain-auc:0.978366\teval-auc:0.973132\n", - "[85]\ttrain-auc:0.97844\teval-auc:0.973132\n", - "[86]\ttrain-auc:0.978461\teval-auc:0.973143\n", - "[87]\ttrain-auc:0.97852\teval-auc:0.973207\n", - "[88]\ttrain-auc:0.978731\teval-auc:0.973457\n", - "[89]\ttrain-auc:0.978776\teval-auc:0.973499\n", - "[90]\ttrain-auc:0.978881\teval-auc:0.973446\n", - "[91]\ttrain-auc:0.979052\teval-auc:0.973494\n", - "[92]\ttrain-auc:0.979078\teval-auc:0.973499\n", - "[93]\ttrain-auc:0.979186\teval-auc:0.973637\n", - "[94]\ttrain-auc:0.9793\teval-auc:0.973712\n", - "[95]\ttrain-auc:0.979578\teval-auc:0.973733\n", - "[96]\ttrain-auc:0.979638\teval-auc:0.973797\n", - "[97]\ttrain-auc:0.979718\teval-auc:0.974021\n", - "[98]\ttrain-auc:0.979887\teval-auc:0.973978\n", - "[99]\ttrain-auc:0.9799\teval-auc:0.973957\n", - "[100]\ttrain-auc:0.979966\teval-auc:0.974106\n", - "[101]\ttrain-auc:0.980003\teval-auc:0.974159\n", - "[102]\ttrain-auc:0.98012\teval-auc:0.973994\n", - "[103]\ttrain-auc:0.980258\teval-auc:0.973962\n", - "[104]\ttrain-auc:0.980323\teval-auc:0.973903\n", - "[105]\ttrain-auc:0.980386\teval-auc:0.973999\n", - "[106]\ttrain-auc:0.980468\teval-auc:0.973946\n", - "[107]\ttrain-auc:0.980523\teval-auc:0.974058\n", - "[108]\ttrain-auc:0.980577\teval-auc:0.974116\n", - "[109]\ttrain-auc:0.98073\teval-auc:0.974239\n", - "[110]\ttrain-auc:0.98088\teval-auc:0.974244\n", - "[111]\ttrain-auc:0.980953\teval-auc:0.974377\n", - "[112]\ttrain-auc:0.981079\teval-auc:0.974409\n", - "[113]\ttrain-auc:0.981224\teval-auc:0.974499\n", - "[114]\ttrain-auc:0.981241\teval-auc:0.974515\n", - "[115]\ttrain-auc:0.981318\teval-auc:0.97434\n", - "[116]\ttrain-auc:0.981389\teval-auc:0.97451\n", - "[117]\ttrain-auc:0.981489\teval-auc:0.974537\n", - "[118]\ttrain-auc:0.981613\teval-auc:0.974654\n", - "[119]\ttrain-auc:0.981645\teval-auc:0.974765\n", - "[120]\ttrain-auc:0.981738\teval-auc:0.974739\n", - "[121]\ttrain-auc:0.98188\teval-auc:0.974707\n", - "[122]\ttrain-auc:0.98195\teval-auc:0.974643\n", - "[123]\ttrain-auc:0.982098\teval-auc:0.974659\n", - "[124]\ttrain-auc:0.982177\teval-auc:0.974723\n", - "[125]\ttrain-auc:0.982389\teval-auc:0.974941\n", - "[126]\ttrain-auc:0.982517\teval-auc:0.97509\n", - "[127]\ttrain-auc:0.982527\teval-auc:0.975132\n", - "[128]\ttrain-auc:0.982643\teval-auc:0.97517\n", - "[129]\ttrain-auc:0.982795\teval-auc:0.97509\n", - "[130]\ttrain-auc:0.982866\teval-auc:0.975122\n", - "[131]\ttrain-auc:0.98296\teval-auc:0.975186\n", - "[132]\ttrain-auc:0.983059\teval-auc:0.975223\n", - "[133]\ttrain-auc:0.983209\teval-auc:0.975143\n", - "[134]\ttrain-auc:0.983343\teval-auc:0.975239\n", - "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n", - "[136]\ttrain-auc:0.983545\teval-auc:0.975228\n", - "[137]\ttrain-auc:0.98368\teval-auc:0.975196\n", - "[138]\ttrain-auc:0.983674\teval-auc:0.975244\n", - "[139]\ttrain-auc:0.983737\teval-auc:0.975223\n", - "[140]\ttrain-auc:0.983804\teval-auc:0.97518\n", - "[141]\ttrain-auc:0.983939\teval-auc:0.975143\n", - "[142]\ttrain-auc:0.983985\teval-auc:0.975159\n", - "[143]\ttrain-auc:0.984077\teval-auc:0.975095\n", - "[144]\ttrain-auc:0.984248\teval-auc:0.975074\n", - "[145]\ttrain-auc:0.984285\teval-auc:0.975042\n", - "Stopping. Best iteration:\n", - "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n", - "\n" - ] - } - ], - "source": [ - "num_round = param['n_estimators']\n", - "\n", - "plst = param.items()\n", - "evallist = [(dtrain, 'train'), (dvalid, 'eval')]\n", - "best = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=10) # 寻找最优参\n", - "best.save_model('bst.model')" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'best_iteration': '135',\n", - " 'best_msg': '[135]\\ttrain-auc:0.983497\\teval-auc:0.975266',\n", - " 'best_score': '0.975266'}" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "best.attributes()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "def create_feature_map(features):\n", - " outfile = open(r'xgb.fmap', 'w')\n", - " i = 0\n", - " for feat in features:\n", - " outfile.write('{0}\\t{1}\\tq\\n'.format(i, feat))\n", - " i = i + 1\n", - " outfile.close()\n", - "\n", - "\n", - "features = list(x_train.columns[:])\n", - "create_feature_map(features)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def feature_importance(best_xgb):\n", - " importance = best_xgb.get_fscore(fmap=r'xgb.fmap')\n", - " importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)\n", - "\n", - " df = pd.DataFrame(importance, columns=['feature', 'fscore'])\n", - " df['fscore'] = df['fscore'] / df['fscore'].sum()\n", - " file_name = 'data/feature_importance_' + str(datetime.now().date())[5:] + '.csv'\n", - " df.to_csv(file_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "feature_importance(best)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0featurefscore
00brand0.077852
11action_before_10_5.0_x0.041611
22bad_comment_rate0.038926
33product_action_5_ratio0.028188
44user_lv_cd_20.025503
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 feature fscore\n", - "0 0 brand 0.077852\n", - "1 1 action_before_10_5.0_x 0.041611\n", - "2 2 bad_comment_rate 0.038926\n", - "3 3 product_action_5_ratio 0.028188\n", - "4 4 user_lv_cd_2 0.025503" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fi = pd.read_csv('data/feature_importance_02-05.csv')\n", - "fi.sort_values(\"fscore\", inplace=True, ascending=False)\n", - "fi.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idsku_idcateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_y...cate_action_4_meancate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
6765235394.024371.08.08.00.00.00.01.018.012.0...25.733333116.824942.6666671.00.02060.00.00.00.01.0
13767272948.0108907.04.08.00.00.00.00.07.00.0...61.633333143.941669.2333330.00.00000.00.01.00.00.0
9672245846.063026.010.01.00.00.00.00.00.047.0...1.1000009.02361.6000001.00.09380.00.00.01.00.0
9116272178.066704.09.02.00.00.00.00.04.0112.0...10.40000032.57264.2000000.00.00000.00.00.00.00.0
10055216485.0131364.06.02.01.01.00.00.06.028.0...38.23333388.017558.0333331.00.04730.00.00.00.01.0
\n", - "

5 rows × 236 columns

\n", - "
" - ], - "text/plain": [ - " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "6765 235394.0 24371.0 8.0 8.0 0.0 \n", - "13767 272948.0 108907.0 4.0 8.0 0.0 \n", - "9672 245846.0 63026.0 10.0 1.0 0.0 \n", - "9116 272178.0 66704.0 9.0 2.0 0.0 \n", - "10055 216485.0 131364.0 6.0 2.0 1.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "6765 0.0 0.0 1.0 \n", - "13767 0.0 0.0 0.0 \n", - "9672 0.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 \n", - "10055 1.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n", - "6765 18.0 12.0 ... 25.733333 \n", - "13767 7.0 0.0 ... 61.633333 \n", - "9672 0.0 47.0 ... 1.100000 \n", - "9116 4.0 112.0 ... 10.400000 \n", - "10055 6.0 28.0 ... 38.233333 \n", - "\n", - " cate_action_5_mean cate_action_6_mean has_bad_comment \\\n", - "6765 116.8 24942.666667 1.0 \n", - "13767 143.9 41669.233333 0.0 \n", - "9672 9.0 2361.600000 1.0 \n", - "9116 32.5 7264.200000 0.0 \n", - "10055 88.0 17558.033333 1.0 \n", - "\n", - " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n", - "6765 0.0206 0.0 0.0 0.0 \n", - "13767 0.0000 0.0 0.0 1.0 \n", - "9672 0.0938 0.0 0.0 0.0 \n", - "9116 0.0000 0.0 0.0 0.0 \n", - "10055 0.0473 0.0 0.0 0.0 \n", - "\n", - " comment_num_3 comment_num_4 \n", - "6765 0.0 1.0 \n", - "13767 0.0 0.0 \n", - "9672 1.0 0.0 \n", - "9116 0.0 0.0 \n", - "10055 0.0 1.0 \n", - "\n", - "[5 rows x 236 columns]" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x_test.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "users = x_test[['user_id', 'sku_id', 'cate']].copy()\n", - "del x_test['user_id']\n", - "del x_test['sku_id']\n", - "x_test_DMatrix = xgb.DMatrix(x_test)\n", - "y_pred = bst.predict(x_test_DMatrix, ntree_limit=bst.best_ntree_limit)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_label
67658.08.00.00.00.01.018.012.00.00.0...116.824942.6666671.00.02060.00.00.00.01.00.453736
137674.08.00.00.00.00.07.00.00.00.0...143.941669.2333330.00.00000.00.01.00.00.00.002793
967210.01.00.00.00.00.00.047.00.00.0...9.02361.6000001.00.09380.00.00.01.00.00.000167
91169.02.00.00.00.00.04.0112.00.00.0...32.57264.2000000.00.00000.00.00.00.00.00.000225
100556.02.01.01.00.00.06.028.01.00.0...88.017558.0333331.00.04730.00.00.00.01.00.000507
\n", - "

5 rows × 235 columns

\n", - "
" - ], - "text/plain": [ - " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "6765 8.0 8.0 0.0 \n", - "13767 4.0 8.0 0.0 \n", - "9672 10.0 1.0 0.0 \n", - "9116 9.0 2.0 0.0 \n", - "10055 6.0 2.0 1.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "6765 0.0 0.0 1.0 \n", - "13767 0.0 0.0 0.0 \n", - "9672 0.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 \n", - "10055 1.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", - "6765 18.0 12.0 0.0 \n", - "13767 7.0 0.0 0.0 \n", - "9672 0.0 47.0 0.0 \n", - "9116 4.0 112.0 0.0 \n", - "10055 6.0 28.0 1.0 \n", - "\n", - " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n", - "6765 0.0 ... 116.8 24942.666667 \n", - "13767 0.0 ... 143.9 41669.233333 \n", - "9672 0.0 ... 9.0 2361.600000 \n", - "9116 0.0 ... 32.5 7264.200000 \n", - "10055 0.0 ... 88.0 17558.033333 \n", - "\n", - " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n", - "6765 1.0 0.0206 0.0 0.0 \n", - "13767 0.0 0.0000 0.0 0.0 \n", - "9672 1.0 0.0938 0.0 0.0 \n", - "9116 0.0 0.0000 0.0 0.0 \n", - "10055 1.0 0.0473 0.0 0.0 \n", - "\n", - " comment_num_2 comment_num_3 comment_num_4 pred_label \n", - "6765 0.0 0.0 1.0 0.453736 \n", - "13767 1.0 0.0 0.0 0.002793 \n", - "9672 0.0 1.0 0.0 0.000167 \n", - "9116 0.0 0.0 0.0 0.000225 \n", - "10055 0.0 0.0 1.0 0.000507 \n", - "\n", - "[5 rows x 235 columns]" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x_test['pred_label'] = y_pred\n", - "x_test.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_5_meancate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_label
67658.08.00.00.00.01.018.012.00.00.0...116.824942.6666671.00.02060.00.00.00.01.00.0
137674.08.00.00.00.00.07.00.00.00.0...143.941669.2333330.00.00000.00.01.00.00.00.0
967210.01.00.00.00.00.00.047.00.00.0...9.02361.6000001.00.09380.00.00.01.00.00.0
91169.02.00.00.00.00.04.0112.00.00.0...32.57264.2000000.00.00000.00.00.00.00.00.0
100556.02.01.01.00.00.06.028.01.00.0...88.017558.0333331.00.04730.00.00.00.01.00.0
\n", - "

5 rows × 235 columns

\n", - "
" - ], - "text/plain": [ - " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "6765 8.0 8.0 0.0 \n", - "13767 4.0 8.0 0.0 \n", - "9672 10.0 1.0 0.0 \n", - "9116 9.0 2.0 0.0 \n", - "10055 6.0 2.0 1.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "6765 0.0 0.0 1.0 \n", - "13767 0.0 0.0 0.0 \n", - "9672 0.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 \n", - "10055 1.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", - "6765 18.0 12.0 0.0 \n", - "13767 7.0 0.0 0.0 \n", - "9672 0.0 47.0 0.0 \n", - "9116 4.0 112.0 0.0 \n", - "10055 6.0 28.0 1.0 \n", - "\n", - " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n", - "6765 0.0 ... 116.8 24942.666667 \n", - "13767 0.0 ... 143.9 41669.233333 \n", - "9672 0.0 ... 9.0 2361.600000 \n", - "9116 0.0 ... 32.5 7264.200000 \n", - "10055 0.0 ... 88.0 17558.033333 \n", - "\n", - " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n", - "6765 1.0 0.0206 0.0 0.0 \n", - "13767 0.0 0.0000 0.0 0.0 \n", - "9672 1.0 0.0938 0.0 0.0 \n", - "9116 0.0 0.0000 0.0 0.0 \n", - "10055 1.0 0.0473 0.0 0.0 \n", - "\n", - " comment_num_2 comment_num_3 comment_num_4 pred_label \n", - "6765 0.0 0.0 1.0 0.0 \n", - "13767 1.0 0.0 0.0 0.0 \n", - "9672 0.0 1.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 0.0 \n", - "10055 0.0 0.0 1.0 0.0 \n", - "\n", - "[5 rows x 235 columns]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def label(column):\n", - " if column['pred_label'] > 0.5:\n", - " #rint ('yes')\n", - " column['pred_label'] = 1\n", - " else:\n", - " column['pred_label'] = 0\n", - " return column\n", - "x_test = x_test.apply(label,axis = 1)\n", - "x_test.head() " - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...cate_action_6_meanhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_labeltrue_label
67658.08.00.00.00.01.018.012.00.00.0...24942.6666671.00.02060.00.00.00.01.00.00.0
137674.08.00.00.00.00.07.00.00.00.0...41669.2333330.00.00000.00.01.00.00.00.00.0
967210.01.00.00.00.00.00.047.00.00.0...2361.6000001.00.09380.00.00.01.00.00.00.0
91169.02.00.00.00.00.04.0112.00.00.0...7264.2000000.00.00000.00.00.00.00.00.00.0
100556.02.01.01.00.00.06.028.01.00.0...17558.0333331.00.04730.00.00.00.01.00.00.0
\n", - "

5 rows × 236 columns

\n", - "
" - ], - "text/plain": [ - " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "6765 8.0 8.0 0.0 \n", - "13767 4.0 8.0 0.0 \n", - "9672 10.0 1.0 0.0 \n", - "9116 9.0 2.0 0.0 \n", - "10055 6.0 2.0 1.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "6765 0.0 0.0 1.0 \n", - "13767 0.0 0.0 0.0 \n", - "9672 0.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 \n", - "10055 1.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", - "6765 18.0 12.0 0.0 \n", - "13767 7.0 0.0 0.0 \n", - "9672 0.0 47.0 0.0 \n", - "9116 4.0 112.0 0.0 \n", - "10055 6.0 28.0 1.0 \n", - "\n", - " action_before_3_3.0_y ... cate_action_6_mean has_bad_comment \\\n", - "6765 0.0 ... 24942.666667 1.0 \n", - "13767 0.0 ... 41669.233333 0.0 \n", - "9672 0.0 ... 2361.600000 1.0 \n", - "9116 0.0 ... 7264.200000 0.0 \n", - "10055 0.0 ... 17558.033333 1.0 \n", - "\n", - " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n", - "6765 0.0206 0.0 0.0 0.0 \n", - "13767 0.0000 0.0 0.0 1.0 \n", - "9672 0.0938 0.0 0.0 0.0 \n", - "9116 0.0000 0.0 0.0 0.0 \n", - "10055 0.0473 0.0 0.0 0.0 \n", - "\n", - " comment_num_3 comment_num_4 pred_label true_label \n", - "6765 0.0 1.0 0.0 0.0 \n", - "13767 0.0 0.0 0.0 0.0 \n", - "9672 1.0 0.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 0.0 \n", - "10055 0.0 1.0 0.0 0.0 \n", - "\n", - "[5 rows x 236 columns]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x_test['true_label'] = y_test\n", - "x_test.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cateaction_before_3_1.0_xaction_before_3_2.0_xaction_before_3_3.0_xaction_before_3_4.0_xaction_before_3_5.0_xaction_before_3_6.0_xaction_before_3_1.0_yaction_before_3_2.0_yaction_before_3_3.0_y...bad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4pred_labeltrue_labeluser_idsku_id
67658.08.00.00.00.01.018.012.00.00.0...0.02060.00.00.00.01.00.00.0235394.024371.0
137674.08.00.00.00.00.07.00.00.00.0...0.00000.00.01.00.00.00.00.0272948.0108907.0
967210.01.00.00.00.00.00.047.00.00.0...0.09380.00.00.01.00.00.00.0245846.063026.0
91169.02.00.00.00.00.04.0112.00.00.0...0.00000.00.00.00.00.00.00.0272178.066704.0
100556.02.01.01.00.00.06.028.01.00.0...0.04730.00.00.00.01.00.00.0216485.0131364.0
\n", - "

5 rows × 238 columns

\n", - "
" - ], - "text/plain": [ - " cate action_before_3_1.0_x action_before_3_2.0_x \\\n", - "6765 8.0 8.0 0.0 \n", - "13767 4.0 8.0 0.0 \n", - "9672 10.0 1.0 0.0 \n", - "9116 9.0 2.0 0.0 \n", - "10055 6.0 2.0 1.0 \n", - "\n", - " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n", - "6765 0.0 0.0 1.0 \n", - "13767 0.0 0.0 0.0 \n", - "9672 0.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 \n", - "10055 1.0 0.0 0.0 \n", - "\n", - " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n", - "6765 18.0 12.0 0.0 \n", - "13767 7.0 0.0 0.0 \n", - "9672 0.0 47.0 0.0 \n", - "9116 4.0 112.0 0.0 \n", - "10055 6.0 28.0 1.0 \n", - "\n", - " action_before_3_3.0_y ... bad_comment_rate comment_num_0 \\\n", - "6765 0.0 ... 0.0206 0.0 \n", - "13767 0.0 ... 0.0000 0.0 \n", - "9672 0.0 ... 0.0938 0.0 \n", - "9116 0.0 ... 0.0000 0.0 \n", - "10055 0.0 ... 0.0473 0.0 \n", - "\n", - " comment_num_1 comment_num_2 comment_num_3 comment_num_4 pred_label \\\n", - "6765 0.0 0.0 0.0 1.0 0.0 \n", - "13767 0.0 1.0 0.0 0.0 0.0 \n", - "9672 0.0 0.0 1.0 0.0 0.0 \n", - "9116 0.0 0.0 0.0 0.0 0.0 \n", - "10055 0.0 0.0 0.0 1.0 0.0 \n", - "\n", - " true_label user_id sku_id \n", - "6765 0.0 235394.0 24371.0 \n", - "13767 0.0 272948.0 108907.0 \n", - "9672 0.0 245846.0 63026.0 \n", - "9116 0.0 272178.0 66704.0 \n", - "10055 0.0 216485.0 131364.0 \n", - "\n", - "[5 rows x 238 columns]" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x_test['user_id'] = users['user_id']\n", - "x_test['sku_id'] = users['sku_id']\n", - "x_test.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "126\n", - "248\n", - "267\n" - ] - } - ], - "source": [ - "# 所有购买用户\n", - "all_user_set = x_test[x_test['true_label']==1]['user_id'].unique()\n", - "print (len(all_user_set))\n", - "# 所有预测购买的用户\n", - "all_user_test_set = x_test[x_test['pred_label'] == 1]['user_id'].unique()\n", - "print (len(all_user_test_set))\n", - "all_user_test_item_pair = x_test[x_test['pred_label'] == 1]['user_id'].map(str) + '-' + x_test[x_test['pred_label'] == 1]['sku_id'].map(str)\n", - "all_user_test_item_pair = np.array(all_user_test_item_pair)\n", - "print (len(all_user_test_item_pair))" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "所有用户中预测购买用户的准确率为 0.4838709677419355\n", - "所有用户中预测购买用户的召回率0.9523809523809523\n" - ] - } - ], - "source": [ - "pos, neg = 0,0\n", - "for user_id in all_user_test_set:\n", - " if user_id in all_user_set:\n", - " pos += 1\n", - " else:\n", - " neg += 1\n", - "all_user_acc = 1.0 * pos / (pos + neg)\n", - "all_user_recall = 1.0 * pos / len(all_user_set)\n", - "print ('所有用户中预测购买用户的准确率为 ' + str(all_user_acc))\n", - "print ('所有用户中预测购买用户的召回率' + str(all_user_recall))" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "144\n", - "所有用户中预测购买商品的准确率为 0.5131086142322098\n", - "所有用户中预测购买商品的召回率0.9513888888888888\n", - "F11=0.527086383601757\n", - "F12=0.7091097308488614\n", - "score=0.6363003919500196\n" - ] - } - ], - "source": [ - "#所有实际商品对\n", - "all_user_item_pair = x_test[x_test['true_label']==1]['user_id'].map(str) + '-' + x_test[x_test['true_label']==1]['sku_id'].map(str)\n", - "all_user_item_pair = np.array(all_user_item_pair)\n", - "print (len(all_user_item_pair))\n", - "pos, neg = 0, 0\n", - "for user_item_pair in all_user_test_item_pair:\n", - " #print (user_item_pair)\n", - " if user_item_pair in all_user_item_pair:\n", - " pos += 1\n", - " else:\n", - " neg += 1\n", - "all_item_acc = 1.0 * pos / ( pos + neg)\n", - "all_item_recall = 1.0 * pos / len(all_user_item_pair)\n", - "print ('所有用户中预测购买商品的准确率为 ' + str(all_item_acc))\n", - "print ('所有用户中预测购买商品的召回率' + str(all_item_recall))\n", - "F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc)\n", - "F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc)\n", - "score = 0.4 * F11 + 0.6 * F12\n", - "print ('F11=' + str(F11))\n", - "print ('F12=' + str(F12))\n", - "print ('score=' + str(score))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}