From 31c0dbe61c9a70df32bce18b4e416b62adb5b308 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Fri, 5 Feb 2021 13:40:02 +0800
Subject: [PATCH] =?UTF-8?q?Create=204-=E6=A8=A1=E5=9E=8B=E8=AE=AD=E7=BB=83?=
=?UTF-8?q?=E5=92=8C=E9=A2=84=E6=B5=8B.ipynb?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../4-模型训练和预测.ipynb | 2505 +++++++++++++++++
1 file changed, 2505 insertions(+)
create mode 100644 机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb
diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb
new file mode 100644
index 0000000..491fa75
--- /dev/null
+++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb
@@ -0,0 +1,2505 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import xgboost as xgb\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "import operator\n",
+ "from matplotlib import pylab as plt\n",
+ "from datetime import datetime\n",
+ "import time\n",
+ "from sklearn.model_selection import GridSearchCV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " sku_id | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " ... | \n",
+ " cate_action_5_mean | \n",
+ " cate_action_6_mean | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 202633.0 | \n",
+ " 12564.0 | \n",
+ " 8.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0260 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 218498.0 | \n",
+ " 149854.0 | \n",
+ " 8.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " ... | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0403 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 221842.0 | \n",
+ " 75877.0 | \n",
+ " 8.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 79.0 | \n",
+ " ... | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0245 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 222886.0 | \n",
+ " 154636.0 | \n",
+ " 8.0 | \n",
+ " 20.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 26.0 | \n",
+ " 10.0 | \n",
+ " ... | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0208 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 235240.0 | \n",
+ " 38222.0 | \n",
+ " 8.0 | \n",
+ " 30.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 28.0 | \n",
+ " 55.0 | \n",
+ " ... | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0166 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 237 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "0 202633.0 12564.0 8.0 1.0 0.0 \n",
+ "1 218498.0 149854.0 8.0 4.0 0.0 \n",
+ "2 221842.0 75877.0 8.0 3.0 0.0 \n",
+ "3 222886.0 154636.0 8.0 20.0 1.0 \n",
+ "4 235240.0 38222.0 8.0 30.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_5_mean \\\n",
+ "0 2.0 1.0 ... 20.866667 \n",
+ "1 4.0 2.0 ... 20.866667 \n",
+ "2 5.0 79.0 ... 20.866667 \n",
+ "3 26.0 10.0 ... 20.866667 \n",
+ "4 28.0 55.0 ... 20.866667 \n",
+ "\n",
+ " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n",
+ "0 5167.6 1.0 0.0260 0.0 \n",
+ "1 5167.6 1.0 0.0403 0.0 \n",
+ "2 5167.6 1.0 0.0245 0.0 \n",
+ "3 5167.6 1.0 0.0208 0.0 \n",
+ "4 5167.6 1.0 0.0166 0.0 \n",
+ "\n",
+ " comment_num_1 comment_num_2 comment_num_3 comment_num_4 label \n",
+ "0 0.0 0.0 0.0 1.0 1.0 \n",
+ "1 0.0 0.0 0.0 1.0 1.0 \n",
+ "2 0.0 0.0 0.0 1.0 1.0 \n",
+ "3 0.0 0.0 0.0 1.0 1.0 \n",
+ "4 0.0 0.0 0.0 1.0 1.0 \n",
+ "\n",
+ "[5 rows x 237 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = pd.read_csv('data/train_set.csv')\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['user_id', 'sku_id', 'cate', 'action_before_3_1.0_x',\n",
+ " 'action_before_3_2.0_x', 'action_before_3_3.0_x',\n",
+ " 'action_before_3_4.0_x', 'action_before_3_5.0_x',\n",
+ " 'action_before_3_6.0_x', 'action_before_3_1.0_y',\n",
+ " ...\n",
+ " 'cate_action_5_mean', 'cate_action_6_mean', 'has_bad_comment',\n",
+ " 'bad_comment_rate', 'comment_num_0', 'comment_num_1', 'comment_num_2',\n",
+ " 'comment_num_3', 'comment_num_4', 'label'],\n",
+ " dtype='object', length=237)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " sku_id | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " ... | \n",
+ " cate_action_4_mean | \n",
+ " cate_action_5_mean | \n",
+ " cate_action_6_mean | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 202633.0 | \n",
+ " 12564.0 | \n",
+ " 8.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 8.4 | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0260 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 218498.0 | \n",
+ " 149854.0 | \n",
+ " 8.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " ... | \n",
+ " 8.4 | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0403 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 221842.0 | \n",
+ " 75877.0 | \n",
+ " 8.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 79.0 | \n",
+ " ... | \n",
+ " 8.4 | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0245 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 222886.0 | \n",
+ " 154636.0 | \n",
+ " 8.0 | \n",
+ " 20.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 26.0 | \n",
+ " 10.0 | \n",
+ " ... | \n",
+ " 8.4 | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0208 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 235240.0 | \n",
+ " 38222.0 | \n",
+ " 8.0 | \n",
+ " 30.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 28.0 | \n",
+ " 55.0 | \n",
+ " ... | \n",
+ " 8.4 | \n",
+ " 20.866667 | \n",
+ " 5167.6 | \n",
+ " 1.0 | \n",
+ " 0.0166 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 236 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "0 202633.0 12564.0 8.0 1.0 0.0 \n",
+ "1 218498.0 149854.0 8.0 4.0 0.0 \n",
+ "2 221842.0 75877.0 8.0 3.0 0.0 \n",
+ "3 222886.0 154636.0 8.0 20.0 1.0 \n",
+ "4 235240.0 38222.0 8.0 30.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n",
+ "0 2.0 1.0 ... 8.4 \n",
+ "1 4.0 2.0 ... 8.4 \n",
+ "2 5.0 79.0 ... 8.4 \n",
+ "3 26.0 10.0 ... 8.4 \n",
+ "4 28.0 55.0 ... 8.4 \n",
+ "\n",
+ " cate_action_5_mean cate_action_6_mean has_bad_comment bad_comment_rate \\\n",
+ "0 20.866667 5167.6 1.0 0.0260 \n",
+ "1 20.866667 5167.6 1.0 0.0403 \n",
+ "2 20.866667 5167.6 1.0 0.0245 \n",
+ "3 20.866667 5167.6 1.0 0.0208 \n",
+ "4 20.866667 5167.6 1.0 0.0166 \n",
+ "\n",
+ " comment_num_0 comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n",
+ "0 0.0 0.0 0.0 0.0 1.0 \n",
+ "1 0.0 0.0 0.0 0.0 1.0 \n",
+ "2 0.0 0.0 0.0 0.0 1.0 \n",
+ "3 0.0 0.0 0.0 0.0 1.0 \n",
+ "4 0.0 0.0 0.0 0.0 1.0 \n",
+ "\n",
+ "[5 rows x 236 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_x = data.loc[:,data.columns != 'label'] # 将数据集分成训练集和预测集\n",
+ "data_y = data.loc[:,data.columns == 'label']\n",
+ "data_x.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " label\n",
+ "0 1.0\n",
+ "1 1.0\n",
+ "2 1.0\n",
+ "3 1.0\n",
+ "4 1.0"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_y.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(11695, 236)\n",
+ "(2924, 236)\n"
+ ]
+ }
+ ],
+ "source": [
+ "x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,test_size = 0.2, random_state = 0) # 数据切分成两份,训练和测试,8:2切分\n",
+ "print(x_train.shape)\n",
+ "print(x_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 上面测试集刚好有近3000条,二切分成训练时的验证和预测\n",
+ "x_val = x_test.iloc[:1500,:]\n",
+ "y_val = y_test.iloc[:1500,:]\n",
+ "\n",
+ "x_test = x_test.iloc[1500:,:] \n",
+ "y_test = y_test.iloc[1500:,:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1500, 236)\n",
+ "(1424, 236)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(x_val.shape)\n",
+ "print(x_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " action_before_3_2.0_y | \n",
+ " action_before_3_3.0_y | \n",
+ " ... | \n",
+ " cate_action_4_mean | \n",
+ " cate_action_5_mean | \n",
+ " cate_action_6_mean | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2157 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 25.300000 | \n",
+ " 52.133333 | \n",
+ " 16112.033333 | \n",
+ " 1.0 | \n",
+ " 0.0344 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2464 | \n",
+ " 10.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 288.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.433333 | \n",
+ " 5.066667 | \n",
+ " 1273.500000 | \n",
+ " 1.0 | \n",
+ " 0.0132 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 10326 | \n",
+ " 8.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 42.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 35.233333 | \n",
+ " 149.266667 | \n",
+ " 32299.233333 | \n",
+ " 1.0 | \n",
+ " 0.0213 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 7025 | \n",
+ " 8.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 36.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 25.733333 | \n",
+ " 116.800000 | \n",
+ " 24942.666667 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 6625 | \n",
+ " 7.0 | \n",
+ " 13.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 22.0 | \n",
+ " 92.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 17.000000 | \n",
+ " 36.166667 | \n",
+ " 9447.266667 | \n",
+ " 1.0 | \n",
+ " 0.0800 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 234 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "2157 4.0 2.0 0.0 \n",
+ "2464 10.0 2.0 0.0 \n",
+ "10326 8.0 2.0 1.0 \n",
+ "7025 8.0 2.0 0.0 \n",
+ "6625 7.0 13.0 0.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "2157 0.0 0.0 0.0 \n",
+ "2464 0.0 0.0 0.0 \n",
+ "10326 1.0 0.0 0.0 \n",
+ "7025 0.0 0.0 0.0 \n",
+ "6625 0.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
+ "2157 4.0 48.0 0.0 \n",
+ "2464 7.0 288.0 0.0 \n",
+ "10326 3.0 42.0 2.0 \n",
+ "7025 2.0 36.0 1.0 \n",
+ "6625 22.0 92.0 5.0 \n",
+ "\n",
+ " action_before_3_3.0_y ... cate_action_4_mean cate_action_5_mean \\\n",
+ "2157 0.0 ... 25.300000 52.133333 \n",
+ "2464 0.0 ... 0.433333 5.066667 \n",
+ "10326 1.0 ... 35.233333 149.266667 \n",
+ "7025 0.0 ... 25.733333 116.800000 \n",
+ "6625 1.0 ... 17.000000 36.166667 \n",
+ "\n",
+ " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n",
+ "2157 16112.033333 1.0 0.0344 0.0 \n",
+ "2464 1273.500000 1.0 0.0132 0.0 \n",
+ "10326 32299.233333 1.0 0.0213 0.0 \n",
+ "7025 24942.666667 0.0 0.0000 0.0 \n",
+ "6625 9447.266667 1.0 0.0800 0.0 \n",
+ "\n",
+ " comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n",
+ "2157 0.0 0.0 0.0 1.0 \n",
+ "2464 0.0 0.0 0.0 1.0 \n",
+ "10326 0.0 0.0 0.0 1.0 \n",
+ "7025 0.0 1.0 0.0 0.0 \n",
+ "6625 0.0 0.0 0.0 1.0 \n",
+ "\n",
+ "[5 rows x 234 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "del x_train['user_id']\n",
+ "del x_train['sku_id']\n",
+ "\n",
+ "del x_val['user_id']\n",
+ "del x_val['sku_id']\n",
+ "\n",
+ "x_train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dtrain = xgb.DMatrix(x_train, label=y_train)\n",
+ "dvalid = xgb.DMatrix(x_val, label=y_val)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "param = {'n_estimators': 4000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, \n",
+ " 'colsample_bytree': 0.8, 'scale_pos_weight':10, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic',\n",
+ " 'eval_metric':'auc'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[0]\ttrain-auc:0.938547\teval-auc:0.934522\n",
+ "Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.\n",
+ "\n",
+ "Will train until eval-auc hasn't improved in 10 rounds.\n",
+ "[1]\ttrain-auc:0.947568\teval-auc:0.944769\n",
+ "[2]\ttrain-auc:0.952758\teval-auc:0.949358\n",
+ "[3]\ttrain-auc:0.955704\teval-auc:0.952481\n",
+ "[4]\ttrain-auc:0.95525\teval-auc:0.952343\n",
+ "[5]\ttrain-auc:0.957462\teval-auc:0.95475\n",
+ "[6]\ttrain-auc:0.957636\teval-auc:0.955133\n",
+ "[7]\ttrain-auc:0.958327\teval-auc:0.955077\n",
+ "[8]\ttrain-auc:0.958339\teval-auc:0.95549\n",
+ "[9]\ttrain-auc:0.958235\teval-auc:0.955479\n",
+ "[10]\ttrain-auc:0.958922\teval-auc:0.955668\n",
+ "[11]\ttrain-auc:0.959378\teval-auc:0.956423\n",
+ "[12]\ttrain-auc:0.959666\teval-auc:0.956724\n",
+ "[13]\ttrain-auc:0.959674\teval-auc:0.957192\n",
+ "[14]\ttrain-auc:0.960227\teval-auc:0.957447\n",
+ "[15]\ttrain-auc:0.960316\teval-auc:0.957583\n",
+ "[16]\ttrain-auc:0.961338\teval-auc:0.959102\n",
+ "[17]\ttrain-auc:0.961986\teval-auc:0.958905\n",
+ "[18]\ttrain-auc:0.962297\teval-auc:0.959365\n",
+ "[19]\ttrain-auc:0.962798\teval-auc:0.959863\n",
+ "[20]\ttrain-auc:0.963266\teval-auc:0.960243\n",
+ "[21]\ttrain-auc:0.963661\teval-auc:0.96069\n",
+ "[22]\ttrain-auc:0.964377\teval-auc:0.961733\n",
+ "[23]\ttrain-auc:0.964741\teval-auc:0.962419\n",
+ "[24]\ttrain-auc:0.964886\teval-auc:0.962837\n",
+ "[25]\ttrain-auc:0.965193\teval-auc:0.963015\n",
+ "[26]\ttrain-auc:0.965453\teval-auc:0.963356\n",
+ "[27]\ttrain-auc:0.965766\teval-auc:0.963154\n",
+ "[28]\ttrain-auc:0.965954\teval-auc:0.963015\n",
+ "[29]\ttrain-auc:0.966586\teval-auc:0.963877\n",
+ "[30]\ttrain-auc:0.966813\teval-auc:0.963994\n",
+ "[31]\ttrain-auc:0.967003\teval-auc:0.96425\n",
+ "[32]\ttrain-auc:0.967469\teval-auc:0.965021\n",
+ "[33]\ttrain-auc:0.967952\teval-auc:0.964936\n",
+ "[34]\ttrain-auc:0.968365\teval-auc:0.965487\n",
+ "[35]\ttrain-auc:0.968576\teval-auc:0.965891\n",
+ "[36]\ttrain-auc:0.968961\teval-auc:0.966418\n",
+ "[37]\ttrain-auc:0.969475\teval-auc:0.966474\n",
+ "[38]\ttrain-auc:0.96979\teval-auc:0.966923\n",
+ "[39]\ttrain-auc:0.970028\teval-auc:0.967189\n",
+ "[40]\ttrain-auc:0.970177\teval-auc:0.967397\n",
+ "[41]\ttrain-auc:0.970596\teval-auc:0.967048\n",
+ "[42]\ttrain-auc:0.970871\teval-auc:0.967607\n",
+ "[43]\ttrain-auc:0.971206\teval-auc:0.968102\n",
+ "[44]\ttrain-auc:0.971298\teval-auc:0.968176\n",
+ "[45]\ttrain-auc:0.971754\teval-auc:0.968493\n",
+ "[46]\ttrain-auc:0.971813\teval-auc:0.968546\n",
+ "[47]\ttrain-auc:0.972115\teval-auc:0.968902\n",
+ "[48]\ttrain-auc:0.972266\teval-auc:0.968961\n",
+ "[49]\ttrain-auc:0.972328\teval-auc:0.969046\n",
+ "[50]\ttrain-auc:0.972632\teval-auc:0.968804\n",
+ "[51]\ttrain-auc:0.973076\teval-auc:0.968977\n",
+ "[52]\ttrain-auc:0.973468\teval-auc:0.969573\n",
+ "[53]\ttrain-auc:0.973681\teval-auc:0.969738\n",
+ "[54]\ttrain-auc:0.973826\teval-auc:0.970062\n",
+ "[55]\ttrain-auc:0.974159\teval-auc:0.970142\n",
+ "[56]\ttrain-auc:0.974259\teval-auc:0.970254\n",
+ "[57]\ttrain-auc:0.974533\teval-auc:0.970278\n",
+ "[58]\ttrain-auc:0.974716\teval-auc:0.970485\n",
+ "[59]\ttrain-auc:0.974808\teval-auc:0.970538\n",
+ "[60]\ttrain-auc:0.975131\teval-auc:0.9709\n",
+ "[61]\ttrain-auc:0.975251\teval-auc:0.97106\n",
+ "[62]\ttrain-auc:0.97532\teval-auc:0.971113\n",
+ "[63]\ttrain-auc:0.975468\teval-auc:0.971262\n",
+ "[64]\ttrain-auc:0.975523\teval-auc:0.971342\n",
+ "[65]\ttrain-auc:0.975621\teval-auc:0.971342\n",
+ "[66]\ttrain-auc:0.975726\teval-auc:0.97132\n",
+ "[67]\ttrain-auc:0.975945\teval-auc:0.971496\n",
+ "[68]\ttrain-auc:0.976067\teval-auc:0.971703\n",
+ "[69]\ttrain-auc:0.976234\teval-auc:0.971991\n",
+ "[70]\ttrain-auc:0.976296\teval-auc:0.972044\n",
+ "[71]\ttrain-auc:0.976386\teval-auc:0.972129\n",
+ "[72]\ttrain-auc:0.976636\teval-auc:0.972087\n",
+ "[73]\ttrain-auc:0.976809\teval-auc:0.972156\n",
+ "[74]\ttrain-auc:0.97688\teval-auc:0.972251\n",
+ "[75]\ttrain-auc:0.977256\teval-auc:0.972459\n",
+ "[76]\ttrain-auc:0.977306\teval-auc:0.972507\n",
+ "[77]\ttrain-auc:0.977407\teval-auc:0.972603\n",
+ "[78]\ttrain-auc:0.977514\teval-auc:0.972656\n",
+ "[79]\ttrain-auc:0.977588\teval-auc:0.972757\n",
+ "[80]\ttrain-auc:0.977843\teval-auc:0.972972\n",
+ "[81]\ttrain-auc:0.977938\teval-auc:0.973036\n",
+ "[82]\ttrain-auc:0.978056\teval-auc:0.972962\n",
+ "[83]\ttrain-auc:0.97829\teval-auc:0.973058\n",
+ "[84]\ttrain-auc:0.978366\teval-auc:0.973132\n",
+ "[85]\ttrain-auc:0.97844\teval-auc:0.973132\n",
+ "[86]\ttrain-auc:0.978461\teval-auc:0.973143\n",
+ "[87]\ttrain-auc:0.97852\teval-auc:0.973207\n",
+ "[88]\ttrain-auc:0.978731\teval-auc:0.973457\n",
+ "[89]\ttrain-auc:0.978776\teval-auc:0.973499\n",
+ "[90]\ttrain-auc:0.978881\teval-auc:0.973446\n",
+ "[91]\ttrain-auc:0.979052\teval-auc:0.973494\n",
+ "[92]\ttrain-auc:0.979078\teval-auc:0.973499\n",
+ "[93]\ttrain-auc:0.979186\teval-auc:0.973637\n",
+ "[94]\ttrain-auc:0.9793\teval-auc:0.973712\n",
+ "[95]\ttrain-auc:0.979578\teval-auc:0.973733\n",
+ "[96]\ttrain-auc:0.979638\teval-auc:0.973797\n",
+ "[97]\ttrain-auc:0.979718\teval-auc:0.974021\n",
+ "[98]\ttrain-auc:0.979887\teval-auc:0.973978\n",
+ "[99]\ttrain-auc:0.9799\teval-auc:0.973957\n",
+ "[100]\ttrain-auc:0.979966\teval-auc:0.974106\n",
+ "[101]\ttrain-auc:0.980003\teval-auc:0.974159\n",
+ "[102]\ttrain-auc:0.98012\teval-auc:0.973994\n",
+ "[103]\ttrain-auc:0.980258\teval-auc:0.973962\n",
+ "[104]\ttrain-auc:0.980323\teval-auc:0.973903\n",
+ "[105]\ttrain-auc:0.980386\teval-auc:0.973999\n",
+ "[106]\ttrain-auc:0.980468\teval-auc:0.973946\n",
+ "[107]\ttrain-auc:0.980523\teval-auc:0.974058\n",
+ "[108]\ttrain-auc:0.980577\teval-auc:0.974116\n",
+ "[109]\ttrain-auc:0.98073\teval-auc:0.974239\n",
+ "[110]\ttrain-auc:0.98088\teval-auc:0.974244\n",
+ "[111]\ttrain-auc:0.980953\teval-auc:0.974377\n",
+ "[112]\ttrain-auc:0.981079\teval-auc:0.974409\n",
+ "[113]\ttrain-auc:0.981224\teval-auc:0.974499\n",
+ "[114]\ttrain-auc:0.981241\teval-auc:0.974515\n",
+ "[115]\ttrain-auc:0.981318\teval-auc:0.97434\n",
+ "[116]\ttrain-auc:0.981389\teval-auc:0.97451\n",
+ "[117]\ttrain-auc:0.981489\teval-auc:0.974537\n",
+ "[118]\ttrain-auc:0.981613\teval-auc:0.974654\n",
+ "[119]\ttrain-auc:0.981645\teval-auc:0.974765\n",
+ "[120]\ttrain-auc:0.981738\teval-auc:0.974739\n",
+ "[121]\ttrain-auc:0.98188\teval-auc:0.974707\n",
+ "[122]\ttrain-auc:0.98195\teval-auc:0.974643\n",
+ "[123]\ttrain-auc:0.982098\teval-auc:0.974659\n",
+ "[124]\ttrain-auc:0.982177\teval-auc:0.974723\n",
+ "[125]\ttrain-auc:0.982389\teval-auc:0.974941\n",
+ "[126]\ttrain-auc:0.982517\teval-auc:0.97509\n",
+ "[127]\ttrain-auc:0.982527\teval-auc:0.975132\n",
+ "[128]\ttrain-auc:0.982643\teval-auc:0.97517\n",
+ "[129]\ttrain-auc:0.982795\teval-auc:0.97509\n",
+ "[130]\ttrain-auc:0.982866\teval-auc:0.975122\n",
+ "[131]\ttrain-auc:0.98296\teval-auc:0.975186\n",
+ "[132]\ttrain-auc:0.983059\teval-auc:0.975223\n",
+ "[133]\ttrain-auc:0.983209\teval-auc:0.975143\n",
+ "[134]\ttrain-auc:0.983343\teval-auc:0.975239\n",
+ "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n",
+ "[136]\ttrain-auc:0.983545\teval-auc:0.975228\n",
+ "[137]\ttrain-auc:0.98368\teval-auc:0.975196\n",
+ "[138]\ttrain-auc:0.983674\teval-auc:0.975244\n",
+ "[139]\ttrain-auc:0.983737\teval-auc:0.975223\n",
+ "[140]\ttrain-auc:0.983804\teval-auc:0.97518\n",
+ "[141]\ttrain-auc:0.983939\teval-auc:0.975143\n",
+ "[142]\ttrain-auc:0.983985\teval-auc:0.975159\n",
+ "[143]\ttrain-auc:0.984077\teval-auc:0.975095\n",
+ "[144]\ttrain-auc:0.984248\teval-auc:0.975074\n",
+ "[145]\ttrain-auc:0.984285\teval-auc:0.975042\n",
+ "Stopping. Best iteration:\n",
+ "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_round = param['n_estimators']\n",
+ "\n",
+ "plst = param.items()\n",
+ "evallist = [(dtrain, 'train'), (dvalid, 'eval')]\n",
+ "best = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=10) # 寻找最优参\n",
+ "best.save_model('bst.model')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'best_iteration': '135',\n",
+ " 'best_msg': '[135]\\ttrain-auc:0.983497\\teval-auc:0.975266',\n",
+ " 'best_score': '0.975266'}"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "best.attributes()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_feature_map(features):\n",
+ " outfile = open(r'xgb.fmap', 'w')\n",
+ " i = 0\n",
+ " for feat in features:\n",
+ " outfile.write('{0}\\t{1}\\tq\\n'.format(i, feat))\n",
+ " i = i + 1\n",
+ " outfile.close()\n",
+ "\n",
+ "\n",
+ "features = list(x_train.columns[:])\n",
+ "create_feature_map(features)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def feature_importance(best_xgb):\n",
+ " importance = best_xgb.get_fscore(fmap=r'xgb.fmap')\n",
+ " importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)\n",
+ "\n",
+ " df = pd.DataFrame(importance, columns=['feature', 'fscore'])\n",
+ " df['fscore'] = df['fscore'] / df['fscore'].sum()\n",
+ " file_name = 'data/feature_importance_' + str(datetime.now().date())[5:] + '.csv'\n",
+ " df.to_csv(file_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_importance(best)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " feature | \n",
+ " fscore | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " brand | \n",
+ " 0.077852 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " action_before_10_5.0_x | \n",
+ " 0.041611 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " bad_comment_rate | \n",
+ " 0.038926 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " product_action_5_ratio | \n",
+ " 0.028188 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " user_lv_cd_2 | \n",
+ " 0.025503 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 feature fscore\n",
+ "0 0 brand 0.077852\n",
+ "1 1 action_before_10_5.0_x 0.041611\n",
+ "2 2 bad_comment_rate 0.038926\n",
+ "3 3 product_action_5_ratio 0.028188\n",
+ "4 4 user_lv_cd_2 0.025503"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fi = pd.read_csv('data/feature_importance_02-05.csv')\n",
+ "fi.sort_values(\"fscore\", inplace=True, ascending=False)\n",
+ "fi.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " sku_id | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " ... | \n",
+ " cate_action_4_mean | \n",
+ " cate_action_5_mean | \n",
+ " cate_action_6_mean | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6765 | \n",
+ " 235394.0 | \n",
+ " 24371.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 18.0 | \n",
+ " 12.0 | \n",
+ " ... | \n",
+ " 25.733333 | \n",
+ " 116.8 | \n",
+ " 24942.666667 | \n",
+ " 1.0 | \n",
+ " 0.0206 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 13767 | \n",
+ " 272948.0 | \n",
+ " 108907.0 | \n",
+ " 4.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 61.633333 | \n",
+ " 143.9 | \n",
+ " 41669.233333 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9672 | \n",
+ " 245846.0 | \n",
+ " 63026.0 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 47.0 | \n",
+ " ... | \n",
+ " 1.100000 | \n",
+ " 9.0 | \n",
+ " 2361.600000 | \n",
+ " 1.0 | \n",
+ " 0.0938 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9116 | \n",
+ " 272178.0 | \n",
+ " 66704.0 | \n",
+ " 9.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 112.0 | \n",
+ " ... | \n",
+ " 10.400000 | \n",
+ " 32.5 | \n",
+ " 7264.200000 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 10055 | \n",
+ " 216485.0 | \n",
+ " 131364.0 | \n",
+ " 6.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " 28.0 | \n",
+ " ... | \n",
+ " 38.233333 | \n",
+ " 88.0 | \n",
+ " 17558.033333 | \n",
+ " 1.0 | \n",
+ " 0.0473 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 236 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "6765 235394.0 24371.0 8.0 8.0 0.0 \n",
+ "13767 272948.0 108907.0 4.0 8.0 0.0 \n",
+ "9672 245846.0 63026.0 10.0 1.0 0.0 \n",
+ "9116 272178.0 66704.0 9.0 2.0 0.0 \n",
+ "10055 216485.0 131364.0 6.0 2.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "6765 0.0 0.0 1.0 \n",
+ "13767 0.0 0.0 0.0 \n",
+ "9672 0.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 \n",
+ "10055 1.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n",
+ "6765 18.0 12.0 ... 25.733333 \n",
+ "13767 7.0 0.0 ... 61.633333 \n",
+ "9672 0.0 47.0 ... 1.100000 \n",
+ "9116 4.0 112.0 ... 10.400000 \n",
+ "10055 6.0 28.0 ... 38.233333 \n",
+ "\n",
+ " cate_action_5_mean cate_action_6_mean has_bad_comment \\\n",
+ "6765 116.8 24942.666667 1.0 \n",
+ "13767 143.9 41669.233333 0.0 \n",
+ "9672 9.0 2361.600000 1.0 \n",
+ "9116 32.5 7264.200000 0.0 \n",
+ "10055 88.0 17558.033333 1.0 \n",
+ "\n",
+ " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n",
+ "6765 0.0206 0.0 0.0 0.0 \n",
+ "13767 0.0000 0.0 0.0 1.0 \n",
+ "9672 0.0938 0.0 0.0 0.0 \n",
+ "9116 0.0000 0.0 0.0 0.0 \n",
+ "10055 0.0473 0.0 0.0 0.0 \n",
+ "\n",
+ " comment_num_3 comment_num_4 \n",
+ "6765 0.0 1.0 \n",
+ "13767 0.0 0.0 \n",
+ "9672 1.0 0.0 \n",
+ "9116 0.0 0.0 \n",
+ "10055 0.0 1.0 \n",
+ "\n",
+ "[5 rows x 236 columns]"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "x_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "users = x_test[['user_id', 'sku_id', 'cate']].copy()\n",
+ "del x_test['user_id']\n",
+ "del x_test['sku_id']\n",
+ "x_test_DMatrix = xgb.DMatrix(x_test)\n",
+ "y_pred = bst.predict(x_test_DMatrix, ntree_limit=bst.best_ntree_limit)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " action_before_3_2.0_y | \n",
+ " action_before_3_3.0_y | \n",
+ " ... | \n",
+ " cate_action_5_mean | \n",
+ " cate_action_6_mean | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ " pred_label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6765 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 18.0 | \n",
+ " 12.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 116.8 | \n",
+ " 24942.666667 | \n",
+ " 1.0 | \n",
+ " 0.0206 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.453736 | \n",
+ "
\n",
+ " \n",
+ " 13767 | \n",
+ " 4.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 143.9 | \n",
+ " 41669.233333 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.002793 | \n",
+ "
\n",
+ " \n",
+ " 9672 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 47.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 9.0 | \n",
+ " 2361.600000 | \n",
+ " 1.0 | \n",
+ " 0.0938 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.000167 | \n",
+ "
\n",
+ " \n",
+ " 9116 | \n",
+ " 9.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 112.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 32.5 | \n",
+ " 7264.200000 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.000225 | \n",
+ "
\n",
+ " \n",
+ " 10055 | \n",
+ " 6.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " 28.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 88.0 | \n",
+ " 17558.033333 | \n",
+ " 1.0 | \n",
+ " 0.0473 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.000507 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 235 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "6765 8.0 8.0 0.0 \n",
+ "13767 4.0 8.0 0.0 \n",
+ "9672 10.0 1.0 0.0 \n",
+ "9116 9.0 2.0 0.0 \n",
+ "10055 6.0 2.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "6765 0.0 0.0 1.0 \n",
+ "13767 0.0 0.0 0.0 \n",
+ "9672 0.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 \n",
+ "10055 1.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
+ "6765 18.0 12.0 0.0 \n",
+ "13767 7.0 0.0 0.0 \n",
+ "9672 0.0 47.0 0.0 \n",
+ "9116 4.0 112.0 0.0 \n",
+ "10055 6.0 28.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n",
+ "6765 0.0 ... 116.8 24942.666667 \n",
+ "13767 0.0 ... 143.9 41669.233333 \n",
+ "9672 0.0 ... 9.0 2361.600000 \n",
+ "9116 0.0 ... 32.5 7264.200000 \n",
+ "10055 0.0 ... 88.0 17558.033333 \n",
+ "\n",
+ " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n",
+ "6765 1.0 0.0206 0.0 0.0 \n",
+ "13767 0.0 0.0000 0.0 0.0 \n",
+ "9672 1.0 0.0938 0.0 0.0 \n",
+ "9116 0.0 0.0000 0.0 0.0 \n",
+ "10055 1.0 0.0473 0.0 0.0 \n",
+ "\n",
+ " comment_num_2 comment_num_3 comment_num_4 pred_label \n",
+ "6765 0.0 0.0 1.0 0.453736 \n",
+ "13767 1.0 0.0 0.0 0.002793 \n",
+ "9672 0.0 1.0 0.0 0.000167 \n",
+ "9116 0.0 0.0 0.0 0.000225 \n",
+ "10055 0.0 0.0 1.0 0.000507 \n",
+ "\n",
+ "[5 rows x 235 columns]"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "x_test['pred_label'] = y_pred\n",
+ "x_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " action_before_3_2.0_y | \n",
+ " action_before_3_3.0_y | \n",
+ " ... | \n",
+ " cate_action_5_mean | \n",
+ " cate_action_6_mean | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ " pred_label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6765 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 18.0 | \n",
+ " 12.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 116.8 | \n",
+ " 24942.666667 | \n",
+ " 1.0 | \n",
+ " 0.0206 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 13767 | \n",
+ " 4.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 143.9 | \n",
+ " 41669.233333 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9672 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 47.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 9.0 | \n",
+ " 2361.600000 | \n",
+ " 1.0 | \n",
+ " 0.0938 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9116 | \n",
+ " 9.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 112.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 32.5 | \n",
+ " 7264.200000 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 10055 | \n",
+ " 6.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " 28.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 88.0 | \n",
+ " 17558.033333 | \n",
+ " 1.0 | \n",
+ " 0.0473 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 235 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "6765 8.0 8.0 0.0 \n",
+ "13767 4.0 8.0 0.0 \n",
+ "9672 10.0 1.0 0.0 \n",
+ "9116 9.0 2.0 0.0 \n",
+ "10055 6.0 2.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "6765 0.0 0.0 1.0 \n",
+ "13767 0.0 0.0 0.0 \n",
+ "9672 0.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 \n",
+ "10055 1.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
+ "6765 18.0 12.0 0.0 \n",
+ "13767 7.0 0.0 0.0 \n",
+ "9672 0.0 47.0 0.0 \n",
+ "9116 4.0 112.0 0.0 \n",
+ "10055 6.0 28.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n",
+ "6765 0.0 ... 116.8 24942.666667 \n",
+ "13767 0.0 ... 143.9 41669.233333 \n",
+ "9672 0.0 ... 9.0 2361.600000 \n",
+ "9116 0.0 ... 32.5 7264.200000 \n",
+ "10055 0.0 ... 88.0 17558.033333 \n",
+ "\n",
+ " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n",
+ "6765 1.0 0.0206 0.0 0.0 \n",
+ "13767 0.0 0.0000 0.0 0.0 \n",
+ "9672 1.0 0.0938 0.0 0.0 \n",
+ "9116 0.0 0.0000 0.0 0.0 \n",
+ "10055 1.0 0.0473 0.0 0.0 \n",
+ "\n",
+ " comment_num_2 comment_num_3 comment_num_4 pred_label \n",
+ "6765 0.0 0.0 1.0 0.0 \n",
+ "13767 1.0 0.0 0.0 0.0 \n",
+ "9672 0.0 1.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 0.0 \n",
+ "10055 0.0 0.0 1.0 0.0 \n",
+ "\n",
+ "[5 rows x 235 columns]"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def label(column):\n",
+ " if column['pred_label'] > 0.5:\n",
+ " #rint ('yes')\n",
+ " column['pred_label'] = 1\n",
+ " else:\n",
+ " column['pred_label'] = 0\n",
+ " return column\n",
+ "x_test = x_test.apply(label,axis = 1)\n",
+ "x_test.head() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " action_before_3_2.0_y | \n",
+ " action_before_3_3.0_y | \n",
+ " ... | \n",
+ " cate_action_6_mean | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ " pred_label | \n",
+ " true_label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6765 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 18.0 | \n",
+ " 12.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 24942.666667 | \n",
+ " 1.0 | \n",
+ " 0.0206 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 13767 | \n",
+ " 4.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 41669.233333 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9672 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 47.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 2361.600000 | \n",
+ " 1.0 | \n",
+ " 0.0938 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9116 | \n",
+ " 9.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 112.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 7264.200000 | \n",
+ " 0.0 | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 10055 | \n",
+ " 6.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " 28.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 17558.033333 | \n",
+ " 1.0 | \n",
+ " 0.0473 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 236 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "6765 8.0 8.0 0.0 \n",
+ "13767 4.0 8.0 0.0 \n",
+ "9672 10.0 1.0 0.0 \n",
+ "9116 9.0 2.0 0.0 \n",
+ "10055 6.0 2.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "6765 0.0 0.0 1.0 \n",
+ "13767 0.0 0.0 0.0 \n",
+ "9672 0.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 \n",
+ "10055 1.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
+ "6765 18.0 12.0 0.0 \n",
+ "13767 7.0 0.0 0.0 \n",
+ "9672 0.0 47.0 0.0 \n",
+ "9116 4.0 112.0 0.0 \n",
+ "10055 6.0 28.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_y ... cate_action_6_mean has_bad_comment \\\n",
+ "6765 0.0 ... 24942.666667 1.0 \n",
+ "13767 0.0 ... 41669.233333 0.0 \n",
+ "9672 0.0 ... 2361.600000 1.0 \n",
+ "9116 0.0 ... 7264.200000 0.0 \n",
+ "10055 0.0 ... 17558.033333 1.0 \n",
+ "\n",
+ " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n",
+ "6765 0.0206 0.0 0.0 0.0 \n",
+ "13767 0.0000 0.0 0.0 1.0 \n",
+ "9672 0.0938 0.0 0.0 0.0 \n",
+ "9116 0.0000 0.0 0.0 0.0 \n",
+ "10055 0.0473 0.0 0.0 0.0 \n",
+ "\n",
+ " comment_num_3 comment_num_4 pred_label true_label \n",
+ "6765 0.0 1.0 0.0 0.0 \n",
+ "13767 0.0 0.0 0.0 0.0 \n",
+ "9672 1.0 0.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 0.0 \n",
+ "10055 0.0 1.0 0.0 0.0 \n",
+ "\n",
+ "[5 rows x 236 columns]"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "x_test['true_label'] = y_test\n",
+ "x_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cate | \n",
+ " action_before_3_1.0_x | \n",
+ " action_before_3_2.0_x | \n",
+ " action_before_3_3.0_x | \n",
+ " action_before_3_4.0_x | \n",
+ " action_before_3_5.0_x | \n",
+ " action_before_3_6.0_x | \n",
+ " action_before_3_1.0_y | \n",
+ " action_before_3_2.0_y | \n",
+ " action_before_3_3.0_y | \n",
+ " ... | \n",
+ " bad_comment_rate | \n",
+ " comment_num_0 | \n",
+ " comment_num_1 | \n",
+ " comment_num_2 | \n",
+ " comment_num_3 | \n",
+ " comment_num_4 | \n",
+ " pred_label | \n",
+ " true_label | \n",
+ " user_id | \n",
+ " sku_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6765 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 18.0 | \n",
+ " 12.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0206 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 235394.0 | \n",
+ " 24371.0 | \n",
+ "
\n",
+ " \n",
+ " 13767 | \n",
+ " 4.0 | \n",
+ " 8.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 272948.0 | \n",
+ " 108907.0 | \n",
+ "
\n",
+ " \n",
+ " 9672 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 47.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0938 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 245846.0 | \n",
+ " 63026.0 | \n",
+ "
\n",
+ " \n",
+ " 9116 | \n",
+ " 9.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 112.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 272178.0 | \n",
+ " 66704.0 | \n",
+ "
\n",
+ " \n",
+ " 10055 | \n",
+ " 6.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " 28.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0473 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 216485.0 | \n",
+ " 131364.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 238 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
+ "6765 8.0 8.0 0.0 \n",
+ "13767 4.0 8.0 0.0 \n",
+ "9672 10.0 1.0 0.0 \n",
+ "9116 9.0 2.0 0.0 \n",
+ "10055 6.0 2.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
+ "6765 0.0 0.0 1.0 \n",
+ "13767 0.0 0.0 0.0 \n",
+ "9672 0.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 \n",
+ "10055 1.0 0.0 0.0 \n",
+ "\n",
+ " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
+ "6765 18.0 12.0 0.0 \n",
+ "13767 7.0 0.0 0.0 \n",
+ "9672 0.0 47.0 0.0 \n",
+ "9116 4.0 112.0 0.0 \n",
+ "10055 6.0 28.0 1.0 \n",
+ "\n",
+ " action_before_3_3.0_y ... bad_comment_rate comment_num_0 \\\n",
+ "6765 0.0 ... 0.0206 0.0 \n",
+ "13767 0.0 ... 0.0000 0.0 \n",
+ "9672 0.0 ... 0.0938 0.0 \n",
+ "9116 0.0 ... 0.0000 0.0 \n",
+ "10055 0.0 ... 0.0473 0.0 \n",
+ "\n",
+ " comment_num_1 comment_num_2 comment_num_3 comment_num_4 pred_label \\\n",
+ "6765 0.0 0.0 0.0 1.0 0.0 \n",
+ "13767 0.0 1.0 0.0 0.0 0.0 \n",
+ "9672 0.0 0.0 1.0 0.0 0.0 \n",
+ "9116 0.0 0.0 0.0 0.0 0.0 \n",
+ "10055 0.0 0.0 0.0 1.0 0.0 \n",
+ "\n",
+ " true_label user_id sku_id \n",
+ "6765 0.0 235394.0 24371.0 \n",
+ "13767 0.0 272948.0 108907.0 \n",
+ "9672 0.0 245846.0 63026.0 \n",
+ "9116 0.0 272178.0 66704.0 \n",
+ "10055 0.0 216485.0 131364.0 \n",
+ "\n",
+ "[5 rows x 238 columns]"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "x_test['user_id'] = users['user_id']\n",
+ "x_test['sku_id'] = users['sku_id']\n",
+ "x_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "126\n",
+ "248\n",
+ "267\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 所有购买用户\n",
+ "all_user_set = x_test[x_test['true_label']==1]['user_id'].unique()\n",
+ "print (len(all_user_set))\n",
+ "# 所有预测购买的用户\n",
+ "all_user_test_set = x_test[x_test['pred_label'] == 1]['user_id'].unique()\n",
+ "print (len(all_user_test_set))\n",
+ "all_user_test_item_pair = x_test[x_test['pred_label'] == 1]['user_id'].map(str) + '-' + x_test[x_test['pred_label'] == 1]['sku_id'].map(str)\n",
+ "all_user_test_item_pair = np.array(all_user_test_item_pair)\n",
+ "print (len(all_user_test_item_pair))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "所有用户中预测购买用户的准确率为 0.4838709677419355\n",
+ "所有用户中预测购买用户的召回率0.9523809523809523\n"
+ ]
+ }
+ ],
+ "source": [
+ "pos, neg = 0,0\n",
+ "for user_id in all_user_test_set:\n",
+ " if user_id in all_user_set:\n",
+ " pos += 1\n",
+ " else:\n",
+ " neg += 1\n",
+ "all_user_acc = 1.0 * pos / ( pos + neg)\n",
+ "all_user_recall = 1.0 * pos / len(all_user_set)\n",
+ "print ('所有用户中预测购买用户的准确率为 ' + str(all_user_acc))\n",
+ "print ('所有用户中预测购买用户的召回率' + str(all_user_recall))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "144\n",
+ "所有用户中预测购买商品的准确率为 0.5131086142322098\n",
+ "所有用户中预测购买商品的召回率0.9513888888888888\n",
+ "F11=0.527086383601757\n",
+ "F12=0.7091097308488614\n",
+ "score=0.6363003919500196\n"
+ ]
+ }
+ ],
+ "source": [
+ "#所有实际商品对\n",
+ "all_user_item_pair = x_test[x_test['true_label']==1]['user_id'].map(str) + '-' + x_test[x_test['true_label']==1]['sku_id'].map(str)\n",
+ "all_user_item_pair = np.array(all_user_item_pair)\n",
+ "print (len(all_user_item_pair))\n",
+ "pos, neg = 0, 0\n",
+ "for user_item_pair in all_user_test_item_pair:\n",
+ " #print (user_item_pair)\n",
+ " if user_item_pair in all_user_item_pair:\n",
+ " pos += 1\n",
+ " else:\n",
+ " neg += 1\n",
+ "all_item_acc = 1.0 * pos / ( pos + neg)\n",
+ "all_item_recall = 1.0 * pos / len(all_user_item_pair)\n",
+ "print ('所有用户中预测购买商品的准确率为 ' + str(all_item_acc))\n",
+ "print ('所有用户中预测购买商品的召回率' + str(all_item_recall))\n",
+ "F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc)\n",
+ "F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc)\n",
+ "score = 0.4 * F11 + 0.6 * F12\n",
+ "print ('F11=' + str(F11))\n",
+ "print ('F12=' + str(F12))\n",
+ "print ('score=' + str(score))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}