From a1908d1e9018f3fc451dfd05232d1ce2a4816146 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Sun, 14 Feb 2021 10:54:13 +0800
Subject: [PATCH] =?UTF-8?q?Delete=204-=E6=A8=A1=E5=9E=8B=E8=AE=AD=E7=BB=83?=
=?UTF-8?q?=E5=92=8C=E9=A2=84=E6=B5=8B.ipynb?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../4-模型训练和预测.ipynb | 2505 -----------------
1 file changed, 2505 deletions(-)
delete mode 100644 机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb
diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb
deleted file mode 100644
index 7984da2..0000000
--- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/4-模型训练和预测.ipynb
+++ /dev/null
@@ -1,2505 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sys\n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "import xgboost as xgb\n",
- "from sklearn.model_selection import train_test_split\n",
- "import operator\n",
- "from matplotlib import pylab as plt\n",
- "from datetime import datetime\n",
- "import time\n",
- "from sklearn.model_selection import GridSearchCV"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " user_id | \n",
- " sku_id | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " ... | \n",
- " cate_action_5_mean | \n",
- " cate_action_6_mean | \n",
- " has_bad_comment | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 202633.0 | \n",
- " 12564.0 | \n",
- " 8.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " ... | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0260 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 218498.0 | \n",
- " 149854.0 | \n",
- " 8.0 | \n",
- " 4.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 2.0 | \n",
- " ... | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0403 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 221842.0 | \n",
- " 75877.0 | \n",
- " 8.0 | \n",
- " 3.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 5.0 | \n",
- " 79.0 | \n",
- " ... | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0245 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 222886.0 | \n",
- " 154636.0 | \n",
- " 8.0 | \n",
- " 20.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 26.0 | \n",
- " 10.0 | \n",
- " ... | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0208 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 235240.0 | \n",
- " 38222.0 | \n",
- " 8.0 | \n",
- " 30.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 28.0 | \n",
- " 55.0 | \n",
- " ... | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0166 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 237 columns
\n",
- "
"
- ],
- "text/plain": [
- " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "0 202633.0 12564.0 8.0 1.0 0.0 \n",
- "1 218498.0 149854.0 8.0 4.0 0.0 \n",
- "2 221842.0 75877.0 8.0 3.0 0.0 \n",
- "3 222886.0 154636.0 8.0 20.0 1.0 \n",
- "4 235240.0 38222.0 8.0 30.0 1.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "0 0.0 0.0 0.0 \n",
- "1 0.0 0.0 0.0 \n",
- "2 0.0 0.0 0.0 \n",
- "3 0.0 0.0 0.0 \n",
- "4 0.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_5_mean \\\n",
- "0 2.0 1.0 ... 20.866667 \n",
- "1 4.0 2.0 ... 20.866667 \n",
- "2 5.0 79.0 ... 20.866667 \n",
- "3 26.0 10.0 ... 20.866667 \n",
- "4 28.0 55.0 ... 20.866667 \n",
- "\n",
- " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n",
- "0 5167.6 1.0 0.0260 0.0 \n",
- "1 5167.6 1.0 0.0403 0.0 \n",
- "2 5167.6 1.0 0.0245 0.0 \n",
- "3 5167.6 1.0 0.0208 0.0 \n",
- "4 5167.6 1.0 0.0166 0.0 \n",
- "\n",
- " comment_num_1 comment_num_2 comment_num_3 comment_num_4 label \n",
- "0 0.0 0.0 0.0 1.0 1.0 \n",
- "1 0.0 0.0 0.0 1.0 1.0 \n",
- "2 0.0 0.0 0.0 1.0 1.0 \n",
- "3 0.0 0.0 0.0 1.0 1.0 \n",
- "4 0.0 0.0 0.0 1.0 1.0 \n",
- "\n",
- "[5 rows x 237 columns]"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data = pd.read_csv('data/train_set.csv') # 读取训练数据\n",
- "data.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['user_id', 'sku_id', 'cate', 'action_before_3_1.0_x',\n",
- " 'action_before_3_2.0_x', 'action_before_3_3.0_x',\n",
- " 'action_before_3_4.0_x', 'action_before_3_5.0_x',\n",
- " 'action_before_3_6.0_x', 'action_before_3_1.0_y',\n",
- " ...\n",
- " 'cate_action_5_mean', 'cate_action_6_mean', 'has_bad_comment',\n",
- " 'bad_comment_rate', 'comment_num_0', 'comment_num_1', 'comment_num_2',\n",
- " 'comment_num_3', 'comment_num_4', 'label'],\n",
- " dtype='object', length=237)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " user_id | \n",
- " sku_id | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " ... | \n",
- " cate_action_4_mean | \n",
- " cate_action_5_mean | \n",
- " cate_action_6_mean | \n",
- " has_bad_comment | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 202633.0 | \n",
- " 12564.0 | \n",
- " 8.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " ... | \n",
- " 8.4 | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0260 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 218498.0 | \n",
- " 149854.0 | \n",
- " 8.0 | \n",
- " 4.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 2.0 | \n",
- " ... | \n",
- " 8.4 | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0403 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 221842.0 | \n",
- " 75877.0 | \n",
- " 8.0 | \n",
- " 3.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 5.0 | \n",
- " 79.0 | \n",
- " ... | \n",
- " 8.4 | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0245 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 222886.0 | \n",
- " 154636.0 | \n",
- " 8.0 | \n",
- " 20.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 26.0 | \n",
- " 10.0 | \n",
- " ... | \n",
- " 8.4 | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0208 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 235240.0 | \n",
- " 38222.0 | \n",
- " 8.0 | \n",
- " 30.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 28.0 | \n",
- " 55.0 | \n",
- " ... | \n",
- " 8.4 | \n",
- " 20.866667 | \n",
- " 5167.6 | \n",
- " 1.0 | \n",
- " 0.0166 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 236 columns
\n",
- "
"
- ],
- "text/plain": [
- " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "0 202633.0 12564.0 8.0 1.0 0.0 \n",
- "1 218498.0 149854.0 8.0 4.0 0.0 \n",
- "2 221842.0 75877.0 8.0 3.0 0.0 \n",
- "3 222886.0 154636.0 8.0 20.0 1.0 \n",
- "4 235240.0 38222.0 8.0 30.0 1.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "0 0.0 0.0 0.0 \n",
- "1 0.0 0.0 0.0 \n",
- "2 0.0 0.0 0.0 \n",
- "3 0.0 0.0 0.0 \n",
- "4 0.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n",
- "0 2.0 1.0 ... 8.4 \n",
- "1 4.0 2.0 ... 8.4 \n",
- "2 5.0 79.0 ... 8.4 \n",
- "3 26.0 10.0 ... 8.4 \n",
- "4 28.0 55.0 ... 8.4 \n",
- "\n",
- " cate_action_5_mean cate_action_6_mean has_bad_comment bad_comment_rate \\\n",
- "0 20.866667 5167.6 1.0 0.0260 \n",
- "1 20.866667 5167.6 1.0 0.0403 \n",
- "2 20.866667 5167.6 1.0 0.0245 \n",
- "3 20.866667 5167.6 1.0 0.0208 \n",
- "4 20.866667 5167.6 1.0 0.0166 \n",
- "\n",
- " comment_num_0 comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n",
- "0 0.0 0.0 0.0 0.0 1.0 \n",
- "1 0.0 0.0 0.0 0.0 1.0 \n",
- "2 0.0 0.0 0.0 0.0 1.0 \n",
- "3 0.0 0.0 0.0 0.0 1.0 \n",
- "4 0.0 0.0 0.0 0.0 1.0 \n",
- "\n",
- "[5 rows x 236 columns]"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data_x = data.loc[:,data.columns != 'label'] # 将训练数据集分成特征和标签\n",
- "data_y = data.loc[:,data.columns == 'label']\n",
- "data_x.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " label\n",
- "0 1.0\n",
- "1 1.0\n",
- "2 1.0\n",
- "3 1.0\n",
- "4 1.0"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data_y.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(11695, 236)\n",
- "(2924, 236)\n"
- ]
- }
- ],
- "source": [
- "x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,test_size = 0.2, random_state = 0) # 数据切分成两份,训练和测试,8:2切分\n",
- "print(x_train.shape)\n",
- "print(x_test.shape)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "# 上面测试集刚好有近3000条,二切分成训练时的验证和预测\n",
- "x_val = x_test.iloc[:1500,:]\n",
- "y_val = y_test.iloc[:1500,:]\n",
- "\n",
- "x_test = x_test.iloc[1500:,:] \n",
- "y_test = y_test.iloc[1500:,:]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(1500, 236)\n",
- "(1424, 236)\n"
- ]
- }
- ],
- "source": [
- "print(x_val.shape)\n",
- "print(x_test.shape)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " action_before_3_2.0_y | \n",
- " action_before_3_3.0_y | \n",
- " ... | \n",
- " cate_action_4_mean | \n",
- " cate_action_5_mean | \n",
- " cate_action_6_mean | \n",
- " has_bad_comment | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 2157 | \n",
- " 4.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 48.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 25.300000 | \n",
- " 52.133333 | \n",
- " 16112.033333 | \n",
- " 1.0 | \n",
- " 0.0344 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 2464 | \n",
- " 10.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 7.0 | \n",
- " 288.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 0.433333 | \n",
- " 5.066667 | \n",
- " 1273.500000 | \n",
- " 1.0 | \n",
- " 0.0132 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 10326 | \n",
- " 8.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " 42.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " ... | \n",
- " 35.233333 | \n",
- " 149.266667 | \n",
- " 32299.233333 | \n",
- " 1.0 | \n",
- " 0.0213 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 7025 | \n",
- " 8.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 2.0 | \n",
- " 36.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 25.733333 | \n",
- " 116.800000 | \n",
- " 24942.666667 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 6625 | \n",
- " 7.0 | \n",
- " 13.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 22.0 | \n",
- " 92.0 | \n",
- " 5.0 | \n",
- " 1.0 | \n",
- " ... | \n",
- " 17.000000 | \n",
- " 36.166667 | \n",
- " 9447.266667 | \n",
- " 1.0 | \n",
- " 0.0800 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 234 columns
\n",
- "
"
- ],
- "text/plain": [
- " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "2157 4.0 2.0 0.0 \n",
- "2464 10.0 2.0 0.0 \n",
- "10326 8.0 2.0 1.0 \n",
- "7025 8.0 2.0 0.0 \n",
- "6625 7.0 13.0 0.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "2157 0.0 0.0 0.0 \n",
- "2464 0.0 0.0 0.0 \n",
- "10326 1.0 0.0 0.0 \n",
- "7025 0.0 0.0 0.0 \n",
- "6625 0.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
- "2157 4.0 48.0 0.0 \n",
- "2464 7.0 288.0 0.0 \n",
- "10326 3.0 42.0 2.0 \n",
- "7025 2.0 36.0 1.0 \n",
- "6625 22.0 92.0 5.0 \n",
- "\n",
- " action_before_3_3.0_y ... cate_action_4_mean cate_action_5_mean \\\n",
- "2157 0.0 ... 25.300000 52.133333 \n",
- "2464 0.0 ... 0.433333 5.066667 \n",
- "10326 1.0 ... 35.233333 149.266667 \n",
- "7025 0.0 ... 25.733333 116.800000 \n",
- "6625 1.0 ... 17.000000 36.166667 \n",
- "\n",
- " cate_action_6_mean has_bad_comment bad_comment_rate comment_num_0 \\\n",
- "2157 16112.033333 1.0 0.0344 0.0 \n",
- "2464 1273.500000 1.0 0.0132 0.0 \n",
- "10326 32299.233333 1.0 0.0213 0.0 \n",
- "7025 24942.666667 0.0 0.0000 0.0 \n",
- "6625 9447.266667 1.0 0.0800 0.0 \n",
- "\n",
- " comment_num_1 comment_num_2 comment_num_3 comment_num_4 \n",
- "2157 0.0 0.0 0.0 1.0 \n",
- "2464 0.0 0.0 0.0 1.0 \n",
- "10326 0.0 0.0 0.0 1.0 \n",
- "7025 0.0 1.0 0.0 0.0 \n",
- "6625 0.0 0.0 0.0 1.0 \n",
- "\n",
- "[5 rows x 234 columns]"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "del x_train['user_id']\n",
- "del x_train['sku_id']\n",
- "\n",
- "del x_val['user_id']\n",
- "del x_val['sku_id']\n",
- "\n",
- "x_train.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "dtrain = xgb.DMatrix(x_train, label=y_train)\n",
- "dvalid = xgb.DMatrix(x_val, label=y_val)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "param = {'n_estimators': 4000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, \n",
- " 'colsample_bytree': 0.8, 'scale_pos_weight':10, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic',\n",
- " 'eval_metric':'auc'}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[0]\ttrain-auc:0.938547\teval-auc:0.934522\n",
- "Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.\n",
- "\n",
- "Will train until eval-auc hasn't improved in 10 rounds.\n",
- "[1]\ttrain-auc:0.947568\teval-auc:0.944769\n",
- "[2]\ttrain-auc:0.952758\teval-auc:0.949358\n",
- "[3]\ttrain-auc:0.955704\teval-auc:0.952481\n",
- "[4]\ttrain-auc:0.95525\teval-auc:0.952343\n",
- "[5]\ttrain-auc:0.957462\teval-auc:0.95475\n",
- "[6]\ttrain-auc:0.957636\teval-auc:0.955133\n",
- "[7]\ttrain-auc:0.958327\teval-auc:0.955077\n",
- "[8]\ttrain-auc:0.958339\teval-auc:0.95549\n",
- "[9]\ttrain-auc:0.958235\teval-auc:0.955479\n",
- "[10]\ttrain-auc:0.958922\teval-auc:0.955668\n",
- "[11]\ttrain-auc:0.959378\teval-auc:0.956423\n",
- "[12]\ttrain-auc:0.959666\teval-auc:0.956724\n",
- "[13]\ttrain-auc:0.959674\teval-auc:0.957192\n",
- "[14]\ttrain-auc:0.960227\teval-auc:0.957447\n",
- "[15]\ttrain-auc:0.960316\teval-auc:0.957583\n",
- "[16]\ttrain-auc:0.961338\teval-auc:0.959102\n",
- "[17]\ttrain-auc:0.961986\teval-auc:0.958905\n",
- "[18]\ttrain-auc:0.962297\teval-auc:0.959365\n",
- "[19]\ttrain-auc:0.962798\teval-auc:0.959863\n",
- "[20]\ttrain-auc:0.963266\teval-auc:0.960243\n",
- "[21]\ttrain-auc:0.963661\teval-auc:0.96069\n",
- "[22]\ttrain-auc:0.964377\teval-auc:0.961733\n",
- "[23]\ttrain-auc:0.964741\teval-auc:0.962419\n",
- "[24]\ttrain-auc:0.964886\teval-auc:0.962837\n",
- "[25]\ttrain-auc:0.965193\teval-auc:0.963015\n",
- "[26]\ttrain-auc:0.965453\teval-auc:0.963356\n",
- "[27]\ttrain-auc:0.965766\teval-auc:0.963154\n",
- "[28]\ttrain-auc:0.965954\teval-auc:0.963015\n",
- "[29]\ttrain-auc:0.966586\teval-auc:0.963877\n",
- "[30]\ttrain-auc:0.966813\teval-auc:0.963994\n",
- "[31]\ttrain-auc:0.967003\teval-auc:0.96425\n",
- "[32]\ttrain-auc:0.967469\teval-auc:0.965021\n",
- "[33]\ttrain-auc:0.967952\teval-auc:0.964936\n",
- "[34]\ttrain-auc:0.968365\teval-auc:0.965487\n",
- "[35]\ttrain-auc:0.968576\teval-auc:0.965891\n",
- "[36]\ttrain-auc:0.968961\teval-auc:0.966418\n",
- "[37]\ttrain-auc:0.969475\teval-auc:0.966474\n",
- "[38]\ttrain-auc:0.96979\teval-auc:0.966923\n",
- "[39]\ttrain-auc:0.970028\teval-auc:0.967189\n",
- "[40]\ttrain-auc:0.970177\teval-auc:0.967397\n",
- "[41]\ttrain-auc:0.970596\teval-auc:0.967048\n",
- "[42]\ttrain-auc:0.970871\teval-auc:0.967607\n",
- "[43]\ttrain-auc:0.971206\teval-auc:0.968102\n",
- "[44]\ttrain-auc:0.971298\teval-auc:0.968176\n",
- "[45]\ttrain-auc:0.971754\teval-auc:0.968493\n",
- "[46]\ttrain-auc:0.971813\teval-auc:0.968546\n",
- "[47]\ttrain-auc:0.972115\teval-auc:0.968902\n",
- "[48]\ttrain-auc:0.972266\teval-auc:0.968961\n",
- "[49]\ttrain-auc:0.972328\teval-auc:0.969046\n",
- "[50]\ttrain-auc:0.972632\teval-auc:0.968804\n",
- "[51]\ttrain-auc:0.973076\teval-auc:0.968977\n",
- "[52]\ttrain-auc:0.973468\teval-auc:0.969573\n",
- "[53]\ttrain-auc:0.973681\teval-auc:0.969738\n",
- "[54]\ttrain-auc:0.973826\teval-auc:0.970062\n",
- "[55]\ttrain-auc:0.974159\teval-auc:0.970142\n",
- "[56]\ttrain-auc:0.974259\teval-auc:0.970254\n",
- "[57]\ttrain-auc:0.974533\teval-auc:0.970278\n",
- "[58]\ttrain-auc:0.974716\teval-auc:0.970485\n",
- "[59]\ttrain-auc:0.974808\teval-auc:0.970538\n",
- "[60]\ttrain-auc:0.975131\teval-auc:0.9709\n",
- "[61]\ttrain-auc:0.975251\teval-auc:0.97106\n",
- "[62]\ttrain-auc:0.97532\teval-auc:0.971113\n",
- "[63]\ttrain-auc:0.975468\teval-auc:0.971262\n",
- "[64]\ttrain-auc:0.975523\teval-auc:0.971342\n",
- "[65]\ttrain-auc:0.975621\teval-auc:0.971342\n",
- "[66]\ttrain-auc:0.975726\teval-auc:0.97132\n",
- "[67]\ttrain-auc:0.975945\teval-auc:0.971496\n",
- "[68]\ttrain-auc:0.976067\teval-auc:0.971703\n",
- "[69]\ttrain-auc:0.976234\teval-auc:0.971991\n",
- "[70]\ttrain-auc:0.976296\teval-auc:0.972044\n",
- "[71]\ttrain-auc:0.976386\teval-auc:0.972129\n",
- "[72]\ttrain-auc:0.976636\teval-auc:0.972087\n",
- "[73]\ttrain-auc:0.976809\teval-auc:0.972156\n",
- "[74]\ttrain-auc:0.97688\teval-auc:0.972251\n",
- "[75]\ttrain-auc:0.977256\teval-auc:0.972459\n",
- "[76]\ttrain-auc:0.977306\teval-auc:0.972507\n",
- "[77]\ttrain-auc:0.977407\teval-auc:0.972603\n",
- "[78]\ttrain-auc:0.977514\teval-auc:0.972656\n",
- "[79]\ttrain-auc:0.977588\teval-auc:0.972757\n",
- "[80]\ttrain-auc:0.977843\teval-auc:0.972972\n",
- "[81]\ttrain-auc:0.977938\teval-auc:0.973036\n",
- "[82]\ttrain-auc:0.978056\teval-auc:0.972962\n",
- "[83]\ttrain-auc:0.97829\teval-auc:0.973058\n",
- "[84]\ttrain-auc:0.978366\teval-auc:0.973132\n",
- "[85]\ttrain-auc:0.97844\teval-auc:0.973132\n",
- "[86]\ttrain-auc:0.978461\teval-auc:0.973143\n",
- "[87]\ttrain-auc:0.97852\teval-auc:0.973207\n",
- "[88]\ttrain-auc:0.978731\teval-auc:0.973457\n",
- "[89]\ttrain-auc:0.978776\teval-auc:0.973499\n",
- "[90]\ttrain-auc:0.978881\teval-auc:0.973446\n",
- "[91]\ttrain-auc:0.979052\teval-auc:0.973494\n",
- "[92]\ttrain-auc:0.979078\teval-auc:0.973499\n",
- "[93]\ttrain-auc:0.979186\teval-auc:0.973637\n",
- "[94]\ttrain-auc:0.9793\teval-auc:0.973712\n",
- "[95]\ttrain-auc:0.979578\teval-auc:0.973733\n",
- "[96]\ttrain-auc:0.979638\teval-auc:0.973797\n",
- "[97]\ttrain-auc:0.979718\teval-auc:0.974021\n",
- "[98]\ttrain-auc:0.979887\teval-auc:0.973978\n",
- "[99]\ttrain-auc:0.9799\teval-auc:0.973957\n",
- "[100]\ttrain-auc:0.979966\teval-auc:0.974106\n",
- "[101]\ttrain-auc:0.980003\teval-auc:0.974159\n",
- "[102]\ttrain-auc:0.98012\teval-auc:0.973994\n",
- "[103]\ttrain-auc:0.980258\teval-auc:0.973962\n",
- "[104]\ttrain-auc:0.980323\teval-auc:0.973903\n",
- "[105]\ttrain-auc:0.980386\teval-auc:0.973999\n",
- "[106]\ttrain-auc:0.980468\teval-auc:0.973946\n",
- "[107]\ttrain-auc:0.980523\teval-auc:0.974058\n",
- "[108]\ttrain-auc:0.980577\teval-auc:0.974116\n",
- "[109]\ttrain-auc:0.98073\teval-auc:0.974239\n",
- "[110]\ttrain-auc:0.98088\teval-auc:0.974244\n",
- "[111]\ttrain-auc:0.980953\teval-auc:0.974377\n",
- "[112]\ttrain-auc:0.981079\teval-auc:0.974409\n",
- "[113]\ttrain-auc:0.981224\teval-auc:0.974499\n",
- "[114]\ttrain-auc:0.981241\teval-auc:0.974515\n",
- "[115]\ttrain-auc:0.981318\teval-auc:0.97434\n",
- "[116]\ttrain-auc:0.981389\teval-auc:0.97451\n",
- "[117]\ttrain-auc:0.981489\teval-auc:0.974537\n",
- "[118]\ttrain-auc:0.981613\teval-auc:0.974654\n",
- "[119]\ttrain-auc:0.981645\teval-auc:0.974765\n",
- "[120]\ttrain-auc:0.981738\teval-auc:0.974739\n",
- "[121]\ttrain-auc:0.98188\teval-auc:0.974707\n",
- "[122]\ttrain-auc:0.98195\teval-auc:0.974643\n",
- "[123]\ttrain-auc:0.982098\teval-auc:0.974659\n",
- "[124]\ttrain-auc:0.982177\teval-auc:0.974723\n",
- "[125]\ttrain-auc:0.982389\teval-auc:0.974941\n",
- "[126]\ttrain-auc:0.982517\teval-auc:0.97509\n",
- "[127]\ttrain-auc:0.982527\teval-auc:0.975132\n",
- "[128]\ttrain-auc:0.982643\teval-auc:0.97517\n",
- "[129]\ttrain-auc:0.982795\teval-auc:0.97509\n",
- "[130]\ttrain-auc:0.982866\teval-auc:0.975122\n",
- "[131]\ttrain-auc:0.98296\teval-auc:0.975186\n",
- "[132]\ttrain-auc:0.983059\teval-auc:0.975223\n",
- "[133]\ttrain-auc:0.983209\teval-auc:0.975143\n",
- "[134]\ttrain-auc:0.983343\teval-auc:0.975239\n",
- "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n",
- "[136]\ttrain-auc:0.983545\teval-auc:0.975228\n",
- "[137]\ttrain-auc:0.98368\teval-auc:0.975196\n",
- "[138]\ttrain-auc:0.983674\teval-auc:0.975244\n",
- "[139]\ttrain-auc:0.983737\teval-auc:0.975223\n",
- "[140]\ttrain-auc:0.983804\teval-auc:0.97518\n",
- "[141]\ttrain-auc:0.983939\teval-auc:0.975143\n",
- "[142]\ttrain-auc:0.983985\teval-auc:0.975159\n",
- "[143]\ttrain-auc:0.984077\teval-auc:0.975095\n",
- "[144]\ttrain-auc:0.984248\teval-auc:0.975074\n",
- "[145]\ttrain-auc:0.984285\teval-auc:0.975042\n",
- "Stopping. Best iteration:\n",
- "[135]\ttrain-auc:0.983497\teval-auc:0.975266\n",
- "\n"
- ]
- }
- ],
- "source": [
- "num_round = param['n_estimators']\n",
- "\n",
- "plst = param.items()\n",
- "evallist = [(dtrain, 'train'), (dvalid, 'eval')]\n",
- "best = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=10) # 寻找最优参\n",
- "best.save_model('bst.model')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'best_iteration': '135',\n",
- " 'best_msg': '[135]\\ttrain-auc:0.983497\\teval-auc:0.975266',\n",
- " 'best_score': '0.975266'}"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "best.attributes()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "def create_feature_map(features):\n",
- " outfile = open(r'xgb.fmap', 'w')\n",
- " i = 0\n",
- " for feat in features:\n",
- " outfile.write('{0}\\t{1}\\tq\\n'.format(i, feat))\n",
- " i = i + 1\n",
- " outfile.close()\n",
- "\n",
- "\n",
- "features = list(x_train.columns[:])\n",
- "create_feature_map(features)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [],
- "source": [
- "def feature_importance(best_xgb):\n",
- " importance = best_xgb.get_fscore(fmap=r'xgb.fmap')\n",
- " importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)\n",
- "\n",
- " df = pd.DataFrame(importance, columns=['feature', 'fscore'])\n",
- " df['fscore'] = df['fscore'] / df['fscore'].sum()\n",
- " file_name = 'data/feature_importance_' + str(datetime.now().date())[5:] + '.csv'\n",
- " df.to_csv(file_name)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "feature_importance(best)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " feature | \n",
- " fscore | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " brand | \n",
- " 0.077852 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " action_before_10_5.0_x | \n",
- " 0.041611 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " bad_comment_rate | \n",
- " 0.038926 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3 | \n",
- " product_action_5_ratio | \n",
- " 0.028188 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " user_lv_cd_2 | \n",
- " 0.025503 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 feature fscore\n",
- "0 0 brand 0.077852\n",
- "1 1 action_before_10_5.0_x 0.041611\n",
- "2 2 bad_comment_rate 0.038926\n",
- "3 3 product_action_5_ratio 0.028188\n",
- "4 4 user_lv_cd_2 0.025503"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fi = pd.read_csv('data/feature_importance_02-05.csv')\n",
- "fi.sort_values(\"fscore\", inplace=True, ascending=False)\n",
- "fi.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " user_id | \n",
- " sku_id | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " ... | \n",
- " cate_action_4_mean | \n",
- " cate_action_5_mean | \n",
- " cate_action_6_mean | \n",
- " has_bad_comment | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 6765 | \n",
- " 235394.0 | \n",
- " 24371.0 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 18.0 | \n",
- " 12.0 | \n",
- " ... | \n",
- " 25.733333 | \n",
- " 116.8 | \n",
- " 24942.666667 | \n",
- " 1.0 | \n",
- " 0.0206 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 13767 | \n",
- " 272948.0 | \n",
- " 108907.0 | \n",
- " 4.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 7.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 61.633333 | \n",
- " 143.9 | \n",
- " 41669.233333 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 9672 | \n",
- " 245846.0 | \n",
- " 63026.0 | \n",
- " 10.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 47.0 | \n",
- " ... | \n",
- " 1.100000 | \n",
- " 9.0 | \n",
- " 2361.600000 | \n",
- " 1.0 | \n",
- " 0.0938 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 9116 | \n",
- " 272178.0 | \n",
- " 66704.0 | \n",
- " 9.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 112.0 | \n",
- " ... | \n",
- " 10.400000 | \n",
- " 32.5 | \n",
- " 7264.200000 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 10055 | \n",
- " 216485.0 | \n",
- " 131364.0 | \n",
- " 6.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 6.0 | \n",
- " 28.0 | \n",
- " ... | \n",
- " 38.233333 | \n",
- " 88.0 | \n",
- " 17558.033333 | \n",
- " 1.0 | \n",
- " 0.0473 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 236 columns
\n",
- "
"
- ],
- "text/plain": [
- " user_id sku_id cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "6765 235394.0 24371.0 8.0 8.0 0.0 \n",
- "13767 272948.0 108907.0 4.0 8.0 0.0 \n",
- "9672 245846.0 63026.0 10.0 1.0 0.0 \n",
- "9116 272178.0 66704.0 9.0 2.0 0.0 \n",
- "10055 216485.0 131364.0 6.0 2.0 1.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "6765 0.0 0.0 1.0 \n",
- "13767 0.0 0.0 0.0 \n",
- "9672 0.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 \n",
- "10055 1.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y ... cate_action_4_mean \\\n",
- "6765 18.0 12.0 ... 25.733333 \n",
- "13767 7.0 0.0 ... 61.633333 \n",
- "9672 0.0 47.0 ... 1.100000 \n",
- "9116 4.0 112.0 ... 10.400000 \n",
- "10055 6.0 28.0 ... 38.233333 \n",
- "\n",
- " cate_action_5_mean cate_action_6_mean has_bad_comment \\\n",
- "6765 116.8 24942.666667 1.0 \n",
- "13767 143.9 41669.233333 0.0 \n",
- "9672 9.0 2361.600000 1.0 \n",
- "9116 32.5 7264.200000 0.0 \n",
- "10055 88.0 17558.033333 1.0 \n",
- "\n",
- " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n",
- "6765 0.0206 0.0 0.0 0.0 \n",
- "13767 0.0000 0.0 0.0 1.0 \n",
- "9672 0.0938 0.0 0.0 0.0 \n",
- "9116 0.0000 0.0 0.0 0.0 \n",
- "10055 0.0473 0.0 0.0 0.0 \n",
- "\n",
- " comment_num_3 comment_num_4 \n",
- "6765 0.0 1.0 \n",
- "13767 0.0 0.0 \n",
- "9672 1.0 0.0 \n",
- "9116 0.0 0.0 \n",
- "10055 0.0 1.0 \n",
- "\n",
- "[5 rows x 236 columns]"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x_test.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [],
- "source": [
- "users = x_test[['user_id', 'sku_id', 'cate']].copy()\n",
- "del x_test['user_id']\n",
- "del x_test['sku_id']\n",
- "x_test_DMatrix = xgb.DMatrix(x_test)\n",
- "y_pred = bst.predict(x_test_DMatrix, ntree_limit=bst.best_ntree_limit)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " action_before_3_2.0_y | \n",
- " action_before_3_3.0_y | \n",
- " ... | \n",
- " cate_action_5_mean | \n",
- " cate_action_6_mean | \n",
- " has_bad_comment | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- " pred_label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 6765 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 18.0 | \n",
- " 12.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 116.8 | \n",
- " 24942.666667 | \n",
- " 1.0 | \n",
- " 0.0206 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.453736 | \n",
- "
\n",
- " \n",
- " 13767 | \n",
- " 4.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 7.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 143.9 | \n",
- " 41669.233333 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.002793 | \n",
- "
\n",
- " \n",
- " 9672 | \n",
- " 10.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 47.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 9.0 | \n",
- " 2361.600000 | \n",
- " 1.0 | \n",
- " 0.0938 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.000167 | \n",
- "
\n",
- " \n",
- " 9116 | \n",
- " 9.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 112.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 32.5 | \n",
- " 7264.200000 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.000225 | \n",
- "
\n",
- " \n",
- " 10055 | \n",
- " 6.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 6.0 | \n",
- " 28.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 88.0 | \n",
- " 17558.033333 | \n",
- " 1.0 | \n",
- " 0.0473 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.000507 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 235 columns
\n",
- "
"
- ],
- "text/plain": [
- " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "6765 8.0 8.0 0.0 \n",
- "13767 4.0 8.0 0.0 \n",
- "9672 10.0 1.0 0.0 \n",
- "9116 9.0 2.0 0.0 \n",
- "10055 6.0 2.0 1.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "6765 0.0 0.0 1.0 \n",
- "13767 0.0 0.0 0.0 \n",
- "9672 0.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 \n",
- "10055 1.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
- "6765 18.0 12.0 0.0 \n",
- "13767 7.0 0.0 0.0 \n",
- "9672 0.0 47.0 0.0 \n",
- "9116 4.0 112.0 0.0 \n",
- "10055 6.0 28.0 1.0 \n",
- "\n",
- " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n",
- "6765 0.0 ... 116.8 24942.666667 \n",
- "13767 0.0 ... 143.9 41669.233333 \n",
- "9672 0.0 ... 9.0 2361.600000 \n",
- "9116 0.0 ... 32.5 7264.200000 \n",
- "10055 0.0 ... 88.0 17558.033333 \n",
- "\n",
- " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n",
- "6765 1.0 0.0206 0.0 0.0 \n",
- "13767 0.0 0.0000 0.0 0.0 \n",
- "9672 1.0 0.0938 0.0 0.0 \n",
- "9116 0.0 0.0000 0.0 0.0 \n",
- "10055 1.0 0.0473 0.0 0.0 \n",
- "\n",
- " comment_num_2 comment_num_3 comment_num_4 pred_label \n",
- "6765 0.0 0.0 1.0 0.453736 \n",
- "13767 1.0 0.0 0.0 0.002793 \n",
- "9672 0.0 1.0 0.0 0.000167 \n",
- "9116 0.0 0.0 0.0 0.000225 \n",
- "10055 0.0 0.0 1.0 0.000507 \n",
- "\n",
- "[5 rows x 235 columns]"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x_test['pred_label'] = y_pred\n",
- "x_test.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " action_before_3_2.0_y | \n",
- " action_before_3_3.0_y | \n",
- " ... | \n",
- " cate_action_5_mean | \n",
- " cate_action_6_mean | \n",
- " has_bad_comment | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- " pred_label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 6765 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 18.0 | \n",
- " 12.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 116.8 | \n",
- " 24942.666667 | \n",
- " 1.0 | \n",
- " 0.0206 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 13767 | \n",
- " 4.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 7.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 143.9 | \n",
- " 41669.233333 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 9672 | \n",
- " 10.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 47.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 9.0 | \n",
- " 2361.600000 | \n",
- " 1.0 | \n",
- " 0.0938 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 9116 | \n",
- " 9.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 112.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 32.5 | \n",
- " 7264.200000 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 10055 | \n",
- " 6.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 6.0 | \n",
- " 28.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 88.0 | \n",
- " 17558.033333 | \n",
- " 1.0 | \n",
- " 0.0473 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 235 columns
\n",
- "
"
- ],
- "text/plain": [
- " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "6765 8.0 8.0 0.0 \n",
- "13767 4.0 8.0 0.0 \n",
- "9672 10.0 1.0 0.0 \n",
- "9116 9.0 2.0 0.0 \n",
- "10055 6.0 2.0 1.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "6765 0.0 0.0 1.0 \n",
- "13767 0.0 0.0 0.0 \n",
- "9672 0.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 \n",
- "10055 1.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
- "6765 18.0 12.0 0.0 \n",
- "13767 7.0 0.0 0.0 \n",
- "9672 0.0 47.0 0.0 \n",
- "9116 4.0 112.0 0.0 \n",
- "10055 6.0 28.0 1.0 \n",
- "\n",
- " action_before_3_3.0_y ... cate_action_5_mean cate_action_6_mean \\\n",
- "6765 0.0 ... 116.8 24942.666667 \n",
- "13767 0.0 ... 143.9 41669.233333 \n",
- "9672 0.0 ... 9.0 2361.600000 \n",
- "9116 0.0 ... 32.5 7264.200000 \n",
- "10055 0.0 ... 88.0 17558.033333 \n",
- "\n",
- " has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n",
- "6765 1.0 0.0206 0.0 0.0 \n",
- "13767 0.0 0.0000 0.0 0.0 \n",
- "9672 1.0 0.0938 0.0 0.0 \n",
- "9116 0.0 0.0000 0.0 0.0 \n",
- "10055 1.0 0.0473 0.0 0.0 \n",
- "\n",
- " comment_num_2 comment_num_3 comment_num_4 pred_label \n",
- "6765 0.0 0.0 1.0 0.0 \n",
- "13767 1.0 0.0 0.0 0.0 \n",
- "9672 0.0 1.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 0.0 \n",
- "10055 0.0 0.0 1.0 0.0 \n",
- "\n",
- "[5 rows x 235 columns]"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "def label(column):\n",
- " if column['pred_label'] > 0.5:\n",
- " #rint ('yes')\n",
- " column['pred_label'] = 1\n",
- " else:\n",
- " column['pred_label'] = 0\n",
- " return column\n",
- "x_test = x_test.apply(label,axis = 1)\n",
- "x_test.head() "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " action_before_3_2.0_y | \n",
- " action_before_3_3.0_y | \n",
- " ... | \n",
- " cate_action_6_mean | \n",
- " has_bad_comment | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- " pred_label | \n",
- " true_label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 6765 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 18.0 | \n",
- " 12.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 24942.666667 | \n",
- " 1.0 | \n",
- " 0.0206 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 13767 | \n",
- " 4.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 7.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 41669.233333 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 9672 | \n",
- " 10.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 47.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 2361.600000 | \n",
- " 1.0 | \n",
- " 0.0938 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 9116 | \n",
- " 9.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 112.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 7264.200000 | \n",
- " 0.0 | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 10055 | \n",
- " 6.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 6.0 | \n",
- " 28.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 17558.033333 | \n",
- " 1.0 | \n",
- " 0.0473 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 236 columns
\n",
- "
"
- ],
- "text/plain": [
- " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "6765 8.0 8.0 0.0 \n",
- "13767 4.0 8.0 0.0 \n",
- "9672 10.0 1.0 0.0 \n",
- "9116 9.0 2.0 0.0 \n",
- "10055 6.0 2.0 1.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "6765 0.0 0.0 1.0 \n",
- "13767 0.0 0.0 0.0 \n",
- "9672 0.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 \n",
- "10055 1.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
- "6765 18.0 12.0 0.0 \n",
- "13767 7.0 0.0 0.0 \n",
- "9672 0.0 47.0 0.0 \n",
- "9116 4.0 112.0 0.0 \n",
- "10055 6.0 28.0 1.0 \n",
- "\n",
- " action_before_3_3.0_y ... cate_action_6_mean has_bad_comment \\\n",
- "6765 0.0 ... 24942.666667 1.0 \n",
- "13767 0.0 ... 41669.233333 0.0 \n",
- "9672 0.0 ... 2361.600000 1.0 \n",
- "9116 0.0 ... 7264.200000 0.0 \n",
- "10055 0.0 ... 17558.033333 1.0 \n",
- "\n",
- " bad_comment_rate comment_num_0 comment_num_1 comment_num_2 \\\n",
- "6765 0.0206 0.0 0.0 0.0 \n",
- "13767 0.0000 0.0 0.0 1.0 \n",
- "9672 0.0938 0.0 0.0 0.0 \n",
- "9116 0.0000 0.0 0.0 0.0 \n",
- "10055 0.0473 0.0 0.0 0.0 \n",
- "\n",
- " comment_num_3 comment_num_4 pred_label true_label \n",
- "6765 0.0 1.0 0.0 0.0 \n",
- "13767 0.0 0.0 0.0 0.0 \n",
- "9672 1.0 0.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 0.0 \n",
- "10055 0.0 1.0 0.0 0.0 \n",
- "\n",
- "[5 rows x 236 columns]"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x_test['true_label'] = y_test\n",
- "x_test.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cate | \n",
- " action_before_3_1.0_x | \n",
- " action_before_3_2.0_x | \n",
- " action_before_3_3.0_x | \n",
- " action_before_3_4.0_x | \n",
- " action_before_3_5.0_x | \n",
- " action_before_3_6.0_x | \n",
- " action_before_3_1.0_y | \n",
- " action_before_3_2.0_y | \n",
- " action_before_3_3.0_y | \n",
- " ... | \n",
- " bad_comment_rate | \n",
- " comment_num_0 | \n",
- " comment_num_1 | \n",
- " comment_num_2 | \n",
- " comment_num_3 | \n",
- " comment_num_4 | \n",
- " pred_label | \n",
- " true_label | \n",
- " user_id | \n",
- " sku_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 6765 | \n",
- " 8.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 18.0 | \n",
- " 12.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 0.0206 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 235394.0 | \n",
- " 24371.0 | \n",
- "
\n",
- " \n",
- " 13767 | \n",
- " 4.0 | \n",
- " 8.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 7.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 272948.0 | \n",
- " 108907.0 | \n",
- "
\n",
- " \n",
- " 9672 | \n",
- " 10.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 47.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 0.0938 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 245846.0 | \n",
- " 63026.0 | \n",
- "
\n",
- " \n",
- " 9116 | \n",
- " 9.0 | \n",
- " 2.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 112.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 0.0000 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 272178.0 | \n",
- " 66704.0 | \n",
- "
\n",
- " \n",
- " 10055 | \n",
- " 6.0 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 6.0 | \n",
- " 28.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " ... | \n",
- " 0.0473 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 216485.0 | \n",
- " 131364.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 238 columns
\n",
- "
"
- ],
- "text/plain": [
- " cate action_before_3_1.0_x action_before_3_2.0_x \\\n",
- "6765 8.0 8.0 0.0 \n",
- "13767 4.0 8.0 0.0 \n",
- "9672 10.0 1.0 0.0 \n",
- "9116 9.0 2.0 0.0 \n",
- "10055 6.0 2.0 1.0 \n",
- "\n",
- " action_before_3_3.0_x action_before_3_4.0_x action_before_3_5.0_x \\\n",
- "6765 0.0 0.0 1.0 \n",
- "13767 0.0 0.0 0.0 \n",
- "9672 0.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 \n",
- "10055 1.0 0.0 0.0 \n",
- "\n",
- " action_before_3_6.0_x action_before_3_1.0_y action_before_3_2.0_y \\\n",
- "6765 18.0 12.0 0.0 \n",
- "13767 7.0 0.0 0.0 \n",
- "9672 0.0 47.0 0.0 \n",
- "9116 4.0 112.0 0.0 \n",
- "10055 6.0 28.0 1.0 \n",
- "\n",
- " action_before_3_3.0_y ... bad_comment_rate comment_num_0 \\\n",
- "6765 0.0 ... 0.0206 0.0 \n",
- "13767 0.0 ... 0.0000 0.0 \n",
- "9672 0.0 ... 0.0938 0.0 \n",
- "9116 0.0 ... 0.0000 0.0 \n",
- "10055 0.0 ... 0.0473 0.0 \n",
- "\n",
- " comment_num_1 comment_num_2 comment_num_3 comment_num_4 pred_label \\\n",
- "6765 0.0 0.0 0.0 1.0 0.0 \n",
- "13767 0.0 1.0 0.0 0.0 0.0 \n",
- "9672 0.0 0.0 1.0 0.0 0.0 \n",
- "9116 0.0 0.0 0.0 0.0 0.0 \n",
- "10055 0.0 0.0 0.0 1.0 0.0 \n",
- "\n",
- " true_label user_id sku_id \n",
- "6765 0.0 235394.0 24371.0 \n",
- "13767 0.0 272948.0 108907.0 \n",
- "9672 0.0 245846.0 63026.0 \n",
- "9116 0.0 272178.0 66704.0 \n",
- "10055 0.0 216485.0 131364.0 \n",
- "\n",
- "[5 rows x 238 columns]"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "x_test['user_id'] = users['user_id']\n",
- "x_test['sku_id'] = users['sku_id']\n",
- "x_test.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "126\n",
- "248\n",
- "267\n"
- ]
- }
- ],
- "source": [
- "# 所有购买用户\n",
- "all_user_set = x_test[x_test['true_label']==1]['user_id'].unique()\n",
- "print (len(all_user_set))\n",
- "# 所有预测购买的用户\n",
- "all_user_test_set = x_test[x_test['pred_label'] == 1]['user_id'].unique()\n",
- "print (len(all_user_test_set))\n",
- "all_user_test_item_pair = x_test[x_test['pred_label'] == 1]['user_id'].map(str) + '-' + x_test[x_test['pred_label'] == 1]['sku_id'].map(str)\n",
- "all_user_test_item_pair = np.array(all_user_test_item_pair)\n",
- "print (len(all_user_test_item_pair))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "所有用户中预测购买用户的准确率为 0.4838709677419355\n",
- "所有用户中预测购买用户的召回率0.9523809523809523\n"
- ]
- }
- ],
- "source": [
- "pos, neg = 0,0\n",
- "for user_id in all_user_test_set:\n",
- " if user_id in all_user_set:\n",
- " pos += 1\n",
- " else:\n",
- " neg += 1\n",
- "all_user_acc = 1.0 * pos / (pos + neg)\n",
- "all_user_recall = 1.0 * pos / len(all_user_set)\n",
- "print ('所有用户中预测购买用户的准确率为 ' + str(all_user_acc))\n",
- "print ('所有用户中预测购买用户的召回率' + str(all_user_recall))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "144\n",
- "所有用户中预测购买商品的准确率为 0.5131086142322098\n",
- "所有用户中预测购买商品的召回率0.9513888888888888\n",
- "F11=0.527086383601757\n",
- "F12=0.7091097308488614\n",
- "score=0.6363003919500196\n"
- ]
- }
- ],
- "source": [
- "#所有实际商品对\n",
- "all_user_item_pair = x_test[x_test['true_label']==1]['user_id'].map(str) + '-' + x_test[x_test['true_label']==1]['sku_id'].map(str)\n",
- "all_user_item_pair = np.array(all_user_item_pair)\n",
- "print (len(all_user_item_pair))\n",
- "pos, neg = 0, 0\n",
- "for user_item_pair in all_user_test_item_pair:\n",
- " #print (user_item_pair)\n",
- " if user_item_pair in all_user_item_pair:\n",
- " pos += 1\n",
- " else:\n",
- " neg += 1\n",
- "all_item_acc = 1.0 * pos / ( pos + neg)\n",
- "all_item_recall = 1.0 * pos / len(all_user_item_pair)\n",
- "print ('所有用户中预测购买商品的准确率为 ' + str(all_item_acc))\n",
- "print ('所有用户中预测购买商品的召回率' + str(all_item_recall))\n",
- "F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc)\n",
- "F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc)\n",
- "score = 0.4 * F11 + 0.6 * F12\n",
- "print ('F11=' + str(F11))\n",
- "print ('F12=' + str(F12))\n",
- "print ('score=' + str(score))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}