From 4ac06866071f24bbd5b840498fb03fffa6a8d215 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Mon, 25 Jan 2021 11:17:42 +0800
Subject: [PATCH] Update. Create the Item table
---
.../数据清洗.ipynb | 1743 ++++++++++++++++-
1 file changed, 1712 insertions(+), 31 deletions(-)
diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb
index fc69279..2224c2f 100644
--- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb
@@ -1488,7 +1488,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -1500,7 +1500,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -1516,14 +1516,14 @@
" group['favor_num'] = type_cnt[5]\n",
" group['click_num'] = type_cnt[6]\n",
" \n",
- " return group[['user_id', 'browse_num', 'addcart_num',\n",
+ " return group[['sku_id', 'browse_num', 'addcart_num',\n",
" 'delcart_num', 'buy_num', 'favor_num',\n",
" 'click_num']]"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -1534,22 +1534,22 @@
" loop = True\n",
" while loop:\n",
" try:\n",
- " chunk = reader.get_chunk(chunk_size)[[\"user_id\", \"type\"]]\n",
+ " chunk = reader.get_chunk(chunk_size)[[\"sku_id\", \"type\"]]\n",
" chunks.append(chunk)\n",
" except StopIteration:\n",
" loop = False\n",
" print(\"Iteration is stopped\")\n",
" \n",
" df_ac = pd.concat(chunks, ignore_index=True)\n",
- " df_ac = df_ac.groupby(['user_id'], as_index=False).apply(add_type_count)\n",
- " df_ac = df_ac.drop_duplicates('user_id')\n",
+ " df_ac = df_ac.groupby(['sku_id'], as_index=False).apply(add_type_count)\n",
+ " df_ac = df_ac.drop_duplicates('sku_id')\n",
" \n",
" return df_ac"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -1567,7 +1567,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -1578,7 +1578,7 @@
" df_ac.append(get_from_action_data(fname=ACTION_201604_FILE))\n",
" \n",
" df_ac = pd.concat(df_ac, ignore_index=True)\n",
- " df_ac = df_ac.groupby(['user_id'], as_index=False).sum()\n",
+ " df_ac = df_ac.groupby(['sku_id'], as_index=False).sum()\n",
"\n",
" df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']\n",
" df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']\n",
@@ -1595,45 +1595,1726 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration is stopped\n"
- ]
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sku_id | \n",
+ " a1 | \n",
+ " a2 | \n",
+ " a3 | \n",
+ " cate | \n",
+ " brand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 489 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100002 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 489 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100003 | \n",
+ " 1 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ " 8 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100006 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 545 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 10001 | \n",
+ " -1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 244 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sku_id a1 a2 a3 cate brand\n",
+ "0 10 3 1 1 8 489\n",
+ "1 100002 3 2 2 8 489\n",
+ "2 100003 1 -1 -1 8 30\n",
+ "3 100006 1 2 1 8 545\n",
+ "4 10001 -1 1 2 8 244"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
"item_base = get_from_jdata_product()\n",
- "item_behavior = merge_action_data()\n",
- "item_comment = get_from_jdata_comment()\n",
- "\n",
- "# SQL: left join\n",
- "item_behavior = pd.merge(item_base, item_behavior, on=['sku_id'], how='left')\n",
- "item_behavior = pd.merge(item_behavior, item_comment, on=['sku_id'], how='left')\n",
- " \n",
- "item_behavior.to_csv(ITEM_TABLE_FILE, index=False)"
+ "item_base.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Iteration is stopped\n",
+ "Iteration is stopped\n",
+ "Iteration is stopped\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sku_id | \n",
+ " browse_num | \n",
+ " addcart_num | \n",
+ " delcart_num | \n",
+ " buy_num | \n",
+ " favor_num | \n",
+ " click_num | \n",
+ " buy_addcart_ratio | \n",
+ " buy_browse_ratio | \n",
+ " buy_click_ratio | \n",
+ " buy_favor_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 55 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 79 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 18 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 36 | \n",
+ " 107 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 186 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 37 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 40 | \n",
+ " 79 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 179 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sku_id browse_num addcart_num delcart_num buy_num favor_num \\\n",
+ "0 2 55 0 0 0 0 \n",
+ "1 18 2 0 0 0 0 \n",
+ "2 36 107 4 0 0 1 \n",
+ "3 37 5 0 0 0 0 \n",
+ "4 40 79 2 2 0 0 \n",
+ "\n",
+ " click_num buy_addcart_ratio buy_browse_ratio buy_click_ratio \\\n",
+ "0 79 NaN 0.0 0.0 \n",
+ "1 2 NaN 0.0 0.0 \n",
+ "2 186 0.0 0.0 0.0 \n",
+ "3 10 NaN 0.0 0.0 \n",
+ "4 179 0.0 0.0 0.0 \n",
+ "\n",
+ " buy_favor_ratio \n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 0.0 \n",
+ "3 NaN \n",
+ "4 NaN "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "item_table = pd.read_csv(ITEM_TABLE_FILE)\n",
- "item_table = haed()"
+ "item_behavior = merge_action_data()\n",
+ "item_behavior.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sku_id | \n",
+ " comment_num | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 512006 | \n",
+ " 1000 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.0417 | \n",
+ "
\n",
+ " \n",
+ " 512007 | \n",
+ " 10000 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ " 512008 | \n",
+ " 100011 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 0.0376 | \n",
+ "
\n",
+ " \n",
+ " 512009 | \n",
+ " 100018 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ " 512010 | \n",
+ " 100020 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sku_id comment_num has_bad_comment bad_comment_rate\n",
+ "512006 1000 3 1 0.0417\n",
+ "512007 10000 2 0 0.0000\n",
+ "512008 100011 4 1 0.0376\n",
+ "512009 100018 3 0 0.0000\n",
+ "512010 100020 3 0 0.0000"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "item_comment = get_from_jdata_comment()\n",
+ "item_comment.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# SQL: left join\n",
+ "item_behavior = pd.merge(item_base, item_behavior, on=['sku_id'], how='left')\n",
+ "item_behavior = pd.merge(item_behavior, item_comment, on=['sku_id'], how='left')\n",
+ " \n",
+ "item_behavior.to_csv(ITEM_TABLE_FILE, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sku_id | \n",
+ " a1 | \n",
+ " a2 | \n",
+ " a3 | \n",
+ " cate | \n",
+ " brand | \n",
+ " browse_num | \n",
+ " addcart_num | \n",
+ " delcart_num | \n",
+ " buy_num | \n",
+ " favor_num | \n",
+ " click_num | \n",
+ " buy_addcart_ratio | \n",
+ " buy_browse_ratio | \n",
+ " buy_click_ratio | \n",
+ " buy_favor_ratio | \n",
+ " comment_num | \n",
+ " has_bad_comment | \n",
+ " bad_comment_rate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 489 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100002 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 489 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100003 | \n",
+ " 1 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ " 8 | \n",
+ " 30 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100006 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 545 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 10001 | \n",
+ " -1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 244 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sku_id a1 a2 a3 cate brand browse_num addcart_num delcart_num \\\n",
+ "0 10 3 1 1 8 489 NaN NaN NaN \n",
+ "1 100002 3 2 2 8 489 NaN NaN NaN \n",
+ "2 100003 1 -1 -1 8 30 NaN NaN NaN \n",
+ "3 100006 1 2 1 8 545 NaN NaN NaN \n",
+ "4 10001 -1 1 2 8 244 NaN NaN NaN \n",
+ "\n",
+ " buy_num favor_num click_num buy_addcart_ratio buy_browse_ratio \\\n",
+ "0 NaN NaN NaN NaN NaN \n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " buy_click_ratio buy_favor_ratio comment_num has_bad_comment \\\n",
+ "0 NaN NaN NaN NaN \n",
+ "1 NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN \n",
+ "\n",
+ " bad_comment_rate \n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 NaN \n",
+ "3 NaN \n",
+ "4 NaN "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "item_table = pd.read_csv(ITEM_TABLE_FILE)\n",
+ "item_table.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 数据清洗\n",
+ "\n",
+ "用户清洗"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " age | \n",
+ " sex | \n",
+ " user_lv_cd | \n",
+ " browse_num | \n",
+ " addcart_num | \n",
+ " delcart_num | \n",
+ " buy_num | \n",
+ " favor_num | \n",
+ " click_num | \n",
+ " buy_addcart_ratio | \n",
+ " buy_browse_ratio | \n",
+ " buy_click_ratio | \n",
+ " buy_favor_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 105,321.000 | \n",
+ " 105,318.000 | \n",
+ " 105,318.000 | \n",
+ " 105,321.000 | \n",
+ " 105,180.000 | \n",
+ " 105,180.000 | \n",
+ " 105,180.000 | \n",
+ " 105,180.000 | \n",
+ " 105,180.000 | \n",
+ " 105,180.000 | \n",
+ " 72,129.000 | \n",
+ " 105,172.000 | \n",
+ " 103,197.000 | \n",
+ " 45,986.000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 252,661.000 | \n",
+ " 2.773 | \n",
+ " 1.113 | \n",
+ " 3.850 | \n",
+ " 180.466 | \n",
+ " 5.471 | \n",
+ " 2.434 | \n",
+ " 0.459 | \n",
+ " 1.045 | \n",
+ " 291.222 | \n",
+ " 0.147 | \n",
+ " 0.005 | \n",
+ " 0.009 | \n",
+ " 0.552 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 30,403.698 | \n",
+ " 1.672 | \n",
+ " 0.956 | \n",
+ " 1.072 | \n",
+ " 273.437 | \n",
+ " 10.618 | \n",
+ " 5.600 | \n",
+ " 1.048 | \n",
+ " 3.442 | \n",
+ " 460.031 | \n",
+ " 0.270 | \n",
+ " 0.022 | \n",
+ " 0.074 | \n",
+ " 0.473 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 200,001.000 | \n",
+ " -1.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 226,331.000 | \n",
+ " 3.000 | \n",
+ " 0.000 | \n",
+ " 3.000 | \n",
+ " 40.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 59.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 252,661.000 | \n",
+ " 3.000 | \n",
+ " 2.000 | \n",
+ " 4.000 | \n",
+ " 94.000 | \n",
+ " 2.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 148.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 278,991.000 | \n",
+ " 4.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 212.000 | \n",
+ " 6.000 | \n",
+ " 3.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 342.000 | \n",
+ " 0.167 | \n",
+ " 0.002 | \n",
+ " 0.001 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 305,321.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 7,605.000 | \n",
+ " 369.000 | \n",
+ " 231.000 | \n",
+ " 50.000 | \n",
+ " 99.000 | \n",
+ " 15,302.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id age sex user_lv_cd browse_num \\\n",
+ "count 105,321.000 105,318.000 105,318.000 105,321.000 105,180.000 \n",
+ "mean 252,661.000 2.773 1.113 3.850 180.466 \n",
+ "std 30,403.698 1.672 0.956 1.072 273.437 \n",
+ "min 200,001.000 -1.000 0.000 1.000 0.000 \n",
+ "25% 226,331.000 3.000 0.000 3.000 40.000 \n",
+ "50% 252,661.000 3.000 2.000 4.000 94.000 \n",
+ "75% 278,991.000 4.000 2.000 5.000 212.000 \n",
+ "max 305,321.000 6.000 2.000 5.000 7,605.000 \n",
+ "\n",
+ " addcart_num delcart_num buy_num favor_num click_num \\\n",
+ "count 105,180.000 105,180.000 105,180.000 105,180.000 105,180.000 \n",
+ "mean 5.471 2.434 0.459 1.045 291.222 \n",
+ "std 10.618 5.600 1.048 3.442 460.031 \n",
+ "min 0.000 0.000 0.000 0.000 0.000 \n",
+ "25% 0.000 0.000 0.000 0.000 59.000 \n",
+ "50% 2.000 0.000 0.000 0.000 148.000 \n",
+ "75% 6.000 3.000 1.000 0.000 342.000 \n",
+ "max 369.000 231.000 50.000 99.000 15,302.000 \n",
+ "\n",
+ " buy_addcart_ratio buy_browse_ratio buy_click_ratio buy_favor_ratio \n",
+ "count 72,129.000 105,172.000 103,197.000 45,986.000 \n",
+ "mean 0.147 0.005 0.009 0.552 \n",
+ "std 0.270 0.022 0.074 0.473 \n",
+ "min 0.000 0.000 0.000 0.000 \n",
+ "25% 0.000 0.000 0.000 0.000 \n",
+ "50% 0.000 0.000 0.000 1.000 \n",
+ "75% 0.167 0.002 0.001 1.000 \n",
+ "max 1.000 1.000 1.000 1.000 "
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_user = pd.read_csv('data/User_table.csv',header=0)\n",
+ "pd.options.display.float_format = '{:,.3f}'.format #输出格式设置,保留三位小数\n",
+ "df_user.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "由上述统计信息发现: 第一行中根据User_id统计发现有105321个用户,发现有3个用户没有age,sex字段\n",
+ "\n",
+ "根据浏览、加购、删购、购买等记录却只有105180条记录,说明存在用户无任何交互记录,因此可以删除上述用户"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "删除没有age,sex字段的用户"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " age | \n",
+ " sex | \n",
+ " user_lv_cd | \n",
+ " browse_num | \n",
+ " addcart_num | \n",
+ " delcart_num | \n",
+ " buy_num | \n",
+ " favor_num | \n",
+ " click_num | \n",
+ " buy_addcart_ratio | \n",
+ " buy_browse_ratio | \n",
+ " buy_click_ratio | \n",
+ " buy_favor_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 34072 | \n",
+ " 234073 | \n",
+ " nan | \n",
+ " nan | \n",
+ " 1 | \n",
+ " 32.000 | \n",
+ " 6.000 | \n",
+ " 4.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 41.000 | \n",
+ " 0.167 | \n",
+ " 0.031 | \n",
+ " 0.024 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " 38905 | \n",
+ " 238906 | \n",
+ " nan | \n",
+ " nan | \n",
+ " 1 | \n",
+ " 171.000 | \n",
+ " 3.000 | \n",
+ " 2.000 | \n",
+ " 2.000 | \n",
+ " 3.000 | \n",
+ " 464.000 | \n",
+ " 0.667 | \n",
+ " 0.012 | \n",
+ " 0.004 | \n",
+ " 0.667 | \n",
+ "
\n",
+ " \n",
+ " 67704 | \n",
+ " 267705 | \n",
+ " nan | \n",
+ " nan | \n",
+ " 1 | \n",
+ " 342.000 | \n",
+ " 18.000 | \n",
+ " 8.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 743.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " nan | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id age sex user_lv_cd browse_num addcart_num delcart_num \\\n",
+ "34072 234073 nan nan 1 32.000 6.000 4.000 \n",
+ "38905 238906 nan nan 1 171.000 3.000 2.000 \n",
+ "67704 267705 nan nan 1 342.000 18.000 8.000 \n",
+ "\n",
+ " buy_num favor_num click_num buy_addcart_ratio buy_browse_ratio \\\n",
+ "34072 1.000 0.000 41.000 0.167 0.031 \n",
+ "38905 2.000 3.000 464.000 0.667 0.012 \n",
+ "67704 0.000 0.000 743.000 0.000 0.000 \n",
+ "\n",
+ " buy_click_ratio buy_favor_ratio \n",
+ "34072 0.024 1.000 \n",
+ "38905 0.004 0.667 \n",
+ "67704 0.000 nan "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_user[df_user['age'].isnull()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " age | \n",
+ " sex | \n",
+ " user_lv_cd | \n",
+ " browse_num | \n",
+ " addcart_num | \n",
+ " delcart_num | \n",
+ " buy_num | \n",
+ " favor_num | \n",
+ " click_num | \n",
+ " buy_addcart_ratio | \n",
+ " buy_browse_ratio | \n",
+ " buy_click_ratio | \n",
+ " buy_favor_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 105,318.000 | \n",
+ " 105,318.000 | \n",
+ " 105,318.000 | \n",
+ " 105,318.000 | \n",
+ " 105,177.000 | \n",
+ " 105,177.000 | \n",
+ " 105,177.000 | \n",
+ " 105,177.000 | \n",
+ " 105,177.000 | \n",
+ " 105,177.000 | \n",
+ " 72,126.000 | \n",
+ " 105,169.000 | \n",
+ " 103,194.000 | \n",
+ " 45,984.000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 252,661.164 | \n",
+ " 2.773 | \n",
+ " 1.113 | \n",
+ " 3.850 | \n",
+ " 180.466 | \n",
+ " 5.471 | \n",
+ " 2.434 | \n",
+ " 0.459 | \n",
+ " 1.045 | \n",
+ " 291.219 | \n",
+ " 0.147 | \n",
+ " 0.005 | \n",
+ " 0.009 | \n",
+ " 0.552 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 30,404.012 | \n",
+ " 1.672 | \n",
+ " 0.956 | \n",
+ " 1.071 | \n",
+ " 273.440 | \n",
+ " 10.618 | \n",
+ " 5.600 | \n",
+ " 1.048 | \n",
+ " 3.442 | \n",
+ " 460.034 | \n",
+ " 0.270 | \n",
+ " 0.022 | \n",
+ " 0.074 | \n",
+ " 0.473 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 200,001.000 | \n",
+ " -1.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 226,330.250 | \n",
+ " 3.000 | \n",
+ " 0.000 | \n",
+ " 3.000 | \n",
+ " 40.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 59.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 252,661.500 | \n",
+ " 3.000 | \n",
+ " 2.000 | \n",
+ " 4.000 | \n",
+ " 94.000 | \n",
+ " 2.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 148.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 278,991.750 | \n",
+ " 4.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 212.000 | \n",
+ " 6.000 | \n",
+ " 3.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 342.000 | \n",
+ " 0.167 | \n",
+ " 0.002 | \n",
+ " 0.001 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 305,321.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 7,605.000 | \n",
+ " 369.000 | \n",
+ " 231.000 | \n",
+ " 50.000 | \n",
+ " 99.000 | \n",
+ " 15,302.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id age sex user_lv_cd browse_num \\\n",
+ "count 105,318.000 105,318.000 105,318.000 105,318.000 105,177.000 \n",
+ "mean 252,661.164 2.773 1.113 3.850 180.466 \n",
+ "std 30,404.012 1.672 0.956 1.071 273.440 \n",
+ "min 200,001.000 -1.000 0.000 1.000 0.000 \n",
+ "25% 226,330.250 3.000 0.000 3.000 40.000 \n",
+ "50% 252,661.500 3.000 2.000 4.000 94.000 \n",
+ "75% 278,991.750 4.000 2.000 5.000 212.000 \n",
+ "max 305,321.000 6.000 2.000 5.000 7,605.000 \n",
+ "\n",
+ " addcart_num delcart_num buy_num favor_num click_num \\\n",
+ "count 105,177.000 105,177.000 105,177.000 105,177.000 105,177.000 \n",
+ "mean 5.471 2.434 0.459 1.045 291.219 \n",
+ "std 10.618 5.600 1.048 3.442 460.034 \n",
+ "min 0.000 0.000 0.000 0.000 0.000 \n",
+ "25% 0.000 0.000 0.000 0.000 59.000 \n",
+ "50% 2.000 0.000 0.000 0.000 148.000 \n",
+ "75% 6.000 3.000 1.000 0.000 342.000 \n",
+ "max 369.000 231.000 50.000 99.000 15,302.000 \n",
+ "\n",
+ " buy_addcart_ratio buy_browse_ratio buy_click_ratio buy_favor_ratio \n",
+ "count 72,126.000 105,169.000 103,194.000 45,984.000 \n",
+ "mean 0.147 0.005 0.009 0.552 \n",
+ "std 0.270 0.022 0.074 0.473 \n",
+ "min 0.000 0.000 0.000 0.000 \n",
+ "25% 0.000 0.000 0.000 0.000 \n",
+ "50% 0.000 0.000 0.000 1.000 \n",
+ "75% 0.167 0.002 0.001 1.000 \n",
+ "max 1.000 1.000 1.000 1.000 "
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "delete_list = df_user[df_user['age'].isnull()].index\n",
+ "df_user.drop(delete_list,axis=0,inplace=True)\n",
+ "df_user.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "删除无交互记录的用户"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "105177\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_naction = df_user[(df_user['browse_num'].isnull()) & (df_user['addcart_num'].isnull()) & (df_user['delcart_num'].isnull()) & (df_user['buy_num'].isnull()) & (df_user['favor_num'].isnull()) & (df_user['click_num'].isnull())]\n",
+ "df_user.drop(df_naction.index,axis=0,inplace=True)\n",
+ "print(len(df_user))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "统计并删除无购买记录的用户"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "75694\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_bzero = df_user[df_user['buy_num']==0]\n",
+ "# 输出购买数为0的总记录数\n",
+ "print(len(df_bzero))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " age | \n",
+ " sex | \n",
+ " user_lv_cd | \n",
+ " browse_num | \n",
+ " addcart_num | \n",
+ " delcart_num | \n",
+ " buy_num | \n",
+ " favor_num | \n",
+ " click_num | \n",
+ " buy_addcart_ratio | \n",
+ " buy_browse_ratio | \n",
+ " buy_click_ratio | \n",
+ " buy_favor_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ " 29,483.000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 250,746.445 | \n",
+ " 2.914 | \n",
+ " 1.025 | \n",
+ " 4.272 | \n",
+ " 302.488 | \n",
+ " 10.525 | \n",
+ " 4.673 | \n",
+ " 1.637 | \n",
+ " 1.677 | \n",
+ " 486.653 | \n",
+ " 0.360 | \n",
+ " 0.018 | \n",
+ " 0.030 | \n",
+ " 0.862 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 29,979.676 | \n",
+ " 1.490 | \n",
+ " 0.959 | \n",
+ " 0.808 | \n",
+ " 391.535 | \n",
+ " 14.301 | \n",
+ " 7.568 | \n",
+ " 1.412 | \n",
+ " 4.584 | \n",
+ " 658.671 | \n",
+ " 0.320 | \n",
+ " 0.038 | \n",
+ " 0.136 | \n",
+ " 0.287 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 200,001.000 | \n",
+ " -1.000 | \n",
+ " 0.000 | \n",
+ " 2.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.004 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.010 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 225,058.500 | \n",
+ " 3.000 | \n",
+ " 0.000 | \n",
+ " 4.000 | \n",
+ " 76.000 | \n",
+ " 3.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 116.000 | \n",
+ " 0.118 | \n",
+ " 0.004 | \n",
+ " 0.002 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 249,144.000 | \n",
+ " 3.000 | \n",
+ " 1.000 | \n",
+ " 4.000 | \n",
+ " 178.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 282.000 | \n",
+ " 0.250 | \n",
+ " 0.008 | \n",
+ " 0.005 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 276,252.500 | \n",
+ " 4.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 381.000 | \n",
+ " 13.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 1.000 | \n",
+ " 604.000 | \n",
+ " 0.500 | \n",
+ " 0.018 | \n",
+ " 0.012 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 305,318.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 7,605.000 | \n",
+ " 288.000 | \n",
+ " 178.000 | \n",
+ " 50.000 | \n",
+ " 96.000 | \n",
+ " 15,302.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id age sex user_lv_cd browse_num addcart_num \\\n",
+ "count 29,483.000 29,483.000 29,483.000 29,483.000 29,483.000 29,483.000 \n",
+ "mean 250,746.445 2.914 1.025 4.272 302.488 10.525 \n",
+ "std 29,979.676 1.490 0.959 0.808 391.535 14.301 \n",
+ "min 200,001.000 -1.000 0.000 2.000 1.000 0.000 \n",
+ "25% 225,058.500 3.000 0.000 4.000 76.000 3.000 \n",
+ "50% 249,144.000 3.000 1.000 4.000 178.000 6.000 \n",
+ "75% 276,252.500 4.000 2.000 5.000 381.000 13.000 \n",
+ "max 305,318.000 6.000 2.000 5.000 7,605.000 288.000 \n",
+ "\n",
+ " delcart_num buy_num favor_num click_num buy_addcart_ratio \\\n",
+ "count 29,483.000 29,483.000 29,483.000 29,483.000 29,483.000 \n",
+ "mean 4.673 1.637 1.677 486.653 0.360 \n",
+ "std 7.568 1.412 4.584 658.671 0.320 \n",
+ "min 0.000 1.000 0.000 0.000 0.004 \n",
+ "25% 0.000 1.000 0.000 116.000 0.118 \n",
+ "50% 2.000 1.000 0.000 282.000 0.250 \n",
+ "75% 6.000 2.000 1.000 604.000 0.500 \n",
+ "max 178.000 50.000 96.000 15,302.000 1.000 \n",
+ "\n",
+ " buy_browse_ratio buy_click_ratio buy_favor_ratio \n",
+ "count 29,483.000 29,483.000 29,483.000 \n",
+ "mean 0.018 0.030 0.862 \n",
+ "std 0.038 0.136 0.287 \n",
+ "min 0.000 0.000 0.010 \n",
+ "25% 0.004 0.002 1.000 \n",
+ "50% 0.008 0.005 1.000 \n",
+ "75% 0.018 0.012 1.000 \n",
+ "max 1.000 1.000 1.000 "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_user = df_user[df_user['buy_num']!=0] # 只要有购买记录的\n",
+ "df_user.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "删除爬虫及惰性用户\n",
+ "\n",
+ "由上表所知,浏览购买转换比和点击购买转换比均值为0.018,0.030,因此这里认为浏览购买转换比和点击购买转换比小于0.0005的用户为惰性用户"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "90\n"
+ ]
+ }
+ ],
+ "source": [
+ "bindex = df_user[df_user['buy_browse_ratio']<0.0005].index\n",
+ "print (len(bindex))\n",
+ "df_user.drop(bindex,axis=0,inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "323\n"
+ ]
+ }
+ ],
+ "source": [
+ "cindex = df_user[df_user['buy_click_ratio']<0.0005].index\n",
+ "print (len(cindex))\n",
+ "df_user.drop(cindex,axis=0,inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " age | \n",
+ " sex | \n",
+ " user_lv_cd | \n",
+ " browse_num | \n",
+ " addcart_num | \n",
+ " delcart_num | \n",
+ " buy_num | \n",
+ " favor_num | \n",
+ " click_num | \n",
+ " buy_addcart_ratio | \n",
+ " buy_browse_ratio | \n",
+ " buy_click_ratio | \n",
+ " buy_favor_ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ " 29,070.000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 250,767.099 | \n",
+ " 2.910 | \n",
+ " 1.028 | \n",
+ " 4.268 | \n",
+ " 280.260 | \n",
+ " 10.145 | \n",
+ " 4.457 | \n",
+ " 1.644 | \n",
+ " 1.589 | \n",
+ " 447.113 | \n",
+ " 0.364 | \n",
+ " 0.019 | \n",
+ " 0.031 | \n",
+ " 0.866 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 29,998.870 | \n",
+ " 1.492 | \n",
+ " 0.959 | \n",
+ " 0.809 | \n",
+ " 325.129 | \n",
+ " 13.443 | \n",
+ " 6.998 | \n",
+ " 1.420 | \n",
+ " 4.294 | \n",
+ " 530.994 | \n",
+ " 0.320 | \n",
+ " 0.038 | \n",
+ " 0.137 | \n",
+ " 0.282 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 200,001.000 | \n",
+ " -1.000 | \n",
+ " 0.000 | \n",
+ " 2.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ " 0.004 | \n",
+ " 0.001 | \n",
+ " 0.001 | \n",
+ " 0.018 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 225,036.000 | \n",
+ " 3.000 | \n",
+ " 0.000 | \n",
+ " 4.000 | \n",
+ " 75.000 | \n",
+ " 3.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 114.000 | \n",
+ " 0.125 | \n",
+ " 0.004 | \n",
+ " 0.002 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 249,200.500 | \n",
+ " 3.000 | \n",
+ " 1.000 | \n",
+ " 4.000 | \n",
+ " 174.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 275.000 | \n",
+ " 0.250 | \n",
+ " 0.008 | \n",
+ " 0.005 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 276,284.000 | \n",
+ " 4.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 366.000 | \n",
+ " 13.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 1.000 | \n",
+ " 585.000 | \n",
+ " 0.500 | \n",
+ " 0.018 | \n",
+ " 0.012 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 305,318.000 | \n",
+ " 6.000 | \n",
+ " 2.000 | \n",
+ " 5.000 | \n",
+ " 5,007.000 | \n",
+ " 288.000 | \n",
+ " 158.000 | \n",
+ " 50.000 | \n",
+ " 69.000 | \n",
+ " 8,156.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id age sex user_lv_cd browse_num addcart_num \\\n",
+ "count 29,070.000 29,070.000 29,070.000 29,070.000 29,070.000 29,070.000 \n",
+ "mean 250,767.099 2.910 1.028 4.268 280.260 10.145 \n",
+ "std 29,998.870 1.492 0.959 0.809 325.129 13.443 \n",
+ "min 200,001.000 -1.000 0.000 2.000 1.000 0.000 \n",
+ "25% 225,036.000 3.000 0.000 4.000 75.000 3.000 \n",
+ "50% 249,200.500 3.000 1.000 4.000 174.000 6.000 \n",
+ "75% 276,284.000 4.000 2.000 5.000 366.000 13.000 \n",
+ "max 305,318.000 6.000 2.000 5.000 5,007.000 288.000 \n",
+ "\n",
+ " delcart_num buy_num favor_num click_num buy_addcart_ratio \\\n",
+ "count 29,070.000 29,070.000 29,070.000 29,070.000 29,070.000 \n",
+ "mean 4.457 1.644 1.589 447.113 0.364 \n",
+ "std 6.998 1.420 4.294 530.994 0.320 \n",
+ "min 0.000 1.000 0.000 0.000 0.004 \n",
+ "25% 0.000 1.000 0.000 114.000 0.125 \n",
+ "50% 2.000 1.000 0.000 275.000 0.250 \n",
+ "75% 6.000 2.000 1.000 585.000 0.500 \n",
+ "max 158.000 50.000 69.000 8,156.000 1.000 \n",
+ "\n",
+ " buy_browse_ratio buy_click_ratio buy_favor_ratio \n",
+ "count 29,070.000 29,070.000 29,070.000 \n",
+ "mean 0.019 0.031 0.866 \n",
+ "std 0.038 0.137 0.282 \n",
+ "min 0.001 0.001 0.018 \n",
+ "25% 0.004 0.002 1.000 \n",
+ "50% 0.008 0.005 1.000 \n",
+ "75% 0.018 0.012 1.000 \n",
+ "max 1.000 1.000 1.000 "
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_user.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_user.to_csv(\"data/JData_FUser.csv\", index=None)"
+ ]
}
],
"metadata": {