diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb index fc69279..2224c2f 100644 --- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb @@ -1488,7 +1488,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -1500,7 +1500,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -1516,14 +1516,14 @@ " group['favor_num'] = type_cnt[5]\n", " group['click_num'] = type_cnt[6]\n", " \n", - " return group[['user_id', 'browse_num', 'addcart_num',\n", + " return group[['sku_id', 'browse_num', 'addcart_num',\n", " 'delcart_num', 'buy_num', 'favor_num',\n", " 'click_num']]" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1534,22 +1534,22 @@ " loop = True\n", " while loop:\n", " try:\n", - " chunk = reader.get_chunk(chunk_size)[[\"user_id\", \"type\"]]\n", + " chunk = reader.get_chunk(chunk_size)[[\"sku_id\", \"type\"]]\n", " chunks.append(chunk)\n", " except StopIteration:\n", " loop = False\n", " print(\"Iteration is stopped\")\n", " \n", " df_ac = pd.concat(chunks, ignore_index=True)\n", - " df_ac = df_ac.groupby(['user_id'], as_index=False).apply(add_type_count)\n", - " df_ac = df_ac.drop_duplicates('user_id')\n", + " df_ac = df_ac.groupby(['sku_id'], as_index=False).apply(add_type_count)\n", + " df_ac = df_ac.drop_duplicates('sku_id')\n", " \n", " return df_ac" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -1567,7 +1567,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -1578,7 +1578,7 @@ " df_ac.append(get_from_action_data(fname=ACTION_201604_FILE))\n", " \n", " df_ac = pd.concat(df_ac, ignore_index=True)\n", - " df_ac = df_ac.groupby(['user_id'], as_index=False).sum()\n", + " df_ac = df_ac.groupby(['sku_id'], as_index=False).sum()\n", "\n", " df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']\n", " df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']\n", @@ -1595,45 +1595,1726 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration is stopped\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_ida1a2a3catebrand
0103118489
11000023228489
21000031-1-1830
31000061218545
410001-1128244
\n", + "
" + ], + "text/plain": [ + " sku_id a1 a2 a3 cate brand\n", + "0 10 3 1 1 8 489\n", + "1 100002 3 2 2 8 489\n", + "2 100003 1 -1 -1 8 30\n", + "3 100006 1 2 1 8 545\n", + "4 10001 -1 1 2 8 244" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "item_base = get_from_jdata_product()\n", - "item_behavior = merge_action_data()\n", - "item_comment = get_from_jdata_comment()\n", - "\n", - "# SQL: left join\n", - "item_behavior = pd.merge(item_base, item_behavior, on=['sku_id'], how='left')\n", - "item_behavior = pd.merge(item_behavior, item_comment, on=['sku_id'], how='left')\n", - " \n", - "item_behavior.to_csv(ITEM_TABLE_FILE, index=False)" + "item_base.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration is stopped\n", + "Iteration is stopped\n", + "Iteration is stopped\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_idbrowse_numaddcart_numdelcart_numbuy_numfavor_numclick_numbuy_addcart_ratiobuy_browse_ratiobuy_click_ratiobuy_favor_ratio
0255000079NaN0.00.0NaN
118200002NaN0.00.0NaN
23610740011860.00.00.00.0
3375000010NaN0.00.0NaN
4407922001790.00.00.0NaN
\n", + "
" + ], + "text/plain": [ + " sku_id browse_num addcart_num delcart_num buy_num favor_num \\\n", + "0 2 55 0 0 0 0 \n", + "1 18 2 0 0 0 0 \n", + "2 36 107 4 0 0 1 \n", + "3 37 5 0 0 0 0 \n", + "4 40 79 2 2 0 0 \n", + "\n", + " click_num buy_addcart_ratio buy_browse_ratio buy_click_ratio \\\n", + "0 79 NaN 0.0 0.0 \n", + "1 2 NaN 0.0 0.0 \n", + "2 186 0.0 0.0 0.0 \n", + "3 10 NaN 0.0 0.0 \n", + "4 179 0.0 0.0 0.0 \n", + "\n", + " buy_favor_ratio \n", + "0 NaN \n", + "1 NaN \n", + "2 0.0 \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "item_table = pd.read_csv(ITEM_TABLE_FILE)\n", - "item_table = haed()" + "item_behavior = merge_action_data()\n", + "item_behavior.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_idcomment_numhas_bad_commentbad_comment_rate
5120061000310.0417
51200710000200.0000
512008100011410.0376
512009100018300.0000
512010100020300.0000
\n", + "
" + ], + "text/plain": [ + " sku_id comment_num has_bad_comment bad_comment_rate\n", + "512006 1000 3 1 0.0417\n", + "512007 10000 2 0 0.0000\n", + "512008 100011 4 1 0.0376\n", + "512009 100018 3 0 0.0000\n", + "512010 100020 3 0 0.0000" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_comment = get_from_jdata_comment()\n", + "item_comment.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# SQL: left join\n", + "item_behavior = pd.merge(item_base, item_behavior, on=['sku_id'], how='left')\n", + "item_behavior = pd.merge(item_behavior, item_comment, on=['sku_id'], how='left')\n", + " \n", + "item_behavior.to_csv(ITEM_TABLE_FILE, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_ida1a2a3catebrandbrowse_numaddcart_numdelcart_numbuy_numfavor_numclick_numbuy_addcart_ratiobuy_browse_ratiobuy_click_ratiobuy_favor_ratiocomment_numhas_bad_commentbad_comment_rate
0103118489NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
11000023228489NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
21000031-1-1830NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
31000061218545NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
410001-1128244NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " sku_id a1 a2 a3 cate brand browse_num addcart_num delcart_num \\\n", + "0 10 3 1 1 8 489 NaN NaN NaN \n", + "1 100002 3 2 2 8 489 NaN NaN NaN \n", + "2 100003 1 -1 -1 8 30 NaN NaN NaN \n", + "3 100006 1 2 1 8 545 NaN NaN NaN \n", + "4 10001 -1 1 2 8 244 NaN NaN NaN \n", + "\n", + " buy_num favor_num click_num buy_addcart_ratio buy_browse_ratio \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN \n", + "\n", + " buy_click_ratio buy_favor_ratio comment_num has_bad_comment \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " bad_comment_rate \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_table = pd.read_csv(ITEM_TABLE_FILE)\n", + "item_table.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据清洗\n", + "\n", + "用户清洗" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexuser_lv_cdbrowse_numaddcart_numdelcart_numbuy_numfavor_numclick_numbuy_addcart_ratiobuy_browse_ratiobuy_click_ratiobuy_favor_ratio
count105,321.000105,318.000105,318.000105,321.000105,180.000105,180.000105,180.000105,180.000105,180.000105,180.00072,129.000105,172.000103,197.00045,986.000
mean252,661.0002.7731.1133.850180.4665.4712.4340.4591.045291.2220.1470.0050.0090.552
std30,403.6981.6720.9561.072273.43710.6185.6001.0483.442460.0310.2700.0220.0740.473
min200,001.000-1.0000.0001.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.000
25%226,331.0003.0000.0003.00040.0000.0000.0000.0000.00059.0000.0000.0000.0000.000
50%252,661.0003.0002.0004.00094.0002.0000.0000.0000.000148.0000.0000.0000.0001.000
75%278,991.0004.0002.0005.000212.0006.0003.0001.0000.000342.0000.1670.0020.0011.000
max305,321.0006.0002.0005.0007,605.000369.000231.00050.00099.00015,302.0001.0001.0001.0001.000
\n", + "
" + ], + "text/plain": [ + " user_id age sex user_lv_cd browse_num \\\n", + "count 105,321.000 105,318.000 105,318.000 105,321.000 105,180.000 \n", + "mean 252,661.000 2.773 1.113 3.850 180.466 \n", + "std 30,403.698 1.672 0.956 1.072 273.437 \n", + "min 200,001.000 -1.000 0.000 1.000 0.000 \n", + "25% 226,331.000 3.000 0.000 3.000 40.000 \n", + "50% 252,661.000 3.000 2.000 4.000 94.000 \n", + "75% 278,991.000 4.000 2.000 5.000 212.000 \n", + "max 305,321.000 6.000 2.000 5.000 7,605.000 \n", + "\n", + " addcart_num delcart_num buy_num favor_num click_num \\\n", + "count 105,180.000 105,180.000 105,180.000 105,180.000 105,180.000 \n", + "mean 5.471 2.434 0.459 1.045 291.222 \n", + "std 10.618 5.600 1.048 3.442 460.031 \n", + "min 0.000 0.000 0.000 0.000 0.000 \n", + "25% 0.000 0.000 0.000 0.000 59.000 \n", + "50% 2.000 0.000 0.000 0.000 148.000 \n", + "75% 6.000 3.000 1.000 0.000 342.000 \n", + "max 369.000 231.000 50.000 99.000 15,302.000 \n", + "\n", + " buy_addcart_ratio buy_browse_ratio buy_click_ratio buy_favor_ratio \n", + "count 72,129.000 105,172.000 103,197.000 45,986.000 \n", + "mean 0.147 0.005 0.009 0.552 \n", + "std 0.270 0.022 0.074 0.473 \n", + "min 0.000 0.000 0.000 0.000 \n", + "25% 0.000 0.000 0.000 0.000 \n", + "50% 0.000 0.000 0.000 1.000 \n", + "75% 0.167 0.002 0.001 1.000 \n", + "max 1.000 1.000 1.000 1.000 " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_user = pd.read_csv('data/User_table.csv',header=0)\n", + "pd.options.display.float_format = '{:,.3f}'.format #输出格式设置,保留三位小数\n", + "df_user.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "由上述统计信息发现: 第一行中根据User_id统计发现有105321个用户,发现有3个用户没有age,sex字段\n", + "\n", + "根据浏览、加购、删购、购买等记录却只有105180条记录,说明存在用户无任何交互记录,因此可以删除上述用户" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "删除没有age,sex字段的用户" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexuser_lv_cdbrowse_numaddcart_numdelcart_numbuy_numfavor_numclick_numbuy_addcart_ratiobuy_browse_ratiobuy_click_ratiobuy_favor_ratio
34072234073nannan132.0006.0004.0001.0000.00041.0000.1670.0310.0241.000
38905238906nannan1171.0003.0002.0002.0003.000464.0000.6670.0120.0040.667
67704267705nannan1342.00018.0008.0000.0000.000743.0000.0000.0000.000nan
\n", + "
" + ], + "text/plain": [ + " user_id age sex user_lv_cd browse_num addcart_num delcart_num \\\n", + "34072 234073 nan nan 1 32.000 6.000 4.000 \n", + "38905 238906 nan nan 1 171.000 3.000 2.000 \n", + "67704 267705 nan nan 1 342.000 18.000 8.000 \n", + "\n", + " buy_num favor_num click_num buy_addcart_ratio buy_browse_ratio \\\n", + "34072 1.000 0.000 41.000 0.167 0.031 \n", + "38905 2.000 3.000 464.000 0.667 0.012 \n", + "67704 0.000 0.000 743.000 0.000 0.000 \n", + "\n", + " buy_click_ratio buy_favor_ratio \n", + "34072 0.024 1.000 \n", + "38905 0.004 0.667 \n", + "67704 0.000 nan " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_user[df_user['age'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexuser_lv_cdbrowse_numaddcart_numdelcart_numbuy_numfavor_numclick_numbuy_addcart_ratiobuy_browse_ratiobuy_click_ratiobuy_favor_ratio
count105,318.000105,318.000105,318.000105,318.000105,177.000105,177.000105,177.000105,177.000105,177.000105,177.00072,126.000105,169.000103,194.00045,984.000
mean252,661.1642.7731.1133.850180.4665.4712.4340.4591.045291.2190.1470.0050.0090.552
std30,404.0121.6720.9561.071273.44010.6185.6001.0483.442460.0340.2700.0220.0740.473
min200,001.000-1.0000.0001.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.000
25%226,330.2503.0000.0003.00040.0000.0000.0000.0000.00059.0000.0000.0000.0000.000
50%252,661.5003.0002.0004.00094.0002.0000.0000.0000.000148.0000.0000.0000.0001.000
75%278,991.7504.0002.0005.000212.0006.0003.0001.0000.000342.0000.1670.0020.0011.000
max305,321.0006.0002.0005.0007,605.000369.000231.00050.00099.00015,302.0001.0001.0001.0001.000
\n", + "
" + ], + "text/plain": [ + " user_id age sex user_lv_cd browse_num \\\n", + "count 105,318.000 105,318.000 105,318.000 105,318.000 105,177.000 \n", + "mean 252,661.164 2.773 1.113 3.850 180.466 \n", + "std 30,404.012 1.672 0.956 1.071 273.440 \n", + "min 200,001.000 -1.000 0.000 1.000 0.000 \n", + "25% 226,330.250 3.000 0.000 3.000 40.000 \n", + "50% 252,661.500 3.000 2.000 4.000 94.000 \n", + "75% 278,991.750 4.000 2.000 5.000 212.000 \n", + "max 305,321.000 6.000 2.000 5.000 7,605.000 \n", + "\n", + " addcart_num delcart_num buy_num favor_num click_num \\\n", + "count 105,177.000 105,177.000 105,177.000 105,177.000 105,177.000 \n", + "mean 5.471 2.434 0.459 1.045 291.219 \n", + "std 10.618 5.600 1.048 3.442 460.034 \n", + "min 0.000 0.000 0.000 0.000 0.000 \n", + "25% 0.000 0.000 0.000 0.000 59.000 \n", + "50% 2.000 0.000 0.000 0.000 148.000 \n", + "75% 6.000 3.000 1.000 0.000 342.000 \n", + "max 369.000 231.000 50.000 99.000 15,302.000 \n", + "\n", + " buy_addcart_ratio buy_browse_ratio buy_click_ratio buy_favor_ratio \n", + "count 72,126.000 105,169.000 103,194.000 45,984.000 \n", + "mean 0.147 0.005 0.009 0.552 \n", + "std 0.270 0.022 0.074 0.473 \n", + "min 0.000 0.000 0.000 0.000 \n", + "25% 0.000 0.000 0.000 0.000 \n", + "50% 0.000 0.000 0.000 1.000 \n", + "75% 0.167 0.002 0.001 1.000 \n", + "max 1.000 1.000 1.000 1.000 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "delete_list = df_user[df_user['age'].isnull()].index\n", + "df_user.drop(delete_list,axis=0,inplace=True)\n", + "df_user.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "删除无交互记录的用户" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "105177\n" + ] + } + ], + "source": [ + "df_naction = df_user[(df_user['browse_num'].isnull()) & (df_user['addcart_num'].isnull()) & (df_user['delcart_num'].isnull()) & (df_user['buy_num'].isnull()) & (df_user['favor_num'].isnull()) & (df_user['click_num'].isnull())]\n", + "df_user.drop(df_naction.index,axis=0,inplace=True)\n", + "print(len(df_user))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "统计并删除无购买记录的用户" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "75694\n" + ] + } + ], + "source": [ + "df_bzero = df_user[df_user['buy_num']==0]\n", + "# 输出购买数为0的总记录数\n", + "print(len(df_bzero))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexuser_lv_cdbrowse_numaddcart_numdelcart_numbuy_numfavor_numclick_numbuy_addcart_ratiobuy_browse_ratiobuy_click_ratiobuy_favor_ratio
count29,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.00029,483.000
mean250,746.4452.9141.0254.272302.48810.5254.6731.6371.677486.6530.3600.0180.0300.862
std29,979.6761.4900.9590.808391.53514.3017.5681.4124.584658.6710.3200.0380.1360.287
min200,001.000-1.0000.0002.0001.0000.0000.0001.0000.0000.0000.0040.0000.0000.010
25%225,058.5003.0000.0004.00076.0003.0000.0001.0000.000116.0000.1180.0040.0021.000
50%249,144.0003.0001.0004.000178.0006.0002.0001.0000.000282.0000.2500.0080.0051.000
75%276,252.5004.0002.0005.000381.00013.0006.0002.0001.000604.0000.5000.0180.0121.000
max305,318.0006.0002.0005.0007,605.000288.000178.00050.00096.00015,302.0001.0001.0001.0001.000
\n", + "
" + ], + "text/plain": [ + " user_id age sex user_lv_cd browse_num addcart_num \\\n", + "count 29,483.000 29,483.000 29,483.000 29,483.000 29,483.000 29,483.000 \n", + "mean 250,746.445 2.914 1.025 4.272 302.488 10.525 \n", + "std 29,979.676 1.490 0.959 0.808 391.535 14.301 \n", + "min 200,001.000 -1.000 0.000 2.000 1.000 0.000 \n", + "25% 225,058.500 3.000 0.000 4.000 76.000 3.000 \n", + "50% 249,144.000 3.000 1.000 4.000 178.000 6.000 \n", + "75% 276,252.500 4.000 2.000 5.000 381.000 13.000 \n", + "max 305,318.000 6.000 2.000 5.000 7,605.000 288.000 \n", + "\n", + " delcart_num buy_num favor_num click_num buy_addcart_ratio \\\n", + "count 29,483.000 29,483.000 29,483.000 29,483.000 29,483.000 \n", + "mean 4.673 1.637 1.677 486.653 0.360 \n", + "std 7.568 1.412 4.584 658.671 0.320 \n", + "min 0.000 1.000 0.000 0.000 0.004 \n", + "25% 0.000 1.000 0.000 116.000 0.118 \n", + "50% 2.000 1.000 0.000 282.000 0.250 \n", + "75% 6.000 2.000 1.000 604.000 0.500 \n", + "max 178.000 50.000 96.000 15,302.000 1.000 \n", + "\n", + " buy_browse_ratio buy_click_ratio buy_favor_ratio \n", + "count 29,483.000 29,483.000 29,483.000 \n", + "mean 0.018 0.030 0.862 \n", + "std 0.038 0.136 0.287 \n", + "min 0.000 0.000 0.010 \n", + "25% 0.004 0.002 1.000 \n", + "50% 0.008 0.005 1.000 \n", + "75% 0.018 0.012 1.000 \n", + "max 1.000 1.000 1.000 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_user = df_user[df_user['buy_num']!=0] # 只要有购买记录的\n", + "df_user.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "删除爬虫及惰性用户\n", + "\n", + "由上表所知,浏览购买转换比和点击购买转换比均值为0.018,0.030,因此这里认为浏览购买转换比和点击购买转换比小于0.0005的用户为惰性用户" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "90\n" + ] + } + ], + "source": [ + "bindex = df_user[df_user['buy_browse_ratio']<0.0005].index\n", + "print (len(bindex))\n", + "df_user.drop(bindex,axis=0,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "323\n" + ] + } + ], + "source": [ + "cindex = df_user[df_user['buy_click_ratio']<0.0005].index\n", + "print (len(cindex))\n", + "df_user.drop(cindex,axis=0,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexuser_lv_cdbrowse_numaddcart_numdelcart_numbuy_numfavor_numclick_numbuy_addcart_ratiobuy_browse_ratiobuy_click_ratiobuy_favor_ratio
count29,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.00029,070.000
mean250,767.0992.9101.0284.268280.26010.1454.4571.6441.589447.1130.3640.0190.0310.866
std29,998.8701.4920.9590.809325.12913.4436.9981.4204.294530.9940.3200.0380.1370.282
min200,001.000-1.0000.0002.0001.0000.0000.0001.0000.0000.0000.0040.0010.0010.018
25%225,036.0003.0000.0004.00075.0003.0000.0001.0000.000114.0000.1250.0040.0021.000
50%249,200.5003.0001.0004.000174.0006.0002.0001.0000.000275.0000.2500.0080.0051.000
75%276,284.0004.0002.0005.000366.00013.0006.0002.0001.000585.0000.5000.0180.0121.000
max305,318.0006.0002.0005.0005,007.000288.000158.00050.00069.0008,156.0001.0001.0001.0001.000
\n", + "
" + ], + "text/plain": [ + " user_id age sex user_lv_cd browse_num addcart_num \\\n", + "count 29,070.000 29,070.000 29,070.000 29,070.000 29,070.000 29,070.000 \n", + "mean 250,767.099 2.910 1.028 4.268 280.260 10.145 \n", + "std 29,998.870 1.492 0.959 0.809 325.129 13.443 \n", + "min 200,001.000 -1.000 0.000 2.000 1.000 0.000 \n", + "25% 225,036.000 3.000 0.000 4.000 75.000 3.000 \n", + "50% 249,200.500 3.000 1.000 4.000 174.000 6.000 \n", + "75% 276,284.000 4.000 2.000 5.000 366.000 13.000 \n", + "max 305,318.000 6.000 2.000 5.000 5,007.000 288.000 \n", + "\n", + " delcart_num buy_num favor_num click_num buy_addcart_ratio \\\n", + "count 29,070.000 29,070.000 29,070.000 29,070.000 29,070.000 \n", + "mean 4.457 1.644 1.589 447.113 0.364 \n", + "std 6.998 1.420 4.294 530.994 0.320 \n", + "min 0.000 1.000 0.000 0.000 0.004 \n", + "25% 0.000 1.000 0.000 114.000 0.125 \n", + "50% 2.000 1.000 0.000 275.000 0.250 \n", + "75% 6.000 2.000 1.000 585.000 0.500 \n", + "max 158.000 50.000 69.000 8,156.000 1.000 \n", + "\n", + " buy_browse_ratio buy_click_ratio buy_favor_ratio \n", + "count 29,070.000 29,070.000 29,070.000 \n", + "mean 0.019 0.031 0.866 \n", + "std 0.038 0.137 0.282 \n", + "min 0.001 0.001 0.018 \n", + "25% 0.004 0.002 1.000 \n", + "50% 0.008 0.005 1.000 \n", + "75% 0.018 0.012 1.000 \n", + "max 1.000 1.000 1.000 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_user.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "df_user.to_csv(\"data/JData_FUser.csv\", index=None)" + ] } ], "metadata": {