|
|
@ -862,7 +862,7 @@
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"### user_table\n",
|
|
|
|
"### user_table特征包括:\n",
|
|
|
|
"* user_table特征包括:\n",
|
|
|
|
"* user_table特征包括:\n",
|
|
|
|
"* user_id(用户id),age(年龄),sex(性别),\n",
|
|
|
|
"* user_id(用户id),age(年龄),sex(性别),\n",
|
|
|
|
"* user_lv_cd(用户级别),browse_num(浏览数),\n",
|
|
|
|
"* user_lv_cd(用户级别),browse_num(浏览数),\n",
|
|
|
@ -873,7 +873,7 @@
|
|
|
|
"* buy_click_ratio(购买点击转化率),\n",
|
|
|
|
"* buy_click_ratio(购买点击转化率),\n",
|
|
|
|
"* buy_favor_ratio(购买收藏转化率)\n",
|
|
|
|
"* buy_favor_ratio(购买收藏转化率)\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"### item_table特征包括:\n",
|
|
|
|
"### item_table特征包括:\n",
|
|
|
|
"* sku_id(商品id),attr1,attr2,\n",
|
|
|
|
"* sku_id(商品id),attr1,attr2,\n",
|
|
|
|
"* attr3,cate,brand,browse_num,\n",
|
|
|
|
"* attr3,cate,brand,browse_num,\n",
|
|
|
|
"* addcart_num,delcart_num,\n",
|
|
|
|
"* addcart_num,delcart_num,\n",
|
|
|
@ -899,13 +899,13 @@
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"# 定义文件名\n",
|
|
|
|
"# 定义文件名\n",
|
|
|
|
"ACTION_201602_FILE = \"data/JData_Action_201602.csv\" # 11M条\n",
|
|
|
|
"ACTION_201602_FILE = \"data/JData_Action_201602.csv\"\n",
|
|
|
|
"ACTION_201603_FILE = \"data/JData_Action_201603.csv\" #26M 条\n",
|
|
|
|
"ACTION_201603_FILE = \"data/JData_Action_201603.csv\"\n",
|
|
|
|
"ACTION_201604_FILE = \"data/JData_Action_201604.csv\" #13M条\n",
|
|
|
|
"ACTION_201604_FILE = \"data/JData_Action_201604.csv\"\n",
|
|
|
|
"COMMENT_FILE = \"data/JData_Comment.csv\" #560K条\n",
|
|
|
|
"COMMENT_FILE = \"data/JData_Comment.csv\"\n",
|
|
|
|
"PRODUCT_FILE = \"data/JData_Product.csv\" #24k\n",
|
|
|
|
"PRODUCT_FILE = \"data/JData_Product.csv\"\n",
|
|
|
|
"USER_FILE = \"data/JData_User.csv\" # 105K 条\n",
|
|
|
|
"USER_FILE = \"data/JData_User.csv\"\n",
|
|
|
|
" \n",
|
|
|
|
"\n",
|
|
|
|
"USER_TABLE_FILE = \"data/user_table.csv\"\n",
|
|
|
|
"USER_TABLE_FILE = \"data/user_table.csv\"\n",
|
|
|
|
"ITEM_TABLE_FILE = \"data/item_table.csv\""
|
|
|
|
"ITEM_TABLE_FILE = \"data/item_table.csv\""
|
|
|
|
]
|
|
|
|
]
|
|
|
@ -1450,6 +1450,184 @@
|
|
|
|
"user_table.head()"
|
|
|
|
"user_table.head()"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"### 构建Item_table"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"# 定义文件名\n",
|
|
|
|
|
|
|
|
"ACTION_201602_FILE = \"data/JData_Action_201602.csv\"\n",
|
|
|
|
|
|
|
|
"ACTION_201603_FILE = \"data/JData_Action_201603.csv\"\n",
|
|
|
|
|
|
|
|
"ACTION_201604_FILE = \"data/JData_Action_201604.csv\"\n",
|
|
|
|
|
|
|
|
"COMMENT_FILE = \"data/JData_Comment.csv\"\n",
|
|
|
|
|
|
|
|
"PRODUCT_FILE = \"data/JData_Product.csv\"\n",
|
|
|
|
|
|
|
|
"USER_FILE = \"data/JData_User.csv\"\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"USER_TABLE_FILE = \"data/user_table.csv\"\n",
|
|
|
|
|
|
|
|
"ITEM_TABLE_FILE = \"data/item_table.csv\""
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 2,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
|
|
|
"from collections import Counter"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"# 读取Product中商品\n",
|
|
|
|
|
|
|
|
"def get_from_jdata_product():\n",
|
|
|
|
|
|
|
|
" df_item = pd.read_csv(PRODUCT_FILE, header=0,encoding='gbk')\n",
|
|
|
|
|
|
|
|
" return df_item"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"# 功能函数: 对每一个商品分组的数据进行统计\n",
|
|
|
|
|
|
|
|
"def add_type_count(group):\n",
|
|
|
|
|
|
|
|
" behavior_type = group.type.astype(int) \n",
|
|
|
|
|
|
|
|
" type_cnt = Counter(behavior_type)\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" group['browse_num'] = type_cnt[1]\n",
|
|
|
|
|
|
|
|
" group['addcart_num'] = type_cnt[2]\n",
|
|
|
|
|
|
|
|
" group['delcart_num'] = type_cnt[3]\n",
|
|
|
|
|
|
|
|
" group['buy_num'] = type_cnt[4]\n",
|
|
|
|
|
|
|
|
" group['favor_num'] = type_cnt[5]\n",
|
|
|
|
|
|
|
|
" group['click_num'] = type_cnt[6]\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" return group[['user_id', 'browse_num', 'addcart_num',\n",
|
|
|
|
|
|
|
|
" 'delcart_num', 'buy_num', 'favor_num',\n",
|
|
|
|
|
|
|
|
" 'click_num']]"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"# 对action数据进行统计\n",
|
|
|
|
|
|
|
|
"def get_from_action_data(fname, chunk_size=50000):\n",
|
|
|
|
|
|
|
|
" reader = pd.read_csv(fname, header=0, iterator=True,encoding='gbk')\n",
|
|
|
|
|
|
|
|
" chunks = []\n",
|
|
|
|
|
|
|
|
" loop = True\n",
|
|
|
|
|
|
|
|
" while loop:\n",
|
|
|
|
|
|
|
|
" try:\n",
|
|
|
|
|
|
|
|
" chunk = reader.get_chunk(chunk_size)[[\"user_id\", \"type\"]]\n",
|
|
|
|
|
|
|
|
" chunks.append(chunk)\n",
|
|
|
|
|
|
|
|
" except StopIteration:\n",
|
|
|
|
|
|
|
|
" loop = False\n",
|
|
|
|
|
|
|
|
" print(\"Iteration is stopped\")\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" df_ac = pd.concat(chunks, ignore_index=True)\n",
|
|
|
|
|
|
|
|
" df_ac = df_ac.groupby(['user_id'], as_index=False).apply(add_type_count)\n",
|
|
|
|
|
|
|
|
" df_ac = df_ac.drop_duplicates('user_id')\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" return df_ac"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"# 获取评论中的商品数据,如果存在某一个商品有两个日期的评论,我们取最晚的那一个\n",
|
|
|
|
|
|
|
|
"def get_from_jdata_comment():\n",
|
|
|
|
|
|
|
|
" df_cmt = pd.read_csv(COMMENT_FILE, header=0)\n",
|
|
|
|
|
|
|
|
" df_cmt['dt'] = pd.to_datetime(df_cmt['dt'])\n",
|
|
|
|
|
|
|
|
" # find latest comment index\n",
|
|
|
|
|
|
|
|
" idx = df_cmt.groupby(['sku_id'])['dt'].transform(max) == df_cmt['dt'] # 取最晚的那一个??\n",
|
|
|
|
|
|
|
|
" df_cmt = df_cmt[idx]\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" return df_cmt[['sku_id', 'comment_num',\n",
|
|
|
|
|
|
|
|
" 'has_bad_comment', 'bad_comment_rate']]"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"def merge_action_data():\n",
|
|
|
|
|
|
|
|
" df_ac = []\n",
|
|
|
|
|
|
|
|
" df_ac.append(get_from_action_data(fname=ACTION_201602_FILE))\n",
|
|
|
|
|
|
|
|
" df_ac.append(get_from_action_data(fname=ACTION_201603_FILE))\n",
|
|
|
|
|
|
|
|
" df_ac.append(get_from_action_data(fname=ACTION_201604_FILE))\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" df_ac = pd.concat(df_ac, ignore_index=True)\n",
|
|
|
|
|
|
|
|
" df_ac = df_ac.groupby(['user_id'], as_index=False).sum()\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']\n",
|
|
|
|
|
|
|
|
" df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']\n",
|
|
|
|
|
|
|
|
" df_ac['buy_click_ratio'] = df_ac['buy_num'] / df_ac['click_num']\n",
|
|
|
|
|
|
|
|
" df_ac['buy_favor_ratio'] = df_ac['buy_num'] / df_ac['favor_num']\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" df_ac.loc[df_ac['buy_addcart_ratio'] > 1., 'buy_addcart_ratio'] = 1.\n",
|
|
|
|
|
|
|
|
" df_ac.loc[df_ac['buy_browse_ratio'] > 1., 'buy_browse_ratio'] = 1.\n",
|
|
|
|
|
|
|
|
" df_ac.loc[df_ac['buy_click_ratio'] > 1., 'buy_click_ratio'] = 1.\n",
|
|
|
|
|
|
|
|
" df_ac.loc[df_ac['buy_favor_ratio'] > 1., 'buy_favor_ratio'] = 1.\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" return df_ac"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Iteration is stopped\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"item_base = get_from_jdata_product()\n",
|
|
|
|
|
|
|
|
"item_behavior = merge_action_data()\n",
|
|
|
|
|
|
|
|
"item_comment = get_from_jdata_comment()\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# SQL: left join\n",
|
|
|
|
|
|
|
|
"item_behavior = pd.merge(item_base, item_behavior, on=['sku_id'], how='left')\n",
|
|
|
|
|
|
|
|
"item_behavior = pd.merge(item_behavior, item_comment, on=['sku_id'], how='left')\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
"item_behavior.to_csv(ITEM_TABLE_FILE, index=False)"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"item_table = pd.read_csv(ITEM_TABLE_FILE)\n",
|
|
|
|
|
|
|
|
"item_table = haed()"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": null,
|
|
|
|