Add. Create the Item table

pull/2/head
benjas 5 years ago
parent d4bb88515a
commit c27374e6d0

@ -862,7 +862,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### user_table\n", "### user_table特征包括:\n",
"* user_table特征包括:\n", "* user_table特征包括:\n",
"* user_id(用户id),age(年龄),sex(性别),\n", "* user_id(用户id),age(年龄),sex(性别),\n",
"* user_lv_cd(用户级别),browse_num(浏览数),\n", "* user_lv_cd(用户级别),browse_num(浏览数),\n",
@ -873,7 +873,7 @@
"* buy_click_ratio(购买点击转化率),\n", "* buy_click_ratio(购买点击转化率),\n",
"* buy_favor_ratio(购买收藏转化率)\n", "* buy_favor_ratio(购买收藏转化率)\n",
"\n", "\n",
"### item_table特征包括:\n", "### item_table特征包括\n",
"* sku_id(商品id),attr1,attr2,\n", "* sku_id(商品id),attr1,attr2,\n",
"* attr3,cate,brand,browse_num,\n", "* attr3,cate,brand,browse_num,\n",
"* addcart_num,delcart_num,\n", "* addcart_num,delcart_num,\n",
@ -899,13 +899,13 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# 定义文件名\n", "# 定义文件名\n",
"ACTION_201602_FILE = \"data/JData_Action_201602.csv\" # 11M条\n", "ACTION_201602_FILE = \"data/JData_Action_201602.csv\"\n",
"ACTION_201603_FILE = \"data/JData_Action_201603.csv\" #26M 条\n", "ACTION_201603_FILE = \"data/JData_Action_201603.csv\"\n",
"ACTION_201604_FILE = \"data/JData_Action_201604.csv\" #13M条\n", "ACTION_201604_FILE = \"data/JData_Action_201604.csv\"\n",
"COMMENT_FILE = \"data/JData_Comment.csv\" #560K条\n", "COMMENT_FILE = \"data/JData_Comment.csv\"\n",
"PRODUCT_FILE = \"data/JData_Product.csv\" #24k\n", "PRODUCT_FILE = \"data/JData_Product.csv\"\n",
"USER_FILE = \"data/JData_User.csv\" # 105K 条\n", "USER_FILE = \"data/JData_User.csv\"\n",
" \n", "\n",
"USER_TABLE_FILE = \"data/user_table.csv\"\n", "USER_TABLE_FILE = \"data/user_table.csv\"\n",
"ITEM_TABLE_FILE = \"data/item_table.csv\"" "ITEM_TABLE_FILE = \"data/item_table.csv\""
] ]
@ -1450,6 +1450,184 @@
"user_table.head()" "user_table.head()"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 构建Item_table"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# 定义文件名\n",
"ACTION_201602_FILE = \"data/JData_Action_201602.csv\"\n",
"ACTION_201603_FILE = \"data/JData_Action_201603.csv\"\n",
"ACTION_201604_FILE = \"data/JData_Action_201604.csv\"\n",
"COMMENT_FILE = \"data/JData_Comment.csv\"\n",
"PRODUCT_FILE = \"data/JData_Product.csv\"\n",
"USER_FILE = \"data/JData_User.csv\"\n",
"\n",
"USER_TABLE_FILE = \"data/user_table.csv\"\n",
"ITEM_TABLE_FILE = \"data/item_table.csv\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 读取Product中商品\n",
"def get_from_jdata_product():\n",
" df_item = pd.read_csv(PRODUCT_FILE, header=0,encoding='gbk')\n",
" return df_item"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# 功能函数: 对每一个商品分组的数据进行统计\n",
"def add_type_count(group):\n",
" behavior_type = group.type.astype(int) \n",
" type_cnt = Counter(behavior_type)\n",
" \n",
" group['browse_num'] = type_cnt[1]\n",
" group['addcart_num'] = type_cnt[2]\n",
" group['delcart_num'] = type_cnt[3]\n",
" group['buy_num'] = type_cnt[4]\n",
" group['favor_num'] = type_cnt[5]\n",
" group['click_num'] = type_cnt[6]\n",
" \n",
" return group[['user_id', 'browse_num', 'addcart_num',\n",
" 'delcart_num', 'buy_num', 'favor_num',\n",
" 'click_num']]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# 对action数据进行统计\n",
"def get_from_action_data(fname, chunk_size=50000):\n",
" reader = pd.read_csv(fname, header=0, iterator=True,encoding='gbk')\n",
" chunks = []\n",
" loop = True\n",
" while loop:\n",
" try:\n",
" chunk = reader.get_chunk(chunk_size)[[\"user_id\", \"type\"]]\n",
" chunks.append(chunk)\n",
" except StopIteration:\n",
" loop = False\n",
" print(\"Iteration is stopped\")\n",
" \n",
" df_ac = pd.concat(chunks, ignore_index=True)\n",
" df_ac = df_ac.groupby(['user_id'], as_index=False).apply(add_type_count)\n",
" df_ac = df_ac.drop_duplicates('user_id')\n",
" \n",
" return df_ac"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# 获取评论中的商品数据,如果存在某一个商品有两个日期的评论,我们取最晚的那一个\n",
"def get_from_jdata_comment():\n",
" df_cmt = pd.read_csv(COMMENT_FILE, header=0)\n",
" df_cmt['dt'] = pd.to_datetime(df_cmt['dt'])\n",
" # find latest comment index\n",
" idx = df_cmt.groupby(['sku_id'])['dt'].transform(max) == df_cmt['dt'] # 取最晚的那一个??\n",
" df_cmt = df_cmt[idx]\n",
" \n",
" return df_cmt[['sku_id', 'comment_num',\n",
" 'has_bad_comment', 'bad_comment_rate']]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def merge_action_data():\n",
" df_ac = []\n",
" df_ac.append(get_from_action_data(fname=ACTION_201602_FILE))\n",
" df_ac.append(get_from_action_data(fname=ACTION_201603_FILE))\n",
" df_ac.append(get_from_action_data(fname=ACTION_201604_FILE))\n",
" \n",
" df_ac = pd.concat(df_ac, ignore_index=True)\n",
" df_ac = df_ac.groupby(['user_id'], as_index=False).sum()\n",
"\n",
" df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']\n",
" df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']\n",
" df_ac['buy_click_ratio'] = df_ac['buy_num'] / df_ac['click_num']\n",
" df_ac['buy_favor_ratio'] = df_ac['buy_num'] / df_ac['favor_num']\n",
" \n",
" df_ac.loc[df_ac['buy_addcart_ratio'] > 1., 'buy_addcart_ratio'] = 1.\n",
" df_ac.loc[df_ac['buy_browse_ratio'] > 1., 'buy_browse_ratio'] = 1.\n",
" df_ac.loc[df_ac['buy_click_ratio'] > 1., 'buy_click_ratio'] = 1.\n",
" df_ac.loc[df_ac['buy_favor_ratio'] > 1., 'buy_favor_ratio'] = 1.\n",
" \n",
" return df_ac"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iteration is stopped\n"
]
}
],
"source": [
"item_base = get_from_jdata_product()\n",
"item_behavior = merge_action_data()\n",
"item_comment = get_from_jdata_comment()\n",
"\n",
"# SQL: left join\n",
"item_behavior = pd.merge(item_base, item_behavior, on=['sku_id'], how='left')\n",
"item_behavior = pd.merge(item_behavior, item_comment, on=['sku_id'], how='left')\n",
" \n",
"item_behavior.to_csv(ITEM_TABLE_FILE, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"item_table = pd.read_csv(ITEM_TABLE_FILE)\n",
"item_table = haed()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

Loading…
Cancel
Save