Fix. Fix the feature engineering of User and products

pull/2/head
benjas 5 years ago
parent 87cef42d55
commit f1f12a5b08

@ -3092,45 +3092,40 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 43, "execution_count": 51,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def make_actions(user, product, all_actions, train_start_date):\n", "def make_set(start_date, end_date, csv_path):\n",
" train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", " start_days = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(days=30)\n",
" train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
" # 修正prod_acc,cate_acc的时间跨度\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n", " start_days = start_days.strftime('%Y-%m-%d')\n",
" print (train_end_date)\n", " all_actions = get_all_action()\n",
" user_acc = get_recent_user_feat(train_end_date, all_actions)\n", " print(\"get all actions!\")\n",
" print ('get_recent_user_feat finsihed')\n", " user = get_basic_user_feat()\n",
" print('get_basic_user_feat finsihed')\n",
" product = get_basic_product_feat()\n",
" print('get_basic_product_feat finsihed')\n",
" \n", " \n",
" user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", " user_acc = get_recent_user_feat(end_date, all_actions)\n",
" print ('get_user_cate_feature finished')\n", " print('get_accumulate_user_feat finsihed')\n",
" \n", " \n",
" product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", " user_cate = get_user_cate_feature(start_date, end_date, all_actions)\n",
" print ('get_accumulate_product_feat finsihed')\n", " print('get_user_cate_feature finished')\n",
" cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
" print ('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(train_end_date)\n",
" print ('get_comments_product_feat finished')\n",
" # 标记\n",
" test_start_date = train_end_date\n",
" test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
" test_end_date = test_end_date.strftime('%Y-%m-%d')\n",
" labels = get_labels(test_start_date, test_end_date, all_actions)\n",
" print (\"get labels\")\n",
" \n", " \n",
" product_acc = get_accumulate_product_feat(start_days, end_date, all_actions)\n",
" print('get_accumulate_product_feat finsihed')\n",
" cate_acc = get_accumulate_cate_feat(start_days, end_date, all_actions)\n",
" print('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(end_date)\n",
"\n",
" actions = None\n", " actions = None\n",
" for i in (3, 5, 7, 10, 15, 21, 30):\n", " for i in (3, 5, 7, 10, 15, 21, 30):\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", " start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n", " start_days = start_days.strftime('%Y-%m-%d')\n",
" if actions is None:\n", " if actions is None:\n",
" actions = get_action_feat(start_days, train_end_date, all_actions, i)\n", " actions = get_action_feat(start_days, end_date, all_actions,i)\n",
" else:\n", " else:\n",
" # 注意这里的拼接key\n", " actions = pd.merge(actions, get_action_feat(start_days, end_date, all_actions,i), how='left',\n",
" actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',\n",
" on=['user_id', 'sku_id', 'cate'])\n", " on=['user_id', 'sku_id', 'cate'])\n",
"\n", "\n",
" actions = pd.merge(actions, user, how='left', on='user_id')\n", " actions = pd.merge(actions, user, how='left', on='user_id')\n",
@ -3142,269 +3137,34 @@
" actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
" actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])\n", "\n",
" # 主要是填充拼接商品基本特征、评论特征、标签之后的空值\n",
" actions = actions.fillna(0)\n", " actions = actions.fillna(0)\n",
"# return actions\n",
" # 采样\n",
" action_postive = actions[actions['label'] == 1]\n",
" action_negative = actions[actions['label'] == 0]\n",
" del actions\n",
" neg_len = len(action_postive) * 10\n",
" action_negative = action_negative.sample(n=neg_len)\n",
" action_sample = pd.concat([action_postive, action_negative], ignore_index=True) \n",
" \n", " \n",
" return action_sample" " actions.to_csv(csv_path, index=False)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 44, "execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"def make_train_set(train_start_date, setNums ,f_path, all_actions):\n",
" train_actions = None\n",
" #all_actions = get_all_action()\n",
" #print (\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print ('get_basic_user_feat finsihed')\n",
" product = get_basic_product_feat()\n",
" print ('get_basic_product_feat finsihed')\n",
" # 滑窗,构造多组训练集/验证集\n",
" for i in range(setNums):\n",
" print (train_start_date)\n",
" if train_actions is None:\n",
" train_actions = make_actions(user, product, all_actions, train_start_date)\n",
" else:\n",
" train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],\n",
" ignore_index=True)\n",
" # 接下来每次移动一天\n",
" train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)\n",
" train_start_date = train_start_date.strftime('%Y-%m-%d')\n",
" print (\"round {0}/{1} over!\".format(i+1, setNums))\n",
"\n",
" train_actions.to_csv(f_path, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"get all actions!\n",
"get_basic_user_feat finsihed\n", "get_basic_user_feat finsihed\n",
"get_basic_product_feat finsihed\n", "get_basic_product_feat finsihed\n",
"2016-02-01\n", "get_accumulate_user_feat finsihed\n",
"2016-02-04\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" sort=sort)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"round 1/20 over!\n",
"2016-02-02\n",
"2016-02-05\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 2/20 over!\n",
"2016-02-03\n",
"2016-02-06\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 3/20 over!\n",
"2016-02-04\n",
"2016-02-07\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 4/20 over!\n",
"2016-02-05\n",
"2016-02-08\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 5/20 over!\n",
"2016-02-06\n",
"2016-02-09\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 6/20 over!\n",
"2016-02-07\n",
"2016-02-10\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 7/20 over!\n",
"2016-02-08\n",
"2016-02-11\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 8/20 over!\n",
"2016-02-09\n",
"2016-02-12\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 9/20 over!\n",
"2016-02-10\n",
"2016-02-13\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 10/20 over!\n",
"2016-02-11\n",
"2016-02-14\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 11/20 over!\n",
"2016-02-12\n",
"2016-02-15\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 12/20 over!\n",
"2016-02-13\n",
"2016-02-16\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 13/20 over!\n",
"2016-02-14\n",
"2016-02-17\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 14/20 over!\n",
"2016-02-15\n",
"2016-02-18\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 15/20 over!\n",
"2016-02-16\n",
"2016-02-19\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 16/20 over!\n",
"2016-02-17\n",
"2016-02-20\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 17/20 over!\n",
"2016-02-18\n",
"2016-02-21\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 18/20 over!\n",
"2016-02-19\n",
"2016-02-22\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 19/20 over!\n",
"2016-02-20\n",
"2016-02-23\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n", "get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n", "get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n", "get_accumulate_cate_feat finsihed\n"
"get_comments_product_feat finished\n",
"get labels\n",
"round 20/20 over!\n"
] ]
} }
], ],
"source": [ "source": [
"# 训练集\n", "# 训练集\n",
"train_start_date = '2016-02-01'\n", "make_set('2016-02-01', '2016-03-30', 'data/train_set.csv')"
"make_train_set(train_start_date, 20, 'data/train_set.csv',all_actions)"
] ]
}, },
{ {
@ -3416,75 +3176,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 46, "execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"def make_val_answer(val_start_date, val_end_date, all_actions, label_val_s1_path):\n",
" actions = get_actions(val_start_date, val_end_date,all_actions)\n",
" actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]\n",
" actions = actions[['user_id', 'sku_id']]\n",
" actions = actions.drop_duplicates()\n",
" actions.to_csv(label_val_s1_path, index=False)\n",
"\n",
"def make_val_set(train_start_date, train_end_date, val_s1_path):\n",
" # 修改时间跨度\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" all_actions = get_all_action()\n",
" print (\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print ('get_basic_user_feat finsihed')\n",
" \n",
" product = get_basic_product_feat()\n",
" print ('get_basic_product_feat finsihed')\n",
" user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
" print ('get_recent_user_feat finsihed')\n",
" user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
" print ('get_user_cate_feature finished')\n",
" \n",
" product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
" print ('get_accumulate_product_feat finsihed')\n",
" cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
" print ('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(train_end_date)\n",
" print ('get_comments_product_feat finished')\n",
" \n",
" actions = None\n",
" for i in (3, 5, 7, 10, 15, 21, 30):\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" if actions is None:\n",
" actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
" else:\n",
" actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
" on=['user_id', 'sku_id', 'cate'])\n",
"\n",
" actions = pd.merge(actions, user, how='left', on='user_id')\n",
" actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
"# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
" actions.append(user_cate)\n",
" # 注意这里的拼接key\n",
" actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
" actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
" actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
" actions = actions.fillna(0)\n",
" \n",
" \n",
"# print actions\n",
" # 构造真实用户购买情况作为后续验证\n",
" val_start_date = train_end_date\n",
" val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
" val_end_date = val_end_date.strftime('%Y-%m-%d')\n",
" make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n",
" \n",
" actions.to_csv(val_s1_path, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3494,76 +3186,21 @@
"get all actions!\n", "get all actions!\n",
"get_basic_user_feat finsihed\n", "get_basic_user_feat finsihed\n",
"get_basic_product_feat finsihed\n", "get_basic_product_feat finsihed\n",
"get_recent_user_feat finsihed\n", "get_accumulate_user_feat finsihed\n",
"get_user_cate_feature finished\n", "get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n", "get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n", "get_accumulate_cate_feat finsihed\n"
"get_comments_product_feat finished\n"
] ]
} }
], ],
"source": [ "source": [
"# 验证集\n", "# 验证集\n",
"make_val_set('2016-02-23', '2016-02-26', 'data/val_set.csv')" "make_set('2016-04-01', '2016-04-10', 'data/val_set.csv')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"def make_test_set(train_start_date, train_end_date):\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" all_actions = get_all_action()\n",
" print(\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print('get_basic_user_feat finsihed')\n",
" product = get_basic_product_feat()\n",
" print('get_basic_product_feat finsihed')\n",
" \n",
" user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
" print('get_accumulate_user_feat finsihed')\n",
" \n",
" user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
" print('get_user_cate_feature finished')\n",
" \n",
" product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
" print('get_accumulate_product_feat finsihed')\n",
" cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
" print('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(train_end_date)\n",
"\n",
" actions = None\n",
" for i in (3, 5, 7, 10, 15, 21, 30):\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" if actions is None:\n",
" actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
" else:\n",
" actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
" on=['user_id', 'sku_id', 'cate'])\n",
"\n",
" actions = pd.merge(actions, user, how='left', on='user_id')\n",
" actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
"# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
" actions.append(user_cate)\n",
" # 注意这里的拼接key\n",
" actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
" actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
" actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
"\n",
" actions = actions.fillna(0)\n",
" \n",
"\n",
" actions.to_csv(\"data/test_set.csv\", index=False)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 50, "execution_count": 54,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3582,9 +3219,7 @@
], ],
"source": [ "source": [
"# 预测结果\n", "# 预测结果\n",
"sub_start_date = '2016-04-13'\n", "make_set('2016-04-11', '2016-04-16', 'data/test_set.csv')"
"sub_end_date = '2016-04-16'\n",
"make_test_set(sub_start_date, sub_end_date)"
] ]
}, },
{ {

Loading…
Cancel
Save