@ -3092,45 +3092,40 @@
},
{
"cell_type": "code",
"execution_count": 43 ,
"execution_count": 51 ,
"metadata": {},
"outputs": [],
"source": [
"def make_actions(user, product, all_actions, train_start_date):\n",
" train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n",
" train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
" # 修正prod_acc,cate_acc的时间跨度\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
"def make_set(start_date, end_date, csv_path):\n",
" start_days = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(days=30)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" print (train_end_date)\n",
" user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
" print ('get_recent_user_feat finsihed')\n",
" all_actions = get_all_action()\n",
" print(\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print('get_basic_user_feat finsihed')\n",
" product = get_basic_product_feat()\n",
" print('get_basic_product_feat finsihed')\n",
" \n",
" user_acc = get_recent_user_feat(end_date, all_actions)\n",
" print('get_accumulate_user_feat finsihed')\n",
" \n",
" user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
" user_cate = get_user_cate_feature(start_date, end_date, all_actions)\n",
" print('get_user_cate_feature finished')\n",
" \n",
" product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
" product_acc = get_accumulate_product_feat(start_days, end_date, all_actions)\n",
" print('get_accumulate_product_feat finsihed')\n",
" cate_acc = get_accumulate_cate_feat(start_days, train_ end_date, all_actions)\n",
" cate_acc = get_accumulate_cate_feat(start_days, end_date, all_actions)\n",
" print('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(train_end_date)\n",
" print ('get_comments_product_feat finished')\n",
" # 标记\n",
" test_start_date = train_end_date\n",
" test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
" test_end_date = test_end_date.strftime('%Y-%m-%d')\n",
" labels = get_labels(test_start_date, test_end_date, all_actions)\n",
" print (\"get labels\")\n",
" comment_acc = get_comments_product_feat(end_date)\n",
"\n",
" actions = None\n",
" for i in (3, 5, 7, 10, 15, 21, 30):\n",
" start_days = datetime.strptime(train_ end_date, '%Y-%m-%d') - timedelta(days=i)\n",
" start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" if actions is None:\n",
" actions = get_action_feat(start_days, train_ end_date, all_actions, i)\n",
" actions = get_action_feat(start_days, end_date, all_actions,i)\n",
" else:\n",
" # 注意这里的拼接key\n",
" actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',\n",
" actions = pd.merge(actions, get_action_feat(start_days, end_date, all_actions,i), how='left',\n",
" on=['user_id', 'sku_id', 'cate'])\n",
"\n",
" actions = pd.merge(actions, user, how='left', on='user_id')\n",
@ -3142,269 +3137,34 @@
" actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
" actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])\n",
" # 主要是填充拼接商品基本特征、评论特征、标签之后的空值\n",
" actions = actions.fillna(0)\n",
"# return actions\n",
" # 采样\n",
" action_postive = actions[actions['label'] == 1]\n",
" action_negative = actions[actions['label'] == 0]\n",
" del actions\n",
" neg_len = len(action_postive) * 10\n",
" action_negative = action_negative.sample(n=neg_len)\n",
" action_sample = pd.concat([action_postive, action_negative], ignore_index=True) \n",
"\n",
" return action_sample"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"def make_train_set(train_start_date, setNums ,f_path, all_actions):\n",
" train_actions = None\n",
" #all_actions = get_all_action()\n",
" #print (\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print ('get_basic_user_feat finsihed')\n",
" product = get_basic_product_feat()\n",
" print ('get_basic_product_feat finsihed')\n",
" # 滑窗,构造多组训练集/验证集\n",
" for i in range(setNums):\n",
" print (train_start_date)\n",
" if train_actions is None:\n",
" train_actions = make_actions(user, product, all_actions, train_start_date)\n",
" else:\n",
" train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],\n",
" ignore_index=True)\n",
" # 接下来每次移动一天\n",
" train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)\n",
" train_start_date = train_start_date.strftime('%Y-%m-%d')\n",
" print (\"round {0}/{1} over!\".format(i+1, setNums))\n",
" actions = actions.fillna(0)\n",
" \n",
" train_actions.to_csv(f _path, index=False)"
" actions.to_csv(csv_path, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 4 5,
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"get all actions!\n",
"get_basic_user_feat finsihed\n",
"get_basic_product_feat finsihed\n",
"2016-02-01\n",
"2016-02-04\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" sort=sort)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"round 1/20 over!\n",
"2016-02-02\n",
"2016-02-05\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 2/20 over!\n",
"2016-02-03\n",
"2016-02-06\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 3/20 over!\n",
"2016-02-04\n",
"2016-02-07\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 4/20 over!\n",
"2016-02-05\n",
"2016-02-08\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 5/20 over!\n",
"2016-02-06\n",
"2016-02-09\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 6/20 over!\n",
"2016-02-07\n",
"2016-02-10\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 7/20 over!\n",
"2016-02-08\n",
"2016-02-11\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 8/20 over!\n",
"2016-02-09\n",
"2016-02-12\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 9/20 over!\n",
"2016-02-10\n",
"2016-02-13\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 10/20 over!\n",
"2016-02-11\n",
"2016-02-14\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 11/20 over!\n",
"2016-02-12\n",
"2016-02-15\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 12/20 over!\n",
"2016-02-13\n",
"2016-02-16\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 13/20 over!\n",
"2016-02-14\n",
"2016-02-17\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 14/20 over!\n",
"2016-02-15\n",
"2016-02-18\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 15/20 over!\n",
"2016-02-16\n",
"2016-02-19\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 16/20 over!\n",
"2016-02-17\n",
"2016-02-20\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 17/20 over!\n",
"2016-02-18\n",
"2016-02-21\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 18/20 over!\n",
"2016-02-19\n",
"2016-02-22\n",
"get_recent_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 19/20 over!\n",
"2016-02-20\n",
"2016-02-23\n",
"get_recent_user_feat finsihed\n",
"get_accumulate_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"round 20/20 over!\n"
"get_accumulate_cate_feat finsihed\n"
]
}
],
"source": [
"# 训练集\n",
"train_start_date = '2016-02-01'\n",
"make_train_set(train_start_date, 20, 'data/train_set.csv',all_actions)"
"make_set('2016-02-01', '2016-03-30', 'data/train_set.csv')"
]
},
{
@ -3416,75 +3176,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"def make_val_answer(val_start_date, val_end_date, all_actions, label_val_s1_path):\n",
" actions = get_actions(val_start_date, val_end_date,all_actions)\n",
" actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]\n",
" actions = actions[['user_id', 'sku_id']]\n",
" actions = actions.drop_duplicates()\n",
" actions.to_csv(label_val_s1_path, index=False)\n",
"\n",
"def make_val_set(train_start_date, train_end_date, val_s1_path):\n",
" # 修改时间跨度\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" all_actions = get_all_action()\n",
" print (\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print ('get_basic_user_feat finsihed')\n",
" \n",
" product = get_basic_product_feat()\n",
" print ('get_basic_product_feat finsihed')\n",
" user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
" print ('get_recent_user_feat finsihed')\n",
" user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
" print ('get_user_cate_feature finished')\n",
" \n",
" product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
" print ('get_accumulate_product_feat finsihed')\n",
" cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
" print ('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(train_end_date)\n",
" print ('get_comments_product_feat finished')\n",
" \n",
" actions = None\n",
" for i in (3, 5, 7, 10, 15, 21, 30):\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" if actions is None:\n",
" actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
" else:\n",
" actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
" on=['user_id', 'sku_id', 'cate'])\n",
"\n",
" actions = pd.merge(actions, user, how='left', on='user_id')\n",
" actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
"# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
" actions.append(user_cate)\n",
" # 注意这里的拼接key\n",
" actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
" actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
" actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
" actions = actions.fillna(0)\n",
" \n",
" \n",
"# print actions\n",
" # 构造真实用户购买情况作为后续验证\n",
" val_start_date = train_end_date\n",
" val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
" val_end_date = val_end_date.strftime('%Y-%m-%d')\n",
" make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n",
" \n",
" actions.to_csv(val_s1_path, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 53,
"metadata": {},
"outputs": [
{
@ -3494,76 +3186,21 @@
"get all actions!\n",
"get_basic_user_feat finsihed\n",
"get_basic_product_feat finsihed\n",
"get_recent _user_feat finsihed\n",
"get_accumulate _user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n"
"get_accumulate_cate_feat finsihed\n"
]
}
],
"source": [
"# 验证集\n",
"make_val_set('2016-02-23', '2016-02-26', 'data/val_set.csv')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"def make_test_set(train_start_date, train_end_date):\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" all_actions = get_all_action()\n",
" print(\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print('get_basic_user_feat finsihed')\n",
" product = get_basic_product_feat()\n",
" print('get_basic_product_feat finsihed')\n",
" \n",
" user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
" print('get_accumulate_user_feat finsihed')\n",
" \n",
" user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
" print('get_user_cate_feature finished')\n",
" \n",
" product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
" print('get_accumulate_product_feat finsihed')\n",
" cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
" print('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(train_end_date)\n",
"\n",
" actions = None\n",
" for i in (3, 5, 7, 10, 15, 21, 30):\n",
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" if actions is None:\n",
" actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
" else:\n",
" actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
" on=['user_id', 'sku_id', 'cate'])\n",
"\n",
" actions = pd.merge(actions, user, how='left', on='user_id')\n",
" actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
"# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
" actions.append(user_cate)\n",
" # 注意这里的拼接key\n",
" actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
" actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
" actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
" actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
"\n",
" actions = actions.fillna(0)\n",
" \n",
"\n",
" actions.to_csv(\"data/test_set.csv\", index=False)"
"make_set('2016-04-01', '2016-04-10', 'data/val_set.csv')"
]
},
{
"cell_type": "code",
"execution_count": 50 ,
"execution_count": 54,
"metadata": {},
"outputs": [
{
@ -3582,9 +3219,7 @@
],
"source": [
"# 预测结果\n",
"sub_start_date = '2016-04-13'\n",
"sub_end_date = '2016-04-16'\n",
"make_test_set(sub_start_date, sub_end_date)"
"make_set('2016-04-11', '2016-04-16', 'data/test_set.csv')"
]
},
{