diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb index 035fe39..f919f40 100644 --- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb @@ -3092,45 +3092,40 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "def make_actions(user, product, all_actions, train_start_date):\n", - " train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", - " train_end_date = train_end_date.strftime('%Y-%m-%d')\n", - " # 修正prod_acc,cate_acc的时间跨度\n", - " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + "def make_set(start_date, end_date, csv_path):\n", + " start_days = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(days=30)\n", " start_days = start_days.strftime('%Y-%m-%d')\n", - " print (train_end_date)\n", - " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", - " print ('get_recent_user_feat finsihed')\n", + " all_actions = get_all_action()\n", + " print(\"get all actions!\")\n", + " user = get_basic_user_feat()\n", + " print('get_basic_user_feat finsihed')\n", + " product = get_basic_product_feat()\n", + " print('get_basic_product_feat finsihed')\n", " \n", - " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", - " print ('get_user_cate_feature finished')\n", + " user_acc = get_recent_user_feat(end_date, all_actions)\n", + " print('get_accumulate_user_feat finsihed')\n", " \n", - " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", - " print ('get_accumulate_product_feat finsihed')\n", - " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", - " print ('get_accumulate_cate_feat finsihed')\n", - " comment_acc = get_comments_product_feat(train_end_date)\n", - " print ('get_comments_product_feat finished')\n", - " # 标记\n", - " test_start_date = train_end_date\n", - " test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)\n", - " test_end_date = test_end_date.strftime('%Y-%m-%d')\n", - " labels = get_labels(test_start_date, test_end_date, all_actions)\n", - " print (\"get labels\")\n", + " user_cate = get_user_cate_feature(start_date, end_date, all_actions)\n", + " print('get_user_cate_feature finished')\n", " \n", + " product_acc = get_accumulate_product_feat(start_days, end_date, all_actions)\n", + " print('get_accumulate_product_feat finsihed')\n", + " cate_acc = get_accumulate_cate_feat(start_days, end_date, all_actions)\n", + " print('get_accumulate_cate_feat finsihed')\n", + " comment_acc = get_comments_product_feat(end_date)\n", + "\n", " actions = None\n", " for i in (3, 5, 7, 10, 15, 21, 30):\n", - " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", + " start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)\n", " start_days = start_days.strftime('%Y-%m-%d')\n", " if actions is None:\n", - " actions = get_action_feat(start_days, train_end_date, all_actions, i)\n", + " actions = get_action_feat(start_days, end_date, all_actions,i)\n", " else:\n", - " # 注意这里的拼接key\n", - " actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',\n", + " actions = pd.merge(actions, get_action_feat(start_days, end_date, all_actions,i), how='left',\n", " on=['user_id', 'sku_id', 'cate'])\n", "\n", " actions = pd.merge(actions, user, how='left', on='user_id')\n", @@ -3142,269 +3137,34 @@ " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", - " actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])\n", - " # 主要是填充拼接商品基本特征、评论特征、标签之后的空值\n", + "\n", " actions = actions.fillna(0)\n", - "# return actions\n", - " # 采样\n", - " action_postive = actions[actions['label'] == 1]\n", - " action_negative = actions[actions['label'] == 0]\n", - " del actions\n", - " neg_len = len(action_postive) * 10\n", - " action_negative = action_negative.sample(n=neg_len)\n", - " action_sample = pd.concat([action_postive, action_negative], ignore_index=True) \n", " \n", - " return action_sample" + " actions.to_csv(csv_path, index=False)" ] }, { "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "def make_train_set(train_start_date, setNums ,f_path, all_actions):\n", - " train_actions = None\n", - " #all_actions = get_all_action()\n", - " #print (\"get all actions!\")\n", - " user = get_basic_user_feat()\n", - " print ('get_basic_user_feat finsihed')\n", - " product = get_basic_product_feat()\n", - " print ('get_basic_product_feat finsihed')\n", - " # 滑窗,构造多组训练集/验证集\n", - " for i in range(setNums):\n", - " print (train_start_date)\n", - " if train_actions is None:\n", - " train_actions = make_actions(user, product, all_actions, train_start_date)\n", - " else:\n", - " train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],\n", - " ignore_index=True)\n", - " # 接下来每次移动一天\n", - " train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)\n", - " train_start_date = train_start_date.strftime('%Y-%m-%d')\n", - " print (\"round {0}/{1} over!\".format(i+1, setNums))\n", - "\n", - " train_actions.to_csv(f_path, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, + "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "get all actions!\n", "get_basic_user_feat finsihed\n", "get_basic_product_feat finsihed\n", - "2016-02-01\n", - "2016-02-04\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", - "of pandas will change to not sort by default.\n", - "\n", - "To accept the future behavior, pass 'sort=False'.\n", - "\n", - "To retain the current behavior and silence the warning, pass 'sort=True'.\n", - "\n", - " sort=sort)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "round 1/20 over!\n", - "2016-02-02\n", - "2016-02-05\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 2/20 over!\n", - "2016-02-03\n", - "2016-02-06\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 3/20 over!\n", - "2016-02-04\n", - "2016-02-07\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 4/20 over!\n", - "2016-02-05\n", - "2016-02-08\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 5/20 over!\n", - "2016-02-06\n", - "2016-02-09\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 6/20 over!\n", - "2016-02-07\n", - "2016-02-10\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 7/20 over!\n", - "2016-02-08\n", - "2016-02-11\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 8/20 over!\n", - "2016-02-09\n", - "2016-02-12\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 9/20 over!\n", - "2016-02-10\n", - "2016-02-13\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 10/20 over!\n", - "2016-02-11\n", - "2016-02-14\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 11/20 over!\n", - "2016-02-12\n", - "2016-02-15\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 12/20 over!\n", - "2016-02-13\n", - "2016-02-16\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 13/20 over!\n", - "2016-02-14\n", - "2016-02-17\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 14/20 over!\n", - "2016-02-15\n", - "2016-02-18\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 15/20 over!\n", - "2016-02-16\n", - "2016-02-19\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 16/20 over!\n", - "2016-02-17\n", - "2016-02-20\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 17/20 over!\n", - "2016-02-18\n", - "2016-02-21\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 18/20 over!\n", - "2016-02-19\n", - "2016-02-22\n", - "get_recent_user_feat finsihed\n", - "get_user_cate_feature finished\n", - "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 19/20 over!\n", - "2016-02-20\n", - "2016-02-23\n", - "get_recent_user_feat finsihed\n", + "get_accumulate_user_feat finsihed\n", "get_user_cate_feature finished\n", "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n", - "get labels\n", - "round 20/20 over!\n" + "get_accumulate_cate_feat finsihed\n" ] } ], "source": [ "# 训练集\n", - "train_start_date = '2016-02-01'\n", - "make_train_set(train_start_date, 20, 'data/train_set.csv',all_actions)" + "make_set('2016-02-01', '2016-03-30', 'data/train_set.csv')" ] }, { @@ -3416,75 +3176,7 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "def make_val_answer(val_start_date, val_end_date, all_actions, label_val_s1_path):\n", - " actions = get_actions(val_start_date, val_end_date,all_actions)\n", - " actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]\n", - " actions = actions[['user_id', 'sku_id']]\n", - " actions = actions.drop_duplicates()\n", - " actions.to_csv(label_val_s1_path, index=False)\n", - "\n", - "def make_val_set(train_start_date, train_end_date, val_s1_path):\n", - " # 修改时间跨度\n", - " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", - " start_days = start_days.strftime('%Y-%m-%d')\n", - " all_actions = get_all_action()\n", - " print (\"get all actions!\")\n", - " user = get_basic_user_feat()\n", - " print ('get_basic_user_feat finsihed')\n", - " \n", - " product = get_basic_product_feat()\n", - " print ('get_basic_product_feat finsihed')\n", - " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", - " print ('get_recent_user_feat finsihed')\n", - " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", - " print ('get_user_cate_feature finished')\n", - " \n", - " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", - " print ('get_accumulate_product_feat finsihed')\n", - " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", - " print ('get_accumulate_cate_feat finsihed')\n", - " comment_acc = get_comments_product_feat(train_end_date)\n", - " print ('get_comments_product_feat finished')\n", - " \n", - " actions = None\n", - " for i in (3, 5, 7, 10, 15, 21, 30):\n", - " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", - " start_days = start_days.strftime('%Y-%m-%d')\n", - " if actions is None:\n", - " actions = get_action_feat(start_days, train_end_date, all_actions,i)\n", - " else:\n", - " actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n", - " on=['user_id', 'sku_id', 'cate'])\n", - "\n", - " actions = pd.merge(actions, user, how='left', on='user_id')\n", - " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", - "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", - " actions.append(user_cate)\n", - " # 注意这里的拼接key\n", - " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", - " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", - " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", - " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", - " actions = actions.fillna(0)\n", - " \n", - " \n", - "# print actions\n", - " # 构造真实用户购买情况作为后续验证\n", - " val_start_date = train_end_date\n", - " val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n", - " val_end_date = val_end_date.strftime('%Y-%m-%d')\n", - " make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n", - " \n", - " actions.to_csv(val_s1_path, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -3494,76 +3186,21 @@ "get all actions!\n", "get_basic_user_feat finsihed\n", "get_basic_product_feat finsihed\n", - "get_recent_user_feat finsihed\n", + "get_accumulate_user_feat finsihed\n", "get_user_cate_feature finished\n", "get_accumulate_product_feat finsihed\n", - "get_accumulate_cate_feat finsihed\n", - "get_comments_product_feat finished\n" + "get_accumulate_cate_feat finsihed\n" ] } ], "source": [ "# 验证集\n", - "make_val_set('2016-02-23', '2016-02-26', 'data/val_set.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "def make_test_set(train_start_date, train_end_date):\n", - " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", - " start_days = start_days.strftime('%Y-%m-%d')\n", - " all_actions = get_all_action()\n", - " print(\"get all actions!\")\n", - " user = get_basic_user_feat()\n", - " print('get_basic_user_feat finsihed')\n", - " product = get_basic_product_feat()\n", - " print('get_basic_product_feat finsihed')\n", - " \n", - " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", - " print('get_accumulate_user_feat finsihed')\n", - " \n", - " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", - " print('get_user_cate_feature finished')\n", - " \n", - " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", - " print('get_accumulate_product_feat finsihed')\n", - " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", - " print('get_accumulate_cate_feat finsihed')\n", - " comment_acc = get_comments_product_feat(train_end_date)\n", - "\n", - " actions = None\n", - " for i in (3, 5, 7, 10, 15, 21, 30):\n", - " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", - " start_days = start_days.strftime('%Y-%m-%d')\n", - " if actions is None:\n", - " actions = get_action_feat(start_days, train_end_date, all_actions,i)\n", - " else:\n", - " actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n", - " on=['user_id', 'sku_id', 'cate'])\n", - "\n", - " actions = pd.merge(actions, user, how='left', on='user_id')\n", - " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", - "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", - " actions.append(user_cate)\n", - " # 注意这里的拼接key\n", - " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", - " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", - " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", - " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", - "\n", - " actions = actions.fillna(0)\n", - " \n", - "\n", - " actions.to_csv(\"data/test_set.csv\", index=False)" + "make_set('2016-04-01', '2016-04-10', 'data/val_set.csv')" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -3582,9 +3219,7 @@ ], "source": [ "# 预测结果\n", - "sub_start_date = '2016-04-13'\n", - "sub_end_date = '2016-04-16'\n", - "make_test_set(sub_start_date, sub_end_date)" + "make_set('2016-04-11', '2016-04-16', 'data/test_set.csv')" ] }, {