Fix. Fix the feature engineering of User and products

5 years ago · f1f12a5b08
parent 87cef42d55
commit f1f12a5b08
1 changed files with 35 additions and 400 deletions
--- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb
@ -3092,45 +3092,40 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def make_actions(user, product, all_actions, train_start_date):\n",
-    "    train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n",
-    "    train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
-    "    # 修正prod_acc,cate_acc的时间跨度\n",
-    "    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
+    "def make_set(start_date, end_date, csv_path):\n",
+    "    start_days = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(days=30)\n",
    "    start_days = start_days.strftime('%Y-%m-%d')\n",
-    "    print (train_end_date)\n",
-    "    user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
-    "    print ('get_recent_user_feat finsihed')\n",
+    "    all_actions = get_all_action()\n",
+    "    print(\"get all actions!\")\n",
+    "    user = get_basic_user_feat()\n",
+    "    print('get_basic_user_feat finsihed')\n",
+    "    product = get_basic_product_feat()\n",
+    "    print('get_basic_product_feat finsihed')\n",
+    "    \n",
+    "    user_acc = get_recent_user_feat(end_date, all_actions)\n",
+    "    print('get_accumulate_user_feat finsihed')\n",
    "    \n",
-    "    user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
+    "    user_cate = get_user_cate_feature(start_date, end_date, all_actions)\n",
    "    print('get_user_cate_feature finished')\n",
    "    \n",
-    "    product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
+    "    product_acc = get_accumulate_product_feat(start_days, end_date, all_actions)\n",
    "    print('get_accumulate_product_feat finsihed')\n",
-    "    cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
+    "    cate_acc = get_accumulate_cate_feat(start_days, end_date, all_actions)\n",
    "    print('get_accumulate_cate_feat finsihed')\n",
-    "    comment_acc = get_comments_product_feat(train_end_date)\n",
-    "    print ('get_comments_product_feat finished')\n",
-    "    # 标记\n",
-    "    test_start_date = train_end_date\n",
-    "    test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
-    "    test_end_date = test_end_date.strftime('%Y-%m-%d')\n",
-    "    labels = get_labels(test_start_date, test_end_date, all_actions)\n",
-    "    print (\"get labels\")\n",
+    "    comment_acc = get_comments_product_feat(end_date)\n",
    "\n",
    "    actions = None\n",
    "    for i in (3, 5, 7, 10, 15, 21, 30):\n",
-    "        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
+    "        start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)\n",
    "        start_days = start_days.strftime('%Y-%m-%d')\n",
    "        if actions is None:\n",
-    "            actions = get_action_feat(start_days, train_end_date, all_actions, i)\n",
+    "            actions = get_action_feat(start_days, end_date, all_actions,i)\n",
    "        else:\n",
-    "            # 注意这里的拼接key\n",
-    "            actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',\n",
+    "            actions = pd.merge(actions, get_action_feat(start_days, end_date, all_actions,i), how='left',\n",
    "                               on=['user_id', 'sku_id', 'cate'])\n",
    "\n",
    "    actions = pd.merge(actions, user, how='left', on='user_id')\n",
@ -3142,269 +3137,34 @@
    "    actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
    "    actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
    "    actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
-    "    actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])\n",
-    "    # 主要是填充拼接商品基本特征、评论特征、标签之后的空值\n",
-    "    actions = actions.fillna(0)\n",
-    "#     return actions\n",
-    "    # 采样\n",
-    "    action_postive = actions[actions['label'] == 1]\n",
-    "    action_negative = actions[actions['label'] == 0]\n",
-    "    del actions\n",
-    "    neg_len = len(action_postive) * 10\n",
-    "    action_negative = action_negative.sample(n=neg_len)\n",
-    "    action_sample = pd.concat([action_postive, action_negative], ignore_index=True)    \n",
    "\n",
-    "    return action_sample"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def make_train_set(train_start_date, setNums ,f_path, all_actions):\n",
-    "    train_actions = None\n",
-    "    #all_actions = get_all_action()\n",
-    "    #print (\"get all actions!\")\n",
-    "    user = get_basic_user_feat()\n",
-    "    print ('get_basic_user_feat finsihed')\n",
-    "    product = get_basic_product_feat()\n",
-    "    print ('get_basic_product_feat finsihed')\n",
-    "    # 滑窗,构造多组训练集/验证集\n",
-    "    for i in range(setNums):\n",
-    "        print (train_start_date)\n",
-    "        if train_actions is None:\n",
-    "            train_actions = make_actions(user, product, all_actions, train_start_date)\n",
-    "        else:\n",
-    "            train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],\n",
-    "                                          ignore_index=True)\n",
-    "        # 接下来每次移动一天\n",
-    "        train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)\n",
-    "        train_start_date = train_start_date.strftime('%Y-%m-%d')\n",
-    "        print (\"round {0}/{1} over!\".format(i+1, setNums))\n",
+    "    actions = actions.fillna(0)\n",
    "    \n",
-    "    train_actions.to_csv(f_path, index=False)"
+    "    actions.to_csv(csv_path, index=False)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "get all actions!\n",
      "get_basic_user_feat finsihed\n",
      "get_basic_product_feat finsihed\n",
-      "2016-02-01\n",
-      "2016-02-04\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
-      "of pandas will change to not sort by default.\n",
-      "\n",
-      "To accept the future behavior, pass 'sort=False'.\n",
-      "\n",
-      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
-      "\n",
-      "  sort=sort)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "round 1/20 over!\n",
-      "2016-02-02\n",
-      "2016-02-05\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 2/20 over!\n",
-      "2016-02-03\n",
-      "2016-02-06\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 3/20 over!\n",
-      "2016-02-04\n",
-      "2016-02-07\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 4/20 over!\n",
-      "2016-02-05\n",
-      "2016-02-08\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 5/20 over!\n",
-      "2016-02-06\n",
-      "2016-02-09\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 6/20 over!\n",
-      "2016-02-07\n",
-      "2016-02-10\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 7/20 over!\n",
-      "2016-02-08\n",
-      "2016-02-11\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 8/20 over!\n",
-      "2016-02-09\n",
-      "2016-02-12\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 9/20 over!\n",
-      "2016-02-10\n",
-      "2016-02-13\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 10/20 over!\n",
-      "2016-02-11\n",
-      "2016-02-14\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 11/20 over!\n",
-      "2016-02-12\n",
-      "2016-02-15\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 12/20 over!\n",
-      "2016-02-13\n",
-      "2016-02-16\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 13/20 over!\n",
-      "2016-02-14\n",
-      "2016-02-17\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 14/20 over!\n",
-      "2016-02-15\n",
-      "2016-02-18\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 15/20 over!\n",
-      "2016-02-16\n",
-      "2016-02-19\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 16/20 over!\n",
-      "2016-02-17\n",
-      "2016-02-20\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 17/20 over!\n",
-      "2016-02-18\n",
-      "2016-02-21\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 18/20 over!\n",
-      "2016-02-19\n",
-      "2016-02-22\n",
-      "get_recent_user_feat finsihed\n",
-      "get_user_cate_feature finished\n",
-      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 19/20 over!\n",
-      "2016-02-20\n",
-      "2016-02-23\n",
-      "get_recent_user_feat finsihed\n",
+      "get_accumulate_user_feat finsihed\n",
      "get_user_cate_feature finished\n",
      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n",
-      "get labels\n",
-      "round 20/20 over!\n"
+      "get_accumulate_cate_feat finsihed\n"
     ]
    }
   ],
   "source": [
    "# 训练集\n",
-    "train_start_date = '2016-02-01'\n",
-    "make_train_set(train_start_date, 20, 'data/train_set.csv',all_actions)"
+    "make_set('2016-02-01', '2016-03-30', 'data/train_set.csv')"
   ]
  },
  {
@ -3416,75 +3176,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def make_val_answer(val_start_date, val_end_date, all_actions, label_val_s1_path):\n",
-    "    actions = get_actions(val_start_date, val_end_date,all_actions)\n",
-    "    actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]\n",
-    "    actions = actions[['user_id', 'sku_id']]\n",
-    "    actions = actions.drop_duplicates()\n",
-    "    actions.to_csv(label_val_s1_path, index=False)\n",
-    "\n",
-    "def make_val_set(train_start_date, train_end_date, val_s1_path):\n",
-    "    # 修改时间跨度\n",
-    "    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
-    "    start_days = start_days.strftime('%Y-%m-%d')\n",
-    "    all_actions = get_all_action()\n",
-    "    print (\"get all actions!\")\n",
-    "    user = get_basic_user_feat()\n",
-    "    print ('get_basic_user_feat finsihed')\n",
-    "    \n",
-    "    product = get_basic_product_feat()\n",
-    "    print ('get_basic_product_feat finsihed')\n",
-    "    user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
-    "    print ('get_recent_user_feat finsihed')\n",
-    "    user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
-    "    print ('get_user_cate_feature finished')\n",
-    " \n",
-    "    product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
-    "    print ('get_accumulate_product_feat finsihed')\n",
-    "    cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
-    "    print ('get_accumulate_cate_feat finsihed')\n",
-    "    comment_acc = get_comments_product_feat(train_end_date)\n",
-    "    print ('get_comments_product_feat finished')\n",
-    "    \n",
-    "    actions = None\n",
-    "    for i in (3, 5, 7, 10, 15, 21, 30):\n",
-    "        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
-    "        start_days = start_days.strftime('%Y-%m-%d')\n",
-    "        if actions is None:\n",
-    "            actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
-    "        else:\n",
-    "            actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
-    "                               on=['user_id', 'sku_id', 'cate'])\n",
-    "\n",
-    "    actions = pd.merge(actions, user, how='left', on='user_id')\n",
-    "    actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
-    "#     actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
-    "    actions.append(user_cate)\n",
-    "    # 注意这里的拼接key\n",
-    "    actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
-    "    actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
-    "    actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
-    "    actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
-    "    actions = actions.fillna(0)\n",
-    "   \n",
-    "    \n",
-    "#     print actions\n",
-    "    # 构造真实用户购买情况作为后续验证\n",
-    "    val_start_date = train_end_date\n",
-    "    val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
-    "    val_end_date = val_end_date.strftime('%Y-%m-%d')\n",
-    "    make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n",
-    "    \n",
-    "    actions.to_csv(val_s1_path, index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
@ -3494,76 +3186,21 @@
      "get all actions!\n",
      "get_basic_user_feat finsihed\n",
      "get_basic_product_feat finsihed\n",
-      "get_recent_user_feat finsihed\n",
+      "get_accumulate_user_feat finsihed\n",
      "get_user_cate_feature finished\n",
      "get_accumulate_product_feat finsihed\n",
-      "get_accumulate_cate_feat finsihed\n",
-      "get_comments_product_feat finished\n"
+      "get_accumulate_cate_feat finsihed\n"
     ]
    }
   ],
   "source": [
    "# 验证集\n",
-    "make_val_set('2016-02-23', '2016-02-26', 'data/val_set.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def make_test_set(train_start_date, train_end_date):\n",
-    "    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
-    "    start_days = start_days.strftime('%Y-%m-%d')\n",
-    "    all_actions = get_all_action()\n",
-    "    print(\"get all actions!\")\n",
-    "    user = get_basic_user_feat()\n",
-    "    print('get_basic_user_feat finsihed')\n",
-    "    product = get_basic_product_feat()\n",
-    "    print('get_basic_product_feat finsihed')\n",
-    "    \n",
-    "    user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
-    "    print('get_accumulate_user_feat finsihed')\n",
-    "    \n",
-    "    user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
-    "    print('get_user_cate_feature finished')\n",
-    "    \n",
-    "    product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
-    "    print('get_accumulate_product_feat finsihed')\n",
-    "    cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
-    "    print('get_accumulate_cate_feat finsihed')\n",
-    "    comment_acc = get_comments_product_feat(train_end_date)\n",
-    "\n",
-    "    actions = None\n",
-    "    for i in (3, 5, 7, 10, 15, 21, 30):\n",
-    "        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
-    "        start_days = start_days.strftime('%Y-%m-%d')\n",
-    "        if actions is None:\n",
-    "            actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
-    "        else:\n",
-    "            actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
-    "                               on=['user_id', 'sku_id', 'cate'])\n",
-    "\n",
-    "    actions = pd.merge(actions, user, how='left', on='user_id')\n",
-    "    actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
-    "#     actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
-    "    actions.append(user_cate)\n",
-    "    # 注意这里的拼接key\n",
-    "    actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
-    "    actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
-    "    actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
-    "    actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
-    "\n",
-    "    actions = actions.fillna(0)\n",
-    "    \n",
-    "\n",
-    "    actions.to_csv(\"data/test_set.csv\", index=False)"
+    "make_set('2016-04-01', '2016-04-10', 'data/val_set.csv')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
@ -3582,9 +3219,7 @@
   ],
   "source": [
    "# 预测结果\n",
-    "sub_start_date = '2016-04-13'\n",
-    "sub_end_date = '2016-04-16'\n",
-    "make_test_set(sub_start_date, sub_end_date)"
+    "make_set('2016-04-11', '2016-04-16', 'data/test_set.csv')"
   ]
  },
  {