From 280a75e0feb60f1bdbf82a9ff94cfd2f7435c222 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Mon, 8 Feb 2021 22:27:50 +0800
Subject: [PATCH] Update. some methods

---
 .../3-特征工程 - 副本.ipynb             | 3619 +++++++++++++++++
 1 file changed, 3619 insertions(+)
 create mode 100644 机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb
diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb
new file mode 100644
index 0000000..035fe39
--- /dev/null
+++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb	
@@ -0,0 +1,3619 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3-特征工程"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "用户基本特征：\n",
+    "* 获取基本的用户特征，基于用户本身属性多为类别特征的特点，对age,sex,usr_lv_cd进行独热编码操作，对于用户注册时间暂时不处理，\n",
+    "\n",
+    "商品基本特征：\n",
+    "* 根据商品文件获取基本的特征\n",
+    "* 针对属性a1,a2,a3进行独热编码\n",
+    "* 商品类别和品牌直接作为特征，不同的品牌的影响力不同，购买力也不同\n",
+    "\n",
+    "评论特征：\n",
+    "* 分时间段\n",
+    "* 对评论数进行独热编码: 0表示无评论，1表示有1条评论，2表示有2-10条评论，3表示有11-50条评论，4表示大于50条评论，对0~4 进行独热编码\n",
+    "\n",
+    "行为特征：\n",
+    "* 分时间段，一般是最近做的对未来的影响越明显\n",
+    "* 对行为类别进行独热编码：对1~6进行独热编码\n",
+    "* 分别按照用户-类别行为分组和用户-类别-商品行为分组统计，然后计算\n",
+    "* 用户对同类别下其他商品的行为计数\n",
+    "* 不同时间累积的行为计数（3,5,7,10,15,21,30）\n",
+    "\n",
+    "累积用户特征：\n",
+    "* 分时间段\n",
+    "* 用户不同行为的\n",
+    "* 购买转化率\n",
+    "* 均值，同上有不同时间的均值，3天、5天.....\n",
+    "\n",
+    "用户近期行为特征：\n",
+    "* 在上面针对用户进行累积特征提取的基础上，分别提取用户近一个月、近三天的特征，然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重\n",
+    "\n",
+    "用户对同类别下各种商品的行为:\n",
+    "* 用户对各个类别的各项行为操作统计\n",
+    "* 用户对各个类别操作行为统计占对所有类别操作行为统计的比重\n",
+    "\n",
+    "累积商品特征:\n",
+    "\n",
+    "* 分时间段\n",
+    "* 针对商品的不同行为的\n",
+    "* 购买转化率\n",
+    "* 均值\n",
+    "\n",
+    "类别特征：\n",
+    "* 分时间段下各个商品类别的\n",
+    "* 购买转化率\n",
+    "* 均值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from datetime import datetime\n",
+    "from datetime import timedelta\n",
+    "import pandas as pd\n",
+    "import pickle\n",
+    "import os\n",
+    "import math\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 11485424 entries, 0 to 11485423\n",
+      "Data columns (total 7 columns):\n",
+      "user_id     float32\n",
+      "sku_id      float32\n",
+      "time        object\n",
+      "model_id    float32\n",
+      "type        float32\n",
+      "cate        float32\n",
+      "brand       float32\n",
+      "dtypes: float32(6), object(1)\n",
+      "memory usage: 350.5+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "#float32 降低内存消耗\n",
+    "test = pd.read_csv('data/JData_Action_201602.csv')\n",
+    "test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "test.dtypes\n",
+    "test.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 11485424 entries, 0 to 11485423\n",
+      "Data columns (total 7 columns):\n",
+      "user_id     int64\n",
+      "sku_id      int64\n",
+      "time        object\n",
+      "model_id    float64\n",
+      "type        int64\n",
+      "cate        int64\n",
+      "brand       int64\n",
+      "dtypes: float64(1), int64(5), object(1)\n",
+      "memory usage: 613.4+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 不转换float32\n",
+    "test = pd.read_csv('data/JData_Action_201602.csv')\n",
+    "# test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "test.dtypes\n",
+    "test.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "可以明显看到`int64`使用的memory usage: 613.4+ MB，比`float32`的多了近一倍的内存使用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 数据路径\n",
+    "action_1_path = r'data/JData_Action_201602.csv'\n",
+    "action_2_path = r'data/JData_Action_201603.csv'\n",
+    "action_3_path = r'data/JData_Action_201604.csv'\n",
+    " \n",
+    "comment_path = r'data/JData_Comment.csv'\n",
+    "product_path = r'data/JData_Product.csv'\n",
+    "user_path = r'data/JData_User.csv'\n",
+    " \n",
+    "comment_date = [\n",
+    "    \"2016-02-01\", \"2016-02-08\", \"2016-02-15\", \"2016-02-22\", \"2016-02-29\",\n",
+    "    \"2016-03-07\", \"2016-03-14\", \"2016-03-21\", \"2016-03-28\", \"2016-04-04\",\n",
+    "    \"2016-04-11\", \"2016-04-15\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 基本方法\n",
+    "def get_actions_0():\n",
+    "    action = pd.read_csv(action_0_path)\n",
+    "    return action\n",
+    " \n",
+    "def get_actions_1():\n",
+    "    action = pd.read_csv(action_1_path)\n",
+    "    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    return action\n",
+    "\n",
+    "def get_actions_2():\n",
+    "    action = pd.read_csv(action_2_path)\n",
+    "    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    " \n",
+    "    return action\n",
+    "def get_actions_3():\n",
+    "    action = pd.read_csv(action_3_path)\n",
+    "    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    " \n",
+    "    return action\n",
+    " \n",
+    "#如果电脑性能好就不用分块\n",
+    "def get_actions_10():\n",
+    "    \n",
+    "    reader = pd.read_csv(action_1_path, iterator=True)\n",
+    "    reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    chunks = []\n",
+    "    loop = True\n",
+    "    while loop:\n",
+    "        try:\n",
+    "            chunk = reader.get_chunk(50000)\n",
+    "            chunks.append(chunk)\n",
+    "        except StopIteration:\n",
+    "            loop = False\n",
+    "            print(\"Iteration is stopped\")\n",
+    "    action = pd.concat(chunks, ignore_index=True)\n",
+    "            \n",
+    "    return action\n",
+    "def get_actions_20():\n",
+    "    \n",
+    "    reader = pd.read_csv(action_2_path, iterator=True)\n",
+    "    reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    chunks = []\n",
+    "    loop = True\n",
+    "    while loop:\n",
+    "        try:\n",
+    "            chunk = reader.get_chunk(50000)\n",
+    "            chunks.append(chunk)\n",
+    "        except StopIteration:\n",
+    "            loop = False\n",
+    "            print(\"Iteration is stopped\")\n",
+    "    action = pd.concat(chunks, ignore_index=True)\n",
+    "            \n",
+    "    return action\n",
+    "def get_actions_30():\n",
+    "    \n",
+    "    reader = pd.read_csv(action_3_path, iterator=True)\n",
+    "    reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    chunks = []\n",
+    "    loop = True\n",
+    "    while loop:\n",
+    "        try:\n",
+    "            chunk = reader.get_chunk(50000)\n",
+    "            chunks.append(chunk)\n",
+    "        except StopIteration:\n",
+    "            loop = False\n",
+    "            print(\"Iteration is stopped\")\n",
+    "    action = pd.concat(chunks, ignore_index=True)\n",
+    "            \n",
+    "    return action\n",
+    "\n",
+    "# 读取并拼接所有行为记录文件\n",
+    "def get_all_action():\n",
+    "    action_1 = get_actions_1()\n",
+    "    action_2 = get_actions_2()\n",
+    "    action_3 = get_actions_3()\n",
+    "    actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame\n",
+    " \n",
+    "    return actions\n",
+    " \n",
+    "# 获取某个时间段的行为记录\n",
+    "def get_actions(start_date, end_date, all_actions):\n",
+    "    \"\"\"\n",
+    "    :param start_date:\n",
+    "    :param end_date:\n",
+    "    :return: actions: pd.Dataframe\n",
+    "    \"\"\"\n",
+    "    actions = all_actions[(all_actions.time >= start_date) & (all_actions.time < end_date)].copy()\n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 用户特征\n",
+    "### 用户基本特征\n",
+    "获取基本的用户特征，基于用户本身属性多为类别特征的特点，对age,sex,usr_lv_cd进行独热编码操作，对于用户注册时间暂时不处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import preprocessing\n",
+    " \n",
+    "def get_basic_user_feat():\n",
+    "    # 针对年龄的中文字符问题处理，首先是读入的时候编码，删除空值，然后将其数值化，最后独热编码，此外对于sex也进行了数值类型转换\n",
+    "    user = pd.read_csv(user_path, encoding='gbk')\n",
+    "    # axis=0/1=包含缺失值的行/列，how=any/all=有则删除指定行或者列/必须全部才删除，inplace=是否在原表上修改\n",
+    "    user.dropna(axis=0, how='any',inplace=True)\n",
+    "    user['sex'] = user['sex'].astype(int)    \n",
+    "    user['age'] = user['age'].astype(int)\n",
+    "    le = preprocessing.LabelEncoder()\n",
+    "    age_df = le.fit_transform(user['age'])  # 将标签值标准化成0,1,2,3...\n",
+    " \n",
+    "    age_df = pd.get_dummies(age_df, prefix='age')  # 将标准化值变成onehot编码0/1\n",
+    "# 如原数据age[0,1,2,3]变成：\n",
+    "#      age_0[1,0,0,0]\n",
+    "#      age_1[0,1,0,0]\n",
+    "#      age_2[0,0,1,0]\n",
+    "#      age_3[0,0,0,1], 有值的位置为1，没有为0，且只有0/1两种情况，列名+列值=新列名\n",
+    "    sex_df = pd.get_dummies(user['sex'], prefix='sex')\n",
+    "    user_lv_df = pd.get_dummies(user['user_lv_cd'], prefix='user_lv_cd')\n",
+    "    user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)  # 合并\n",
+    "    return user"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "user_id        False\n",
+       "age             True\n",
+       "sex             True\n",
+       "user_lv_cd     False\n",
+       "user_reg_tm     True\n",
+       "dtype: bool"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "user = pd.read_csv(user_path, encoding='gbk')\n",
+    "user.isnull().any()  # 判断是否有空值，True为有"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>age</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>user_lv_cd</th>\n",
+       "      <th>user_reg_tm</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>34072</th>\n",
+       "      <td>234073</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34072</th>\n",
+       "      <td>234073</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34072</th>\n",
+       "      <td>234073</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38905</th>\n",
+       "      <td>238906</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38905</th>\n",
+       "      <td>238906</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38905</th>\n",
+       "      <td>238906</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67704</th>\n",
+       "      <td>267705</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67704</th>\n",
+       "      <td>267705</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67704</th>\n",
+       "      <td>267705</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       user_id  age  sex  user_lv_cd user_reg_tm\n",
+       "34072   234073  NaN  NaN           1         NaN\n",
+       "34072   234073  NaN  NaN           1         NaN\n",
+       "34072   234073  NaN  NaN           1         NaN\n",
+       "38905   238906  NaN  NaN           1         NaN\n",
+       "38905   238906  NaN  NaN           1         NaN\n",
+       "38905   238906  NaN  NaN           1         NaN\n",
+       "67704   267705  NaN  NaN           1         NaN\n",
+       "67704   267705  NaN  NaN           1         NaN\n",
+       "67704   267705  NaN  NaN           1         NaN"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "user[user.isnull().values==True]  # 查看空值的部分"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "这些数据不仅没有年龄、性别、注册时间，数据只有9条，比较少，我们直接删除"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "user_id        False\n",
+       "age            False\n",
+       "sex            False\n",
+       "user_lv_cd     False\n",
+       "user_reg_tm    False\n",
+       "dtype: bool"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "user.dropna(axis=0, how='any',inplace=True)\n",
+    "user.isnull().any()  # 判断是否有空值，已经全部为False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 商品特征\n",
+    "### 商品基本特征\n",
+    "根据商品文件获取基本的特征，针对属性a1,a2,a3进行独热编码，商品类别和品牌直接作为特征"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_basic_product_feat():\n",
+    "    product = pd.read_csv(product_path)\n",
+    "    attr1_df = pd.get_dummies(product[\"a1\"], prefix=\"a1\")\n",
+    "    attr2_df = pd.get_dummies(product[\"a2\"], prefix=\"a2\")\n",
+    "    attr3_df = pd.get_dummies(product[\"a3\"], prefix=\"a3\")\n",
+    "    product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)\n",
+    "    return product"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 评论特征\n",
+    "* 分时间段\n",
+    "* 对评论数进行独热编码"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_comments_product_feat(end_date):\n",
+    "    comments = pd.read_csv(comment_path)\n",
+    "    comment_date_end = end_date\n",
+    "    comment_date_begin = comment_date[0]\n",
+    "    for date in reversed(comment_date):\n",
+    "        if date < comment_date_end:\n",
+    "            comment_date_begin = date\n",
+    "            break\n",
+    "    comments = comments[comments.dt==comment_date_begin]\n",
+    "    df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n",
+    "    # 为了防止某个时间段不具备评论数为0的情况（测试集出现过这种情况）\n",
+    "    for i in range(0, 5):\n",
+    "        if 'comment_num_' + str(i) not in df.columns:\n",
+    "            df['comment_num_' + str(i)] = 0\n",
+    "    df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "    \n",
+    "    comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n",
+    "        #del comments['dt']\n",
+    "        #del comments['comment_num']\n",
+    "    comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n",
+    "                         'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "    return comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_start_date = '2016-02-01'\n",
+    "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n",
+    "train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
+    "day = 3\n",
+    " \n",
+    "start_date = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=day)\n",
+    "start_date = start_date.strftime('%Y-%m-%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sku_id</th>\n",
+       "      <th>has_bad_comment</th>\n",
+       "      <th>bad_comment_rate</th>\n",
+       "      <th>comment_num_0</th>\n",
+       "      <th>comment_num_1</th>\n",
+       "      <th>comment_num_2</th>\n",
+       "      <th>comment_num_3</th>\n",
+       "      <th>comment_num_4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1000</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0417</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>10000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>100011</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0376</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>100018</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>100020</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sku_id  has_bad_comment  bad_comment_rate  comment_num_0  comment_num_1  \\\n",
+       "0    1000                1            0.0417              0              0   \n",
+       "1   10000                0            0.0000              0              0   \n",
+       "2  100011                1            0.0376              0              0   \n",
+       "3  100018                0            0.0000              0              0   \n",
+       "4  100020                0            0.0000              0              0   \n",
+       "\n",
+       "   comment_num_2  comment_num_3  comment_num_4  \n",
+       "0              0              1              0  \n",
+       "1              1              0              0  \n",
+       "2              0              0              1  \n",
+       "3              0              1              0  \n",
+       "4              0              1              0  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "comments = pd.read_csv(comment_path)\n",
+    "comment_date_end = train_end_date\n",
+    "comment_date_begin = comment_date[0]\n",
+    "for date in reversed(comment_date):\n",
+    "     if date < comment_date_end:\n",
+    "        comment_date_begin = date\n",
+    "        break\n",
+    "comments = comments[comments.dt==comment_date_begin]\n",
+    "df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n",
+    "for i in range(0, 5):\n",
+    "    if 'comment_num_' + str(i) not in df.columns:\n",
+    "         df['comment_num_' + str(i)] = 0\n",
+    "df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "    \n",
+    "comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n",
+    "comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n",
+    "                        'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "comments.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "comment_num_0/1/2/3/4分别对应着，0无评论/1表示1条/2表示2-10条/3表示11-50条/4表示大于50条，bad_comment_rate差评率，has_bad_comment是否包含差评。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 行为特征\n",
+    "* 分时间段\n",
+    "* 对行为类别进行独热编码\n",
+    "* 分别按照用户-类别行为分组和用户-类别-商品行为分组统计，然后计算\n",
+    "    * 用户对同类别下其他商品的行为计数\n",
+    "    * 针对用户对同类别下目标商品的行为计数与该时间段的行为均值作差"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_action_feat(start_date, end_date, all_actions, i):\n",
+    "    actions = get_actions(start_date, end_date, all_actions)\n",
+    "    actions = actions[['user_id', 'sku_id', 'cate','type']]\n",
+    "    # 不同时间累积的行为计数（3,5,7,10,15,21,30）\n",
+    "    df = pd.get_dummies(actions['type'], prefix='action_before_%s' %i)\n",
+    "    before_date = 'action_before_%s' %i\n",
+    "    actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame\n",
+    "    # 分组统计，用户-类别-商品,不同用户对不同类别下商品的行为计数\n",
+    "    actions = actions.groupby(['user_id', 'sku_id','cate'], as_index=False).sum()\n",
+    "    # 分组统计，用户-类别，不同用户对不同商品类别的行为计数\n",
+    "    user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()\n",
+    "    del user_cate['sku_id']\n",
+    "    del user_cate['type']\n",
+    "    actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])\n",
+    "    #本类别下其他商品点击量\n",
+    "    # 前述两种分组含有相同名称的不同行为的计数，系统会自动针对名称调整添加后缀,x,y，所以这里作差统计的是同一类别下其他商品的行为计数\n",
+    "    actions[before_date+'_1.0_y'] = actions[before_date+'_1.0_y'] - actions[before_date+'_1.0_x']\n",
+    "    actions[before_date+'_2.0_y'] = actions[before_date+'_2.0_y'] - actions[before_date+'_2.0_x']\n",
+    "    actions[before_date+'_3.0_y'] = actions[before_date+'_3.0_y'] - actions[before_date+'_3.0_x']\n",
+    "    actions[before_date+'_4.0_y'] = actions[before_date+'_4.0_y'] - actions[before_date+'_4.0_x']\n",
+    "    actions[before_date+'_5.0_y'] = actions[before_date+'_5.0_y'] - actions[before_date+'_5.0_x']\n",
+    "    actions[before_date+'_6.0_y'] = actions[before_date+'_6.0_y'] - actions[before_date+'_6.0_x']\n",
+    "    # 统计用户对不同类别下商品计数与该类别下商品行为计数均值（对时间）的差值\n",
+    "    actions[before_date+'minus_mean_1'] = actions[before_date+'_1.0_x'] - (actions[before_date+'_1.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_2'] = actions[before_date+'_2.0_x'] - (actions[before_date+'_2.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_3'] = actions[before_date+'_3.0_x'] - (actions[before_date+'_3.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_4'] = actions[before_date+'_4.0_x'] - (actions[before_date+'_4.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_5'] = actions[before_date+'_5.0_x'] - (actions[before_date+'_5.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_6'] = actions[before_date+'_6.0_x'] - (actions[before_date+'_6.0_x']/i)\n",
+    "    del actions['type']\n",
+    " \n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 用户-行为\n",
+    "#### 累积用户特征\n",
+    "* 分时间段\n",
+    "* 用户不同行为的\n",
+    "    * 购买转化率\n",
+    "    * 均值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_accumulate_user_feat(end_date, all_actions, day):\n",
+    "    start_date = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=day)\n",
+    "    start_date = start_date.strftime('%Y-%m-%d')\n",
+    "    before_date = 'user_action_%s' % day\n",
+    "\n",
+    "    feature = [\n",
+    "        'user_id', before_date + '_1', before_date + '_2', before_date + '_3',\n",
+    "        before_date + '_4', before_date + '_5', before_date + '_6',\n",
+    "        before_date + '_1_ratio', before_date + '_2_ratio',\n",
+    "        before_date + '_3_ratio', before_date + '_5_ratio',\n",
+    "        before_date + '_6_ratio', before_date + '_1_mean',\n",
+    "        before_date + '_2_mean', before_date + '_3_mean',\n",
+    "        before_date + '_4_mean', before_date + '_5_mean',\n",
+    "        before_date + '_6_mean', before_date + '_1_std',\n",
+    "        before_date + '_2_std', before_date + '_3_std', before_date + '_4_std',\n",
+    "        before_date + '_5_std', before_date + '_6_std'\n",
+    "    ]\n",
+    "\n",
+    "    actions = get_actions(start_date, end_date, all_actions)\n",
+    "    df = pd.get_dummies(actions['type'], prefix=before_date)\n",
+    "\n",
+    "    actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n",
+    "\n",
+    "    actions = pd.concat([actions[['user_id', 'date']], df], axis=1)\n",
+    "   \n",
+    "    # 分组统计，按用户分组，统计用户各项行为的转化率、均值\n",
+    "    actions = actions.groupby(['user_id'], as_index=False).sum()\n",
+    "\n",
+    "    actions[before_date + '_1_ratio'] =  np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_1.0'])\n",
+    "    actions[before_date + '_2_ratio'] =  np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_2.0'])\n",
+    "    actions[before_date + '_3_ratio'] =  np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_3.0'])\n",
+    "    actions[before_date + '_5_ratio'] =  np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_5.0'])\n",
+    "    actions[before_date + '_6_ratio'] =  np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_6.0'])\n",
+    "    # 均值\n",
+    "    actions[before_date + '_1_mean'] = actions[before_date + '_1.0'] / day\n",
+    "    actions[before_date + '_2_mean'] = actions[before_date + '_2.0'] / day\n",
+    "    actions[before_date + '_3_mean'] = actions[before_date + '_3.0'] / day\n",
+    "    actions[before_date + '_4_mean'] = actions[before_date + '_4.0'] / day\n",
+    "    actions[before_date + '_5_mean'] = actions[before_date + '_5.0'] / day\n",
+    "    actions[before_date + '_6_mean'] = actions[before_date + '_6.0'] / day\n",
+    "    #actions = pd.merge(actions, actions_date, how='left', on='user_id')\n",
+    "    #actions = actions[feature]\n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 用户近期行为特征\n",
+    "在上面针对用户进行累积特征提取的基础上，分别提取用户近一个月、近三天的特征，然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_recent_user_feat(end_date, all_actions):\n",
+    "    actions_3 = get_accumulate_user_feat(end_date, all_actions, 3)\n",
+    "    actions_30 = get_accumulate_user_feat(end_date, all_actions, 30)\n",
+    "    actions = pd.merge(actions_3, actions_30, how ='left', on='user_id')\n",
+    "    del actions_3\n",
+    "    del actions_30\n",
+    "    \n",
+    "    actions['recent_action1'] =  np.log(1 + actions['user_action_30_1.0']-actions['user_action_3_1.0']) - np.log(1 + actions['user_action_30_1.0'])\n",
+    "    actions['recent_action2'] =  np.log(1 + actions['user_action_30_2.0']-actions['user_action_3_2.0']) - np.log(1 + actions['user_action_30_2.0'])\n",
+    "    actions['recent_action3'] =  np.log(1 + actions['user_action_30_3.0']-actions['user_action_3_3.0']) - np.log(1 + actions['user_action_30_3.0'])\n",
+    "    actions['recent_action4'] =  np.log(1 + actions['user_action_30_4.0']-actions['user_action_3_4.0']) - np.log(1 + actions['user_action_30_4.0'])\n",
+    "    actions['recent_action5'] =  np.log(1 + actions['user_action_30_5.0']-actions['user_action_3_5.0']) - np.log(1 + actions['user_action_30_5.0'])\n",
+    "    actions['recent_action6'] =  np.log(1 + actions['user_action_30_6.0']-actions['user_action_3_6.0']) - np.log(1 + actions['user_action_30_6.0'])\n",
+    "    \n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 用户对同类别下各种商品的行为\n",
+    "* 用户对各个类别的各项行为操作统计\n",
+    "* 用户对各个类别操作行为统计占对所有类别操作行为统计的比重"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#增加了用户对不同类别的交互特征\n",
+    "def get_user_cate_feature(start_date, end_date, all_actions):\n",
+    "    actions = get_actions(start_date, end_date, all_actions)\n",
+    "    actions = actions[['user_id', 'cate', 'type']]\n",
+    "    df = pd.get_dummies(actions['type'], prefix='type')\n",
+    "    actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)\n",
+    "    actions = actions.groupby(['user_id', 'cate']).sum()\n",
+    "    actions = actions.unstack()\n",
+    "    actions.columns = actions.columns.swaplevel(0, 1)\n",
+    "    actions.columns = actions.columns.droplevel()\n",
+    "    actions.columns = [\n",
+    "        'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n",
+    "        'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n",
+    "        'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n",
+    "        'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n",
+    "        'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n",
+    "        'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n",
+    "        'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n",
+    "        'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n",
+    "        'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n",
+    "        'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n",
+    "        'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n",
+    "        'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'\n",
+    "    ]\n",
+    "    actions = actions.fillna(0)\n",
+    "    actions['cate_action_sum'] = actions.sum(axis=1)\n",
+    "    actions['cate8_percentage'] = (\n",
+    "        actions['cate_8_type1'] + actions['cate_8_type2'] +\n",
+    "        actions['cate_8_type3'] + actions['cate_8_type4'] +\n",
+    "        actions['cate_8_type5'] + actions['cate_8_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "    actions['cate4_percentage'] = (\n",
+    "        actions['cate_4_type1'] + actions['cate_4_type2'] +\n",
+    "        actions['cate_4_type3'] + actions['cate_4_type4'] +\n",
+    "        actions['cate_4_type5'] + actions['cate_4_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "    actions['cate5_percentage'] = (\n",
+    "        actions['cate_5_type1'] + actions['cate_5_type2'] +\n",
+    "        actions['cate_5_type3'] + actions['cate_5_type4'] +\n",
+    "        actions['cate_5_type5'] + actions['cate_5_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "    actions['cate6_percentage'] = (\n",
+    "        actions['cate_6_type1'] + actions['cate_6_type2'] +\n",
+    "        actions['cate_6_type3'] + actions['cate_6_type4'] +\n",
+    "        actions['cate_6_type5'] + actions['cate_6_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "    actions['cate7_percentage'] = (\n",
+    "        actions['cate_7_type1'] + actions['cate_7_type2'] +\n",
+    "        actions['cate_7_type3'] + actions['cate_7_type4'] +\n",
+    "        actions['cate_7_type5'] + actions['cate_7_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "    actions['cate9_percentage'] = (\n",
+    "        actions['cate_9_type1'] + actions['cate_9_type2'] +\n",
+    "        actions['cate_9_type3'] + actions['cate_9_type4'] +\n",
+    "        actions['cate_9_type5'] + actions['cate_9_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "    actions['cate10_percentage'] = (\n",
+    "        actions['cate_10_type1'] + actions['cate_10_type2'] +\n",
+    "        actions['cate_10_type3'] + actions['cate_10_type4'] +\n",
+    "        actions['cate_10_type5'] + actions['cate_10_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "    actions['cate11_percentage'] = (\n",
+    "        actions['cate_11_type1'] + actions['cate_11_type2'] +\n",
+    "        actions['cate_11_type3'] + actions['cate_11_type4'] +\n",
+    "        actions['cate_11_type5'] + actions['cate_11_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "\n",
+    "    actions['cate8_type1_percentage'] = np.log(\n",
+    "        1 + actions['cate_8_type1']) - np.log(\n",
+    "            1 + actions['cate_8_type1'] + actions['cate_4_type1'] +\n",
+    "            actions['cate_5_type1'] + actions['cate_6_type1'] +\n",
+    "            actions['cate_7_type1'] + actions['cate_9_type1'] +\n",
+    "            actions['cate_10_type1'] + actions['cate_11_type1'])\n",
+    "\n",
+    "    actions['cate8_type2_percentage'] = np.log(\n",
+    "        1 + actions['cate_8_type2']) - np.log(\n",
+    "            1 + actions['cate_8_type2'] + actions['cate_4_type2'] +\n",
+    "            actions['cate_5_type2'] + actions['cate_6_type2'] +\n",
+    "            actions['cate_7_type2'] + actions['cate_9_type2'] +\n",
+    "            actions['cate_10_type2'] + actions['cate_11_type2'])\n",
+    "    actions['cate8_type3_percentage'] = np.log(\n",
+    "        1 + actions['cate_8_type3']) - np.log(\n",
+    "            1 + actions['cate_8_type3'] + actions['cate_4_type3'] +\n",
+    "            actions['cate_5_type3'] + actions['cate_6_type3'] +\n",
+    "            actions['cate_7_type3'] + actions['cate_9_type3'] +\n",
+    "            actions['cate_10_type3'] + actions['cate_11_type3'])\n",
+    "    actions['cate8_type4_percentage'] = np.log(\n",
+    "        1 + actions['cate_8_type4']) - np.log(\n",
+    "            1 + actions['cate_8_type4'] + actions['cate_4_type4'] +\n",
+    "            actions['cate_5_type4'] + actions['cate_6_type4'] +\n",
+    "            actions['cate_7_type4'] + actions['cate_9_type4'] +\n",
+    "            actions['cate_10_type4'] + actions['cate_11_type4'])\n",
+    "    actions['cate8_type5_percentage'] = np.log(\n",
+    "        1 + actions['cate_8_type5']) - np.log(\n",
+    "            1 + actions['cate_8_type5'] + actions['cate_4_type5'] +\n",
+    "            actions['cate_5_type5'] + actions['cate_6_type5'] +\n",
+    "            actions['cate_7_type5'] + actions['cate_9_type5'] +\n",
+    "            actions['cate_10_type5'] + actions['cate_11_type5'])\n",
+    "    actions['cate8_type6_percentage'] = np.log(\n",
+    "        1 + actions['cate_8_type6']) - np.log(\n",
+    "            1 + actions['cate_8_type6'] + actions['cate_4_type6'] +\n",
+    "            actions['cate_5_type6'] + actions['cate_6_type6'] +\n",
+    "            actions['cate_7_type6'] + actions['cate_9_type6'] +\n",
+    "            actions['cate_10_type6'] + actions['cate_11_type6'])\n",
+    "    actions['user_id'] = actions.index\n",
+    "    actions = actions[[\n",
+    "        'user_id', 'cate8_percentage', 'cate4_percentage', 'cate5_percentage',\n",
+    "        'cate6_percentage', 'cate7_percentage', 'cate9_percentage',\n",
+    "        'cate10_percentage', 'cate11_percentage', 'cate8_type1_percentage',\n",
+    "        'cate8_type2_percentage', 'cate8_type3_percentage',\n",
+    "        'cate8_type4_percentage', 'cate8_type5_percentage',\n",
+    "        'cate8_type6_percentage'\n",
+    "    ]]\n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2016-02-01\n",
+      "2016-02-04\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_start_date = '2016-02-01'\n",
+    "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n",
+    "train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
+    "day = 3\n",
+    "\n",
+    "start_date = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=day)\n",
+    "start_date = start_date.strftime('%Y-%m-%d')\n",
+    "\n",
+    "print (start_date)\n",
+    "print (train_end_date)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_actions = get_all_action()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>cate</th>\n",
+       "      <th>type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>272629.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>272629.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>272629.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>6.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>272629.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>272629.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>6.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     user_id  cate  type\n",
+       "29  272629.0  10.0   1.0\n",
+       "30  272629.0  10.0   1.0\n",
+       "31  272629.0  10.0   6.0\n",
+       "32  272629.0  10.0   1.0\n",
+       "33  272629.0  10.0   6.0"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions = get_actions(start_date, train_end_date, all_actions)\n",
+    "actions = actions[['user_id', 'cate', 'type']]\n",
+    "actions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>type_1.0</th>\n",
+       "      <th>type_2.0</th>\n",
+       "      <th>type_3.0</th>\n",
+       "      <th>type_4.0</th>\n",
+       "      <th>type_5.0</th>\n",
+       "      <th>type_6.0</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>user_id</th>\n",
+       "      <th>cate</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">200002.0</th>\n",
+       "      <th>4.0</th>\n",
+       "      <td>16.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>20.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5.0</th>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7.0</th>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8.0</th>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200003.0</th>\n",
+       "      <th>4.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               type_1.0  type_2.0  type_3.0  type_4.0  type_5.0  type_6.0\n",
+       "user_id  cate                                                            \n",
+       "200002.0 4.0       16.0       0.0       0.0       0.0       0.0      20.0\n",
+       "         5.0        4.0       0.0       0.0       0.0       0.0       6.0\n",
+       "         7.0        4.0       0.0       0.0       0.0       0.0       3.0\n",
+       "         8.0        4.0       0.0       0.0       0.0       0.0      12.0\n",
+       "200003.0 4.0        8.0       0.0       0.0       0.0       0.0      12.0"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.get_dummies(actions['type'], prefix='type')\n",
+    "actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)\n",
+    "actions = actions.groupby(['user_id', 'cate']).sum()\n",
+    "actions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"8\" halign=\"left\">type_1.0</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">type_2.0</th>\n",
+       "      <th>...</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">type_5.0</th>\n",
+       "      <th colspan=\"8\" halign=\"left\">type_6.0</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>cate</th>\n",
+       "      <th>4.0</th>\n",
+       "      <th>5.0</th>\n",
+       "      <th>6.0</th>\n",
+       "      <th>7.0</th>\n",
+       "      <th>8.0</th>\n",
+       "      <th>9.0</th>\n",
+       "      <th>10.0</th>\n",
+       "      <th>11.0</th>\n",
+       "      <th>4.0</th>\n",
+       "      <th>5.0</th>\n",
+       "      <th>...</th>\n",
+       "      <th>10.0</th>\n",
+       "      <th>11.0</th>\n",
+       "      <th>4.0</th>\n",
+       "      <th>5.0</th>\n",
+       "      <th>6.0</th>\n",
+       "      <th>7.0</th>\n",
+       "      <th>8.0</th>\n",
+       "      <th>9.0</th>\n",
+       "      <th>10.0</th>\n",
+       "      <th>11.0</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>user_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>200002.0</th>\n",
+       "      <td>16.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200003.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200008.0</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200023.0</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200030.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 48 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         type_1.0                                     type_2.0       ...  \\\n",
+       "cate         4.0  5.0  6.0  7.0   8.0  9.0  10.0 11.0     4.0  5.0   ...   \n",
+       "user_id                                                              ...   \n",
+       "200002.0     16.0  4.0  NaN  4.0   4.0  NaN  NaN  NaN      0.0  0.0  ...   \n",
+       "200003.0      8.0  NaN  NaN  NaN  12.0  NaN  NaN  NaN      0.0  NaN  ...   \n",
+       "200008.0      NaN  NaN  NaN  8.0   NaN  NaN  NaN  NaN      NaN  NaN  ...   \n",
+       "200023.0      NaN  NaN  NaN  NaN   1.0  NaN  NaN  NaN      NaN  NaN  ...   \n",
+       "200030.0      8.0  NaN  NaN  NaN   NaN  NaN  NaN  NaN      0.0  NaN  ...   \n",
+       "\n",
+       "         type_5.0      type_6.0                                       \n",
+       "cate         10.0 11.0     4.0  5.0  6.0   7.0   8.0  9.0  10.0 11.0  \n",
+       "user_id                                                               \n",
+       "200002.0      NaN  NaN     20.0  6.0  NaN   3.0  12.0  NaN  NaN  NaN  \n",
+       "200003.0      NaN  NaN     12.0  NaN  NaN   NaN  19.0  NaN  NaN  NaN  \n",
+       "200008.0      NaN  NaN      NaN  NaN  NaN  20.0   NaN  NaN  NaN  NaN  \n",
+       "200023.0      NaN  NaN      NaN  NaN  NaN   NaN   0.0  NaN  NaN  NaN  \n",
+       "200030.0      NaN  NaN     17.0  NaN  NaN   NaN   NaN  NaN  NaN  NaN  \n",
+       "\n",
+       "[5 rows x 48 columns]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions = actions.unstack()\n",
+    "actions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MultiIndex(levels=[['type_1.0', 'type_2.0', 'type_3.0', 'type_4.0', 'type_5.0', 'type_6.0'], [4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]],\n",
+       "           codes=[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5], [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]],\n",
+       "           names=[None, 'cate'])"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MultiIndex(levels=[[4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0], ['type_1.0', 'type_2.0', 'type_3.0', 'type_4.0', 'type_5.0', 'type_6.0']],\n",
+       "           codes=[[0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5]],\n",
+       "           names=['cate', None])"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions.columns = actions.columns.swaplevel(0, 1)\n",
+    "actions.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['type_1.0', 'type_1.0', 'type_1.0', 'type_1.0', 'type_1.0', 'type_1.0',\n",
+       "       'type_1.0', 'type_1.0', 'type_2.0', 'type_2.0', 'type_2.0', 'type_2.0',\n",
+       "       'type_2.0', 'type_2.0', 'type_2.0', 'type_2.0', 'type_3.0', 'type_3.0',\n",
+       "       'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0',\n",
+       "       'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0',\n",
+       "       'type_4.0', 'type_4.0', 'type_5.0', 'type_5.0', 'type_5.0', 'type_5.0',\n",
+       "       'type_5.0', 'type_5.0', 'type_5.0', 'type_5.0', 'type_6.0', 'type_6.0',\n",
+       "       'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions.columns = actions.columns.droplevel()\n",
+    "actions.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n",
+       "       'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n",
+       "       'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n",
+       "       'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n",
+       "       'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n",
+       "       'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n",
+       "       'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n",
+       "       'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n",
+       "       'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n",
+       "       'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n",
+       "       'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n",
+       "       'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions.columns = [\n",
+    "        'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n",
+    "        'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n",
+    "        'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n",
+    "        'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n",
+    "        'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n",
+    "        'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n",
+    "        'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n",
+    "        'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n",
+    "        'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n",
+    "        'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n",
+    "        'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n",
+    "        'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'\n",
+    "    ]\n",
+    "actions.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>cate_4_type1</th>\n",
+       "      <th>cate_5_type1</th>\n",
+       "      <th>cate_6_type1</th>\n",
+       "      <th>cate_7_type1</th>\n",
+       "      <th>cate_8_type1</th>\n",
+       "      <th>cate_9_type1</th>\n",
+       "      <th>cate_10_type1</th>\n",
+       "      <th>cate_11_type1</th>\n",
+       "      <th>cate_4_type2</th>\n",
+       "      <th>cate_5_type2</th>\n",
+       "      <th>...</th>\n",
+       "      <th>cate_11_type5</th>\n",
+       "      <th>cate_4_type6</th>\n",
+       "      <th>cate_5_type6</th>\n",
+       "      <th>cate_6_type6</th>\n",
+       "      <th>cate_7_type6</th>\n",
+       "      <th>cate_8_type6</th>\n",
+       "      <th>cate_9_type6</th>\n",
+       "      <th>cate_10_type6</th>\n",
+       "      <th>cate_11_type6</th>\n",
+       "      <th>cate_action_sum</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>user_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>200002.0</th>\n",
+       "      <td>16.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>69.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200003.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>51.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200008.0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>28.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200023.0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200030.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>25.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 49 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          cate_4_type1  cate_5_type1  cate_6_type1  cate_7_type1  \\\n",
+       "user_id                                                            \n",
+       "200002.0          16.0           4.0           0.0           4.0   \n",
+       "200003.0           8.0           0.0           0.0           0.0   \n",
+       "200008.0           0.0           0.0           0.0           8.0   \n",
+       "200023.0           0.0           0.0           0.0           0.0   \n",
+       "200030.0           8.0           0.0           0.0           0.0   \n",
+       "\n",
+       "          cate_8_type1  cate_9_type1  cate_10_type1  cate_11_type1  \\\n",
+       "user_id                                                              \n",
+       "200002.0           4.0           0.0            0.0            0.0   \n",
+       "200003.0          12.0           0.0            0.0            0.0   \n",
+       "200008.0           0.0           0.0            0.0            0.0   \n",
+       "200023.0           1.0           0.0            0.0            0.0   \n",
+       "200030.0           0.0           0.0            0.0            0.0   \n",
+       "\n",
+       "          cate_4_type2  cate_5_type2  ...  cate_11_type5  cate_4_type6  \\\n",
+       "user_id                               ...                                \n",
+       "200002.0           0.0           0.0  ...            0.0          20.0   \n",
+       "200003.0           0.0           0.0  ...            0.0          12.0   \n",
+       "200008.0           0.0           0.0  ...            0.0           0.0   \n",
+       "200023.0           0.0           0.0  ...            0.0           0.0   \n",
+       "200030.0           0.0           0.0  ...            0.0          17.0   \n",
+       "\n",
+       "          cate_5_type6  cate_6_type6  cate_7_type6  cate_8_type6  \\\n",
+       "user_id                                                            \n",
+       "200002.0           6.0           0.0           3.0          12.0   \n",
+       "200003.0           0.0           0.0           0.0          19.0   \n",
+       "200008.0           0.0           0.0          20.0           0.0   \n",
+       "200023.0           0.0           0.0           0.0           0.0   \n",
+       "200030.0           0.0           0.0           0.0           0.0   \n",
+       "\n",
+       "          cate_9_type6  cate_10_type6  cate_11_type6  cate_action_sum  \n",
+       "user_id                                                                \n",
+       "200002.0           0.0            0.0            0.0             69.0  \n",
+       "200003.0           0.0            0.0            0.0             51.0  \n",
+       "200008.0           0.0            0.0            0.0             28.0  \n",
+       "200023.0           0.0            0.0            0.0              1.0  \n",
+       "200030.0           0.0            0.0            0.0             25.0  \n",
+       "\n",
+       "[5 rows x 49 columns]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions = actions.fillna(0)\n",
+    "actions['cate_action_sum'] = actions.sum(axis=1)\n",
+    "actions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>cate_4_type1</th>\n",
+       "      <th>cate_5_type1</th>\n",
+       "      <th>cate_6_type1</th>\n",
+       "      <th>cate_7_type1</th>\n",
+       "      <th>cate_8_type1</th>\n",
+       "      <th>cate_9_type1</th>\n",
+       "      <th>cate_10_type1</th>\n",
+       "      <th>cate_11_type1</th>\n",
+       "      <th>cate_4_type2</th>\n",
+       "      <th>cate_5_type2</th>\n",
+       "      <th>...</th>\n",
+       "      <th>cate_4_type6</th>\n",
+       "      <th>cate_5_type6</th>\n",
+       "      <th>cate_6_type6</th>\n",
+       "      <th>cate_7_type6</th>\n",
+       "      <th>cate_8_type6</th>\n",
+       "      <th>cate_9_type6</th>\n",
+       "      <th>cate_10_type6</th>\n",
+       "      <th>cate_11_type6</th>\n",
+       "      <th>cate_action_sum</th>\n",
+       "      <th>cate8_percentage</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>user_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>200002.0</th>\n",
+       "      <td>16.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>69.0</td>\n",
+       "      <td>0.231884</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200003.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>0.607843</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200008.0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>28.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200023.0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200030.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 50 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          cate_4_type1  cate_5_type1  cate_6_type1  cate_7_type1  \\\n",
+       "user_id                                                            \n",
+       "200002.0          16.0           4.0           0.0           4.0   \n",
+       "200003.0           8.0           0.0           0.0           0.0   \n",
+       "200008.0           0.0           0.0           0.0           8.0   \n",
+       "200023.0           0.0           0.0           0.0           0.0   \n",
+       "200030.0           8.0           0.0           0.0           0.0   \n",
+       "\n",
+       "          cate_8_type1  cate_9_type1  cate_10_type1  cate_11_type1  \\\n",
+       "user_id                                                              \n",
+       "200002.0           4.0           0.0            0.0            0.0   \n",
+       "200003.0          12.0           0.0            0.0            0.0   \n",
+       "200008.0           0.0           0.0            0.0            0.0   \n",
+       "200023.0           1.0           0.0            0.0            0.0   \n",
+       "200030.0           0.0           0.0            0.0            0.0   \n",
+       "\n",
+       "          cate_4_type2  cate_5_type2  ...  cate_4_type6  cate_5_type6  \\\n",
+       "user_id                               ...                               \n",
+       "200002.0           0.0           0.0  ...          20.0           6.0   \n",
+       "200003.0           0.0           0.0  ...          12.0           0.0   \n",
+       "200008.0           0.0           0.0  ...           0.0           0.0   \n",
+       "200023.0           0.0           0.0  ...           0.0           0.0   \n",
+       "200030.0           0.0           0.0  ...          17.0           0.0   \n",
+       "\n",
+       "          cate_6_type6  cate_7_type6  cate_8_type6  cate_9_type6  \\\n",
+       "user_id                                                            \n",
+       "200002.0           0.0           3.0          12.0           0.0   \n",
+       "200003.0           0.0           0.0          19.0           0.0   \n",
+       "200008.0           0.0          20.0           0.0           0.0   \n",
+       "200023.0           0.0           0.0           0.0           0.0   \n",
+       "200030.0           0.0           0.0           0.0           0.0   \n",
+       "\n",
+       "          cate_10_type6  cate_11_type6  cate_action_sum  cate8_percentage  \n",
+       "user_id                                                                    \n",
+       "200002.0            0.0            0.0             69.0          0.231884  \n",
+       "200003.0            0.0            0.0             51.0          0.607843  \n",
+       "200008.0            0.0            0.0             28.0          0.000000  \n",
+       "200023.0            0.0            0.0              1.0          1.000000  \n",
+       "200030.0            0.0            0.0             25.0          0.000000  \n",
+       "\n",
+       "[5 rows x 50 columns]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions['cate8_percentage'] = (\n",
+    "        actions['cate_8_type1'] + actions['cate_8_type2'] +\n",
+    "        actions['cate_8_type3'] + actions['cate_8_type4'] +\n",
+    "        actions['cate_8_type5'] + actions['cate_8_type6']\n",
+    "    ) / actions['cate_action_sum']\n",
+    "actions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>cate_4_type1</th>\n",
+       "      <th>cate_5_type1</th>\n",
+       "      <th>cate_6_type1</th>\n",
+       "      <th>cate_7_type1</th>\n",
+       "      <th>cate_8_type1</th>\n",
+       "      <th>cate_9_type1</th>\n",
+       "      <th>cate_10_type1</th>\n",
+       "      <th>cate_11_type1</th>\n",
+       "      <th>cate_4_type2</th>\n",
+       "      <th>cate_5_type2</th>\n",
+       "      <th>...</th>\n",
+       "      <th>cate_5_type6</th>\n",
+       "      <th>cate_6_type6</th>\n",
+       "      <th>cate_7_type6</th>\n",
+       "      <th>cate_8_type6</th>\n",
+       "      <th>cate_9_type6</th>\n",
+       "      <th>cate_10_type6</th>\n",
+       "      <th>cate_11_type6</th>\n",
+       "      <th>cate_action_sum</th>\n",
+       "      <th>cate8_percentage</th>\n",
+       "      <th>cate8_type1_percentage</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>user_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>200002.0</th>\n",
+       "      <td>16.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>69.0</td>\n",
+       "      <td>0.231884</td>\n",
+       "      <td>-1.757858</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200003.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>0.607843</td>\n",
+       "      <td>-0.479573</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200008.0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>28.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>-2.197225</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200023.0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200030.0</th>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>-2.197225</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 51 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          cate_4_type1  cate_5_type1  cate_6_type1  cate_7_type1  \\\n",
+       "user_id                                                            \n",
+       "200002.0          16.0           4.0           0.0           4.0   \n",
+       "200003.0           8.0           0.0           0.0           0.0   \n",
+       "200008.0           0.0           0.0           0.0           8.0   \n",
+       "200023.0           0.0           0.0           0.0           0.0   \n",
+       "200030.0           8.0           0.0           0.0           0.0   \n",
+       "\n",
+       "          cate_8_type1  cate_9_type1  cate_10_type1  cate_11_type1  \\\n",
+       "user_id                                                              \n",
+       "200002.0           4.0           0.0            0.0            0.0   \n",
+       "200003.0          12.0           0.0            0.0            0.0   \n",
+       "200008.0           0.0           0.0            0.0            0.0   \n",
+       "200023.0           1.0           0.0            0.0            0.0   \n",
+       "200030.0           0.0           0.0            0.0            0.0   \n",
+       "\n",
+       "          cate_4_type2  cate_5_type2  ...  cate_5_type6  cate_6_type6  \\\n",
+       "user_id                               ...                               \n",
+       "200002.0           0.0           0.0  ...           6.0           0.0   \n",
+       "200003.0           0.0           0.0  ...           0.0           0.0   \n",
+       "200008.0           0.0           0.0  ...           0.0           0.0   \n",
+       "200023.0           0.0           0.0  ...           0.0           0.0   \n",
+       "200030.0           0.0           0.0  ...           0.0           0.0   \n",
+       "\n",
+       "          cate_7_type6  cate_8_type6  cate_9_type6  cate_10_type6  \\\n",
+       "user_id                                                             \n",
+       "200002.0           3.0          12.0           0.0            0.0   \n",
+       "200003.0           0.0          19.0           0.0            0.0   \n",
+       "200008.0          20.0           0.0           0.0            0.0   \n",
+       "200023.0           0.0           0.0           0.0            0.0   \n",
+       "200030.0           0.0           0.0           0.0            0.0   \n",
+       "\n",
+       "          cate_11_type6  cate_action_sum  cate8_percentage  \\\n",
+       "user_id                                                      \n",
+       "200002.0            0.0             69.0          0.231884   \n",
+       "200003.0            0.0             51.0          0.607843   \n",
+       "200008.0            0.0             28.0          0.000000   \n",
+       "200023.0            0.0              1.0          1.000000   \n",
+       "200030.0            0.0             25.0          0.000000   \n",
+       "\n",
+       "          cate8_type1_percentage  \n",
+       "user_id                           \n",
+       "200002.0               -1.757858  \n",
+       "200003.0               -0.479573  \n",
+       "200008.0               -2.197225  \n",
+       "200023.0                0.000000  \n",
+       "200030.0               -2.197225  \n",
+       "\n",
+       "[5 rows x 51 columns]"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "actions['cate8_type1_percentage'] = np.log(\n",
+    "        1 + actions['cate_8_type1']) - np.log(\n",
+    "            1 + actions['cate_8_type1'] + actions['cate_4_type1'] +\n",
+    "            actions['cate_5_type1'] + actions['cate_6_type1'] +\n",
+    "            actions['cate_7_type1'] + actions['cate_9_type1'] +\n",
+    "            actions['cate_10_type1'] + actions['cate_11_type1'])\n",
+    "actions.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 商品-行为\n",
+    "#### 累积商品特征\n",
+    "* 分时间段\n",
+    "* 针对商品的不同行为的\n",
+    "    * 购买转化率\n",
+    "    * 均值\n",
+    "    * 标准差"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_accumulate_product_feat(start_date, end_date, all_actions):\n",
+    "    feature = [\n",
+    "        'sku_id', 'product_action_1', 'product_action_2',\n",
+    "        'product_action_3', 'product_action_4',\n",
+    "        'product_action_5', 'product_action_6',\n",
+    "        'product_action_1_ratio', 'product_action_2_ratio',\n",
+    "        'product_action_3_ratio', 'product_action_5_ratio',\n",
+    "        'product_action_6_ratio', 'product_action_1_mean',\n",
+    "        'product_action_2_mean', 'product_action_3_mean',\n",
+    "        'product_action_4_mean', 'product_action_5_mean',\n",
+    "        'product_action_6_mean', 'product_action_1_std',\n",
+    "        'product_action_2_std', 'product_action_3_std', 'product_action_4_std',\n",
+    "        'product_action_5_std', 'product_action_6_std'\n",
+    "    ]\n",
+    "\n",
+    "    actions = get_actions(start_date, end_date, all_actions)\n",
+    "    df = pd.get_dummies(actions['type'], prefix='product_action')\n",
+    "    # 按照商品-日期分组，计算某个时间段该商品的各项行为的标准差\n",
+    "    actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n",
+    "    actions = pd.concat([actions[['sku_id', 'date']], df], axis=1)\n",
+    "    actions = actions.groupby(['sku_id'], as_index=False).sum()\n",
+    "    days_interal = (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(start_date, '%Y-%m-%d')).days\n",
+    "    \n",
+    "    # 针对商品分组，计算购买转化率\n",
+    "    actions['product_action_1_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_1.0'])\n",
+    "    actions['product_action_2_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_2.0'])\n",
+    "    actions['product_action_3_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_3.0'])\n",
+    "    actions['product_action_5_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_5.0'])\n",
+    "    actions['product_action_6_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_6.0'])\n",
+    "    \n",
+    "    # 计算各种行为的均值\n",
+    "    actions['product_action_1_mean'] = actions[\n",
+    "        'product_action_1.0'] / days_interal\n",
+    "    actions['product_action_2_mean'] = actions[\n",
+    "        'product_action_2.0'] / days_interal\n",
+    "    actions['product_action_3_mean'] = actions[\n",
+    "        'product_action_3.0'] / days_interal\n",
+    "    actions['product_action_4_mean'] = actions[\n",
+    "        'product_action_4.0'] / days_interal\n",
+    "    actions['product_action_5_mean'] = actions[\n",
+    "        'product_action_5.0'] / days_interal\n",
+    "    actions['product_action_6_mean'] = actions[\n",
+    "        'product_action_6.0'] / days_interal\n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 类别特征\n",
+    "#### 分时间段下各个商品类别的\n",
+    "* 购买转化率\n",
+    "    * 标准差\n",
+    "    * 均值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_accumulate_cate_feat(start_date, end_date, all_actions):\n",
+    "    feature = ['cate','cate_action_1', 'cate_action_2', 'cate_action_3', 'cate_action_4', 'cate_action_5', \n",
+    "               'cate_action_6', 'cate_action_1_ratio', 'cate_action_2_ratio', \n",
+    "               'cate_action_3_ratio', 'cate_action_5_ratio', 'cate_action_6_ratio', 'cate_action_1_mean',\n",
+    "               'cate_action_2_mean', 'cate_action_3_mean', 'cate_action_4_mean', 'cate_action_5_mean',\n",
+    "               'cate_action_6_mean', 'cate_action_1_std', 'cate_action_2_std', 'cate_action_3_std',\n",
+    "               'cate_action_4_std', 'cate_action_5_std', 'cate_action_6_std']\n",
+    "    actions = get_actions(start_date, end_date, all_actions)\n",
+    "    actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n",
+    "    df = pd.get_dummies(actions['type'], prefix='cate_action')\n",
+    "    actions = pd.concat([actions[['cate','date']], df], axis=1)\n",
+    "    \n",
+    "    # 按照类别分组，统计各个商品类别下行为的转化率\n",
+    "    actions = actions.groupby(['cate'], as_index=False).sum()\n",
+    "    days_interal = (datetime.strptime(end_date, '%Y-%m-%d')-datetime.strptime(start_date, '%Y-%m-%d')).days\n",
+    "    \n",
+    "    actions['cate_action_1_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_1.0']))\n",
+    "    actions['cate_action_2_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_2.0']))\n",
+    "    actions['cate_action_3_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_3.0']))\n",
+    "    actions['cate_action_5_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_5.0']))\n",
+    "    actions['cate_action_6_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_6.0']))\n",
+    "    \n",
+    "    # 按照类别分组，统计各个商品类别下行为在一段时间的均值\n",
+    "    actions['cate_action_1_mean'] = actions['cate_action_1.0'] /  days_interal\n",
+    "    actions['cate_action_2_mean'] = actions['cate_action_2.0'] /  days_interal\n",
+    "    actions['cate_action_3_mean'] = actions['cate_action_3.0'] /  days_interal\n",
+    "    actions['cate_action_4_mean'] = actions['cate_action_4.0'] /  days_interal\n",
+    "    actions['cate_action_5_mean'] = actions['cate_action_5.0'] /  days_interal\n",
+    "    actions['cate_action_6_mean'] = actions['cate_action_6.0'] /  days_interal\n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 构造训练集/测试集\n",
+    "### 构造训练集/验证集\n",
+    "标签,采用滑动窗口的方式，构造训练集的时候针对产生购买的行为标记为1\n",
+    "整合特征"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_labels(start_date, end_date, all_actions):\n",
+    "    actions = get_actions(start_date, end_date, all_actions)\n",
+    "#     actions = actions[actions['type'] == 4]\n",
+    "    # 修改为预测购买了商品8的用户预测\n",
+    "    actions = actions[(actions['type'] == 4) & (actions['cate']==8)]\n",
+    "    \n",
+    "    actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()\n",
+    "    actions['label'] = 1\n",
+    "    actions = actions[['user_id', 'sku_id', 'label']]\n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get all actions!\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_start_date = '2016-03-01'\n",
+    "train_actions = None\n",
+    "all_actions = get_all_action()\n",
+    "print (\"get all actions!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>sku_id</th>\n",
+       "      <th>time</th>\n",
+       "      <th>model_id</th>\n",
+       "      <th>type</th>\n",
+       "      <th>cate</th>\n",
+       "      <th>brand</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>266079.0</td>\n",
+       "      <td>138778.0</td>\n",
+       "      <td>2016-01-31 23:59:02</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>403.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>266079.0</td>\n",
+       "      <td>138778.0</td>\n",
+       "      <td>2016-01-31 23:59:03</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>403.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>200719.0</td>\n",
+       "      <td>61226.0</td>\n",
+       "      <td>2016-01-31 23:59:07</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>30.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>200719.0</td>\n",
+       "      <td>61226.0</td>\n",
+       "      <td>2016-01-31 23:59:08</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>30.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>263587.0</td>\n",
+       "      <td>72348.0</td>\n",
+       "      <td>2016-01-31 23:59:08</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>159.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    user_id    sku_id                 time  model_id  type  cate  brand\n",
+       "0  266079.0  138778.0  2016-01-31 23:59:02       NaN   1.0   8.0  403.0\n",
+       "1  266079.0  138778.0  2016-01-31 23:59:03       0.0   6.0   8.0  403.0\n",
+       "2  200719.0   61226.0  2016-01-31 23:59:07       NaN   1.0   8.0   30.0\n",
+       "3  200719.0   61226.0  2016-01-31 23:59:08       0.0   6.0   8.0   30.0\n",
+       "4  263587.0   72348.0  2016-01-31 23:59:08       NaN   1.0   5.0  159.0"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_actions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 50601736 entries, 0 to 13199933\n",
+      "Data columns (total 7 columns):\n",
+      "user_id     float32\n",
+      "sku_id      float32\n",
+      "time        object\n",
+      "model_id    float32\n",
+      "type        float32\n",
+      "cate        float32\n",
+      "brand       float32\n",
+      "dtypes: float32(6), object(1)\n",
+      "memory usage: 1.9+ GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_actions.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(50601736, 7)"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_actions.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get_basic_user_feat finsihed\n"
+     ]
+    }
+   ],
+   "source": [
+    "user = get_basic_user_feat()\n",
+    "print ('get_basic_user_feat finsihed')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>age_0</th>\n",
+       "      <th>age_1</th>\n",
+       "      <th>age_2</th>\n",
+       "      <th>age_3</th>\n",
+       "      <th>age_4</th>\n",
+       "      <th>age_5</th>\n",
+       "      <th>age_6</th>\n",
+       "      <th>sex_0</th>\n",
+       "      <th>sex_1</th>\n",
+       "      <th>sex_2</th>\n",
+       "      <th>user_lv_cd_1</th>\n",
+       "      <th>user_lv_cd_2</th>\n",
+       "      <th>user_lv_cd_3</th>\n",
+       "      <th>user_lv_cd_4</th>\n",
+       "      <th>user_lv_cd_5</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>200001.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>200002.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>200003.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>200004.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>200005.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    user_id  age_0  age_1  age_2  age_3  age_4  age_5  age_6  sex_0  sex_1  \\\n",
+       "0  200001.0    0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0   \n",
+       "1  200002.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0   \n",
+       "2  200003.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0    1.0   \n",
+       "3  200004.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   \n",
+       "4  200005.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0    1.0    0.0   \n",
+       "\n",
+       "   sex_2  user_lv_cd_1  user_lv_cd_2  user_lv_cd_3  user_lv_cd_4  user_lv_cd_5  \n",
+       "0    1.0           0.0           0.0           0.0           0.0           1.0  \n",
+       "1    0.0           1.0           0.0           0.0           0.0           0.0  \n",
+       "2    0.0           0.0           0.0           0.0           1.0           0.0  \n",
+       "3    1.0           1.0           0.0           0.0           0.0           0.0  \n",
+       "4    0.0           0.0           0.0           0.0           1.0           0.0  "
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "user.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get_basic_product_feat finsihed\n"
+     ]
+    }
+   ],
+   "source": [
+    "product = get_basic_product_feat()\n",
+    "print ('get_basic_product_feat finsihed')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sku_id</th>\n",
+       "      <th>cate</th>\n",
+       "      <th>brand</th>\n",
+       "      <th>a1_-1</th>\n",
+       "      <th>a1_1</th>\n",
+       "      <th>a1_2</th>\n",
+       "      <th>a1_3</th>\n",
+       "      <th>a2_-1</th>\n",
+       "      <th>a2_1</th>\n",
+       "      <th>a2_2</th>\n",
+       "      <th>a3_-1</th>\n",
+       "      <th>a3_1</th>\n",
+       "      <th>a3_2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10</td>\n",
+       "      <td>8</td>\n",
+       "      <td>489</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>100002</td>\n",
+       "      <td>8</td>\n",
+       "      <td>489</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>100003</td>\n",
+       "      <td>8</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>100006</td>\n",
+       "      <td>8</td>\n",
+       "      <td>545</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>10001</td>\n",
+       "      <td>8</td>\n",
+       "      <td>244</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sku_id  cate  brand  a1_-1  a1_1  a1_2  a1_3  a2_-1  a2_1  a2_2  a3_-1  \\\n",
+       "0      10     8    489      0     0     0     1      0     1     0      0   \n",
+       "1  100002     8    489      0     0     0     1      0     0     1      0   \n",
+       "2  100003     8     30      0     1     0     0      1     0     0      1   \n",
+       "3  100006     8    545      0     1     0     0      0     0     1      0   \n",
+       "4   10001     8    244      1     0     0     0      0     1     0      0   \n",
+       "\n",
+       "   a3_1  a3_2  \n",
+       "0     1     0  \n",
+       "1     0     1  \n",
+       "2     0     0  \n",
+       "3     1     0  \n",
+       "4     0     1  "
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "product.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "datetime.datetime(2016, 3, 4, 0, 0)"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_start_date = '2016-03-01'\n",
+    "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n",
+    "train_end_date"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2016-03-04\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
+    "# 修正prod_acc,cate_acc的时间跨度\n",
+    "start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
+    "start_days = start_days.strftime('%Y-%m-%d')\n",
+    "print (train_end_date)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_actions(user, product, all_actions, train_start_date):\n",
+    "    train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n",
+    "    train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
+    "    # 修正prod_acc,cate_acc的时间跨度\n",
+    "    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
+    "    start_days = start_days.strftime('%Y-%m-%d')\n",
+    "    print (train_end_date)\n",
+    "    user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
+    "    print ('get_recent_user_feat finsihed')\n",
+    "    \n",
+    "    user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
+    "    print ('get_user_cate_feature finished')\n",
+    "    \n",
+    "    product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
+    "    print ('get_accumulate_product_feat finsihed')\n",
+    "    cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
+    "    print ('get_accumulate_cate_feat finsihed')\n",
+    "    comment_acc = get_comments_product_feat(train_end_date)\n",
+    "    print ('get_comments_product_feat finished')\n",
+    "    # 标记\n",
+    "    test_start_date = train_end_date\n",
+    "    test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
+    "    test_end_date = test_end_date.strftime('%Y-%m-%d')\n",
+    "    labels = get_labels(test_start_date, test_end_date, all_actions)\n",
+    "    print (\"get labels\")\n",
+    "    \n",
+    "    actions = None\n",
+    "    for i in (3, 5, 7, 10, 15, 21, 30):\n",
+    "        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
+    "        start_days = start_days.strftime('%Y-%m-%d')\n",
+    "        if actions is None:\n",
+    "            actions = get_action_feat(start_days, train_end_date, all_actions, i)\n",
+    "        else:\n",
+    "            # 注意这里的拼接key\n",
+    "            actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',\n",
+    "                               on=['user_id', 'sku_id', 'cate'])\n",
+    "\n",
+    "    actions = pd.merge(actions, user, how='left', on='user_id')\n",
+    "    actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
+    "#     actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
+    "    actions.append(user_cate)\n",
+    "    # 注意这里的拼接key\n",
+    "    actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
+    "    actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
+    "    actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
+    "    actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
+    "    actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])\n",
+    "    # 主要是填充拼接商品基本特征、评论特征、标签之后的空值\n",
+    "    actions = actions.fillna(0)\n",
+    "#     return actions\n",
+    "    # 采样\n",
+    "    action_postive = actions[actions['label'] == 1]\n",
+    "    action_negative = actions[actions['label'] == 0]\n",
+    "    del actions\n",
+    "    neg_len = len(action_postive) * 10\n",
+    "    action_negative = action_negative.sample(n=neg_len)\n",
+    "    action_sample = pd.concat([action_postive, action_negative], ignore_index=True)    \n",
+    "    \n",
+    "    return action_sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_train_set(train_start_date, setNums ,f_path, all_actions):\n",
+    "    train_actions = None\n",
+    "    #all_actions = get_all_action()\n",
+    "    #print (\"get all actions!\")\n",
+    "    user = get_basic_user_feat()\n",
+    "    print ('get_basic_user_feat finsihed')\n",
+    "    product = get_basic_product_feat()\n",
+    "    print ('get_basic_product_feat finsihed')\n",
+    "    # 滑窗,构造多组训练集/验证集\n",
+    "    for i in range(setNums):\n",
+    "        print (train_start_date)\n",
+    "        if train_actions is None:\n",
+    "            train_actions = make_actions(user, product, all_actions, train_start_date)\n",
+    "        else:\n",
+    "            train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],\n",
+    "                                          ignore_index=True)\n",
+    "        # 接下来每次移动一天\n",
+    "        train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)\n",
+    "        train_start_date = train_start_date.strftime('%Y-%m-%d')\n",
+    "        print (\"round {0}/{1} over!\".format(i+1, setNums))\n",
+    "\n",
+    "    train_actions.to_csv(f_path, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get_basic_user_feat finsihed\n",
+      "get_basic_product_feat finsihed\n",
+      "2016-02-01\n",
+      "2016-02-04\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+      "of pandas will change to not sort by default.\n",
+      "\n",
+      "To accept the future behavior, pass 'sort=False'.\n",
+      "\n",
+      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+      "\n",
+      "  sort=sort)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "round 1/20 over!\n",
+      "2016-02-02\n",
+      "2016-02-05\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 2/20 over!\n",
+      "2016-02-03\n",
+      "2016-02-06\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 3/20 over!\n",
+      "2016-02-04\n",
+      "2016-02-07\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 4/20 over!\n",
+      "2016-02-05\n",
+      "2016-02-08\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 5/20 over!\n",
+      "2016-02-06\n",
+      "2016-02-09\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 6/20 over!\n",
+      "2016-02-07\n",
+      "2016-02-10\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 7/20 over!\n",
+      "2016-02-08\n",
+      "2016-02-11\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 8/20 over!\n",
+      "2016-02-09\n",
+      "2016-02-12\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 9/20 over!\n",
+      "2016-02-10\n",
+      "2016-02-13\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 10/20 over!\n",
+      "2016-02-11\n",
+      "2016-02-14\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 11/20 over!\n",
+      "2016-02-12\n",
+      "2016-02-15\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 12/20 over!\n",
+      "2016-02-13\n",
+      "2016-02-16\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 13/20 over!\n",
+      "2016-02-14\n",
+      "2016-02-17\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 14/20 over!\n",
+      "2016-02-15\n",
+      "2016-02-18\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 15/20 over!\n",
+      "2016-02-16\n",
+      "2016-02-19\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 16/20 over!\n",
+      "2016-02-17\n",
+      "2016-02-20\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 17/20 over!\n",
+      "2016-02-18\n",
+      "2016-02-21\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 18/20 over!\n",
+      "2016-02-19\n",
+      "2016-02-22\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 19/20 over!\n",
+      "2016-02-20\n",
+      "2016-02-23\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n",
+      "get labels\n",
+      "round 20/20 over!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 训练集\n",
+    "train_start_date = '2016-02-01'\n",
+    "make_train_set(train_start_date, 20, 'data/train_set.csv',all_actions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 构造验证集(线下测试集)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_val_answer(val_start_date, val_end_date, all_actions, label_val_s1_path):\n",
+    "    actions = get_actions(val_start_date, val_end_date,all_actions)\n",
+    "    actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]\n",
+    "    actions = actions[['user_id', 'sku_id']]\n",
+    "    actions = actions.drop_duplicates()\n",
+    "    actions.to_csv(label_val_s1_path, index=False)\n",
+    "\n",
+    "def make_val_set(train_start_date, train_end_date, val_s1_path):\n",
+    "    # 修改时间跨度\n",
+    "    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
+    "    start_days = start_days.strftime('%Y-%m-%d')\n",
+    "    all_actions = get_all_action()\n",
+    "    print (\"get all actions!\")\n",
+    "    user = get_basic_user_feat()\n",
+    "    print ('get_basic_user_feat finsihed')\n",
+    "    \n",
+    "    product = get_basic_product_feat()\n",
+    "    print ('get_basic_product_feat finsihed')\n",
+    "    user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
+    "    print ('get_recent_user_feat finsihed')\n",
+    "    user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
+    "    print ('get_user_cate_feature finished')\n",
+    " \n",
+    "    product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
+    "    print ('get_accumulate_product_feat finsihed')\n",
+    "    cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
+    "    print ('get_accumulate_cate_feat finsihed')\n",
+    "    comment_acc = get_comments_product_feat(train_end_date)\n",
+    "    print ('get_comments_product_feat finished')\n",
+    "    \n",
+    "    actions = None\n",
+    "    for i in (3, 5, 7, 10, 15, 21, 30):\n",
+    "        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
+    "        start_days = start_days.strftime('%Y-%m-%d')\n",
+    "        if actions is None:\n",
+    "            actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
+    "        else:\n",
+    "            actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
+    "                               on=['user_id', 'sku_id', 'cate'])\n",
+    "\n",
+    "    actions = pd.merge(actions, user, how='left', on='user_id')\n",
+    "    actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
+    "#     actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
+    "    actions.append(user_cate)\n",
+    "    # 注意这里的拼接key\n",
+    "    actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
+    "    actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
+    "    actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
+    "    actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
+    "    actions = actions.fillna(0)\n",
+    "   \n",
+    "    \n",
+    "#     print actions\n",
+    "    # 构造真实用户购买情况作为后续验证\n",
+    "    val_start_date = train_end_date\n",
+    "    val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
+    "    val_end_date = val_end_date.strftime('%Y-%m-%d')\n",
+    "    make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n",
+    "    \n",
+    "    actions.to_csv(val_s1_path, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get all actions!\n",
+      "get_basic_user_feat finsihed\n",
+      "get_basic_product_feat finsihed\n",
+      "get_recent_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n",
+      "get_comments_product_feat finished\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 验证集\n",
+    "make_val_set('2016-02-23', '2016-02-26', 'data/val_set.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_test_set(train_start_date, train_end_date):\n",
+    "    start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
+    "    start_days = start_days.strftime('%Y-%m-%d')\n",
+    "    all_actions = get_all_action()\n",
+    "    print(\"get all actions!\")\n",
+    "    user = get_basic_user_feat()\n",
+    "    print('get_basic_user_feat finsihed')\n",
+    "    product = get_basic_product_feat()\n",
+    "    print('get_basic_product_feat finsihed')\n",
+    "    \n",
+    "    user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
+    "    print('get_accumulate_user_feat finsihed')\n",
+    "    \n",
+    "    user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
+    "    print('get_user_cate_feature finished')\n",
+    "    \n",
+    "    product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
+    "    print('get_accumulate_product_feat finsihed')\n",
+    "    cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
+    "    print('get_accumulate_cate_feat finsihed')\n",
+    "    comment_acc = get_comments_product_feat(train_end_date)\n",
+    "\n",
+    "    actions = None\n",
+    "    for i in (3, 5, 7, 10, 15, 21, 30):\n",
+    "        start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n",
+    "        start_days = start_days.strftime('%Y-%m-%d')\n",
+    "        if actions is None:\n",
+    "            actions = get_action_feat(start_days, train_end_date, all_actions,i)\n",
+    "        else:\n",
+    "            actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n",
+    "                               on=['user_id', 'sku_id', 'cate'])\n",
+    "\n",
+    "    actions = pd.merge(actions, user, how='left', on='user_id')\n",
+    "    actions = pd.merge(actions, user_acc, how='left', on='user_id')\n",
+    "#     actions = pd.merge(actions, user_cate, how='left', on='user_id')\n",
+    "    actions.append(user_cate)\n",
+    "    # 注意这里的拼接key\n",
+    "    actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n",
+    "    actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n",
+    "    actions = pd.merge(actions, cate_acc, how='left', on='cate')\n",
+    "    actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n",
+    "\n",
+    "    actions = actions.fillna(0)\n",
+    "    \n",
+    "\n",
+    "    actions.to_csv(\"data/test_set.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get all actions!\n",
+      "get_basic_user_feat finsihed\n",
+      "get_basic_product_feat finsihed\n",
+      "get_accumulate_user_feat finsihed\n",
+      "get_user_cate_feature finished\n",
+      "get_accumulate_product_feat finsihed\n",
+      "get_accumulate_cate_feat finsihed\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 预测结果\n",
+    "sub_start_date = '2016-04-13'\n",
+    "sub_end_date = '2016-04-16'\n",
+    "make_test_set(sub_start_date, sub_end_date)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	user_id	age	sex	user_lv_cd	user_reg_tm
34072	234073	NaN	NaN	1	NaN
34072	234073	NaN	NaN	1	NaN
34072	234073	NaN	NaN	1	NaN
38905	238906	NaN	NaN	1	NaN
38905	238906	NaN	NaN	1	NaN
38905	238906	NaN	NaN	1	NaN
67704	267705	NaN	NaN	1	NaN
67704	267705	NaN	NaN	1	NaN
67704	267705	NaN	NaN	1	NaN
	sku_id	has_bad_comment	bad_comment_rate	comment_num_2	comment_num_3	comment_num_4
0	1000	1	0.0417	0	1	0
1	10000	0	0.0000	1	0	0
2	100011	1	0.0376	0	0	1
3	100018	0	0.0000	0	1	0
4	100020	0	0.0000	0	1	0
	user_id	cate	type
29	272629.0	10.0	1.0
30	272629.0	10.0	1.0
31	272629.0	10.0	6.0
32	272629.0	10.0	1.0
33	272629.0	10.0	6.0
		type_1.0	type_2.0	type_3.0	type_4.0	type_5.0	type_6.0
user_id	cate
200002.0	4.0	16.0	0.0	0.0	0.0	0.0	20.0
5.0	4.0	0.0	0.0	0.0	0.0	6.0
7.0	4.0	0.0	0.0	0.0	0.0	3.0
8.0	4.0	0.0	0.0	0.0	0.0	12.0
200003.0	4.0	8.0	0.0	0.0	0.0	0.0	12.0
	type_1.0	type_2.0	...	type_5.0	type_6.0
cate	4.0	5.0	6.0	7.0	8.0	9.0	10.0	11.0	4.0	5.0	...	10.0	11.0	4.0	5.0	6.0	7.0	8.0	9.0	10.0	11.0
user_id
200002.0	16.0	4.0	NaN	4.0	4.0	NaN	NaN	NaN	0.0	0.0	...	NaN	NaN	20.0	6.0	NaN	3.0	12.0	NaN	NaN	NaN
200003.0	8.0	NaN	NaN	NaN	12.0	NaN	NaN	NaN	0.0	NaN	...	NaN	NaN	12.0	NaN	NaN	NaN	19.0	NaN	NaN	NaN
200008.0	NaN	NaN	NaN	8.0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	20.0	NaN	NaN	NaN	NaN
200023.0	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN
200030.0	8.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	...	NaN	NaN	17.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	user_id	sku_id	time	model_id	type	cate	brand
0	266079.0	138778.0	2016-01-31 23:59:02	NaN	1.0	8.0	403.0
1	266079.0	138778.0	2016-01-31 23:59:03	0.0	6.0	8.0	403.0
2	200719.0	61226.0	2016-01-31 23:59:07	NaN	1.0	8.0	30.0
3	200719.0	61226.0	2016-01-31 23:59:08	0.0	6.0	8.0	30.0
4	263587.0	72348.0	2016-01-31 23:59:08	NaN	1.0	5.0	159.0
	user_id	age_0	age_2	age_4	age_6	sex_0	sex_1	sex_2	user_lv_cd_1	user_lv_cd_4	user_lv_cd_5
0	200001.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0
1	200002.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
2	200003.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
3	200004.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0
4	200005.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
	sku_id	cate	brand	a1_-1	a1_1	a1_3	a2_-1	a2_1	a2_2	a3_-1	a3_1	a3_2
0	10	8	489	0	0	1	0	1	0	0	1	0
1	100002	8	489	0	0	1	0	0	1	0	0	1
2	100003	8	30	0	1	0	1	0	0	1	0	0
3	100006	8	545	0	1	0	0	0	1	0	1	0
4	10001	8	244	1	0	0	0	1	0	0	0	1