Add. Basic feature structure

4 years ago · 9536eb3967
parent e10173ed04
commit 9536eb3967
1 changed files with 719 additions and 1 deletions
--- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/.ipynb_checkpoints/3-特征工程-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/.ipynb_checkpoints/3-特征工程-checkpoint.ipynb
@ -44,6 +44,7 @@
    "* 用户对各个类别操作行为统计占对所有类别操作行为统计的比重\n",
    "\n",
    "累积商品特征:\n",
+    "\n",
    "* 分时间段\n",
    "* 针对商品的不同行为的\n",
    "* 购买转化率\n",
@ -55,12 +56,729 @@
    "* 均值"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from datetime import datetime\n",
+    "from datetime import timedelta\n",
+    "import pandas as pd\n",
+    "import pickle\n",
+    "import os\n",
+    "import math\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 11485424 entries, 0 to 11485423\n",
+      "Data columns (total 7 columns):\n",
+      "user_id     float32\n",
+      "sku_id      float32\n",
+      "time        object\n",
+      "model_id    float32\n",
+      "type        float32\n",
+      "cate        float32\n",
+      "brand       float32\n",
+      "dtypes: float32(6), object(1)\n",
+      "memory usage: 350.5+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "#float32 降低内存消耗\n",
+    "test = pd.read_csv('data/JData_Action_201602.csv')\n",
+    "test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "test.dtypes\n",
+    "test.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 11485424 entries, 0 to 11485423\n",
+      "Data columns (total 7 columns):\n",
+      "user_id     int64\n",
+      "sku_id      int64\n",
+      "time        object\n",
+      "model_id    float64\n",
+      "type        int64\n",
+      "cate        int64\n",
+      "brand       int64\n",
+      "dtypes: float64(1), int64(5), object(1)\n",
+      "memory usage: 613.4+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 不转换float32\n",
+    "test = pd.read_csv('data/JData_Action_201602.csv')\n",
+    "# test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "test.dtypes\n",
+    "test.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "可以明显看到`int64`使用的memory usage: 613.4+ MB，比`float32`的多了近一倍的内存使用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 数据路径\n",
+    "action_1_path = r'data/JData_Action_201602.csv'\n",
+    "action_2_path = r'data/JData_Action_201603.csv'\n",
+    "action_3_path = r'data/JData_Action_201604.csv'\n",
+    " \n",
+    "comment_path = r'data/JData_Comment.csv'\n",
+    "product_path = r'data/JData_Product.csv'\n",
+    "user_path = r'data/JData_User.csv'\n",
+    " \n",
+    "comment_date = [\n",
+    "    \"2016-02-01\", \"2016-02-08\", \"2016-02-15\", \"2016-02-22\", \"2016-02-29\",\n",
+    "    \"2016-03-07\", \"2016-03-14\", \"2016-03-21\", \"2016-03-28\", \"2016-04-04\",\n",
+    "    \"2016-04-11\", \"2016-04-15\"\n",
+    "]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "# 基本方法\n",
+    "def get_actions_0():\n",
+    "    action = pd.read_csv(action_0_path)\n",
+    "    return action\n",
+    " \n",
+    "def get_actions_1():\n",
+    "    action = pd.read_csv(action_1_path)\n",
+    "    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    return action\n",
+    "\n",
+    "def get_actions_2():\n",
+    "    action = pd.read_csv(action_2_path)\n",
+    "    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    " \n",
+    "    return action\n",
+    "def get_actions_3():\n",
+    "    action = pd.read_csv(action_3_path)\n",
+    "    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    " \n",
+    "    return action\n",
+    " \n",
+    "#如果电脑性能好就不用分块\n",
+    "def get_actions_10():\n",
+    "    \n",
+    "    reader = pd.read_csv(action_1_path, iterator=True)\n",
+    "    reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    chunks = []\n",
+    "    loop = True\n",
+    "    while loop:\n",
+    "        try:\n",
+    "            chunk = reader.get_chunk(50000)\n",
+    "            chunks.append(chunk)\n",
+    "        except StopIteration:\n",
+    "            loop = False\n",
+    "            print(\"Iteration is stopped\")\n",
+    "    action = pd.concat(chunks, ignore_index=True)\n",
+    "            \n",
+    "    return action\n",
+    "def get_actions_20():\n",
+    "    \n",
+    "    reader = pd.read_csv(action_2_path, iterator=True)\n",
+    "    reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    chunks = []\n",
+    "    loop = True\n",
+    "    while loop:\n",
+    "        try:\n",
+    "            chunk = reader.get_chunk(50000)\n",
+    "            chunks.append(chunk)\n",
+    "        except StopIteration:\n",
+    "            loop = False\n",
+    "            print(\"Iteration is stopped\")\n",
+    "    action = pd.concat(chunks, ignore_index=True)\n",
+    "            \n",
+    "    return action\n",
+    "def get_actions_30():\n",
+    "    \n",
+    "    reader = pd.read_csv(action_3_path, iterator=True)\n",
+    "    reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n",
+    "    chunks = []\n",
+    "    loop = True\n",
+    "    while loop:\n",
+    "        try:\n",
+    "            chunk = reader.get_chunk(50000)\n",
+    "            chunks.append(chunk)\n",
+    "        except StopIteration:\n",
+    "            loop = False\n",
+    "            print(\"Iteration is stopped\")\n",
+    "    action = pd.concat(chunks, ignore_index=True)\n",
+    "            \n",
+    "    return action\n",
+    "\n",
+    "# 读取并拼接所有行为记录文件\n",
+    "def get_all_action():\n",
+    "    action_1 = get_actions_1()\n",
+    "    action_2 = get_actions_2()\n",
+    "    action_3 = get_actions_3()\n",
+    "    actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame\n",
+    " \n",
+    "    return actions\n",
+    " \n",
+    "# 获取某个时间段的行为记录\n",
+    "def get_actions(start_date, end_date, all_actions):\n",
+    "    \"\"\"\n",
+    "    :param start_date:\n",
+    "    :param end_date:\n",
+    "    :return: actions: pd.Dataframe\n",
+    "    \"\"\"\n",
+    "    actions = all_actions[(all_actions.time >= start_date) & (all_actions.time < end_date)].copy()\n",
+    "    return actions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 用户特征\n",
+    "### 用户基本特征\n",
+    "获取基本的用户特征，基于用户本身属性多为类别特征的特点，对age,sex,usr_lv_cd进行独热编码操作，对于用户注册时间暂时不处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import preprocessing\n",
+    " \n",
+    "def get_basic_user_feat():\n",
+    "    # 针对年龄的中文字符问题处理，首先是读入的时候编码，删除空值，然后将其数值化，最后独热编码，此外对于sex也进行了数值类型转换\n",
+    "    user = pd.read_csv(user_path, encoding='gbk')\n",
+    "    # axis=0/1=包含缺失值的行/列，how=any/all=有则删除指定行或者列/必须全部才删除，inplace=是否在原表上修改\n",
+    "    user.dropna(axis=0, how='any',inplace=True)\n",
+    "    user['sex'] = user['sex'].astype(int)    \n",
+    "    user['age'] = user['age'].astype(int)\n",
+    "    le = preprocessing.LabelEncoder()\n",
+    "    age_df = le.fit_transform(user['age'])  # 将标签值标准化成0,1,2,3...\n",
+    " \n",
+    "    age_df = pd.get_dummies(age_df, prefix='age')  # 将标准化值变成onehot编码0/1\n",
+    "# 如原数据age[0,1,2,3]变成：\n",
+    "#      age_0[1,0,0,0]\n",
+    "#      age_1[0,1,0,0]\n",
+    "#      age_2[0,0,1,0]\n",
+    "#      age_3[0,0,0,1], 有值的位置为1，没有为0，且只有0/1两种情况，列名+列值=新列名\n",
+    "    sex_df = pd.get_dummies(user['sex'], prefix='sex')\n",
+    "    user_lv_df = pd.get_dummies(user['user_lv_cd'], prefix='user_lv_cd')\n",
+    "    user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)  # 合并\n",
+    "    return user"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "user_id        False\n",
+       "age             True\n",
+       "sex             True\n",
+       "user_lv_cd     False\n",
+       "user_reg_tm     True\n",
+       "dtype: bool"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "user = pd.read_csv(user_path, encoding='gbk')\n",
+    "user.isnull().any()  # 判断是否有空值，True为有"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>age</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>user_lv_cd</th>\n",
+       "      <th>user_reg_tm</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>34072</th>\n",
+       "      <td>234073</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34072</th>\n",
+       "      <td>234073</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34072</th>\n",
+       "      <td>234073</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38905</th>\n",
+       "      <td>238906</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38905</th>\n",
+       "      <td>238906</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38905</th>\n",
+       "      <td>238906</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67704</th>\n",
+       "      <td>267705</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67704</th>\n",
+       "      <td>267705</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67704</th>\n",
+       "      <td>267705</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       user_id  age  sex  user_lv_cd user_reg_tm\n",
+       "34072   234073  NaN  NaN           1         NaN\n",
+       "34072   234073  NaN  NaN           1         NaN\n",
+       "34072   234073  NaN  NaN           1         NaN\n",
+       "38905   238906  NaN  NaN           1         NaN\n",
+       "38905   238906  NaN  NaN           1         NaN\n",
+       "38905   238906  NaN  NaN           1         NaN\n",
+       "67704   267705  NaN  NaN           1         NaN\n",
+       "67704   267705  NaN  NaN           1         NaN\n",
+       "67704   267705  NaN  NaN           1         NaN"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "user[user.isnull().values==True]  # 查看空值的部分"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "这些数据不仅没有年龄、性别、注册时间，数据只有9条，比较少，我们直接删除"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "user_id        False\n",
+       "age            False\n",
+       "sex            False\n",
+       "user_lv_cd     False\n",
+       "user_reg_tm    False\n",
+       "dtype: bool"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "user.dropna(axis=0, how='any',inplace=True)\n",
+    "user.isnull().any()  # 判断是否有空值，已经全部为False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 商品特征\n",
+    "### 商品基本特征\n",
+    "根据商品文件获取基本的特征，针对属性a1,a2,a3进行独热编码，商品类别和品牌直接作为特征"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_basic_product_feat():\n",
+    "    product = pd.read_csv(product_path)\n",
+    "    attr1_df = pd.get_dummies(product[\"a1\"], prefix=\"a1\")\n",
+    "    attr2_df = pd.get_dummies(product[\"a2\"], prefix=\"a2\")\n",
+    "    attr3_df = pd.get_dummies(product[\"a3\"], prefix=\"a3\")\n",
+    "    product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)\n",
+    "    return product"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 评论特征\n",
+    "* 分时间段\n",
+    "* 对评论数进行独热编码"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_comments_product_feat(end_date):\n",
+    "    comments = pd.read_csv(comment_path)\n",
+    "    comment_date_end = end_date\n",
+    "    comment_date_begin = comment_date[0]\n",
+    "    for date in reversed(comment_date):\n",
+    "        if date < comment_date_end:\n",
+    "            comment_date_begin = date\n",
+    "            break\n",
+    "    comments = comments[comments.dt==comment_date_begin]\n",
+    "    df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n",
+    "    # 为了防止某个时间段不具备评论数为0的情况（测试集出现过这种情况）\n",
+    "    for i in range(0, 5):\n",
+    "        if 'comment_num_' + str(i) not in df.columns:\n",
+    "            df['comment_num_' + str(i)] = 0\n",
+    "    df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "    \n",
+    "    comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n",
+    "        #del comments['dt']\n",
+    "        #del comments['comment_num']\n",
+    "    comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n",
+    "                         'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "    return comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_start_date = '2016-02-01'\n",
+    "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n",
+    "train_end_date = train_end_date.strftime('%Y-%m-%d')\n",
+    "day = 3\n",
+    " \n",
+    "start_date = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=day)\n",
+    "start_date = start_date.strftime('%Y-%m-%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sku_id</th>\n",
+       "      <th>has_bad_comment</th>\n",
+       "      <th>bad_comment_rate</th>\n",
+       "      <th>comment_num_0</th>\n",
+       "      <th>comment_num_1</th>\n",
+       "      <th>comment_num_2</th>\n",
+       "      <th>comment_num_3</th>\n",
+       "      <th>comment_num_4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1000</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0417</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>10000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>100011</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0376</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>100018</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>100020</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sku_id  has_bad_comment  bad_comment_rate  comment_num_0  comment_num_1  \\\n",
+       "0    1000                1            0.0417              0              0   \n",
+       "1   10000                0            0.0000              0              0   \n",
+       "2  100011                1            0.0376              0              0   \n",
+       "3  100018                0            0.0000              0              0   \n",
+       "4  100020                0            0.0000              0              0   \n",
+       "\n",
+       "   comment_num_2  comment_num_3  comment_num_4  \n",
+       "0              0              1              0  \n",
+       "1              1              0              0  \n",
+       "2              0              0              1  \n",
+       "3              0              1              0  \n",
+       "4              0              1              0  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "comments = pd.read_csv(comment_path)\n",
+    "comment_date_end = train_end_date\n",
+    "comment_date_begin = comment_date[0]\n",
+    "for date in reversed(comment_date):\n",
+    "     if date < comment_date_end:\n",
+    "        comment_date_begin = date\n",
+    "        break\n",
+    "comments = comments[comments.dt==comment_date_begin]\n",
+    "df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n",
+    "for i in range(0, 5):\n",
+    "    if 'comment_num_' + str(i) not in df.columns:\n",
+    "         df['comment_num_' + str(i)] = 0\n",
+    "df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "    \n",
+    "comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n",
+    "comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n",
+    "                        'comment_num_2', 'comment_num_3', 'comment_num_4']]\n",
+    "comments.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "comment_num_0/1/2/3/4分别对应着，0无评论/1表示1条/2表示2-10条/3表示11-50条/4表示大于50条，bad_comment_rate差评率，has_bad_comment是否包含差评。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 行为特征\n",
+    "* 分时间段\n",
+    "* 对行为类别进行独热编码\n",
+    "* 分别按照用户-类别行为分组和用户-类别-商品行为分组统计，然后计算\n",
+    "    * 用户对同类别下其他商品的行为计数\n",
+    "    * 针对用户对同类别下目标商品的行为计数与该时间段的行为均值作差"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_action_feat(start_date, end_date, all_actions, i):\n",
+    "    actions = get_actions(start_date, end_date, all_actions)\n",
+    "    actions = actions[['user_id', 'sku_id', 'cate','type']]\n",
+    "    # 不同时间累积的行为计数（3,5,7,10,15,21,30）\n",
+    "    df = pd.get_dummies(actions['type'], prefix='action_before_%s' %i)\n",
+    "    before_date = 'action_before_%s' %i\n",
+    "    actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame\n",
+    "    # 分组统计，用户-类别-商品,不同用户对不同类别下商品的行为计数\n",
+    "    actions = actions.groupby(['user_id', 'sku_id','cate'], as_index=False).sum()\n",
+    "    # 分组统计，用户-类别，不同用户对不同商品类别的行为计数\n",
+    "    user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()\n",
+    "    del user_cate['sku_id']\n",
+    "    del user_cate['type']\n",
+    "    actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])\n",
+    "    #本类别下其他商品点击量\n",
+    "    # 前述两种分组含有相同名称的不同行为的计数，系统会自动针对名称调整添加后缀,x,y，所以这里作差统计的是同一类别下其他商品的行为计数\n",
+    "    actions[before_date+'_1.0_y'] = actions[before_date+'_1.0_y'] - actions[before_date+'_1.0_x']\n",
+    "    actions[before_date+'_2.0_y'] = actions[before_date+'_2.0_y'] - actions[before_date+'_2.0_x']\n",
+    "    actions[before_date+'_3.0_y'] = actions[before_date+'_3.0_y'] - actions[before_date+'_3.0_x']\n",
+    "    actions[before_date+'_4.0_y'] = actions[before_date+'_4.0_y'] - actions[before_date+'_4.0_x']\n",
+    "    actions[before_date+'_5.0_y'] = actions[before_date+'_5.0_y'] - actions[before_date+'_5.0_x']\n",
+    "    actions[before_date+'_6.0_y'] = actions[before_date+'_6.0_y'] - actions[before_date+'_6.0_x']\n",
+    "    # 统计用户对不同类别下商品计数与该类别下商品行为计数均值（对时间）的差值\n",
+    "    actions[before_date+'minus_mean_1'] = actions[before_date+'_1.0_x'] - (actions[before_date+'_1.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_2'] = actions[before_date+'_2.0_x'] - (actions[before_date+'_2.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_3'] = actions[before_date+'_3.0_x'] - (actions[before_date+'_3.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_4'] = actions[before_date+'_4.0_x'] - (actions[before_date+'_4.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_5'] = actions[before_date+'_5.0_x'] - (actions[before_date+'_5.0_x']/i)\n",
+    "    actions[before_date+'minus_mean_6'] = actions[before_date+'_6.0_x'] - (actions[before_date+'_6.0_x']/i)\n",
+    "    del actions['type']\n",
+    "    # 保留cate特征\n",
+    "#     del actions['cate']\n",
+    " \n",
+    "    return actions\n",
+    " \n",
+    " \n",
+    "actions = get_actions(start_date, train_end_date, all_actions)\n",
+    "actions = actions[['user_id', 'sku_id', 'cate','type']]\n",
+    "    # 不同时间累积的行为计数（3,5,7,10,15,21,30）\n",
+    "df = pd.get_dummies(actions['type'], prefix='action_before_%s' %3)\n",
+    "before_date = 'action_before_%s' %3\n",
+    "actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame\n",
+    "    # 分组统计，用户-类别-商品,不同用户对不同类别下商品的行为计数\n",
+    "actions = actions.groupby(['user_id', 'sku_id','cate'], as_index=False).sum()\n",
+    "actions.head(20)"
+   ]
  }
 ],
 "metadata": {