From 280a75e0feb60f1bdbf82a9ff94cfd2f7435c222 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Mon, 8 Feb 2021 22:27:50 +0800 Subject: [PATCH] Update. some methods --- .../3-特征工程 - 副本.ipynb | 3619 +++++++++++++++++ 1 file changed, 3619 insertions(+) create mode 100644 机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb new file mode 100644 index 0000000..035fe39 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程 - 副本.ipynb @@ -0,0 +1,3619 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3-特征工程" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "用户基本特征:\n", + "* 获取基本的用户特征,基于用户本身属性多为类别特征的特点,对age,sex,usr_lv_cd进行独热编码操作,对于用户注册时间暂时不处理,\n", + "\n", + "商品基本特征:\n", + "* 根据商品文件获取基本的特征\n", + "* 针对属性a1,a2,a3进行独热编码\n", + "* 商品类别和品牌直接作为特征,不同的品牌的影响力不同,购买力也不同\n", + "\n", + "评论特征:\n", + "* 分时间段\n", + "* 对评论数进行独热编码: 0表示无评论,1表示有1条评论,2表示有2-10条评论,3表示有11-50条评论,4表示大于50条评论,对0~4 进行独热编码\n", + "\n", + "行为特征:\n", + "* 分时间段,一般是最近做的对未来的影响越明显\n", + "* 对行为类别进行独热编码:对1~6进行独热编码\n", + "* 分别按照用户-类别行为分组和用户-类别-商品行为分组统计,然后计算\n", + "* 用户对同类别下其他商品的行为计数\n", + "* 不同时间累积的行为计数(3,5,7,10,15,21,30)\n", + "\n", + "累积用户特征:\n", + "* 分时间段\n", + "* 用户不同行为的\n", + "* 购买转化率\n", + "* 均值,同上有不同时间的均值,3天、5天.....\n", + "\n", + "用户近期行为特征:\n", + "* 在上面针对用户进行累积特征提取的基础上,分别提取用户近一个月、近三天的特征,然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重\n", + "\n", + "用户对同类别下各种商品的行为:\n", + "* 用户对各个类别的各项行为操作统计\n", + "* 用户对各个类别操作行为统计占对所有类别操作行为统计的比重\n", + "\n", + "累积商品特征:\n", + "\n", + "* 分时间段\n", + "* 针对商品的不同行为的\n", + "* 购买转化率\n", + "* 均值\n", + "\n", + "类别特征:\n", + "* 分时间段下各个商品类别的\n", + "* 购买转化率\n", + "* 均值" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from datetime import datetime\n", + "from datetime import timedelta\n", + "import pandas as pd\n", + "import pickle\n", + "import os\n", + "import math\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 11485424 entries, 0 to 11485423\n", + "Data columns (total 7 columns):\n", + "user_id float32\n", + "sku_id float32\n", + "time object\n", + "model_id float32\n", + "type float32\n", + "cate float32\n", + "brand float32\n", + "dtypes: float32(6), object(1)\n", + "memory usage: 350.5+ MB\n" + ] + } + ], + "source": [ + "#float32 降低内存消耗\n", + "test = pd.read_csv('data/JData_Action_201602.csv')\n", + "test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + "test.dtypes\n", + "test.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 11485424 entries, 0 to 11485423\n", + "Data columns (total 7 columns):\n", + "user_id int64\n", + "sku_id int64\n", + "time object\n", + "model_id float64\n", + "type int64\n", + "cate int64\n", + "brand int64\n", + "dtypes: float64(1), int64(5), object(1)\n", + "memory usage: 613.4+ MB\n" + ] + } + ], + "source": [ + "# 不转换float32\n", + "test = pd.read_csv('data/JData_Action_201602.csv')\n", + "# test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + "test.dtypes\n", + "test.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以明显看到`int64`使用的memory usage: 613.4+ MB,比`float32`的多了近一倍的内存使用" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 数据路径\n", + "action_1_path = r'data/JData_Action_201602.csv'\n", + "action_2_path = r'data/JData_Action_201603.csv'\n", + "action_3_path = r'data/JData_Action_201604.csv'\n", + " \n", + "comment_path = r'data/JData_Comment.csv'\n", + "product_path = r'data/JData_Product.csv'\n", + "user_path = r'data/JData_User.csv'\n", + " \n", + "comment_date = [\n", + " \"2016-02-01\", \"2016-02-08\", \"2016-02-15\", \"2016-02-22\", \"2016-02-29\",\n", + " \"2016-03-07\", \"2016-03-14\", \"2016-03-21\", \"2016-03-28\", \"2016-04-04\",\n", + " \"2016-04-11\", \"2016-04-15\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# 基本方法\n", + "def get_actions_0():\n", + " action = pd.read_csv(action_0_path)\n", + " return action\n", + " \n", + "def get_actions_1():\n", + " action = pd.read_csv(action_1_path)\n", + " action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " return action\n", + "\n", + "def get_actions_2():\n", + " action = pd.read_csv(action_2_path)\n", + " action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " \n", + " return action\n", + "def get_actions_3():\n", + " action = pd.read_csv(action_3_path)\n", + " action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " \n", + " return action\n", + " \n", + "#如果电脑性能好就不用分块\n", + "def get_actions_10():\n", + " \n", + " reader = pd.read_csv(action_1_path, iterator=True)\n", + " reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(50000)\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + " action = pd.concat(chunks, ignore_index=True)\n", + " \n", + " return action\n", + "def get_actions_20():\n", + " \n", + " reader = pd.read_csv(action_2_path, iterator=True)\n", + " reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(50000)\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + " action = pd.concat(chunks, ignore_index=True)\n", + " \n", + " return action\n", + "def get_actions_30():\n", + " \n", + " reader = pd.read_csv(action_3_path, iterator=True)\n", + " reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(50000)\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + " action = pd.concat(chunks, ignore_index=True)\n", + " \n", + " return action\n", + "\n", + "# 读取并拼接所有行为记录文件\n", + "def get_all_action():\n", + " action_1 = get_actions_1()\n", + " action_2 = get_actions_2()\n", + " action_3 = get_actions_3()\n", + " actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame\n", + " \n", + " return actions\n", + " \n", + "# 获取某个时间段的行为记录\n", + "def get_actions(start_date, end_date, all_actions):\n", + " \"\"\"\n", + " :param start_date:\n", + " :param end_date:\n", + " :return: actions: pd.Dataframe\n", + " \"\"\"\n", + " actions = all_actions[(all_actions.time >= start_date) & (all_actions.time < end_date)].copy()\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户特征\n", + "### 用户基本特征\n", + "获取基本的用户特征,基于用户本身属性多为类别特征的特点,对age,sex,usr_lv_cd进行独热编码操作,对于用户注册时间暂时不处理" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + " \n", + "def get_basic_user_feat():\n", + " # 针对年龄的中文字符问题处理,首先是读入的时候编码,删除空值,然后将其数值化,最后独热编码,此外对于sex也进行了数值类型转换\n", + " user = pd.read_csv(user_path, encoding='gbk')\n", + " # axis=0/1=包含缺失值的行/列,how=any/all=有则删除指定行或者列/必须全部才删除,inplace=是否在原表上修改\n", + " user.dropna(axis=0, how='any',inplace=True)\n", + " user['sex'] = user['sex'].astype(int) \n", + " user['age'] = user['age'].astype(int)\n", + " le = preprocessing.LabelEncoder()\n", + " age_df = le.fit_transform(user['age']) # 将标签值标准化成0,1,2,3...\n", + " \n", + " age_df = pd.get_dummies(age_df, prefix='age') # 将标准化值变成onehot编码0/1\n", + "# 如原数据age[0,1,2,3]变成:\n", + "# age_0[1,0,0,0]\n", + "# age_1[0,1,0,0]\n", + "# age_2[0,0,1,0]\n", + "# age_3[0,0,0,1], 有值的位置为1,没有为0,且只有0/1两种情况,列名+列值=新列名\n", + " sex_df = pd.get_dummies(user['sex'], prefix='sex')\n", + " user_lv_df = pd.get_dummies(user['user_lv_cd'], prefix='user_lv_cd')\n", + " user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1) # 合并\n", + " return user" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "user_id False\n", + "age True\n", + "sex True\n", + "user_lv_cd False\n", + "user_reg_tm True\n", + "dtype: bool" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user = pd.read_csv(user_path, encoding='gbk')\n", + "user.isnull().any() # 判断是否有空值,True为有" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexuser_lv_cduser_reg_tm
34072234073NaNNaN1NaN
34072234073NaNNaN1NaN
34072234073NaNNaN1NaN
38905238906NaNNaN1NaN
38905238906NaNNaN1NaN
38905238906NaNNaN1NaN
67704267705NaNNaN1NaN
67704267705NaNNaN1NaN
67704267705NaNNaN1NaN
\n", + "
" + ], + "text/plain": [ + " user_id age sex user_lv_cd user_reg_tm\n", + "34072 234073 NaN NaN 1 NaN\n", + "34072 234073 NaN NaN 1 NaN\n", + "34072 234073 NaN NaN 1 NaN\n", + "38905 238906 NaN NaN 1 NaN\n", + "38905 238906 NaN NaN 1 NaN\n", + "38905 238906 NaN NaN 1 NaN\n", + "67704 267705 NaN NaN 1 NaN\n", + "67704 267705 NaN NaN 1 NaN\n", + "67704 267705 NaN NaN 1 NaN" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user[user.isnull().values==True] # 查看空值的部分" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这些数据不仅没有年龄、性别、注册时间,数据只有9条,比较少,我们直接删除" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "user_id False\n", + "age False\n", + "sex False\n", + "user_lv_cd False\n", + "user_reg_tm False\n", + "dtype: bool" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user.dropna(axis=0, how='any',inplace=True)\n", + "user.isnull().any() # 判断是否有空值,已经全部为False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 商品特征\n", + "### 商品基本特征\n", + "根据商品文件获取基本的特征,针对属性a1,a2,a3进行独热编码,商品类别和品牌直接作为特征" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def get_basic_product_feat():\n", + " product = pd.read_csv(product_path)\n", + " attr1_df = pd.get_dummies(product[\"a1\"], prefix=\"a1\")\n", + " attr2_df = pd.get_dummies(product[\"a2\"], prefix=\"a2\")\n", + " attr3_df = pd.get_dummies(product[\"a3\"], prefix=\"a3\")\n", + " product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)\n", + " return product" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 评论特征\n", + "* 分时间段\n", + "* 对评论数进行独热编码" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def get_comments_product_feat(end_date):\n", + " comments = pd.read_csv(comment_path)\n", + " comment_date_end = end_date\n", + " comment_date_begin = comment_date[0]\n", + " for date in reversed(comment_date):\n", + " if date < comment_date_end:\n", + " comment_date_begin = date\n", + " break\n", + " comments = comments[comments.dt==comment_date_begin]\n", + " df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n", + " # 为了防止某个时间段不具备评论数为0的情况(测试集出现过这种情况)\n", + " for i in range(0, 5):\n", + " if 'comment_num_' + str(i) not in df.columns:\n", + " df['comment_num_' + str(i)] = 0\n", + " df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + " \n", + " comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n", + " #del comments['dt']\n", + " #del comments['comment_num']\n", + " comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n", + " 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + " return comments" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "train_start_date = '2016-02-01'\n", + "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + "train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + "day = 3\n", + " \n", + "start_date = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=day)\n", + "start_date = start_date.strftime('%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_idhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
0100010.041700010
11000000.000000100
210001110.037600001
310001800.000000010
410002000.000000010
\n", + "
" + ], + "text/plain": [ + " sku_id has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n", + "0 1000 1 0.0417 0 0 \n", + "1 10000 0 0.0000 0 0 \n", + "2 100011 1 0.0376 0 0 \n", + "3 100018 0 0.0000 0 0 \n", + "4 100020 0 0.0000 0 0 \n", + "\n", + " comment_num_2 comment_num_3 comment_num_4 \n", + "0 0 1 0 \n", + "1 1 0 0 \n", + "2 0 0 1 \n", + "3 0 1 0 \n", + "4 0 1 0 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments = pd.read_csv(comment_path)\n", + "comment_date_end = train_end_date\n", + "comment_date_begin = comment_date[0]\n", + "for date in reversed(comment_date):\n", + " if date < comment_date_end:\n", + " comment_date_begin = date\n", + " break\n", + "comments = comments[comments.dt==comment_date_begin]\n", + "df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n", + "for i in range(0, 5):\n", + " if 'comment_num_' + str(i) not in df.columns:\n", + " df['comment_num_' + str(i)] = 0\n", + "df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + " \n", + "comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n", + "comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n", + " 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + "comments.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "comment_num_0/1/2/3/4分别对应着,0无评论/1表示1条/2表示2-10条/3表示11-50条/4表示大于50条,bad_comment_rate差评率,has_bad_comment是否包含差评。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 行为特征\n", + "* 分时间段\n", + "* 对行为类别进行独热编码\n", + "* 分别按照用户-类别行为分组和用户-类别-商品行为分组统计,然后计算\n", + " * 用户对同类别下其他商品的行为计数\n", + " * 针对用户对同类别下目标商品的行为计数与该时间段的行为均值作差" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def get_action_feat(start_date, end_date, all_actions, i):\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " actions = actions[['user_id', 'sku_id', 'cate','type']]\n", + " # 不同时间累积的行为计数(3,5,7,10,15,21,30)\n", + " df = pd.get_dummies(actions['type'], prefix='action_before_%s' %i)\n", + " before_date = 'action_before_%s' %i\n", + " actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame\n", + " # 分组统计,用户-类别-商品,不同用户对不同类别下商品的行为计数\n", + " actions = actions.groupby(['user_id', 'sku_id','cate'], as_index=False).sum()\n", + " # 分组统计,用户-类别,不同用户对不同商品类别的行为计数\n", + " user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()\n", + " del user_cate['sku_id']\n", + " del user_cate['type']\n", + " actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])\n", + " #本类别下其他商品点击量\n", + " # 前述两种分组含有相同名称的不同行为的计数,系统会自动针对名称调整添加后缀,x,y,所以这里作差统计的是同一类别下其他商品的行为计数\n", + " actions[before_date+'_1.0_y'] = actions[before_date+'_1.0_y'] - actions[before_date+'_1.0_x']\n", + " actions[before_date+'_2.0_y'] = actions[before_date+'_2.0_y'] - actions[before_date+'_2.0_x']\n", + " actions[before_date+'_3.0_y'] = actions[before_date+'_3.0_y'] - actions[before_date+'_3.0_x']\n", + " actions[before_date+'_4.0_y'] = actions[before_date+'_4.0_y'] - actions[before_date+'_4.0_x']\n", + " actions[before_date+'_5.0_y'] = actions[before_date+'_5.0_y'] - actions[before_date+'_5.0_x']\n", + " actions[before_date+'_6.0_y'] = actions[before_date+'_6.0_y'] - actions[before_date+'_6.0_x']\n", + " # 统计用户对不同类别下商品计数与该类别下商品行为计数均值(对时间)的差值\n", + " actions[before_date+'minus_mean_1'] = actions[before_date+'_1.0_x'] - (actions[before_date+'_1.0_x']/i)\n", + " actions[before_date+'minus_mean_2'] = actions[before_date+'_2.0_x'] - (actions[before_date+'_2.0_x']/i)\n", + " actions[before_date+'minus_mean_3'] = actions[before_date+'_3.0_x'] - (actions[before_date+'_3.0_x']/i)\n", + " actions[before_date+'minus_mean_4'] = actions[before_date+'_4.0_x'] - (actions[before_date+'_4.0_x']/i)\n", + " actions[before_date+'minus_mean_5'] = actions[before_date+'_5.0_x'] - (actions[before_date+'_5.0_x']/i)\n", + " actions[before_date+'minus_mean_6'] = actions[before_date+'_6.0_x'] - (actions[before_date+'_6.0_x']/i)\n", + " del actions['type']\n", + " \n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户-行为\n", + "#### 累积用户特征\n", + "* 分时间段\n", + "* 用户不同行为的\n", + " * 购买转化率\n", + " * 均值" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accumulate_user_feat(end_date, all_actions, day):\n", + " start_date = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=day)\n", + " start_date = start_date.strftime('%Y-%m-%d')\n", + " before_date = 'user_action_%s' % day\n", + "\n", + " feature = [\n", + " 'user_id', before_date + '_1', before_date + '_2', before_date + '_3',\n", + " before_date + '_4', before_date + '_5', before_date + '_6',\n", + " before_date + '_1_ratio', before_date + '_2_ratio',\n", + " before_date + '_3_ratio', before_date + '_5_ratio',\n", + " before_date + '_6_ratio', before_date + '_1_mean',\n", + " before_date + '_2_mean', before_date + '_3_mean',\n", + " before_date + '_4_mean', before_date + '_5_mean',\n", + " before_date + '_6_mean', before_date + '_1_std',\n", + " before_date + '_2_std', before_date + '_3_std', before_date + '_4_std',\n", + " before_date + '_5_std', before_date + '_6_std'\n", + " ]\n", + "\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " df = pd.get_dummies(actions['type'], prefix=before_date)\n", + "\n", + " actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n", + "\n", + " actions = pd.concat([actions[['user_id', 'date']], df], axis=1)\n", + " \n", + " # 分组统计,按用户分组,统计用户各项行为的转化率、均值\n", + " actions = actions.groupby(['user_id'], as_index=False).sum()\n", + "\n", + " actions[before_date + '_1_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_1.0'])\n", + " actions[before_date + '_2_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_2.0'])\n", + " actions[before_date + '_3_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_3.0'])\n", + " actions[before_date + '_5_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_5.0'])\n", + " actions[before_date + '_6_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_6.0'])\n", + " # 均值\n", + " actions[before_date + '_1_mean'] = actions[before_date + '_1.0'] / day\n", + " actions[before_date + '_2_mean'] = actions[before_date + '_2.0'] / day\n", + " actions[before_date + '_3_mean'] = actions[before_date + '_3.0'] / day\n", + " actions[before_date + '_4_mean'] = actions[before_date + '_4.0'] / day\n", + " actions[before_date + '_5_mean'] = actions[before_date + '_5.0'] / day\n", + " actions[before_date + '_6_mean'] = actions[before_date + '_6.0'] / day\n", + " #actions = pd.merge(actions, actions_date, how='left', on='user_id')\n", + " #actions = actions[feature]\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户近期行为特征\n", + "在上面针对用户进行累积特征提取的基础上,分别提取用户近一个月、近三天的特征,然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_recent_user_feat(end_date, all_actions):\n", + " actions_3 = get_accumulate_user_feat(end_date, all_actions, 3)\n", + " actions_30 = get_accumulate_user_feat(end_date, all_actions, 30)\n", + " actions = pd.merge(actions_3, actions_30, how ='left', on='user_id')\n", + " del actions_3\n", + " del actions_30\n", + " \n", + " actions['recent_action1'] = np.log(1 + actions['user_action_30_1.0']-actions['user_action_3_1.0']) - np.log(1 + actions['user_action_30_1.0'])\n", + " actions['recent_action2'] = np.log(1 + actions['user_action_30_2.0']-actions['user_action_3_2.0']) - np.log(1 + actions['user_action_30_2.0'])\n", + " actions['recent_action3'] = np.log(1 + actions['user_action_30_3.0']-actions['user_action_3_3.0']) - np.log(1 + actions['user_action_30_3.0'])\n", + " actions['recent_action4'] = np.log(1 + actions['user_action_30_4.0']-actions['user_action_3_4.0']) - np.log(1 + actions['user_action_30_4.0'])\n", + " actions['recent_action5'] = np.log(1 + actions['user_action_30_5.0']-actions['user_action_3_5.0']) - np.log(1 + actions['user_action_30_5.0'])\n", + " actions['recent_action6'] = np.log(1 + actions['user_action_30_6.0']-actions['user_action_3_6.0']) - np.log(1 + actions['user_action_30_6.0'])\n", + " \n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户对同类别下各种商品的行为\n", + "* 用户对各个类别的各项行为操作统计\n", + "* 用户对各个类别操作行为统计占对所有类别操作行为统计的比重" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#增加了用户对不同类别的交互特征\n", + "def get_user_cate_feature(start_date, end_date, all_actions):\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " actions = actions[['user_id', 'cate', 'type']]\n", + " df = pd.get_dummies(actions['type'], prefix='type')\n", + " actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)\n", + " actions = actions.groupby(['user_id', 'cate']).sum()\n", + " actions = actions.unstack()\n", + " actions.columns = actions.columns.swaplevel(0, 1)\n", + " actions.columns = actions.columns.droplevel()\n", + " actions.columns = [\n", + " 'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n", + " 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n", + " 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n", + " 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n", + " 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n", + " 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n", + " 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n", + " 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n", + " 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n", + " 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n", + " 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n", + " 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'\n", + " ]\n", + " actions = actions.fillna(0)\n", + " actions['cate_action_sum'] = actions.sum(axis=1)\n", + " actions['cate8_percentage'] = (\n", + " actions['cate_8_type1'] + actions['cate_8_type2'] +\n", + " actions['cate_8_type3'] + actions['cate_8_type4'] +\n", + " actions['cate_8_type5'] + actions['cate_8_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate4_percentage'] = (\n", + " actions['cate_4_type1'] + actions['cate_4_type2'] +\n", + " actions['cate_4_type3'] + actions['cate_4_type4'] +\n", + " actions['cate_4_type5'] + actions['cate_4_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate5_percentage'] = (\n", + " actions['cate_5_type1'] + actions['cate_5_type2'] +\n", + " actions['cate_5_type3'] + actions['cate_5_type4'] +\n", + " actions['cate_5_type5'] + actions['cate_5_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate6_percentage'] = (\n", + " actions['cate_6_type1'] + actions['cate_6_type2'] +\n", + " actions['cate_6_type3'] + actions['cate_6_type4'] +\n", + " actions['cate_6_type5'] + actions['cate_6_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate7_percentage'] = (\n", + " actions['cate_7_type1'] + actions['cate_7_type2'] +\n", + " actions['cate_7_type3'] + actions['cate_7_type4'] +\n", + " actions['cate_7_type5'] + actions['cate_7_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate9_percentage'] = (\n", + " actions['cate_9_type1'] + actions['cate_9_type2'] +\n", + " actions['cate_9_type3'] + actions['cate_9_type4'] +\n", + " actions['cate_9_type5'] + actions['cate_9_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate10_percentage'] = (\n", + " actions['cate_10_type1'] + actions['cate_10_type2'] +\n", + " actions['cate_10_type3'] + actions['cate_10_type4'] +\n", + " actions['cate_10_type5'] + actions['cate_10_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate11_percentage'] = (\n", + " actions['cate_11_type1'] + actions['cate_11_type2'] +\n", + " actions['cate_11_type3'] + actions['cate_11_type4'] +\n", + " actions['cate_11_type5'] + actions['cate_11_type6']\n", + " ) / actions['cate_action_sum']\n", + "\n", + " actions['cate8_type1_percentage'] = np.log(\n", + " 1 + actions['cate_8_type1']) - np.log(\n", + " 1 + actions['cate_8_type1'] + actions['cate_4_type1'] +\n", + " actions['cate_5_type1'] + actions['cate_6_type1'] +\n", + " actions['cate_7_type1'] + actions['cate_9_type1'] +\n", + " actions['cate_10_type1'] + actions['cate_11_type1'])\n", + "\n", + " actions['cate8_type2_percentage'] = np.log(\n", + " 1 + actions['cate_8_type2']) - np.log(\n", + " 1 + actions['cate_8_type2'] + actions['cate_4_type2'] +\n", + " actions['cate_5_type2'] + actions['cate_6_type2'] +\n", + " actions['cate_7_type2'] + actions['cate_9_type2'] +\n", + " actions['cate_10_type2'] + actions['cate_11_type2'])\n", + " actions['cate8_type3_percentage'] = np.log(\n", + " 1 + actions['cate_8_type3']) - np.log(\n", + " 1 + actions['cate_8_type3'] + actions['cate_4_type3'] +\n", + " actions['cate_5_type3'] + actions['cate_6_type3'] +\n", + " actions['cate_7_type3'] + actions['cate_9_type3'] +\n", + " actions['cate_10_type3'] + actions['cate_11_type3'])\n", + " actions['cate8_type4_percentage'] = np.log(\n", + " 1 + actions['cate_8_type4']) - np.log(\n", + " 1 + actions['cate_8_type4'] + actions['cate_4_type4'] +\n", + " actions['cate_5_type4'] + actions['cate_6_type4'] +\n", + " actions['cate_7_type4'] + actions['cate_9_type4'] +\n", + " actions['cate_10_type4'] + actions['cate_11_type4'])\n", + " actions['cate8_type5_percentage'] = np.log(\n", + " 1 + actions['cate_8_type5']) - np.log(\n", + " 1 + actions['cate_8_type5'] + actions['cate_4_type5'] +\n", + " actions['cate_5_type5'] + actions['cate_6_type5'] +\n", + " actions['cate_7_type5'] + actions['cate_9_type5'] +\n", + " actions['cate_10_type5'] + actions['cate_11_type5'])\n", + " actions['cate8_type6_percentage'] = np.log(\n", + " 1 + actions['cate_8_type6']) - np.log(\n", + " 1 + actions['cate_8_type6'] + actions['cate_4_type6'] +\n", + " actions['cate_5_type6'] + actions['cate_6_type6'] +\n", + " actions['cate_7_type6'] + actions['cate_9_type6'] +\n", + " actions['cate_10_type6'] + actions['cate_11_type6'])\n", + " actions['user_id'] = actions.index\n", + " actions = actions[[\n", + " 'user_id', 'cate8_percentage', 'cate4_percentage', 'cate5_percentage',\n", + " 'cate6_percentage', 'cate7_percentage', 'cate9_percentage',\n", + " 'cate10_percentage', 'cate11_percentage', 'cate8_type1_percentage',\n", + " 'cate8_type2_percentage', 'cate8_type3_percentage',\n", + " 'cate8_type4_percentage', 'cate8_type5_percentage',\n", + " 'cate8_type6_percentage'\n", + " ]]\n", + " return actions" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2016-02-01\n", + "2016-02-04\n" + ] + } + ], + "source": [ + "train_start_date = '2016-02-01'\n", + "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + "train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + "day = 3\n", + "\n", + "start_date = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=day)\n", + "start_date = start_date.strftime('%Y-%m-%d')\n", + "\n", + "print (start_date)\n", + "print (train_end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "all_actions = get_all_action()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcatetype
29272629.010.01.0
30272629.010.01.0
31272629.010.06.0
32272629.010.01.0
33272629.010.06.0
\n", + "
" + ], + "text/plain": [ + " user_id cate type\n", + "29 272629.0 10.0 1.0\n", + "30 272629.0 10.0 1.0\n", + "31 272629.0 10.0 6.0\n", + "32 272629.0 10.0 1.0\n", + "33 272629.0 10.0 6.0" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions = get_actions(start_date, train_end_date, all_actions)\n", + "actions = actions[['user_id', 'cate', 'type']]\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
type_1.0type_2.0type_3.0type_4.0type_5.0type_6.0
user_idcate
200002.04.016.00.00.00.00.020.0
5.04.00.00.00.00.06.0
7.04.00.00.00.00.03.0
8.04.00.00.00.00.012.0
200003.04.08.00.00.00.00.012.0
\n", + "
" + ], + "text/plain": [ + " type_1.0 type_2.0 type_3.0 type_4.0 type_5.0 type_6.0\n", + "user_id cate \n", + "200002.0 4.0 16.0 0.0 0.0 0.0 0.0 20.0\n", + " 5.0 4.0 0.0 0.0 0.0 0.0 6.0\n", + " 7.0 4.0 0.0 0.0 0.0 0.0 3.0\n", + " 8.0 4.0 0.0 0.0 0.0 0.0 12.0\n", + "200003.0 4.0 8.0 0.0 0.0 0.0 0.0 12.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.get_dummies(actions['type'], prefix='type')\n", + "actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)\n", + "actions = actions.groupby(['user_id', 'cate']).sum()\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
type_1.0type_2.0...type_5.0type_6.0
cate4.05.06.07.08.09.010.011.04.05.0...10.011.04.05.06.07.08.09.010.011.0
user_id
200002.016.04.0NaN4.04.0NaNNaNNaN0.00.0...NaNNaN20.06.0NaN3.012.0NaNNaNNaN
200003.08.0NaNNaNNaN12.0NaNNaNNaN0.0NaN...NaNNaN12.0NaNNaNNaN19.0NaNNaNNaN
200008.0NaNNaNNaN8.0NaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN20.0NaNNaNNaNNaN
200023.0NaNNaNNaNNaN1.0NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN0.0NaNNaNNaN
200030.08.0NaNNaNNaNNaNNaNNaNNaN0.0NaN...NaNNaN17.0NaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 48 columns

\n", + "
" + ], + "text/plain": [ + " type_1.0 type_2.0 ... \\\n", + "cate 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 4.0 5.0 ... \n", + "user_id ... \n", + "200002.0 16.0 4.0 NaN 4.0 4.0 NaN NaN NaN 0.0 0.0 ... \n", + "200003.0 8.0 NaN NaN NaN 12.0 NaN NaN NaN 0.0 NaN ... \n", + "200008.0 NaN NaN NaN 8.0 NaN NaN NaN NaN NaN NaN ... \n", + "200023.0 NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN ... \n", + "200030.0 8.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... \n", + "\n", + " type_5.0 type_6.0 \n", + "cate 10.0 11.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 \n", + "user_id \n", + "200002.0 NaN NaN 20.0 6.0 NaN 3.0 12.0 NaN NaN NaN \n", + "200003.0 NaN NaN 12.0 NaN NaN NaN 19.0 NaN NaN NaN \n", + "200008.0 NaN NaN NaN NaN NaN 20.0 NaN NaN NaN NaN \n", + "200023.0 NaN NaN NaN NaN NaN NaN 0.0 NaN NaN NaN \n", + "200030.0 NaN NaN 17.0 NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[5 rows x 48 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions = actions.unstack()\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex(levels=[['type_1.0', 'type_2.0', 'type_3.0', 'type_4.0', 'type_5.0', 'type_6.0'], [4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]],\n", + " codes=[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5], [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]],\n", + " names=[None, 'cate'])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex(levels=[[4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0], ['type_1.0', 'type_2.0', 'type_3.0', 'type_4.0', 'type_5.0', 'type_6.0']],\n", + " codes=[[0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5]],\n", + " names=['cate', None])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns = actions.columns.swaplevel(0, 1)\n", + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['type_1.0', 'type_1.0', 'type_1.0', 'type_1.0', 'type_1.0', 'type_1.0',\n", + " 'type_1.0', 'type_1.0', 'type_2.0', 'type_2.0', 'type_2.0', 'type_2.0',\n", + " 'type_2.0', 'type_2.0', 'type_2.0', 'type_2.0', 'type_3.0', 'type_3.0',\n", + " 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0',\n", + " 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0',\n", + " 'type_4.0', 'type_4.0', 'type_5.0', 'type_5.0', 'type_5.0', 'type_5.0',\n", + " 'type_5.0', 'type_5.0', 'type_5.0', 'type_5.0', 'type_6.0', 'type_6.0',\n", + " 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0'],\n", + " dtype='object')" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns = actions.columns.droplevel()\n", + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n", + " 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n", + " 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n", + " 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n", + " 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n", + " 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n", + " 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n", + " 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n", + " 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n", + " 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n", + " 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n", + " 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'],\n", + " dtype='object')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns = [\n", + " 'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n", + " 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n", + " 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n", + " 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n", + " 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n", + " 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n", + " 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n", + " 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n", + " 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n", + " 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n", + " 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n", + " 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'\n", + " ]\n", + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cate_4_type1cate_5_type1cate_6_type1cate_7_type1cate_8_type1cate_9_type1cate_10_type1cate_11_type1cate_4_type2cate_5_type2...cate_11_type5cate_4_type6cate_5_type6cate_6_type6cate_7_type6cate_8_type6cate_9_type6cate_10_type6cate_11_type6cate_action_sum
user_id
200002.016.04.00.04.04.00.00.00.00.00.0...0.020.06.00.03.012.00.00.00.069.0
200003.08.00.00.00.012.00.00.00.00.00.0...0.012.00.00.00.019.00.00.00.051.0
200008.00.00.00.08.00.00.00.00.00.00.0...0.00.00.00.020.00.00.00.00.028.0
200023.00.00.00.00.01.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
200030.08.00.00.00.00.00.00.00.00.00.0...0.017.00.00.00.00.00.00.00.025.0
\n", + "

5 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " cate_4_type1 cate_5_type1 cate_6_type1 cate_7_type1 \\\n", + "user_id \n", + "200002.0 16.0 4.0 0.0 4.0 \n", + "200003.0 8.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 8.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 8.0 0.0 0.0 0.0 \n", + "\n", + " cate_8_type1 cate_9_type1 cate_10_type1 cate_11_type1 \\\n", + "user_id \n", + "200002.0 4.0 0.0 0.0 0.0 \n", + "200003.0 12.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 0.0 \n", + "200023.0 1.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_4_type2 cate_5_type2 ... cate_11_type5 cate_4_type6 \\\n", + "user_id ... \n", + "200002.0 0.0 0.0 ... 0.0 20.0 \n", + "200003.0 0.0 0.0 ... 0.0 12.0 \n", + "200008.0 0.0 0.0 ... 0.0 0.0 \n", + "200023.0 0.0 0.0 ... 0.0 0.0 \n", + "200030.0 0.0 0.0 ... 0.0 17.0 \n", + "\n", + " cate_5_type6 cate_6_type6 cate_7_type6 cate_8_type6 \\\n", + "user_id \n", + "200002.0 6.0 0.0 3.0 12.0 \n", + "200003.0 0.0 0.0 0.0 19.0 \n", + "200008.0 0.0 0.0 20.0 0.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_9_type6 cate_10_type6 cate_11_type6 cate_action_sum \n", + "user_id \n", + "200002.0 0.0 0.0 0.0 69.0 \n", + "200003.0 0.0 0.0 0.0 51.0 \n", + "200008.0 0.0 0.0 0.0 28.0 \n", + "200023.0 0.0 0.0 0.0 1.0 \n", + "200030.0 0.0 0.0 0.0 25.0 \n", + "\n", + "[5 rows x 49 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions = actions.fillna(0)\n", + "actions['cate_action_sum'] = actions.sum(axis=1)\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cate_4_type1cate_5_type1cate_6_type1cate_7_type1cate_8_type1cate_9_type1cate_10_type1cate_11_type1cate_4_type2cate_5_type2...cate_4_type6cate_5_type6cate_6_type6cate_7_type6cate_8_type6cate_9_type6cate_10_type6cate_11_type6cate_action_sumcate8_percentage
user_id
200002.016.04.00.04.04.00.00.00.00.00.0...20.06.00.03.012.00.00.00.069.00.231884
200003.08.00.00.00.012.00.00.00.00.00.0...12.00.00.00.019.00.00.00.051.00.607843
200008.00.00.00.08.00.00.00.00.00.00.0...0.00.00.020.00.00.00.00.028.00.000000
200023.00.00.00.00.01.00.00.00.00.00.0...0.00.00.00.00.00.00.00.01.01.000000
200030.08.00.00.00.00.00.00.00.00.00.0...17.00.00.00.00.00.00.00.025.00.000000
\n", + "

5 rows × 50 columns

\n", + "
" + ], + "text/plain": [ + " cate_4_type1 cate_5_type1 cate_6_type1 cate_7_type1 \\\n", + "user_id \n", + "200002.0 16.0 4.0 0.0 4.0 \n", + "200003.0 8.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 8.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 8.0 0.0 0.0 0.0 \n", + "\n", + " cate_8_type1 cate_9_type1 cate_10_type1 cate_11_type1 \\\n", + "user_id \n", + "200002.0 4.0 0.0 0.0 0.0 \n", + "200003.0 12.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 0.0 \n", + "200023.0 1.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_4_type2 cate_5_type2 ... cate_4_type6 cate_5_type6 \\\n", + "user_id ... \n", + "200002.0 0.0 0.0 ... 20.0 6.0 \n", + "200003.0 0.0 0.0 ... 12.0 0.0 \n", + "200008.0 0.0 0.0 ... 0.0 0.0 \n", + "200023.0 0.0 0.0 ... 0.0 0.0 \n", + "200030.0 0.0 0.0 ... 17.0 0.0 \n", + "\n", + " cate_6_type6 cate_7_type6 cate_8_type6 cate_9_type6 \\\n", + "user_id \n", + "200002.0 0.0 3.0 12.0 0.0 \n", + "200003.0 0.0 0.0 19.0 0.0 \n", + "200008.0 0.0 20.0 0.0 0.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_10_type6 cate_11_type6 cate_action_sum cate8_percentage \n", + "user_id \n", + "200002.0 0.0 0.0 69.0 0.231884 \n", + "200003.0 0.0 0.0 51.0 0.607843 \n", + "200008.0 0.0 0.0 28.0 0.000000 \n", + "200023.0 0.0 0.0 1.0 1.000000 \n", + "200030.0 0.0 0.0 25.0 0.000000 \n", + "\n", + "[5 rows x 50 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions['cate8_percentage'] = (\n", + " actions['cate_8_type1'] + actions['cate_8_type2'] +\n", + " actions['cate_8_type3'] + actions['cate_8_type4'] +\n", + " actions['cate_8_type5'] + actions['cate_8_type6']\n", + " ) / actions['cate_action_sum']\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cate_4_type1cate_5_type1cate_6_type1cate_7_type1cate_8_type1cate_9_type1cate_10_type1cate_11_type1cate_4_type2cate_5_type2...cate_5_type6cate_6_type6cate_7_type6cate_8_type6cate_9_type6cate_10_type6cate_11_type6cate_action_sumcate8_percentagecate8_type1_percentage
user_id
200002.016.04.00.04.04.00.00.00.00.00.0...6.00.03.012.00.00.00.069.00.231884-1.757858
200003.08.00.00.00.012.00.00.00.00.00.0...0.00.00.019.00.00.00.051.00.607843-0.479573
200008.00.00.00.08.00.00.00.00.00.00.0...0.00.020.00.00.00.00.028.00.000000-2.197225
200023.00.00.00.00.01.00.00.00.00.00.0...0.00.00.00.00.00.00.01.01.0000000.000000
200030.08.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.025.00.000000-2.197225
\n", + "

5 rows × 51 columns

\n", + "
" + ], + "text/plain": [ + " cate_4_type1 cate_5_type1 cate_6_type1 cate_7_type1 \\\n", + "user_id \n", + "200002.0 16.0 4.0 0.0 4.0 \n", + "200003.0 8.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 8.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 8.0 0.0 0.0 0.0 \n", + "\n", + " cate_8_type1 cate_9_type1 cate_10_type1 cate_11_type1 \\\n", + "user_id \n", + "200002.0 4.0 0.0 0.0 0.0 \n", + "200003.0 12.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 0.0 \n", + "200023.0 1.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_4_type2 cate_5_type2 ... cate_5_type6 cate_6_type6 \\\n", + "user_id ... \n", + "200002.0 0.0 0.0 ... 6.0 0.0 \n", + "200003.0 0.0 0.0 ... 0.0 0.0 \n", + "200008.0 0.0 0.0 ... 0.0 0.0 \n", + "200023.0 0.0 0.0 ... 0.0 0.0 \n", + "200030.0 0.0 0.0 ... 0.0 0.0 \n", + "\n", + " cate_7_type6 cate_8_type6 cate_9_type6 cate_10_type6 \\\n", + "user_id \n", + "200002.0 3.0 12.0 0.0 0.0 \n", + "200003.0 0.0 19.0 0.0 0.0 \n", + "200008.0 20.0 0.0 0.0 0.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_11_type6 cate_action_sum cate8_percentage \\\n", + "user_id \n", + "200002.0 0.0 69.0 0.231884 \n", + "200003.0 0.0 51.0 0.607843 \n", + "200008.0 0.0 28.0 0.000000 \n", + "200023.0 0.0 1.0 1.000000 \n", + "200030.0 0.0 25.0 0.000000 \n", + "\n", + " cate8_type1_percentage \n", + "user_id \n", + "200002.0 -1.757858 \n", + "200003.0 -0.479573 \n", + "200008.0 -2.197225 \n", + "200023.0 0.000000 \n", + "200030.0 -2.197225 \n", + "\n", + "[5 rows x 51 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions['cate8_type1_percentage'] = np.log(\n", + " 1 + actions['cate_8_type1']) - np.log(\n", + " 1 + actions['cate_8_type1'] + actions['cate_4_type1'] +\n", + " actions['cate_5_type1'] + actions['cate_6_type1'] +\n", + " actions['cate_7_type1'] + actions['cate_9_type1'] +\n", + " actions['cate_10_type1'] + actions['cate_11_type1'])\n", + "actions.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 商品-行为\n", + "#### 累积商品特征\n", + "* 分时间段\n", + "* 针对商品的不同行为的\n", + " * 购买转化率\n", + " * 均值\n", + " * 标准差" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accumulate_product_feat(start_date, end_date, all_actions):\n", + " feature = [\n", + " 'sku_id', 'product_action_1', 'product_action_2',\n", + " 'product_action_3', 'product_action_4',\n", + " 'product_action_5', 'product_action_6',\n", + " 'product_action_1_ratio', 'product_action_2_ratio',\n", + " 'product_action_3_ratio', 'product_action_5_ratio',\n", + " 'product_action_6_ratio', 'product_action_1_mean',\n", + " 'product_action_2_mean', 'product_action_3_mean',\n", + " 'product_action_4_mean', 'product_action_5_mean',\n", + " 'product_action_6_mean', 'product_action_1_std',\n", + " 'product_action_2_std', 'product_action_3_std', 'product_action_4_std',\n", + " 'product_action_5_std', 'product_action_6_std'\n", + " ]\n", + "\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " df = pd.get_dummies(actions['type'], prefix='product_action')\n", + " # 按照商品-日期分组,计算某个时间段该商品的各项行为的标准差\n", + " actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n", + " actions = pd.concat([actions[['sku_id', 'date']], df], axis=1)\n", + " actions = actions.groupby(['sku_id'], as_index=False).sum()\n", + " days_interal = (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(start_date, '%Y-%m-%d')).days\n", + " \n", + " # 针对商品分组,计算购买转化率\n", + " actions['product_action_1_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_1.0'])\n", + " actions['product_action_2_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_2.0'])\n", + " actions['product_action_3_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_3.0'])\n", + " actions['product_action_5_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_5.0'])\n", + " actions['product_action_6_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_6.0'])\n", + " \n", + " # 计算各种行为的均值\n", + " actions['product_action_1_mean'] = actions[\n", + " 'product_action_1.0'] / days_interal\n", + " actions['product_action_2_mean'] = actions[\n", + " 'product_action_2.0'] / days_interal\n", + " actions['product_action_3_mean'] = actions[\n", + " 'product_action_3.0'] / days_interal\n", + " actions['product_action_4_mean'] = actions[\n", + " 'product_action_4.0'] / days_interal\n", + " actions['product_action_5_mean'] = actions[\n", + " 'product_action_5.0'] / days_interal\n", + " actions['product_action_6_mean'] = actions[\n", + " 'product_action_6.0'] / days_interal\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 类别特征\n", + "#### 分时间段下各个商品类别的\n", + "* 购买转化率\n", + " * 标准差\n", + " * 均值" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accumulate_cate_feat(start_date, end_date, all_actions):\n", + " feature = ['cate','cate_action_1', 'cate_action_2', 'cate_action_3', 'cate_action_4', 'cate_action_5', \n", + " 'cate_action_6', 'cate_action_1_ratio', 'cate_action_2_ratio', \n", + " 'cate_action_3_ratio', 'cate_action_5_ratio', 'cate_action_6_ratio', 'cate_action_1_mean',\n", + " 'cate_action_2_mean', 'cate_action_3_mean', 'cate_action_4_mean', 'cate_action_5_mean',\n", + " 'cate_action_6_mean', 'cate_action_1_std', 'cate_action_2_std', 'cate_action_3_std',\n", + " 'cate_action_4_std', 'cate_action_5_std', 'cate_action_6_std']\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n", + " df = pd.get_dummies(actions['type'], prefix='cate_action')\n", + " actions = pd.concat([actions[['cate','date']], df], axis=1)\n", + " \n", + " # 按照类别分组,统计各个商品类别下行为的转化率\n", + " actions = actions.groupby(['cate'], as_index=False).sum()\n", + " days_interal = (datetime.strptime(end_date, '%Y-%m-%d')-datetime.strptime(start_date, '%Y-%m-%d')).days\n", + " \n", + " actions['cate_action_1_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_1.0']))\n", + " actions['cate_action_2_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_2.0']))\n", + " actions['cate_action_3_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_3.0']))\n", + " actions['cate_action_5_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_5.0']))\n", + " actions['cate_action_6_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_6.0']))\n", + " \n", + " # 按照类别分组,统计各个商品类别下行为在一段时间的均值\n", + " actions['cate_action_1_mean'] = actions['cate_action_1.0'] / days_interal\n", + " actions['cate_action_2_mean'] = actions['cate_action_2.0'] / days_interal\n", + " actions['cate_action_3_mean'] = actions['cate_action_3.0'] / days_interal\n", + " actions['cate_action_4_mean'] = actions['cate_action_4.0'] / days_interal\n", + " actions['cate_action_5_mean'] = actions['cate_action_5.0'] / days_interal\n", + " actions['cate_action_6_mean'] = actions['cate_action_6.0'] / days_interal\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 构造训练集/测试集\n", + "### 构造训练集/验证集\n", + "标签,采用滑动窗口的方式,构造训练集的时候针对产生购买的行为标记为1\n", + "整合特征" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def get_labels(start_date, end_date, all_actions):\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + "# actions = actions[actions['type'] == 4]\n", + " # 修改为预测购买了商品8的用户预测\n", + " actions = actions[(actions['type'] == 4) & (actions['cate']==8)]\n", + " \n", + " actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()\n", + " actions['label'] = 1\n", + " actions = actions[['user_id', 'sku_id', 'label']]\n", + " return actions" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get all actions!\n" + ] + } + ], + "source": [ + "train_start_date = '2016-03-01'\n", + "train_actions = None\n", + "all_actions = get_all_action()\n", + "print (\"get all actions!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsku_idtimemodel_idtypecatebrand
0266079.0138778.02016-01-31 23:59:02NaN1.08.0403.0
1266079.0138778.02016-01-31 23:59:030.06.08.0403.0
2200719.061226.02016-01-31 23:59:07NaN1.08.030.0
3200719.061226.02016-01-31 23:59:080.06.08.030.0
4263587.072348.02016-01-31 23:59:08NaN1.05.0159.0
\n", + "
" + ], + "text/plain": [ + " user_id sku_id time model_id type cate brand\n", + "0 266079.0 138778.0 2016-01-31 23:59:02 NaN 1.0 8.0 403.0\n", + "1 266079.0 138778.0 2016-01-31 23:59:03 0.0 6.0 8.0 403.0\n", + "2 200719.0 61226.0 2016-01-31 23:59:07 NaN 1.0 8.0 30.0\n", + "3 200719.0 61226.0 2016-01-31 23:59:08 0.0 6.0 8.0 30.0\n", + "4 263587.0 72348.0 2016-01-31 23:59:08 NaN 1.0 5.0 159.0" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 50601736 entries, 0 to 13199933\n", + "Data columns (total 7 columns):\n", + "user_id float32\n", + "sku_id float32\n", + "time object\n", + "model_id float32\n", + "type float32\n", + "cate float32\n", + "brand float32\n", + "dtypes: float32(6), object(1)\n", + "memory usage: 1.9+ GB\n" + ] + } + ], + "source": [ + "all_actions.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(50601736, 7)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_actions.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_basic_user_feat finsihed\n" + ] + } + ], + "source": [ + "user = get_basic_user_feat()\n", + "print ('get_basic_user_feat finsihed')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idage_0age_1age_2age_3age_4age_5age_6sex_0sex_1sex_2user_lv_cd_1user_lv_cd_2user_lv_cd_3user_lv_cd_4user_lv_cd_5
0200001.00.00.00.00.00.00.01.00.00.01.00.00.00.00.01.0
1200002.01.00.00.00.00.00.00.01.00.00.01.00.00.00.00.0
2200003.00.00.00.00.01.00.00.00.01.00.00.00.00.01.00.0
3200004.01.00.00.00.00.00.00.00.00.01.01.00.00.00.00.0
4200005.00.00.01.00.00.00.00.01.00.00.00.00.00.01.00.0
\n", + "
" + ], + "text/plain": [ + " user_id age_0 age_1 age_2 age_3 age_4 age_5 age_6 sex_0 sex_1 \\\n", + "0 200001.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 \n", + "1 200002.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "2 200003.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n", + "3 200004.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 200005.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "\n", + " sex_2 user_lv_cd_1 user_lv_cd_2 user_lv_cd_3 user_lv_cd_4 user_lv_cd_5 \n", + "0 1.0 0.0 0.0 0.0 0.0 1.0 \n", + "1 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "3 1.0 1.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 1.0 0.0 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_basic_product_feat finsihed\n" + ] + } + ], + "source": [ + "product = get_basic_product_feat()\n", + "print ('get_basic_product_feat finsihed')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_idcatebranda1_-1a1_1a1_2a1_3a2_-1a2_1a2_2a3_-1a3_1a3_2
01084890001010010
110000284890001001001
21000038300100100100
310000685450100001010
41000182441000010001
\n", + "
" + ], + "text/plain": [ + " sku_id cate brand a1_-1 a1_1 a1_2 a1_3 a2_-1 a2_1 a2_2 a3_-1 \\\n", + "0 10 8 489 0 0 0 1 0 1 0 0 \n", + "1 100002 8 489 0 0 0 1 0 0 1 0 \n", + "2 100003 8 30 0 1 0 0 1 0 0 1 \n", + "3 100006 8 545 0 1 0 0 0 0 1 0 \n", + "4 10001 8 244 1 0 0 0 0 1 0 0 \n", + "\n", + " a3_1 a3_2 \n", + "0 1 0 \n", + "1 0 1 \n", + "2 0 0 \n", + "3 1 0 \n", + "4 0 1 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "datetime.datetime(2016, 3, 4, 0, 0)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_start_date = '2016-03-01'\n", + "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + "train_end_date" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2016-03-04\n" + ] + } + ], + "source": [ + "train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + "# 修正prod_acc,cate_acc的时间跨度\n", + "start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + "start_days = start_days.strftime('%Y-%m-%d')\n", + "print (train_end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "def make_actions(user, product, all_actions, train_start_date):\n", + " train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + " train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + " # 修正prod_acc,cate_acc的时间跨度\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " print (train_end_date)\n", + " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", + " print ('get_recent_user_feat finsihed')\n", + " \n", + " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", + " print ('get_user_cate_feature finished')\n", + " \n", + " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_product_feat finsihed')\n", + " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_cate_feat finsihed')\n", + " comment_acc = get_comments_product_feat(train_end_date)\n", + " print ('get_comments_product_feat finished')\n", + " # 标记\n", + " test_start_date = train_end_date\n", + " test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)\n", + " test_end_date = test_end_date.strftime('%Y-%m-%d')\n", + " labels = get_labels(test_start_date, test_end_date, all_actions)\n", + " print (\"get labels\")\n", + " \n", + " actions = None\n", + " for i in (3, 5, 7, 10, 15, 21, 30):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " if actions is None:\n", + " actions = get_action_feat(start_days, train_end_date, all_actions, i)\n", + " else:\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',\n", + " on=['user_id', 'sku_id', 'cate'])\n", + "\n", + " actions = pd.merge(actions, user, how='left', on='user_id')\n", + " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", + "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", + " actions.append(user_cate)\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", + " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", + " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])\n", + " # 主要是填充拼接商品基本特征、评论特征、标签之后的空值\n", + " actions = actions.fillna(0)\n", + "# return actions\n", + " # 采样\n", + " action_postive = actions[actions['label'] == 1]\n", + " action_negative = actions[actions['label'] == 0]\n", + " del actions\n", + " neg_len = len(action_postive) * 10\n", + " action_negative = action_negative.sample(n=neg_len)\n", + " action_sample = pd.concat([action_postive, action_negative], ignore_index=True) \n", + " \n", + " return action_sample" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def make_train_set(train_start_date, setNums ,f_path, all_actions):\n", + " train_actions = None\n", + " #all_actions = get_all_action()\n", + " #print (\"get all actions!\")\n", + " user = get_basic_user_feat()\n", + " print ('get_basic_user_feat finsihed')\n", + " product = get_basic_product_feat()\n", + " print ('get_basic_product_feat finsihed')\n", + " # 滑窗,构造多组训练集/验证集\n", + " for i in range(setNums):\n", + " print (train_start_date)\n", + " if train_actions is None:\n", + " train_actions = make_actions(user, product, all_actions, train_start_date)\n", + " else:\n", + " train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],\n", + " ignore_index=True)\n", + " # 接下来每次移动一天\n", + " train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)\n", + " train_start_date = train_start_date.strftime('%Y-%m-%d')\n", + " print (\"round {0}/{1} over!\".format(i+1, setNums))\n", + "\n", + " train_actions.to_csv(f_path, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_basic_user_feat finsihed\n", + "get_basic_product_feat finsihed\n", + "2016-02-01\n", + "2016-02-04\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "round 1/20 over!\n", + "2016-02-02\n", + "2016-02-05\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 2/20 over!\n", + "2016-02-03\n", + "2016-02-06\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 3/20 over!\n", + "2016-02-04\n", + "2016-02-07\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 4/20 over!\n", + "2016-02-05\n", + "2016-02-08\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 5/20 over!\n", + "2016-02-06\n", + "2016-02-09\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 6/20 over!\n", + "2016-02-07\n", + "2016-02-10\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 7/20 over!\n", + "2016-02-08\n", + "2016-02-11\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 8/20 over!\n", + "2016-02-09\n", + "2016-02-12\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 9/20 over!\n", + "2016-02-10\n", + "2016-02-13\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 10/20 over!\n", + "2016-02-11\n", + "2016-02-14\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 11/20 over!\n", + "2016-02-12\n", + "2016-02-15\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 12/20 over!\n", + "2016-02-13\n", + "2016-02-16\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 13/20 over!\n", + "2016-02-14\n", + "2016-02-17\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 14/20 over!\n", + "2016-02-15\n", + "2016-02-18\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 15/20 over!\n", + "2016-02-16\n", + "2016-02-19\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 16/20 over!\n", + "2016-02-17\n", + "2016-02-20\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 17/20 over!\n", + "2016-02-18\n", + "2016-02-21\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 18/20 over!\n", + "2016-02-19\n", + "2016-02-22\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 19/20 over!\n", + "2016-02-20\n", + "2016-02-23\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 20/20 over!\n" + ] + } + ], + "source": [ + "# 训练集\n", + "train_start_date = '2016-02-01'\n", + "make_train_set(train_start_date, 20, 'data/train_set.csv',all_actions)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 构造验证集(线下测试集)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "def make_val_answer(val_start_date, val_end_date, all_actions, label_val_s1_path):\n", + " actions = get_actions(val_start_date, val_end_date,all_actions)\n", + " actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]\n", + " actions = actions[['user_id', 'sku_id']]\n", + " actions = actions.drop_duplicates()\n", + " actions.to_csv(label_val_s1_path, index=False)\n", + "\n", + "def make_val_set(train_start_date, train_end_date, val_s1_path):\n", + " # 修改时间跨度\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " all_actions = get_all_action()\n", + " print (\"get all actions!\")\n", + " user = get_basic_user_feat()\n", + " print ('get_basic_user_feat finsihed')\n", + " \n", + " product = get_basic_product_feat()\n", + " print ('get_basic_product_feat finsihed')\n", + " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", + " print ('get_recent_user_feat finsihed')\n", + " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", + " print ('get_user_cate_feature finished')\n", + " \n", + " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_product_feat finsihed')\n", + " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_cate_feat finsihed')\n", + " comment_acc = get_comments_product_feat(train_end_date)\n", + " print ('get_comments_product_feat finished')\n", + " \n", + " actions = None\n", + " for i in (3, 5, 7, 10, 15, 21, 30):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " if actions is None:\n", + " actions = get_action_feat(start_days, train_end_date, all_actions,i)\n", + " else:\n", + " actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n", + " on=['user_id', 'sku_id', 'cate'])\n", + "\n", + " actions = pd.merge(actions, user, how='left', on='user_id')\n", + " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", + "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", + " actions.append(user_cate)\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", + " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", + " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", + " actions = actions.fillna(0)\n", + " \n", + " \n", + "# print actions\n", + " # 构造真实用户购买情况作为后续验证\n", + " val_start_date = train_end_date\n", + " val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n", + " val_end_date = val_end_date.strftime('%Y-%m-%d')\n", + " make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n", + " \n", + " actions.to_csv(val_s1_path, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get all actions!\n", + "get_basic_user_feat finsihed\n", + "get_basic_product_feat finsihed\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n" + ] + } + ], + "source": [ + "# 验证集\n", + "make_val_set('2016-02-23', '2016-02-26', 'data/val_set.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "def make_test_set(train_start_date, train_end_date):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " all_actions = get_all_action()\n", + " print(\"get all actions!\")\n", + " user = get_basic_user_feat()\n", + " print('get_basic_user_feat finsihed')\n", + " product = get_basic_product_feat()\n", + " print('get_basic_product_feat finsihed')\n", + " \n", + " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", + " print('get_accumulate_user_feat finsihed')\n", + " \n", + " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", + " print('get_user_cate_feature finished')\n", + " \n", + " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", + " print('get_accumulate_product_feat finsihed')\n", + " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", + " print('get_accumulate_cate_feat finsihed')\n", + " comment_acc = get_comments_product_feat(train_end_date)\n", + "\n", + " actions = None\n", + " for i in (3, 5, 7, 10, 15, 21, 30):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " if actions is None:\n", + " actions = get_action_feat(start_days, train_end_date, all_actions,i)\n", + " else:\n", + " actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n", + " on=['user_id', 'sku_id', 'cate'])\n", + "\n", + " actions = pd.merge(actions, user, how='left', on='user_id')\n", + " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", + "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", + " actions.append(user_cate)\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", + " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", + " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", + "\n", + " actions = actions.fillna(0)\n", + " \n", + "\n", + " actions.to_csv(\"data/test_set.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get all actions!\n", + "get_basic_user_feat finsihed\n", + "get_basic_product_feat finsihed\n", + "get_accumulate_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n" + ] + } + ], + "source": [ + "# 预测结果\n", + "sub_start_date = '2016-04-13'\n", + "sub_end_date = '2016-04-16'\n", + "make_test_set(sub_start_date, sub_end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}