From 886ce60e40ad7a9474327f048fcd462b8595a404 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Thu, 4 Feb 2021 15:35:59 +0800 Subject: [PATCH] Add. Feature engineering of Users and Commndities --- .../3-特征工程.ipynb | 2850 ++++++++++++++++- 1 file changed, 2823 insertions(+), 27 deletions(-) diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb index 01cfe35..5b04afa 100644 --- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb @@ -167,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -273,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -303,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -317,7 +317,7 @@ "dtype: bool" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -329,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -450,7 +450,7 @@ "67704 267705 NaN NaN 1 NaN" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -468,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -482,7 +482,7 @@ "dtype: bool" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -503,7 +503,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -527,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -557,7 +557,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -572,7 +572,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -682,7 +682,7 @@ "4 0 1 0 " ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -729,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -763,22 +763,2818 @@ " actions[before_date+'minus_mean_5'] = actions[before_date+'_5.0_x'] - (actions[before_date+'_5.0_x']/i)\n", " actions[before_date+'minus_mean_6'] = actions[before_date+'_6.0_x'] - (actions[before_date+'_6.0_x']/i)\n", " del actions['type']\n", - " # 保留cate特征\n", - "# del actions['cate']\n", - " \n", - " return actions\n", - " \n", " \n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户-行为\n", + "#### 累积用户特征\n", + "* 分时间段\n", + "* 用户不同行为的\n", + " * 购买转化率\n", + " * 均值" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accumulate_user_feat(end_date, all_actions, day):\n", + " start_date = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=day)\n", + " start_date = start_date.strftime('%Y-%m-%d')\n", + " before_date = 'user_action_%s' % day\n", + "\n", + " feature = [\n", + " 'user_id', before_date + '_1', before_date + '_2', before_date + '_3',\n", + " before_date + '_4', before_date + '_5', before_date + '_6',\n", + " before_date + '_1_ratio', before_date + '_2_ratio',\n", + " before_date + '_3_ratio', before_date + '_5_ratio',\n", + " before_date + '_6_ratio', before_date + '_1_mean',\n", + " before_date + '_2_mean', before_date + '_3_mean',\n", + " before_date + '_4_mean', before_date + '_5_mean',\n", + " before_date + '_6_mean', before_date + '_1_std',\n", + " before_date + '_2_std', before_date + '_3_std', before_date + '_4_std',\n", + " before_date + '_5_std', before_date + '_6_std'\n", + " ]\n", + "\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " df = pd.get_dummies(actions['type'], prefix=before_date)\n", + "\n", + " actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n", + "\n", + " actions = pd.concat([actions[['user_id', 'date']], df], axis=1)\n", + " \n", + " # 分组统计,按用户分组,统计用户各项行为的转化率、均值\n", + " actions = actions.groupby(['user_id'], as_index=False).sum()\n", + "\n", + " actions[before_date + '_1_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_1.0'])\n", + " actions[before_date + '_2_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_2.0'])\n", + " actions[before_date + '_3_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_3.0'])\n", + " actions[before_date + '_5_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_5.0'])\n", + " actions[before_date + '_6_ratio'] = np.log(1 + actions[before_date + '_4.0']) - np.log(1 + actions[before_date +'_6.0'])\n", + " # 均值\n", + " actions[before_date + '_1_mean'] = actions[before_date + '_1.0'] / day\n", + " actions[before_date + '_2_mean'] = actions[before_date + '_2.0'] / day\n", + " actions[before_date + '_3_mean'] = actions[before_date + '_3.0'] / day\n", + " actions[before_date + '_4_mean'] = actions[before_date + '_4.0'] / day\n", + " actions[before_date + '_5_mean'] = actions[before_date + '_5.0'] / day\n", + " actions[before_date + '_6_mean'] = actions[before_date + '_6.0'] / day\n", + " #actions = pd.merge(actions, actions_date, how='left', on='user_id')\n", + " #actions = actions[feature]\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户近期行为特征\n", + "在上面针对用户进行累积特征提取的基础上,分别提取用户近一个月、近三天的特征,然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_recent_user_feat(end_date, all_actions):\n", + " actions_3 = get_accumulate_user_feat(end_date, all_actions, 3)\n", + " actions_30 = get_accumulate_user_feat(end_date, all_actions, 30)\n", + " actions = pd.merge(actions_3, actions_30, how ='left', on='user_id')\n", + " del actions_3\n", + " del actions_30\n", + " \n", + " actions['recent_action1'] = np.log(1 + actions['user_action_30_1.0']-actions['user_action_3_1.0']) - np.log(1 + actions['user_action_30_1.0'])\n", + " actions['recent_action2'] = np.log(1 + actions['user_action_30_2.0']-actions['user_action_3_2.0']) - np.log(1 + actions['user_action_30_2.0'])\n", + " actions['recent_action3'] = np.log(1 + actions['user_action_30_3.0']-actions['user_action_3_3.0']) - np.log(1 + actions['user_action_30_3.0'])\n", + " actions['recent_action4'] = np.log(1 + actions['user_action_30_4.0']-actions['user_action_3_4.0']) - np.log(1 + actions['user_action_30_4.0'])\n", + " actions['recent_action5'] = np.log(1 + actions['user_action_30_5.0']-actions['user_action_3_5.0']) - np.log(1 + actions['user_action_30_5.0'])\n", + " actions['recent_action6'] = np.log(1 + actions['user_action_30_6.0']-actions['user_action_3_6.0']) - np.log(1 + actions['user_action_30_6.0'])\n", + " \n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户对同类别下各种商品的行为\n", + "* 用户对各个类别的各项行为操作统计\n", + "* 用户对各个类别操作行为统计占对所有类别操作行为统计的比重" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#增加了用户对不同类别的交互特征\n", + "def get_user_cate_feature(start_date, end_date, all_actions):\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " actions = actions[['user_id', 'cate', 'type']]\n", + " df = pd.get_dummies(actions['type'], prefix='type')\n", + " actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)\n", + " actions = actions.groupby(['user_id', 'cate']).sum()\n", + " actions = actions.unstack()\n", + " actions.columns = actions.columns.swaplevel(0, 1)\n", + " actions.columns = actions.columns.droplevel()\n", + " actions.columns = [\n", + " 'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n", + " 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n", + " 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n", + " 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n", + " 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n", + " 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n", + " 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n", + " 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n", + " 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n", + " 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n", + " 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n", + " 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'\n", + " ]\n", + " actions = actions.fillna(0)\n", + " actions['cate_action_sum'] = actions.sum(axis=1)\n", + " actions['cate8_percentage'] = (\n", + " actions['cate_8_type1'] + actions['cate_8_type2'] +\n", + " actions['cate_8_type3'] + actions['cate_8_type4'] +\n", + " actions['cate_8_type5'] + actions['cate_8_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate4_percentage'] = (\n", + " actions['cate_4_type1'] + actions['cate_4_type2'] +\n", + " actions['cate_4_type3'] + actions['cate_4_type4'] +\n", + " actions['cate_4_type5'] + actions['cate_4_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate5_percentage'] = (\n", + " actions['cate_5_type1'] + actions['cate_5_type2'] +\n", + " actions['cate_5_type3'] + actions['cate_5_type4'] +\n", + " actions['cate_5_type5'] + actions['cate_5_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate6_percentage'] = (\n", + " actions['cate_6_type1'] + actions['cate_6_type2'] +\n", + " actions['cate_6_type3'] + actions['cate_6_type4'] +\n", + " actions['cate_6_type5'] + actions['cate_6_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate7_percentage'] = (\n", + " actions['cate_7_type1'] + actions['cate_7_type2'] +\n", + " actions['cate_7_type3'] + actions['cate_7_type4'] +\n", + " actions['cate_7_type5'] + actions['cate_7_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate9_percentage'] = (\n", + " actions['cate_9_type1'] + actions['cate_9_type2'] +\n", + " actions['cate_9_type3'] + actions['cate_9_type4'] +\n", + " actions['cate_9_type5'] + actions['cate_9_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate10_percentage'] = (\n", + " actions['cate_10_type1'] + actions['cate_10_type2'] +\n", + " actions['cate_10_type3'] + actions['cate_10_type4'] +\n", + " actions['cate_10_type5'] + actions['cate_10_type6']\n", + " ) / actions['cate_action_sum']\n", + " actions['cate11_percentage'] = (\n", + " actions['cate_11_type1'] + actions['cate_11_type2'] +\n", + " actions['cate_11_type3'] + actions['cate_11_type4'] +\n", + " actions['cate_11_type5'] + actions['cate_11_type6']\n", + " ) / actions['cate_action_sum']\n", + "\n", + " actions['cate8_type1_percentage'] = np.log(\n", + " 1 + actions['cate_8_type1']) - np.log(\n", + " 1 + actions['cate_8_type1'] + actions['cate_4_type1'] +\n", + " actions['cate_5_type1'] + actions['cate_6_type1'] +\n", + " actions['cate_7_type1'] + actions['cate_9_type1'] +\n", + " actions['cate_10_type1'] + actions['cate_11_type1'])\n", + "\n", + " actions['cate8_type2_percentage'] = np.log(\n", + " 1 + actions['cate_8_type2']) - np.log(\n", + " 1 + actions['cate_8_type2'] + actions['cate_4_type2'] +\n", + " actions['cate_5_type2'] + actions['cate_6_type2'] +\n", + " actions['cate_7_type2'] + actions['cate_9_type2'] +\n", + " actions['cate_10_type2'] + actions['cate_11_type2'])\n", + " actions['cate8_type3_percentage'] = np.log(\n", + " 1 + actions['cate_8_type3']) - np.log(\n", + " 1 + actions['cate_8_type3'] + actions['cate_4_type3'] +\n", + " actions['cate_5_type3'] + actions['cate_6_type3'] +\n", + " actions['cate_7_type3'] + actions['cate_9_type3'] +\n", + " actions['cate_10_type3'] + actions['cate_11_type3'])\n", + " actions['cate8_type4_percentage'] = np.log(\n", + " 1 + actions['cate_8_type4']) - np.log(\n", + " 1 + actions['cate_8_type4'] + actions['cate_4_type4'] +\n", + " actions['cate_5_type4'] + actions['cate_6_type4'] +\n", + " actions['cate_7_type4'] + actions['cate_9_type4'] +\n", + " actions['cate_10_type4'] + actions['cate_11_type4'])\n", + " actions['cate8_type5_percentage'] = np.log(\n", + " 1 + actions['cate_8_type5']) - np.log(\n", + " 1 + actions['cate_8_type5'] + actions['cate_4_type5'] +\n", + " actions['cate_5_type5'] + actions['cate_6_type5'] +\n", + " actions['cate_7_type5'] + actions['cate_9_type5'] +\n", + " actions['cate_10_type5'] + actions['cate_11_type5'])\n", + " actions['cate8_type6_percentage'] = np.log(\n", + " 1 + actions['cate_8_type6']) - np.log(\n", + " 1 + actions['cate_8_type6'] + actions['cate_4_type6'] +\n", + " actions['cate_5_type6'] + actions['cate_6_type6'] +\n", + " actions['cate_7_type6'] + actions['cate_9_type6'] +\n", + " actions['cate_10_type6'] + actions['cate_11_type6'])\n", + " actions['user_id'] = actions.index\n", + " actions = actions[[\n", + " 'user_id', 'cate8_percentage', 'cate4_percentage', 'cate5_percentage',\n", + " 'cate6_percentage', 'cate7_percentage', 'cate9_percentage',\n", + " 'cate10_percentage', 'cate11_percentage', 'cate8_type1_percentage',\n", + " 'cate8_type2_percentage', 'cate8_type3_percentage',\n", + " 'cate8_type4_percentage', 'cate8_type5_percentage',\n", + " 'cate8_type6_percentage'\n", + " ]]\n", + " return actions" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2016-02-01\n", + "2016-02-04\n" + ] + } + ], + "source": [ + "train_start_date = '2016-02-01'\n", + "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + "train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + "day = 3\n", + "\n", + "start_date = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=day)\n", + "start_date = start_date.strftime('%Y-%m-%d')\n", + "\n", + "print (start_date)\n", + "print (train_end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "all_actions = get_all_action()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcatetype
29272629.010.01.0
30272629.010.01.0
31272629.010.06.0
32272629.010.01.0
33272629.010.06.0
\n", + "
" + ], + "text/plain": [ + " user_id cate type\n", + "29 272629.0 10.0 1.0\n", + "30 272629.0 10.0 1.0\n", + "31 272629.0 10.0 6.0\n", + "32 272629.0 10.0 1.0\n", + "33 272629.0 10.0 6.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "actions = get_actions(start_date, train_end_date, all_actions)\n", - "actions = actions[['user_id', 'sku_id', 'cate','type']]\n", - " # 不同时间累积的行为计数(3,5,7,10,15,21,30)\n", - "df = pd.get_dummies(actions['type'], prefix='action_before_%s' %3)\n", - "before_date = 'action_before_%s' %3\n", - "actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame\n", - " # 分组统计,用户-类别-商品,不同用户对不同类别下商品的行为计数\n", - "actions = actions.groupby(['user_id', 'sku_id','cate'], as_index=False).sum()\n", - "actions.head(20)" + "actions = actions[['user_id', 'cate', 'type']]\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
type_1.0type_2.0type_3.0type_4.0type_5.0type_6.0
user_idcate
200002.04.016.00.00.00.00.020.0
5.04.00.00.00.00.06.0
7.04.00.00.00.00.03.0
8.04.00.00.00.00.012.0
200003.04.08.00.00.00.00.012.0
\n", + "
" + ], + "text/plain": [ + " type_1.0 type_2.0 type_3.0 type_4.0 type_5.0 type_6.0\n", + "user_id cate \n", + "200002.0 4.0 16.0 0.0 0.0 0.0 0.0 20.0\n", + " 5.0 4.0 0.0 0.0 0.0 0.0 6.0\n", + " 7.0 4.0 0.0 0.0 0.0 0.0 3.0\n", + " 8.0 4.0 0.0 0.0 0.0 0.0 12.0\n", + "200003.0 4.0 8.0 0.0 0.0 0.0 0.0 12.0" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.get_dummies(actions['type'], prefix='type')\n", + "actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)\n", + "actions = actions.groupby(['user_id', 'cate']).sum()\n", + "actions.head()" ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
type_1.0type_2.0...type_5.0type_6.0
cate4.05.06.07.08.09.010.011.04.05.0...10.011.04.05.06.07.08.09.010.011.0
user_id
200002.016.04.0NaN4.04.0NaNNaNNaN0.00.0...NaNNaN20.06.0NaN3.012.0NaNNaNNaN
200003.08.0NaNNaNNaN12.0NaNNaNNaN0.0NaN...NaNNaN12.0NaNNaNNaN19.0NaNNaNNaN
200008.0NaNNaNNaN8.0NaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaN20.0NaNNaNNaNNaN
200023.0NaNNaNNaNNaN1.0NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaN0.0NaNNaNNaN
200030.08.0NaNNaNNaNNaNNaNNaNNaN0.0NaN...NaNNaN17.0NaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 48 columns

\n", + "
" + ], + "text/plain": [ + " type_1.0 type_2.0 ... \\\n", + "cate 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 4.0 5.0 ... \n", + "user_id ... \n", + "200002.0 16.0 4.0 NaN 4.0 4.0 NaN NaN NaN 0.0 0.0 ... \n", + "200003.0 8.0 NaN NaN NaN 12.0 NaN NaN NaN 0.0 NaN ... \n", + "200008.0 NaN NaN NaN 8.0 NaN NaN NaN NaN NaN NaN ... \n", + "200023.0 NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN ... \n", + "200030.0 8.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... \n", + "\n", + " type_5.0 type_6.0 \n", + "cate 10.0 11.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 \n", + "user_id \n", + "200002.0 NaN NaN 20.0 6.0 NaN 3.0 12.0 NaN NaN NaN \n", + "200003.0 NaN NaN 12.0 NaN NaN NaN 19.0 NaN NaN NaN \n", + "200008.0 NaN NaN NaN NaN NaN 20.0 NaN NaN NaN NaN \n", + "200023.0 NaN NaN NaN NaN NaN NaN 0.0 NaN NaN NaN \n", + "200030.0 NaN NaN 17.0 NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[5 rows x 48 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions = actions.unstack()\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex(levels=[['type_1.0', 'type_2.0', 'type_3.0', 'type_4.0', 'type_5.0', 'type_6.0'], [4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]],\n", + " codes=[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5], [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]],\n", + " names=[None, 'cate'])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex(levels=[[4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0], ['type_1.0', 'type_2.0', 'type_3.0', 'type_4.0', 'type_5.0', 'type_6.0']],\n", + " codes=[[0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5]],\n", + " names=['cate', None])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns = actions.columns.swaplevel(0, 1)\n", + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['type_1.0', 'type_1.0', 'type_1.0', 'type_1.0', 'type_1.0', 'type_1.0',\n", + " 'type_1.0', 'type_1.0', 'type_2.0', 'type_2.0', 'type_2.0', 'type_2.0',\n", + " 'type_2.0', 'type_2.0', 'type_2.0', 'type_2.0', 'type_3.0', 'type_3.0',\n", + " 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0', 'type_3.0',\n", + " 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0', 'type_4.0',\n", + " 'type_4.0', 'type_4.0', 'type_5.0', 'type_5.0', 'type_5.0', 'type_5.0',\n", + " 'type_5.0', 'type_5.0', 'type_5.0', 'type_5.0', 'type_6.0', 'type_6.0',\n", + " 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0', 'type_6.0'],\n", + " dtype='object')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns = actions.columns.droplevel()\n", + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n", + " 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n", + " 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n", + " 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n", + " 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n", + " 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n", + " 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n", + " 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n", + " 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n", + " 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n", + " 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n", + " 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'],\n", + " dtype='object')" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions.columns = [\n", + " 'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',\n", + " 'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',\n", + " 'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',\n", + " 'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',\n", + " 'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',\n", + " 'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',\n", + " 'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',\n", + " 'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',\n", + " 'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',\n", + " 'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',\n", + " 'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',\n", + " 'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6'\n", + " ]\n", + "actions.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cate_4_type1cate_5_type1cate_6_type1cate_7_type1cate_8_type1cate_9_type1cate_10_type1cate_11_type1cate_4_type2cate_5_type2...cate_11_type5cate_4_type6cate_5_type6cate_6_type6cate_7_type6cate_8_type6cate_9_type6cate_10_type6cate_11_type6cate_action_sum
user_id
200002.016.04.00.04.04.00.00.00.00.00.0...0.020.06.00.03.012.00.00.00.069.0
200003.08.00.00.00.012.00.00.00.00.00.0...0.012.00.00.00.019.00.00.00.051.0
200008.00.00.00.08.00.00.00.00.00.00.0...0.00.00.00.020.00.00.00.00.028.0
200023.00.00.00.00.01.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
200030.08.00.00.00.00.00.00.00.00.00.0...0.017.00.00.00.00.00.00.00.025.0
\n", + "

5 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " cate_4_type1 cate_5_type1 cate_6_type1 cate_7_type1 \\\n", + "user_id \n", + "200002.0 16.0 4.0 0.0 4.0 \n", + "200003.0 8.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 8.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 8.0 0.0 0.0 0.0 \n", + "\n", + " cate_8_type1 cate_9_type1 cate_10_type1 cate_11_type1 \\\n", + "user_id \n", + "200002.0 4.0 0.0 0.0 0.0 \n", + "200003.0 12.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 0.0 \n", + "200023.0 1.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_4_type2 cate_5_type2 ... cate_11_type5 cate_4_type6 \\\n", + "user_id ... \n", + "200002.0 0.0 0.0 ... 0.0 20.0 \n", + "200003.0 0.0 0.0 ... 0.0 12.0 \n", + "200008.0 0.0 0.0 ... 0.0 0.0 \n", + "200023.0 0.0 0.0 ... 0.0 0.0 \n", + "200030.0 0.0 0.0 ... 0.0 17.0 \n", + "\n", + " cate_5_type6 cate_6_type6 cate_7_type6 cate_8_type6 \\\n", + "user_id \n", + "200002.0 6.0 0.0 3.0 12.0 \n", + "200003.0 0.0 0.0 0.0 19.0 \n", + "200008.0 0.0 0.0 20.0 0.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_9_type6 cate_10_type6 cate_11_type6 cate_action_sum \n", + "user_id \n", + "200002.0 0.0 0.0 0.0 69.0 \n", + "200003.0 0.0 0.0 0.0 51.0 \n", + "200008.0 0.0 0.0 0.0 28.0 \n", + "200023.0 0.0 0.0 0.0 1.0 \n", + "200030.0 0.0 0.0 0.0 25.0 \n", + "\n", + "[5 rows x 49 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions = actions.fillna(0)\n", + "actions['cate_action_sum'] = actions.sum(axis=1)\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cate_4_type1cate_5_type1cate_6_type1cate_7_type1cate_8_type1cate_9_type1cate_10_type1cate_11_type1cate_4_type2cate_5_type2...cate_4_type6cate_5_type6cate_6_type6cate_7_type6cate_8_type6cate_9_type6cate_10_type6cate_11_type6cate_action_sumcate8_percentage
user_id
200002.016.04.00.04.04.00.00.00.00.00.0...20.06.00.03.012.00.00.00.069.00.231884
200003.08.00.00.00.012.00.00.00.00.00.0...12.00.00.00.019.00.00.00.051.00.607843
200008.00.00.00.08.00.00.00.00.00.00.0...0.00.00.020.00.00.00.00.028.00.000000
200023.00.00.00.00.01.00.00.00.00.00.0...0.00.00.00.00.00.00.00.01.01.000000
200030.08.00.00.00.00.00.00.00.00.00.0...17.00.00.00.00.00.00.00.025.00.000000
\n", + "

5 rows × 50 columns

\n", + "
" + ], + "text/plain": [ + " cate_4_type1 cate_5_type1 cate_6_type1 cate_7_type1 \\\n", + "user_id \n", + "200002.0 16.0 4.0 0.0 4.0 \n", + "200003.0 8.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 8.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 8.0 0.0 0.0 0.0 \n", + "\n", + " cate_8_type1 cate_9_type1 cate_10_type1 cate_11_type1 \\\n", + "user_id \n", + "200002.0 4.0 0.0 0.0 0.0 \n", + "200003.0 12.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 0.0 \n", + "200023.0 1.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_4_type2 cate_5_type2 ... cate_4_type6 cate_5_type6 \\\n", + "user_id ... \n", + "200002.0 0.0 0.0 ... 20.0 6.0 \n", + "200003.0 0.0 0.0 ... 12.0 0.0 \n", + "200008.0 0.0 0.0 ... 0.0 0.0 \n", + "200023.0 0.0 0.0 ... 0.0 0.0 \n", + "200030.0 0.0 0.0 ... 17.0 0.0 \n", + "\n", + " cate_6_type6 cate_7_type6 cate_8_type6 cate_9_type6 \\\n", + "user_id \n", + "200002.0 0.0 3.0 12.0 0.0 \n", + "200003.0 0.0 0.0 19.0 0.0 \n", + "200008.0 0.0 20.0 0.0 0.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_10_type6 cate_11_type6 cate_action_sum cate8_percentage \n", + "user_id \n", + "200002.0 0.0 0.0 69.0 0.231884 \n", + "200003.0 0.0 0.0 51.0 0.607843 \n", + "200008.0 0.0 0.0 28.0 0.000000 \n", + "200023.0 0.0 0.0 1.0 1.000000 \n", + "200030.0 0.0 0.0 25.0 0.000000 \n", + "\n", + "[5 rows x 50 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions['cate8_percentage'] = (\n", + " actions['cate_8_type1'] + actions['cate_8_type2'] +\n", + " actions['cate_8_type3'] + actions['cate_8_type4'] +\n", + " actions['cate_8_type5'] + actions['cate_8_type6']\n", + " ) / actions['cate_action_sum']\n", + "actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cate_4_type1cate_5_type1cate_6_type1cate_7_type1cate_8_type1cate_9_type1cate_10_type1cate_11_type1cate_4_type2cate_5_type2...cate_5_type6cate_6_type6cate_7_type6cate_8_type6cate_9_type6cate_10_type6cate_11_type6cate_action_sumcate8_percentagecate8_type1_percentage
user_id
200002.016.04.00.04.04.00.00.00.00.00.0...6.00.03.012.00.00.00.069.00.231884-1.757858
200003.08.00.00.00.012.00.00.00.00.00.0...0.00.00.019.00.00.00.051.00.607843-0.479573
200008.00.00.00.08.00.00.00.00.00.00.0...0.00.020.00.00.00.00.028.00.000000-2.197225
200023.00.00.00.00.01.00.00.00.00.00.0...0.00.00.00.00.00.00.01.01.0000000.000000
200030.08.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.025.00.000000-2.197225
\n", + "

5 rows × 51 columns

\n", + "
" + ], + "text/plain": [ + " cate_4_type1 cate_5_type1 cate_6_type1 cate_7_type1 \\\n", + "user_id \n", + "200002.0 16.0 4.0 0.0 4.0 \n", + "200003.0 8.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 8.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 8.0 0.0 0.0 0.0 \n", + "\n", + " cate_8_type1 cate_9_type1 cate_10_type1 cate_11_type1 \\\n", + "user_id \n", + "200002.0 4.0 0.0 0.0 0.0 \n", + "200003.0 12.0 0.0 0.0 0.0 \n", + "200008.0 0.0 0.0 0.0 0.0 \n", + "200023.0 1.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_4_type2 cate_5_type2 ... cate_5_type6 cate_6_type6 \\\n", + "user_id ... \n", + "200002.0 0.0 0.0 ... 6.0 0.0 \n", + "200003.0 0.0 0.0 ... 0.0 0.0 \n", + "200008.0 0.0 0.0 ... 0.0 0.0 \n", + "200023.0 0.0 0.0 ... 0.0 0.0 \n", + "200030.0 0.0 0.0 ... 0.0 0.0 \n", + "\n", + " cate_7_type6 cate_8_type6 cate_9_type6 cate_10_type6 \\\n", + "user_id \n", + "200002.0 3.0 12.0 0.0 0.0 \n", + "200003.0 0.0 19.0 0.0 0.0 \n", + "200008.0 20.0 0.0 0.0 0.0 \n", + "200023.0 0.0 0.0 0.0 0.0 \n", + "200030.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cate_11_type6 cate_action_sum cate8_percentage \\\n", + "user_id \n", + "200002.0 0.0 69.0 0.231884 \n", + "200003.0 0.0 51.0 0.607843 \n", + "200008.0 0.0 28.0 0.000000 \n", + "200023.0 0.0 1.0 1.000000 \n", + "200030.0 0.0 25.0 0.000000 \n", + "\n", + " cate8_type1_percentage \n", + "user_id \n", + "200002.0 -1.757858 \n", + "200003.0 -0.479573 \n", + "200008.0 -2.197225 \n", + "200023.0 0.000000 \n", + "200030.0 -2.197225 \n", + "\n", + "[5 rows x 51 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions['cate8_type1_percentage'] = np.log(\n", + " 1 + actions['cate_8_type1']) - np.log(\n", + " 1 + actions['cate_8_type1'] + actions['cate_4_type1'] +\n", + " actions['cate_5_type1'] + actions['cate_6_type1'] +\n", + " actions['cate_7_type1'] + actions['cate_9_type1'] +\n", + " actions['cate_10_type1'] + actions['cate_11_type1'])\n", + "actions.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 商品-行为\n", + "#### 累积商品特征\n", + "* 分时间段\n", + "* 针对商品的不同行为的\n", + " * 购买转化率\n", + " * 均值\n", + " * 标准差" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accumulate_product_feat(start_date, end_date, all_actions):\n", + " feature = [\n", + " 'sku_id', 'product_action_1', 'product_action_2',\n", + " 'product_action_3', 'product_action_4',\n", + " 'product_action_5', 'product_action_6',\n", + " 'product_action_1_ratio', 'product_action_2_ratio',\n", + " 'product_action_3_ratio', 'product_action_5_ratio',\n", + " 'product_action_6_ratio', 'product_action_1_mean',\n", + " 'product_action_2_mean', 'product_action_3_mean',\n", + " 'product_action_4_mean', 'product_action_5_mean',\n", + " 'product_action_6_mean', 'product_action_1_std',\n", + " 'product_action_2_std', 'product_action_3_std', 'product_action_4_std',\n", + " 'product_action_5_std', 'product_action_6_std'\n", + " ]\n", + "\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " df = pd.get_dummies(actions['type'], prefix='product_action')\n", + " # 按照商品-日期分组,计算某个时间段该商品的各项行为的标准差\n", + " actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n", + " actions = pd.concat([actions[['sku_id', 'date']], df], axis=1)\n", + " actions = actions.groupby(['sku_id'], as_index=False).sum()\n", + " days_interal = (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(start_date, '%Y-%m-%d')).days\n", + " \n", + " # 针对商品分组,计算购买转化率\n", + " actions['product_action_1_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_1.0'])\n", + " actions['product_action_2_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_2.0'])\n", + " actions['product_action_3_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_3.0'])\n", + " actions['product_action_5_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_5.0'])\n", + " actions['product_action_6_ratio'] = np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_6.0'])\n", + " \n", + " # 计算各种行为的均值\n", + " actions['product_action_1_mean'] = actions[\n", + " 'product_action_1.0'] / days_interal\n", + " actions['product_action_2_mean'] = actions[\n", + " 'product_action_2.0'] / days_interal\n", + " actions['product_action_3_mean'] = actions[\n", + " 'product_action_3.0'] / days_interal\n", + " actions['product_action_4_mean'] = actions[\n", + " 'product_action_4.0'] / days_interal\n", + " actions['product_action_5_mean'] = actions[\n", + " 'product_action_5.0'] / days_interal\n", + " actions['product_action_6_mean'] = actions[\n", + " 'product_action_6.0'] / days_interal\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 类别特征\n", + "#### 分时间段下各个商品类别的\n", + "* 购买转化率\n", + " * 标准差\n", + " * 均值" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accumulate_cate_feat(start_date, end_date, all_actions):\n", + " feature = ['cate','cate_action_1', 'cate_action_2', 'cate_action_3', 'cate_action_4', 'cate_action_5', \n", + " 'cate_action_6', 'cate_action_1_ratio', 'cate_action_2_ratio', \n", + " 'cate_action_3_ratio', 'cate_action_5_ratio', 'cate_action_6_ratio', 'cate_action_1_mean',\n", + " 'cate_action_2_mean', 'cate_action_3_mean', 'cate_action_4_mean', 'cate_action_5_mean',\n", + " 'cate_action_6_mean', 'cate_action_1_std', 'cate_action_2_std', 'cate_action_3_std',\n", + " 'cate_action_4_std', 'cate_action_5_std', 'cate_action_6_std']\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())\n", + " df = pd.get_dummies(actions['type'], prefix='cate_action')\n", + " actions = pd.concat([actions[['cate','date']], df], axis=1)\n", + " \n", + " # 按照类别分组,统计各个商品类别下行为的转化率\n", + " actions = actions.groupby(['cate'], as_index=False).sum()\n", + " days_interal = (datetime.strptime(end_date, '%Y-%m-%d')-datetime.strptime(start_date, '%Y-%m-%d')).days\n", + " \n", + " actions['cate_action_1_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_1.0']))\n", + " actions['cate_action_2_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_2.0']))\n", + " actions['cate_action_3_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_3.0']))\n", + " actions['cate_action_5_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_5.0']))\n", + " actions['cate_action_6_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_6.0']))\n", + " \n", + " # 按照类别分组,统计各个商品类别下行为在一段时间的均值\n", + " actions['cate_action_1_mean'] = actions['cate_action_1.0'] / days_interal\n", + " actions['cate_action_2_mean'] = actions['cate_action_2.0'] / days_interal\n", + " actions['cate_action_3_mean'] = actions['cate_action_3.0'] / days_interal\n", + " actions['cate_action_4_mean'] = actions['cate_action_4.0'] / days_interal\n", + " actions['cate_action_5_mean'] = actions['cate_action_5.0'] / days_interal\n", + " actions['cate_action_6_mean'] = actions['cate_action_6.0'] / days_interal\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 构造训练集/测试集\n", + "### 构造训练集/验证集\n", + "标签,采用滑动窗口的方式,构造训练集的时候针对产生购买的行为标记为1\n", + "整合特征" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "def get_labels(start_date, end_date, all_actions):\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + "# actions = actions[actions['type'] == 4]\n", + " # 修改为预测购买了商品8的用户预测\n", + " actions = actions[(actions['type'] == 4) & (actions['cate']==8)]\n", + " \n", + " actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()\n", + " actions['label'] = 1\n", + " actions = actions[['user_id', 'sku_id', 'label']]\n", + " return actions" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get all actions!\n" + ] + } + ], + "source": [ + "train_start_date = '2016-03-01'\n", + "train_actions = None\n", + "all_actions = get_all_action()\n", + "print (\"get all actions!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsku_idtimemodel_idtypecatebrand
0266079.0138778.02016-01-31 23:59:02NaN1.08.0403.0
1266079.0138778.02016-01-31 23:59:030.06.08.0403.0
2200719.061226.02016-01-31 23:59:07NaN1.08.030.0
3200719.061226.02016-01-31 23:59:080.06.08.030.0
4263587.072348.02016-01-31 23:59:08NaN1.05.0159.0
\n", + "
" + ], + "text/plain": [ + " user_id sku_id time model_id type cate brand\n", + "0 266079.0 138778.0 2016-01-31 23:59:02 NaN 1.0 8.0 403.0\n", + "1 266079.0 138778.0 2016-01-31 23:59:03 0.0 6.0 8.0 403.0\n", + "2 200719.0 61226.0 2016-01-31 23:59:07 NaN 1.0 8.0 30.0\n", + "3 200719.0 61226.0 2016-01-31 23:59:08 0.0 6.0 8.0 30.0\n", + "4 263587.0 72348.0 2016-01-31 23:59:08 NaN 1.0 5.0 159.0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_actions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 50601736 entries, 0 to 13199933\n", + "Data columns (total 7 columns):\n", + "user_id float32\n", + "sku_id float32\n", + "time object\n", + "model_id float32\n", + "type float32\n", + "cate float32\n", + "brand float32\n", + "dtypes: float32(6), object(1)\n", + "memory usage: 1.9+ GB\n" + ] + } + ], + "source": [ + "all_actions.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(50601736, 7)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_actions.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_basic_user_feat finsihed\n" + ] + } + ], + "source": [ + "user = get_basic_user_feat()\n", + "print ('get_basic_user_feat finsihed')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idage_0age_1age_2age_3age_4age_5age_6sex_0sex_1sex_2user_lv_cd_1user_lv_cd_2user_lv_cd_3user_lv_cd_4user_lv_cd_5
0200001.00.00.00.00.00.00.01.00.00.01.00.00.00.00.01.0
1200002.01.00.00.00.00.00.00.01.00.00.01.00.00.00.00.0
2200003.00.00.00.00.01.00.00.00.01.00.00.00.00.01.00.0
3200004.01.00.00.00.00.00.00.00.00.01.01.00.00.00.00.0
4200005.00.00.01.00.00.00.00.01.00.00.00.00.00.01.00.0
\n", + "
" + ], + "text/plain": [ + " user_id age_0 age_1 age_2 age_3 age_4 age_5 age_6 sex_0 sex_1 \\\n", + "0 200001.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 \n", + "1 200002.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "2 200003.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n", + "3 200004.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 200005.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "\n", + " sex_2 user_lv_cd_1 user_lv_cd_2 user_lv_cd_3 user_lv_cd_4 user_lv_cd_5 \n", + "0 1.0 0.0 0.0 0.0 0.0 1.0 \n", + "1 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "3 1.0 1.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 1.0 0.0 " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_basic_product_feat finsihed\n" + ] + } + ], + "source": [ + "product = get_basic_product_feat()\n", + "print ('get_basic_product_feat finsihed')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_idcatebranda1_-1a1_1a1_2a1_3a2_-1a2_1a2_2a3_-1a3_1a3_2
01084890001010010
110000284890001001001
21000038300100100100
310000685450100001010
41000182441000010001
\n", + "
" + ], + "text/plain": [ + " sku_id cate brand a1_-1 a1_1 a1_2 a1_3 a2_-1 a2_1 a2_2 a3_-1 \\\n", + "0 10 8 489 0 0 0 1 0 1 0 0 \n", + "1 100002 8 489 0 0 0 1 0 0 1 0 \n", + "2 100003 8 30 0 1 0 0 1 0 0 1 \n", + "3 100006 8 545 0 1 0 0 0 0 1 0 \n", + "4 10001 8 244 1 0 0 0 0 1 0 0 \n", + "\n", + " a3_1 a3_2 \n", + "0 1 0 \n", + "1 0 1 \n", + "2 0 0 \n", + "3 1 0 \n", + "4 0 1 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "datetime.datetime(2016, 3, 4, 0, 0)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_start_date = '2016-03-01'\n", + "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + "train_end_date" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2016-03-04\n" + ] + } + ], + "source": [ + "train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + "# 修正prod_acc,cate_acc的时间跨度\n", + "start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + "start_days = start_days.strftime('%Y-%m-%d')\n", + "print (train_end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def make_actions(user, product, all_actions, train_start_date):\n", + " train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + " train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + " # 修正prod_acc,cate_acc的时间跨度\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " print (train_end_date)\n", + " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", + " print ('get_recent_user_feat finsihed')\n", + " \n", + " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", + " print ('get_user_cate_feature finished')\n", + " \n", + " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_product_feat finsihed')\n", + " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_cate_feat finsihed')\n", + " comment_acc = get_comments_product_feat(train_end_date)\n", + " print ('get_comments_product_feat finished')\n", + " # 标记\n", + " test_start_date = train_end_date\n", + " test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)\n", + " test_end_date = test_end_date.strftime('%Y-%m-%d')\n", + " labels = get_labels(test_start_date, test_end_date, all_actions)\n", + " print (\"get labels\")\n", + " \n", + " actions = None\n", + " for i in (3, 5, 7, 10, 15, 21, 30):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " if actions is None:\n", + " actions = get_action_feat(start_days, train_end_date, all_actions, i)\n", + " else:\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, get_action_feat(start_days, train_end_date, all_actions, i), how='left',\n", + " on=['user_id', 'sku_id', 'cate'])\n", + "\n", + " actions = pd.merge(actions, user, how='left', on='user_id')\n", + " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", + "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", + " actions.append(user_cate)\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", + " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", + " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])\n", + " # 主要是填充拼接商品基本特征、评论特征、标签之后的空值\n", + " actions = actions.fillna(0)\n", + "# return actions\n", + " # 采样\n", + " action_postive = actions[actions['label'] == 1]\n", + " action_negative = actions[actions['label'] == 0]\n", + " del actions\n", + " neg_len = len(action_postive) * 10\n", + " action_negative = action_negative.sample(n=neg_len)\n", + " action_sample = pd.concat([action_postive, action_negative], ignore_index=True) \n", + " \n", + " return action_sample" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "def make_train_set(train_start_date, setNums ,f_path, all_actions):\n", + " train_actions = None\n", + " #all_actions = get_all_action()\n", + " #print (\"get all actions!\")\n", + " user = get_basic_user_feat()\n", + " print ('get_basic_user_feat finsihed')\n", + " product = get_basic_product_feat()\n", + " print ('get_basic_product_feat finsihed')\n", + " # 滑窗,构造多组训练集/验证集\n", + " for i in range(setNums):\n", + " print (train_start_date)\n", + " if train_actions is None:\n", + " train_actions = make_actions(user, product, all_actions, train_start_date)\n", + " else:\n", + " train_actions = pd.concat([train_actions, make_actions(user, product, all_actions, train_start_date)],\n", + " ignore_index=True)\n", + " # 接下来每次移动一天\n", + " train_start_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=1)\n", + " train_start_date = train_start_date.strftime('%Y-%m-%d')\n", + " print (\"round {0}/{1} over!\".format(i+1, setNums))\n", + "\n", + " train_actions.to_csv(f_path, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_basic_user_feat finsihed\n", + "get_basic_product_feat finsihed\n", + "2016-02-01\n", + "2016-02-04\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 1/20 over!\n", + "2016-02-02\n", + "2016-02-05\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 2/20 over!\n", + "2016-02-03\n", + "2016-02-06\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 3/20 over!\n", + "2016-02-04\n", + "2016-02-07\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 4/20 over!\n", + "2016-02-05\n", + "2016-02-08\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 5/20 over!\n", + "2016-02-06\n", + "2016-02-09\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 6/20 over!\n", + "2016-02-07\n", + "2016-02-10\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 7/20 over!\n", + "2016-02-08\n", + "2016-02-11\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 8/20 over!\n", + "2016-02-09\n", + "2016-02-12\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 9/20 over!\n", + "2016-02-10\n", + "2016-02-13\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 10/20 over!\n", + "2016-02-11\n", + "2016-02-14\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 11/20 over!\n", + "2016-02-12\n", + "2016-02-15\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 12/20 over!\n", + "2016-02-13\n", + "2016-02-16\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 13/20 over!\n", + "2016-02-14\n", + "2016-02-17\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 14/20 over!\n", + "2016-02-15\n", + "2016-02-18\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 15/20 over!\n", + "2016-02-16\n", + "2016-02-19\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 16/20 over!\n", + "2016-02-17\n", + "2016-02-20\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 17/20 over!\n", + "2016-02-18\n", + "2016-02-21\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 18/20 over!\n", + "2016-02-19\n", + "2016-02-22\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 19/20 over!\n", + "2016-02-20\n", + "2016-02-23\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n", + "get labels\n", + "round 20/20 over!\n" + ] + } + ], + "source": [ + "# 训练集\n", + "train_start_date = '2016-02-01'\n", + "make_train_set(train_start_date, 20, 'data/train_set.csv',all_actions)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 构造验证集(线下测试集)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "def make_val_answer(val_start_date, val_end_date, all_actions, label_val_s1_path):\n", + " actions = get_actions(val_start_date, val_end_date,all_actions)\n", + " actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]\n", + " actions = actions[['user_id', 'sku_id']]\n", + " actions = actions.drop_duplicates()\n", + " actions.to_csv(label_val_s1_path, index=False)\n", + "\n", + "def make_val_set(train_start_date, train_end_date, val_s1_path):\n", + " # 修改时间跨度\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " all_actions = get_all_action()\n", + " print (\"get all actions!\")\n", + " user = get_basic_user_feat()\n", + " print ('get_basic_user_feat finsihed')\n", + " \n", + " product = get_basic_product_feat()\n", + " print ('get_basic_product_feat finsihed')\n", + " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", + " print ('get_recent_user_feat finsihed')\n", + " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", + " print ('get_user_cate_feature finished')\n", + " \n", + " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_product_feat finsihed')\n", + " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", + " print ('get_accumulate_cate_feat finsihed')\n", + " comment_acc = get_comments_product_feat(train_end_date)\n", + " print ('get_comments_product_feat finished')\n", + " \n", + " actions = None\n", + " for i in (3, 5, 7, 10, 15, 21, 30):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " if actions is None:\n", + " actions = get_action_feat(start_days, train_end_date, all_actions,i)\n", + " else:\n", + " actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n", + " on=['user_id', 'sku_id', 'cate'])\n", + "\n", + " actions = pd.merge(actions, user, how='left', on='user_id')\n", + " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", + "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", + " actions.append(user_cate)\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", + " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", + " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", + " actions = actions.fillna(0)\n", + " \n", + " \n", + "# print actions\n", + " # 构造真实用户购买情况作为后续验证\n", + " val_start_date = train_end_date\n", + " val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n", + " val_end_date = val_end_date.strftime('%Y-%m-%d')\n", + " make_val_answer(val_start_date, val_end_date, all_actions, 'label_'+val_s1_path)\n", + " \n", + " actions.to_csv(val_s1_path, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get all actions!\n", + "get_basic_user_feat finsihed\n", + "get_basic_product_feat finsihed\n", + "get_recent_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n", + "get_comments_product_feat finished\n" + ] + }, + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'label_data/val_set.csv'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 验证集\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmake_val_set\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'2016-02-23'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'2016-02-26'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'data/val_set.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m\u001b[0m in \u001b[0;36mmake_val_set\u001b[1;34m(train_start_date, train_end_date, val_s1_path)\u001b[0m\n\u001b[0;32m 56\u001b[0m \u001b[0mval_end_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrptime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'%Y-%m-%d'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mtimedelta\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdays\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 57\u001b[0m \u001b[0mval_end_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mval_end_date\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrftime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'%Y-%m-%d'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 58\u001b[1;33m \u001b[0mmake_val_answer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval_end_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mall_actions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'label_'\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mval_s1_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 59\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_s1_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m\u001b[0m in \u001b[0;36mmake_val_answer\u001b[1;34m(val_start_date, val_end_date, all_actions, label_val_s1_path)\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'user_id'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'sku_id'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel_val_s1_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmake_val_set\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_end_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval_s1_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mto_csv\u001b[1;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)\u001b[0m\n\u001b[0;32m 3018\u001b[0m \u001b[0mdoublequote\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdoublequote\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3019\u001b[0m escapechar=escapechar, decimal=decimal)\n\u001b[1;32m-> 3020\u001b[1;33m \u001b[0mformatter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3021\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3022\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpath_or_buf\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\formats\\csvs.py\u001b[0m in \u001b[0;36msave\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 155\u001b[0m f, handles = _get_handle(self.path_or_buf, self.mode,\n\u001b[0;32m 156\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 157\u001b[1;33m compression=self.compression)\n\u001b[0m\u001b[0;32m 158\u001b[0m \u001b[0mclose\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\common.py\u001b[0m in \u001b[0;36m_get_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text)\u001b[0m\n\u001b[0;32m 422\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 423\u001b[0m \u001b[1;31m# Python 3 and encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 424\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath_or_buf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnewline\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 425\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mis_text\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 426\u001b[0m \u001b[1;31m# Python 3 and no explicit encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'label_data/val_set.csv'" + ] + } + ], + "source": [ + "# 验证集\n", + "make_val_set('2016-02-23', '2016-02-26', 'data/val_set.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_test_set(train_start_date, train_end_date):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " all_actions = get_all_action()\n", + " print \"get all actions!\"\n", + " user = get_basic_user_feat()\n", + " print 'get_basic_user_feat finsihed'\n", + " product = get_basic_product_feat()\n", + " print 'get_basic_product_feat finsihed'\n", + " \n", + " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", + " print 'get_accumulate_user_feat finsihed'\n", + " \n", + " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", + " print 'get_user_cate_feature finished'\n", + " \n", + " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", + " print 'get_accumulate_product_feat finsihed'\n", + " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", + " print 'get_accumulate_cate_feat finsihed'\n", + " comment_acc = get_comments_product_feat(train_end_date)\n", + "\n", + " actions = None\n", + " for i in (3, 5, 7, 10, 15, 21, 30):\n", + " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)\n", + " start_days = start_days.strftime('%Y-%m-%d')\n", + " if actions is None:\n", + " actions = get_action_feat(start_days, train_end_date, all_actions,i)\n", + " else:\n", + " actions = pd.merge(actions, get_action_feat(start_days, train_end_date,all_actions,i), how='left',\n", + " on=['user_id', 'sku_id', 'cate'])\n", + "\n", + " actions = pd.merge(actions, user, how='left', on='user_id')\n", + " actions = pd.merge(actions, user_acc, how='left', on='user_id')\n", + "# actions = pd.merge(actions, user_cate, how='left', on='user_id')\n", + " actions.append(user_cate)\n", + " # 注意这里的拼接key\n", + " actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])\n", + " actions = pd.merge(actions, product_acc, how='left', on='sku_id')\n", + " actions = pd.merge(actions, cate_acc, how='left', on='cate')\n", + " actions = pd.merge(actions, comment_acc, how='left', on='sku_id')\n", + "\n", + " actions = actions.fillna(0)\n", + " \n", + "\n", + " actions.to_csv(\"data/test_set.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 预测结果\n", + "sub_start_date = '2016-04-13'\n", + "sub_end_date = '2016-04-16'\n", + "make_test_set(sub_start_date, sub_end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {