From 3901a1e776c8f87e8408ab699754b91e1f80655c Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 5 Feb 2021 13:39:47 +0800 Subject: [PATCH] Update. Feature engineering of Users and Commendities --- .../3-特征工程.ipynb | 162 ++++++++++-------- 1 file changed, 90 insertions(+), 72 deletions(-) diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb index 5b04afa..035fe39 100644 --- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/3-特征工程.ipynb @@ -1021,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1030,7 +1030,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1103,7 +1103,7 @@ "33 272629.0 10.0 6.0" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1116,7 +1116,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1221,7 +1221,7 @@ "200003.0 4.0 8.0 0.0 0.0 0.0 0.0 12.0" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1235,7 +1235,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1466,7 +1466,7 @@ "[5 rows x 48 columns]" ] }, - "execution_count": 23, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1478,7 +1478,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1489,7 +1489,7 @@ " names=[None, 'cate'])" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1500,7 +1500,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1511,7 +1511,7 @@ " names=['cate', None])" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1523,7 +1523,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1540,7 +1540,7 @@ " dtype='object')" ] }, - "execution_count": 26, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1552,7 +1552,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1573,7 +1573,7 @@ " dtype='object')" ] }, - "execution_count": 27, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1598,7 +1598,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1839,7 +1839,7 @@ "[5 rows x 49 columns]" ] }, - "execution_count": 28, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1852,7 +1852,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -2093,7 +2093,7 @@ "[5 rows x 50 columns]" ] }, - "execution_count": 29, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2109,7 +2109,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -2358,7 +2358,7 @@ "[5 rows x 51 columns]" ] }, - "execution_count": 30, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -2388,7 +2388,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -2451,7 +2451,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -2499,7 +2499,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -2517,7 +2517,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -2537,7 +2537,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -2634,7 +2634,7 @@ "4 263587.0 72348.0 2016-01-31 23:59:08 NaN 1.0 5.0 159.0" ] }, - "execution_count": 35, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -2645,7 +2645,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2673,7 +2673,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2682,7 +2682,7 @@ "(50601736, 7)" ] }, - "execution_count": 37, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2693,7 +2693,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -2711,7 +2711,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2869,7 +2869,7 @@ "4 0.0 0.0 0.0 0.0 1.0 0.0 " ] }, - "execution_count": 39, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2880,7 +2880,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -2898,7 +2898,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -3038,7 +3038,7 @@ "4 0 1 " ] }, - "execution_count": 41, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -3049,7 +3049,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -3058,7 +3058,7 @@ "datetime.datetime(2016, 3, 4, 0, 0)" ] }, - "execution_count": 42, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -3071,7 +3071,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -3092,7 +3092,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -3159,7 +3159,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -3189,7 +3189,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -3205,7 +3205,27 @@ "get_accumulate_product_feat finsihed\n", "get_accumulate_cate_feat finsihed\n", "get_comments_product_feat finished\n", - "get labels\n", + "get labels\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "round 1/20 over!\n", "2016-02-02\n", "2016-02-05\n", @@ -3396,7 +3416,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -3457,14 +3477,14 @@ " val_start_date = train_end_date\n", " val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n", " val_end_date = val_end_date.strftime('%Y-%m-%d')\n", - " make_val_answer(val_start_date, val_end_date, all_actions, 'label_'+val_s1_path)\n", + " make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n", " \n", " actions.to_csv(val_s1_path, index=False)" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -3480,22 +3500,6 @@ "get_accumulate_cate_feat finsihed\n", "get_comments_product_feat finished\n" ] - }, - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'label_data/val_set.csv'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 验证集\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmake_val_set\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'2016-02-23'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'2016-02-26'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'data/val_set.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m\u001b[0m in \u001b[0;36mmake_val_set\u001b[1;34m(train_start_date, train_end_date, val_s1_path)\u001b[0m\n\u001b[0;32m 56\u001b[0m \u001b[0mval_end_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrptime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'%Y-%m-%d'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mtimedelta\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdays\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 57\u001b[0m \u001b[0mval_end_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mval_end_date\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrftime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'%Y-%m-%d'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 58\u001b[1;33m \u001b[0mmake_val_answer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval_end_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mall_actions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'label_'\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mval_s1_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 59\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_s1_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m\u001b[0m in \u001b[0;36mmake_val_answer\u001b[1;34m(val_start_date, val_end_date, all_actions, label_val_s1_path)\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'user_id'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'sku_id'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel_val_s1_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmake_val_set\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_end_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval_s1_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mto_csv\u001b[1;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)\u001b[0m\n\u001b[0;32m 3018\u001b[0m \u001b[0mdoublequote\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdoublequote\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3019\u001b[0m escapechar=escapechar, decimal=decimal)\n\u001b[1;32m-> 3020\u001b[1;33m \u001b[0mformatter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3021\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3022\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpath_or_buf\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\formats\\csvs.py\u001b[0m in \u001b[0;36msave\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 155\u001b[0m f, handles = _get_handle(self.path_or_buf, self.mode,\n\u001b[0;32m 156\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 157\u001b[1;33m compression=self.compression)\n\u001b[0m\u001b[0;32m 158\u001b[0m \u001b[0mclose\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\common.py\u001b[0m in \u001b[0;36m_get_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text)\u001b[0m\n\u001b[0;32m 422\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 423\u001b[0m \u001b[1;31m# Python 3 and encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 424\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath_or_buf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnewline\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 425\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mis_text\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 426\u001b[0m \u001b[1;31m# Python 3 and no explicit encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'label_data/val_set.csv'" - ] } ], "source": [ @@ -3505,7 +3509,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -3513,22 +3517,22 @@ " start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n", " start_days = start_days.strftime('%Y-%m-%d')\n", " all_actions = get_all_action()\n", - " print \"get all actions!\"\n", + " print(\"get all actions!\")\n", " user = get_basic_user_feat()\n", - " print 'get_basic_user_feat finsihed'\n", + " print('get_basic_user_feat finsihed')\n", " product = get_basic_product_feat()\n", - " print 'get_basic_product_feat finsihed'\n", + " print('get_basic_product_feat finsihed')\n", " \n", " user_acc = get_recent_user_feat(train_end_date, all_actions)\n", - " print 'get_accumulate_user_feat finsihed'\n", + " print('get_accumulate_user_feat finsihed')\n", " \n", " user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n", - " print 'get_user_cate_feature finished'\n", + " print('get_user_cate_feature finished')\n", " \n", " product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n", - " print 'get_accumulate_product_feat finsihed'\n", + " print('get_accumulate_product_feat finsihed')\n", " cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n", - " print 'get_accumulate_cate_feat finsihed'\n", + " print('get_accumulate_cate_feat finsihed')\n", " comment_acc = get_comments_product_feat(train_end_date)\n", "\n", " actions = None\n", @@ -3559,9 +3563,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get all actions!\n", + "get_basic_user_feat finsihed\n", + "get_basic_product_feat finsihed\n", + "get_accumulate_user_feat finsihed\n", + "get_user_cate_feature finished\n", + "get_accumulate_product_feat finsihed\n", + "get_accumulate_cate_feat finsihed\n" + ] + } + ], "source": [ "# 预测结果\n", "sub_start_date = '2016-04-13'\n",