Update. Feature engineering of Users and Commendities

pull/2/head
benjas 5 years ago
parent adf64f3b73
commit 3901a1e776

@ -1021,7 +1021,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@ -1030,7 +1030,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"outputs": [
{
@ -1103,7 +1103,7 @@
"33 272629.0 10.0 6.0"
]
},
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@ -1116,7 +1116,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"metadata": {},
"outputs": [
{
@ -1221,7 +1221,7 @@
"200003.0 4.0 8.0 0.0 0.0 0.0 0.0 12.0"
]
},
"execution_count": 22,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@ -1235,7 +1235,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 22,
"metadata": {},
"outputs": [
{
@ -1466,7 +1466,7 @@
"[5 rows x 48 columns]"
]
},
"execution_count": 23,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@ -1478,7 +1478,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 23,
"metadata": {},
"outputs": [
{
@ -1489,7 +1489,7 @@
" names=[None, 'cate'])"
]
},
"execution_count": 24,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@ -1500,7 +1500,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 24,
"metadata": {},
"outputs": [
{
@ -1511,7 +1511,7 @@
" names=['cate', None])"
]
},
"execution_count": 25,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@ -1523,7 +1523,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 25,
"metadata": {},
"outputs": [
{
@ -1540,7 +1540,7 @@
" dtype='object')"
]
},
"execution_count": 26,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@ -1552,7 +1552,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 26,
"metadata": {},
"outputs": [
{
@ -1573,7 +1573,7 @@
" dtype='object')"
]
},
"execution_count": 27,
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@ -1598,7 +1598,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 27,
"metadata": {},
"outputs": [
{
@ -1839,7 +1839,7 @@
"[5 rows x 49 columns]"
]
},
"execution_count": 28,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@ -1852,7 +1852,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 28,
"metadata": {},
"outputs": [
{
@ -2093,7 +2093,7 @@
"[5 rows x 50 columns]"
]
},
"execution_count": 29,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@ -2109,7 +2109,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 29,
"metadata": {},
"outputs": [
{
@ -2358,7 +2358,7 @@
"[5 rows x 51 columns]"
]
},
"execution_count": 30,
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@ -2388,7 +2388,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@ -2451,7 +2451,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
@ -2499,7 +2499,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@ -2517,7 +2517,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 33,
"metadata": {},
"outputs": [
{
@ -2537,7 +2537,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 34,
"metadata": {},
"outputs": [
{
@ -2634,7 +2634,7 @@
"4 263587.0 72348.0 2016-01-31 23:59:08 NaN 1.0 5.0 159.0"
]
},
"execution_count": 35,
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@ -2645,7 +2645,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 35,
"metadata": {},
"outputs": [
{
@ -2673,7 +2673,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 36,
"metadata": {},
"outputs": [
{
@ -2682,7 +2682,7 @@
"(50601736, 7)"
]
},
"execution_count": 37,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
@ -2693,7 +2693,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 37,
"metadata": {},
"outputs": [
{
@ -2711,7 +2711,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 38,
"metadata": {},
"outputs": [
{
@ -2869,7 +2869,7 @@
"4 0.0 0.0 0.0 0.0 1.0 0.0 "
]
},
"execution_count": 39,
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
@ -2880,7 +2880,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 39,
"metadata": {},
"outputs": [
{
@ -2898,7 +2898,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 40,
"metadata": {},
"outputs": [
{
@ -3038,7 +3038,7 @@
"4 0 1 "
]
},
"execution_count": 41,
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@ -3049,7 +3049,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 41,
"metadata": {},
"outputs": [
{
@ -3058,7 +3058,7 @@
"datetime.datetime(2016, 3, 4, 0, 0)"
]
},
"execution_count": 42,
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
@ -3071,7 +3071,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 42,
"metadata": {},
"outputs": [
{
@ -3092,7 +3092,7 @@
},
{
"cell_type": "code",
"execution_count": 53,
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
@ -3159,7 +3159,7 @@
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
@ -3189,7 +3189,7 @@
},
{
"cell_type": "code",
"execution_count": 56,
"execution_count": 45,
"metadata": {},
"outputs": [
{
@ -3205,7 +3205,27 @@
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n",
"get labels\n",
"get labels\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" sort=sort)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"round 1/20 over!\n",
"2016-02-02\n",
"2016-02-05\n",
@ -3396,7 +3416,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
@ -3457,14 +3477,14 @@
" val_start_date = train_end_date\n",
" val_end_date = datetime.strptime(val_start_date, '%Y-%m-%d') + timedelta(days=5)\n",
" val_end_date = val_end_date.strftime('%Y-%m-%d')\n",
" make_val_answer(val_start_date, val_end_date, all_actions, 'label_'+val_s1_path)\n",
" make_val_answer(val_start_date, val_end_date, all_actions, val_s1_path)\n",
" \n",
" actions.to_csv(val_s1_path, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 47,
"metadata": {},
"outputs": [
{
@ -3480,22 +3500,6 @@
"get_accumulate_cate_feat finsihed\n",
"get_comments_product_feat finished\n"
]
},
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'label_data/val_set.csv'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-61-e7de241c1782>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 验证集\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmake_val_set\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'2016-02-23'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'2016-02-26'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'data/val_set.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m<ipython-input-60-6f88f89bdf35>\u001b[0m in \u001b[0;36mmake_val_set\u001b[1;34m(train_start_date, train_end_date, val_s1_path)\u001b[0m\n\u001b[0;32m 56\u001b[0m \u001b[0mval_end_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrptime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'%Y-%m-%d'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mtimedelta\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdays\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 57\u001b[0m \u001b[0mval_end_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mval_end_date\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrftime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'%Y-%m-%d'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 58\u001b[1;33m \u001b[0mmake_val_answer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval_end_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mall_actions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'label_'\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mval_s1_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 59\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_s1_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-60-6f88f89bdf35>\u001b[0m in \u001b[0;36mmake_val_answer\u001b[1;34m(val_start_date, val_end_date, all_actions, label_val_s1_path)\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'user_id'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'sku_id'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mactions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel_val_s1_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmake_val_set\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_start_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_end_date\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval_s1_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mto_csv\u001b[1;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)\u001b[0m\n\u001b[0;32m 3018\u001b[0m \u001b[0mdoublequote\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdoublequote\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3019\u001b[0m escapechar=escapechar, decimal=decimal)\n\u001b[1;32m-> 3020\u001b[1;33m \u001b[0mformatter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3021\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3022\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpath_or_buf\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\formats\\csvs.py\u001b[0m in \u001b[0;36msave\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 155\u001b[0m f, handles = _get_handle(self.path_or_buf, self.mode,\n\u001b[0;32m 156\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 157\u001b[1;33m compression=self.compression)\n\u001b[0m\u001b[0;32m 158\u001b[0m \u001b[0mclose\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\io\\common.py\u001b[0m in \u001b[0;36m_get_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text)\u001b[0m\n\u001b[0;32m 422\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 423\u001b[0m \u001b[1;31m# Python 3 and encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 424\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath_or_buf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnewline\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 425\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mis_text\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 426\u001b[0m \u001b[1;31m# Python 3 and no explicit encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'label_data/val_set.csv'"
]
}
],
"source": [
@ -3505,7 +3509,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
@ -3513,22 +3517,22 @@
" start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=30)\n",
" start_days = start_days.strftime('%Y-%m-%d')\n",
" all_actions = get_all_action()\n",
" print \"get all actions!\"\n",
" print(\"get all actions!\")\n",
" user = get_basic_user_feat()\n",
" print 'get_basic_user_feat finsihed'\n",
" print('get_basic_user_feat finsihed')\n",
" product = get_basic_product_feat()\n",
" print 'get_basic_product_feat finsihed'\n",
" print('get_basic_product_feat finsihed')\n",
" \n",
" user_acc = get_recent_user_feat(train_end_date, all_actions)\n",
" print 'get_accumulate_user_feat finsihed'\n",
" print('get_accumulate_user_feat finsihed')\n",
" \n",
" user_cate = get_user_cate_feature(train_start_date, train_end_date, all_actions)\n",
" print 'get_user_cate_feature finished'\n",
" print('get_user_cate_feature finished')\n",
" \n",
" product_acc = get_accumulate_product_feat(start_days, train_end_date, all_actions)\n",
" print 'get_accumulate_product_feat finsihed'\n",
" print('get_accumulate_product_feat finsihed')\n",
" cate_acc = get_accumulate_cate_feat(start_days, train_end_date, all_actions)\n",
" print 'get_accumulate_cate_feat finsihed'\n",
" print('get_accumulate_cate_feat finsihed')\n",
" comment_acc = get_comments_product_feat(train_end_date)\n",
"\n",
" actions = None\n",
@ -3559,9 +3563,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 50,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"get all actions!\n",
"get_basic_user_feat finsihed\n",
"get_basic_product_feat finsihed\n",
"get_accumulate_user_feat finsihed\n",
"get_user_cate_feature finished\n",
"get_accumulate_product_feat finsihed\n",
"get_accumulate_cate_feat finsihed\n"
]
}
],
"source": [
"# 预测结果\n",
"sub_start_date = '2016-04-13'\n",

Loading…
Cancel
Save