diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb index af3985e..213c66e 100644 --- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,7 @@ "from scipy.interpolate import UnivariateSpline\n", "from sklearn import linear_model\n", "import xgboost as xgb\n", - "# from ultis import *" + "from sklearn.utils import *" ] }, { @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -116,7 +116,7 @@ "4 10.4 " ] }, - "execution_count": 6, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -144,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -223,7 +223,7 @@ "4 4377906284422600514 55 12 1" ] }, - "execution_count": 8, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -249,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -322,7 +322,7 @@ "4 4377906284422600514 2 1" ] }, - "execution_count": 11, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -339,7 +339,8 @@ "### 任务:预测未来一个月平均通行结果,每两分钟一次\n", "回归任务\n", "\n", - "构建时间序列,基于前几天或者前几十天的数据预测" + "构建时间序列,基于前几天或者前几十天的数据预测\n", + "https://tianchi.aliyun.com/competition/entrance/231598/information" ] }, { @@ -352,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -438,7 +439,7 @@ "4 10.4 " ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -449,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -541,7 +542,7 @@ "4 10.4 2017-05-06 10:52:00 " ] }, - "execution_count": 13, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -563,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -580,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -594,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -673,7 +674,7 @@ "4 3377906287934510514 2017-05-06 2.251292 2017-05-06 10:52:00" ] }, - "execution_count": 18, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -686,7 +687,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -696,7 +697,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -713,7 +714,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -792,7 +793,7 @@ "4 4377906283759500514 2017-05-06 2.140066 2017-05-06 13:24:00" ] }, - "execution_count": 25, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -811,7 +812,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -890,7 +891,7 @@ "4 4377906284422600514 55 12 1" ] }, - "execution_count": 26, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -901,7 +902,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -913,7 +914,7 @@ " dtype='datetime64[ns]', freq='2T')" ] }, - "execution_count": 27, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -925,7 +926,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -992,7 +993,7 @@ "4 4377906289869500514 2017-03-01 00:08:00" ] }, - "execution_count": 29, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1007,7 +1008,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1086,7 +1087,7 @@ "4 4377906289869500514 2017-03-01 00:08:00 NaN NaN" ] }, - "execution_count": 32, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1099,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -1117,7 +1118,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1196,7 +1197,7 @@ "184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752" ] }, - "execution_count": 42, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1207,7 +1208,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1224,7 +1225,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1316,7 +1317,7 @@ "184 2.174752 " ] }, - "execution_count": 44, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1336,7 +1337,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1361,7 +1362,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1459,7 +1460,7 @@ "184 2.174752 2017-03-01-06 " ] }, - "execution_count": 46, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1482,7 +1483,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1601,7 +1602,7 @@ " 4 NaN NaN 2017-03-01-06 1.960745 " ] }, - "execution_count": 47, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1620,7 +1621,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1718,7 +1719,7 @@ "4 NaN 1.960745 " ] }, - "execution_count": 48, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1731,6 +1732,1171 @@ "df.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "日变化量(分钟)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def minute_trend(group):\n", + " tmp = group.groupby('hour_minute').mean().reset_index()\n", + " #s的值越小,对数据的拟合越好,但是存在过拟合风险\n", + " spl = UnivariateSpline(tmp.index, tmp['travel_time'].values, s=0.5)\n", + " tmp['minute_trend'] = spl(tmp.index)\n", + " group = pd.merge(group, tmp[['minute_trend', 'hour_minute']], on='hour_minute', how='left')\n", + " return group" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendhour_minute
033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.96074506-00
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.96074506-02
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.96074506-04
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.96074506-06
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.96074506-08
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time \\\n", + "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n", + "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n", + "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n", + "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n", + "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n", + "\n", + " travel_time2 date_trend hour_minute \n", + "0 NaN 1.960745 06-00 \n", + "1 NaN 1.960745 06-02 \n", + "2 NaN 1.960745 06-04 \n", + "3 NaN 1.960745 06-06 \n", + "4 NaN 1.960745 06-08 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['hour_minute'] = df.time_interval_begin.map(lambda x: x.strftime('%H-%M'))\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendhour_minuteminute_trend
link_ID
3377906280028510514033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.96074506-00-0.252121
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.96074506-02-0.246743
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.96074506-04-0.241428
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.96074506-06-0.236176
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.96074506-08-0.230986
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date \\\n", + "link_ID \n", + "3377906280028510514 0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 \n", + " 1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 \n", + " 2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 \n", + " 3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 \n", + " 4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 \n", + "\n", + " travel_time travel_time2 date_trend hour_minute \\\n", + "link_ID \n", + "3377906280028510514 0 NaN NaN 1.960745 06-00 \n", + " 1 NaN NaN 1.960745 06-02 \n", + " 2 NaN NaN 1.960745 06-04 \n", + " 3 NaN NaN 1.960745 06-06 \n", + " 4 NaN NaN 1.960745 06-08 \n", + "\n", + " minute_trend \n", + "link_ID \n", + "3377906280028510514 0 -0.252121 \n", + " 1 -0.246743 \n", + " 2 -0.241428 \n", + " 3 -0.236176 \n", + " 4 -0.230986 " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df=df.groupby('link_ID').apply(minute_trend)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "与季节残差一样,回归得到的值存在 df['minute_trend']里,因此现在的travel_time再次更新为 df['travel_time]= df['travel_time']-df['minute_trend]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(['hour_minute', 'link_ID'], axis=1)\n", + "df = df.reset_index()\n", + "df = df.drop('level_1',axis=1)\n", + "df['travel_time'] = df['travel_time'] - df['minute_trend']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendminute_trend
033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.960745-0.252121
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.960745-0.246743
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.960745-0.241428
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.960745-0.236176
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.960745-0.230986
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time \\\n", + "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n", + "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n", + "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n", + "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n", + "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n", + "\n", + " travel_time2 date_trend minute_trend \n", + "0 NaN 1.960745 -0.252121 \n", + "1 NaN 1.960745 -0.246743 \n", + "2 NaN 1.960745 -0.241428 \n", + "3 NaN 1.960745 -0.236176 \n", + "4 NaN 1.960745 -0.230986 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "link_infos = pd.read_csv('gy_contest_link_info.txt',delimiter=';',dtype={'link_ID':object})\n", + "link_tops = pd.read_csv('gy_contest_link_top_update.txt',delimiter=',',dtype={'link_ID':object})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "基本上大概的走势已经被 date_trend和 hour_trend决定了,剩下就是建模得到这个travel_time如何围绕这两个trends上下变化的\n", + "\n", + "选择训练特征:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendminute_trendlengthwidthlink_numarea
033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.960745-0.2521214832144
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.960745-0.2467434832144
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.960745-0.2414284832144
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.960745-0.2361764832144
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.960745-0.2309864832144
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time \\\n", + "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n", + "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n", + "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n", + "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n", + "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n", + "\n", + " travel_time2 date_trend minute_trend length width link_num area \n", + "0 NaN 1.960745 -0.252121 48 3 2 144 \n", + "1 NaN 1.960745 -0.246743 48 3 2 144 \n", + "2 NaN 1.960745 -0.241428 48 3 2 144 \n", + "3 NaN 1.960745 -0.236176 48 3 2 144 \n", + "4 NaN 1.960745 -0.230986 48 3 2 144 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link_infos = pd.merge(link_infos, link_tops, on=['link_ID'], how='left')\n", + "link_infos['link_num'] = link_infos['in_links']+link_infos['out_links']\n", + "link_infos['area'] = link_infos['length'] * link_infos['width']\n", + "df = pd.merge(df, link_infos[['link_ID','length','width', 'link_num', 'area']], on=['link_ID'], how='left')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendminute_trendlengthwidthlink_numareavacationminutehourdayweek_daymouth
033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.960745-0.25212148321440.006133
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.960745-0.24674348321440.026133
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.960745-0.24142848321440.046133
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.960745-0.23617648321440.066133
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.960745-0.23098648321440.086133
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time \\\n", + "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n", + "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n", + "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n", + "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n", + "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n", + "\n", + " travel_time2 date_trend minute_trend length width link_num area \\\n", + "0 NaN 1.960745 -0.252121 48 3 2 144 \n", + "1 NaN 1.960745 -0.246743 48 3 2 144 \n", + "2 NaN 1.960745 -0.241428 48 3 2 144 \n", + "3 NaN 1.960745 -0.236176 48 3 2 144 \n", + "4 NaN 1.960745 -0.230986 48 3 2 144 \n", + "\n", + " vacation minute hour day week_day mouth \n", + "0 0.0 0 6 1 3 3 \n", + "1 0.0 2 6 1 3 3 \n", + "2 0.0 4 6 1 3 3 \n", + "3 0.0 6 6 1 3 3 \n", + "4 0.0 8 6 1 3 3 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#时间相关特征\n", + "df.loc[df['date'].isin(['2017-04-02','2017-04-03','2017-04-04','2017-04-29','2017-04-30',\n", + " '2017-05-01','2017-05-28','2017-05-29','2017-05-30']),'vacation']=1\n", + "\n", + "df.loc[~df['date'].isin(['2017-04-02','2017-04-03','2017-04-04','2017-04-29','2017-04-30',\n", + " '2017-05-01','2017-05-28','2017-05-29','2017-05-30']),'vacation']=0\n", + "\n", + "df['minute'] = df['time_interval_begin'].dt.minute\n", + "df['hour'] = df['time_interval_begin'].dt.hour\n", + "df['day'] = df['time_interval_begin'].dt.day\n", + "df['week_day'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1)\n", + "df['mouth'] = df['time_interval_begin'].dt.month\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_time(group):\n", + " group['link_ID_en'] = group['travel_time'].mean()\n", + " return group" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendminute_trendlengthwidthlink_numareavacationminutehourdayweek_daymouthlink_ID_en
033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.960745-0.25212148321440.0061330.000138
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.960745-0.24674348321440.0261330.000138
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.960745-0.24142848321440.0461330.000138
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.960745-0.23617648321440.0661330.000138
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.960745-0.23098648321440.0861330.000138
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time \\\n", + "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n", + "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n", + "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n", + "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n", + "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n", + "\n", + " travel_time2 date_trend minute_trend length width link_num area \\\n", + "0 NaN 1.960745 -0.252121 48 3 2 144 \n", + "1 NaN 1.960745 -0.246743 48 3 2 144 \n", + "2 NaN 1.960745 -0.241428 48 3 2 144 \n", + "3 NaN 1.960745 -0.236176 48 3 2 144 \n", + "4 NaN 1.960745 -0.230986 48 3 2 144 \n", + "\n", + " vacation minute hour day week_day mouth link_ID_en \n", + "0 0.0 0 6 1 3 3 0.000138 \n", + "1 0.0 2 6 1 3 3 0.000138 \n", + "2 0.0 4 6 1 3 3 0.000138 \n", + "3 0.0 6 6 1 3 3 0.000138 \n", + "4 0.0 8 6 1 3 3 0.000138 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.groupby('link_ID').apply(mean_time)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendminute_trendlengthwidthlink_numareavacationminutehourdayweek_daymouthlink_ID_en
033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.960745-0.25212148321440.00613375
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.960745-0.24674348321440.02613375
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.960745-0.24142848321440.04613375
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.960745-0.23617648321440.06613375
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.960745-0.23098648321440.08613375
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time \\\n", + "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n", + "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n", + "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n", + "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n", + "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n", + "\n", + " travel_time2 date_trend minute_trend length width link_num area \\\n", + "0 NaN 1.960745 -0.252121 48 3 2 144 \n", + "1 NaN 1.960745 -0.246743 48 3 2 144 \n", + "2 NaN 1.960745 -0.241428 48 3 2 144 \n", + "3 NaN 1.960745 -0.236176 48 3 2 144 \n", + "4 NaN 1.960745 -0.230986 48 3 2 144 \n", + "\n", + " vacation minute hour day week_day mouth link_ID_en \n", + "0 0.0 0 6 1 3 3 75 \n", + "1 0.0 2 6 1 3 3 75 \n", + "2 0.0 4 6 1 3 3 75 \n", + "3 0.0 6 6 1 3 3 75 \n", + "4 0.0 8 6 1 3 3 75 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 通行时间长的编号大\n", + "sorted_link = np.sort(df['link_ID_en'].unique())\n", + "df['link_ID_en'] = df['link_ID_en'].map(lambda x: np.argmin(x >= sorted_link))\n", + "df.head()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb index 9cafb82..213c66e 100644 --- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb +++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb @@ -2708,9 +2708,188 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_timetravel_time2date_trendminute_trendlengthwidthlink_numareavacationminutehourdayweek_daymouthlink_ID_en
033779062800285105142017-03-01 06:00:002017-03-01NaNNaN1.960745-0.25212148321440.00613375
133779062800285105142017-03-01 06:02:002017-03-01NaNNaN1.960745-0.24674348321440.02613375
233779062800285105142017-03-01 06:04:002017-03-01NaNNaN1.960745-0.24142848321440.04613375
333779062800285105142017-03-01 06:06:002017-03-01NaNNaN1.960745-0.23617648321440.06613375
433779062800285105142017-03-01 06:08:002017-03-01NaNNaN1.960745-0.23098648321440.08613375
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time \\\n", + "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n", + "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n", + "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n", + "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n", + "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n", + "\n", + " travel_time2 date_trend minute_trend length width link_num area \\\n", + "0 NaN 1.960745 -0.252121 48 3 2 144 \n", + "1 NaN 1.960745 -0.246743 48 3 2 144 \n", + "2 NaN 1.960745 -0.241428 48 3 2 144 \n", + "3 NaN 1.960745 -0.236176 48 3 2 144 \n", + "4 NaN 1.960745 -0.230986 48 3 2 144 \n", + "\n", + " vacation minute hour day week_day mouth link_ID_en \n", + "0 0.0 0 6 1 3 3 75 \n", + "1 0.0 2 6 1 3 3 75 \n", + "2 0.0 4 6 1 3 3 75 \n", + "3 0.0 6 6 1 3 3 75 \n", + "4 0.0 8 6 1 3 3 75 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# 通行时间长的编号大\n", "sorted_link = np.sort(df['link_ID_en'].unique())\n",