From ec67cad8a6d592f071cee81d8ceb4647fd92da20 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Sat, 5 Dec 2020 12:24:13 +0800
Subject: [PATCH] =?UTF-8?q?Update=20=E9=81=93=E8=B7=AF=E9=80=9A=E8=A1=8C?=
=?UTF-8?q?=E6=97=B6=E9=97=B4=E9=A2=84=E6=B5=8B.ipynb?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../道路通行时间预测.ipynb | 653 ++++++++++++++++--
1 file changed, 596 insertions(+), 57 deletions(-)
diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb
index 213c66e..cbc96a1 100644
--- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb
@@ -9,7 +9,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -1741,7 +1741,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@@ -1756,7 +1756,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -1860,7 +1860,7 @@
"4 NaN 1.960745 06-08 "
]
},
- "execution_count": 29,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -1872,7 +1872,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
@@ -2006,7 +2006,7 @@
" 4 -0.230986 "
]
},
- "execution_count": 33,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@@ -2025,7 +2025,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
@@ -2037,7 +2037,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 29,
"metadata": {},
"outputs": [
{
@@ -2141,7 +2141,7 @@
"4 NaN 1.960745 -0.230986 "
]
},
- "execution_count": 35,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@@ -2152,7 +2152,7 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@@ -2171,7 +2171,7 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 31,
"metadata": {},
"outputs": [
{
@@ -2204,7 +2204,7 @@
"
minute_trend | \n",
" length | \n",
" width | \n",
- " link_num | \n",
+ " links_num | \n",
" area | \n",
" \n",
" \n",
@@ -2291,30 +2291,30 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
- " travel_time2 date_trend minute_trend length width link_num area \n",
- "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
- "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
- "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
- "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
- "4 NaN 1.960745 -0.230986 48 3 2 144 "
+ " travel_time2 date_trend minute_trend length width links_num area \n",
+ "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
+ "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
+ "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
+ "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
+ "4 NaN 1.960745 -0.230986 48 3 2 144 "
]
},
- "execution_count": 40,
+ "execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_infos = pd.merge(link_infos, link_tops, on=['link_ID'], how='left')\n",
- "link_infos['link_num'] = link_infos['in_links']+link_infos['out_links']\n",
+ "link_infos['links_num'] = link_infos['in_links']+link_infos['out_links']\n",
"link_infos['area'] = link_infos['length'] * link_infos['width']\n",
- "df = pd.merge(df, link_infos[['link_ID','length','width', 'link_num', 'area']], on=['link_ID'], how='left')\n",
+ "df = pd.merge(df, link_infos[['link_ID','length','width', 'links_num', 'area']], on=['link_ID'], how='left')\n",
"df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 32,
"metadata": {},
"outputs": [
{
@@ -2347,14 +2347,14 @@
" minute_trend | \n",
" length | \n",
" width | \n",
- " link_num | \n",
+ " links_num | \n",
" area | \n",
" vacation | \n",
" minute | \n",
" hour | \n",
" day | \n",
" week_day | \n",
- " mouth | \n",
+ " month | \n",
" \n",
" \n",
" \n",
@@ -2470,14 +2470,14 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
- " travel_time2 date_trend minute_trend length width link_num area \\\n",
- "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
- "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
- "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
- "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
- "4 NaN 1.960745 -0.230986 48 3 2 144 \n",
+ " travel_time2 date_trend minute_trend length width links_num area \\\n",
+ "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
+ "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
+ "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
+ "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
+ "4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
- " vacation minute hour day week_day mouth \n",
+ " vacation minute hour day week_day month \n",
"0 0.0 0 6 1 3 3 \n",
"1 0.0 2 6 1 3 3 \n",
"2 0.0 4 6 1 3 3 \n",
@@ -2485,7 +2485,7 @@
"4 0.0 8 6 1 3 3 "
]
},
- "execution_count": 41,
+ "execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
@@ -2502,13 +2502,13 @@
"df['hour'] = df['time_interval_begin'].dt.hour\n",
"df['day'] = df['time_interval_begin'].dt.day\n",
"df['week_day'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1)\n",
- "df['mouth'] = df['time_interval_begin'].dt.month\n",
+ "df['month'] = df['time_interval_begin'].dt.month\n",
"df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@@ -2519,7 +2519,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 34,
"metadata": {},
"outputs": [
{
@@ -2552,14 +2552,14 @@
" minute_trend | \n",
" length | \n",
" width | \n",
- " link_num | \n",
+ " links_num | \n",
" area | \n",
" vacation | \n",
" minute | \n",
" hour | \n",
" day | \n",
" week_day | \n",
- " mouth | \n",
+ " month | \n",
" link_ID_en | \n",
" \n",
" \n",
@@ -2681,14 +2681,14 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
- " travel_time2 date_trend minute_trend length width link_num area \\\n",
- "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
- "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
- "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
- "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
- "4 NaN 1.960745 -0.230986 48 3 2 144 \n",
+ " travel_time2 date_trend minute_trend length width links_num area \\\n",
+ "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
+ "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
+ "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
+ "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
+ "4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
- " vacation minute hour day week_day mouth link_ID_en \n",
+ " vacation minute hour day week_day month link_ID_en \n",
"0 0.0 0 6 1 3 3 0.000138 \n",
"1 0.0 2 6 1 3 3 0.000138 \n",
"2 0.0 4 6 1 3 3 0.000138 \n",
@@ -2696,7 +2696,7 @@
"4 0.0 8 6 1 3 3 0.000138 "
]
},
- "execution_count": 43,
+ "execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@@ -2708,7 +2708,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 35,
"metadata": {},
"outputs": [
{
@@ -2741,14 +2741,14 @@
" minute_trend | \n",
" length | \n",
" width | \n",
- " link_num | \n",
+ " links_num | \n",
" area | \n",
" vacation | \n",
" minute | \n",
" hour | \n",
" day | \n",
" week_day | \n",
- " mouth | \n",
+ " month | \n",
" link_ID_en | \n",
" \n",
" \n",
@@ -2870,14 +2870,14 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
- " travel_time2 date_trend minute_trend length width link_num area \\\n",
- "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
- "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
- "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
- "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
- "4 NaN 1.960745 -0.230986 48 3 2 144 \n",
+ " travel_time2 date_trend minute_trend length width links_num area \\\n",
+ "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
+ "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
+ "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
+ "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
+ "4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
- " vacation minute hour day week_day mouth link_ID_en \n",
+ " vacation minute hour day week_day month link_ID_en \n",
"0 0.0 0 6 1 3 3 75 \n",
"1 0.0 2 6 1 3 3 75 \n",
"2 0.0 4 6 1 3 3 75 \n",
@@ -2885,7 +2885,7 @@
"4 0.0 8 6 1 3 3 75 "
]
},
- "execution_count": 44,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@@ -2897,12 +2897,551 @@
"df.head()"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "标准化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def std(group):\n",
+ " group['travel_time_std'] = np.std(group['travel_time'])\n",
+ " return group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_trend | \n",
+ " minute_trend | \n",
+ " length | \n",
+ " width | \n",
+ " links_num | \n",
+ " area | \n",
+ " vacation | \n",
+ " minute | \n",
+ " hour | \n",
+ " day | \n",
+ " week_day | \n",
+ " month | \n",
+ " link_ID_en | \n",
+ " travel_time_std | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.252121 | \n",
+ " 48 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 75 | \n",
+ " 0.223232 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.246743 | \n",
+ " 48 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 75 | \n",
+ " 0.223232 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.241428 | \n",
+ " 48 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " 4 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 75 | \n",
+ " 0.223232 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.236176 | \n",
+ " 48 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 75 | \n",
+ " 0.223232 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.230986 | \n",
+ " 48 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " 8 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 75 | \n",
+ " 0.223232 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
+ "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
+ "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
+ "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
+ "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
+ "\n",
+ " travel_time2 date_trend minute_trend length width links_num area \\\n",
+ "0 NaN 1.960745 -0.252121 48 3 2 144 \n",
+ "1 NaN 1.960745 -0.246743 48 3 2 144 \n",
+ "2 NaN 1.960745 -0.241428 48 3 2 144 \n",
+ "3 NaN 1.960745 -0.236176 48 3 2 144 \n",
+ "4 NaN 1.960745 -0.230986 48 3 2 144 \n",
+ "\n",
+ " vacation minute hour day week_day month link_ID_en travel_time_std \n",
+ "0 0.0 0 6 1 3 3 75 0.223232 \n",
+ "1 0.0 2 6 1 3 3 75 0.223232 \n",
+ "2 0.0 4 6 1 3 3 75 0.223232 \n",
+ "3 0.0 6 6 1 3 3 75 0.223232 \n",
+ "4 0.0 8 6 1 3 3 75 0.223232 "
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.groupby('link_ID').apply(std)\n",
+ "df['travel_time'] = df['travel_time'] / df['travel_time_std']\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "缺失时间预测"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = {\n",
+ " 'learning_rate':0.2,\n",
+ " 'n_estimators':30,\n",
+ " 'subsample':0.8,\n",
+ " 'colsample_bytree':0.6,\n",
+ " 'max_depth':10,\n",
+ " 'min_child_weight':1,\n",
+ " 'reg_alpha':0,\n",
+ " 'gamma':0\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_trend | \n",
+ " minute_trend | \n",
+ " length | \n",
+ " area | \n",
+ " vacation | \n",
+ " ... | \n",
+ " day_27 | \n",
+ " day_28 | \n",
+ " day_29 | \n",
+ " day_30 | \n",
+ " day_31 | \n",
+ " month_3 | \n",
+ " month_4 | \n",
+ " month_5 | \n",
+ " month_6 | \n",
+ " month_7 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.252121 | \n",
+ " 48 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.246743 | \n",
+ " 48 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.241428 | \n",
+ " 48 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.236176 | \n",
+ " 48 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ " -0.230986 | \n",
+ " 48 | \n",
+ " 144 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 103 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
+ "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
+ "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
+ "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
+ "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
+ "\n",
+ " travel_time2 date_trend minute_trend length area vacation ... \\\n",
+ "0 NaN 1.960745 -0.252121 48 144 0.0 ... \n",
+ "1 NaN 1.960745 -0.246743 48 144 0.0 ... \n",
+ "2 NaN 1.960745 -0.241428 48 144 0.0 ... \n",
+ "3 NaN 1.960745 -0.236176 48 144 0.0 ... \n",
+ "4 NaN 1.960745 -0.230986 48 144 0.0 ... \n",
+ "\n",
+ " day_27 day_28 day_29 day_30 day_31 month_3 month_4 month_5 month_6 \\\n",
+ "0 0 0 0 0 0 1 0 0 0 \n",
+ "1 0 0 0 0 0 1 0 0 0 \n",
+ "2 0 0 0 0 0 1 0 0 0 \n",
+ "3 0 0 0 0 0 1 0 0 0 \n",
+ "4 0 0 0 0 0 1 0 0 0 \n",
+ "\n",
+ " month_7 \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ "[5 rows x 103 columns]"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.get_dummies(df, columns=['links_num','width','minute','hour',\n",
+ " 'week_day','day','month'])\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "训练的数据train_df为travel_time非空的数据,而数据集test_df为travel_time空的数据"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['length', 'area', 'vacation', 'link_ID_en', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'minute_0', 'minute_2', 'minute_4', 'minute_6', 'minute_8', 'minute_10', 'minute_12', 'minute_14', 'minute_16', 'minute_18', 'minute_20', 'minute_22', 'minute_24', 'minute_26', 'minute_28', 'minute_30', 'minute_32', 'minute_34', 'minute_36', 'minute_38', 'minute_40', 'minute_42', 'minute_44', 'minute_46', 'minute_48', 'minute_50', 'minute_52', 'minute_54', 'minute_56', 'minute_58', 'hour_6', 'hour_7', 'hour_8', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'week_day_1', 'week_day_2', 'week_day_3', 'week_day_4', 'week_day_5', 'week_day_6', 'week_day_7', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7']\n"
+ ]
+ }
+ ],
+ "source": [
+ "feature = df.columns.values.tolist()\n",
+ "train_feature = [x for x in feature if \n",
+ " x not in ['link_ID', 'time_interval_begin', 'travel_time', 'date',\n",
+ " 'travel_time2', 'minute_trend', 'travel_time_std', 'date_trend']]\n",
+ "\n",
+ "train_df = df.loc[~df['travel_time'].isnull()] # 获取非空的值,~是非空意思\n",
+ "test_df = df.loc[df['travel_time2'].isnull()].copy()\n",
+ "\n",
+ "print(train_feature)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(3165426, 103)\n",
+ "(1883574, 103)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(train_df.shape)\n",
+ "print(test_df.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "训练数据切分"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "X = train_df[train_feature].values\n",
+ "y = train_df['travel_time'].values\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n",
+ "\n",
+ "eval_set = [(X_test, y_test)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "训练回归模型来预测缺失值"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "regressor = xgb.XGBRegressor()"
+ ]
}
],
"metadata": {