Update 道路通行时间预测.ipynb

pull/2/head
benjas 5 years ago
parent 9165da8698
commit ec67cad8a6

@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -1741,7 +1741,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@ -1756,7 +1756,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 26,
"metadata": {},
"outputs": [
{
@ -1860,7 +1860,7 @@
"4 NaN 1.960745 06-08 "
]
},
"execution_count": 29,
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@ -1872,7 +1872,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 27,
"metadata": {},
"outputs": [
{
@ -2006,7 +2006,7 @@
" 4 -0.230986 "
]
},
"execution_count": 33,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@ -2025,7 +2025,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
@ -2037,7 +2037,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 29,
"metadata": {},
"outputs": [
{
@ -2141,7 +2141,7 @@
"4 NaN 1.960745 -0.230986 "
]
},
"execution_count": 35,
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@ -2152,7 +2152,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@ -2171,7 +2171,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 31,
"metadata": {},
"outputs": [
{
@ -2204,7 +2204,7 @@
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_num</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" </tr>\n",
" </thead>\n",
@ -2291,30 +2291,30 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width link_num area \n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 "
" travel_time2 date_trend minute_trend length width links_num area \n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 "
]
},
"execution_count": 40,
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_infos = pd.merge(link_infos, link_tops, on=['link_ID'], how='left')\n",
"link_infos['link_num'] = link_infos['in_links']+link_infos['out_links']\n",
"link_infos['links_num'] = link_infos['in_links']+link_infos['out_links']\n",
"link_infos['area'] = link_infos['length'] * link_infos['width']\n",
"df = pd.merge(df, link_infos[['link_ID','length','width', 'link_num', 'area']], on=['link_ID'], how='left')\n",
"df = pd.merge(df, link_infos[['link_ID','length','width', 'links_num', 'area']], on=['link_ID'], how='left')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 32,
"metadata": {},
"outputs": [
{
@ -2347,14 +2347,14 @@
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_num</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>mouth</th>\n",
" <th>month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
@ -2470,14 +2470,14 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width link_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day mouth \n",
" vacation minute hour day week_day month \n",
"0 0.0 0 6 1 3 3 \n",
"1 0.0 2 6 1 3 3 \n",
"2 0.0 4 6 1 3 3 \n",
@ -2485,7 +2485,7 @@
"4 0.0 8 6 1 3 3 "
]
},
"execution_count": 41,
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
@ -2502,13 +2502,13 @@
"df['hour'] = df['time_interval_begin'].dt.hour\n",
"df['day'] = df['time_interval_begin'].dt.day\n",
"df['week_day'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1)\n",
"df['mouth'] = df['time_interval_begin'].dt.month\n",
"df['month'] = df['time_interval_begin'].dt.month\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@ -2519,7 +2519,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 34,
"metadata": {},
"outputs": [
{
@ -2552,14 +2552,14 @@
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_num</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>mouth</th>\n",
" <th>month</th>\n",
" <th>link_ID_en</th>\n",
" </tr>\n",
" </thead>\n",
@ -2681,14 +2681,14 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width link_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day mouth link_ID_en \n",
" vacation minute hour day week_day month link_ID_en \n",
"0 0.0 0 6 1 3 3 0.000138 \n",
"1 0.0 2 6 1 3 3 0.000138 \n",
"2 0.0 4 6 1 3 3 0.000138 \n",
@ -2696,7 +2696,7 @@
"4 0.0 8 6 1 3 3 0.000138 "
]
},
"execution_count": 43,
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@ -2708,7 +2708,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 35,
"metadata": {},
"outputs": [
{
@ -2741,14 +2741,14 @@
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_num</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>mouth</th>\n",
" <th>month</th>\n",
" <th>link_ID_en</th>\n",
" </tr>\n",
" </thead>\n",
@ -2870,14 +2870,14 @@
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width link_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day mouth link_ID_en \n",
" vacation minute hour day week_day month link_ID_en \n",
"0 0.0 0 6 1 3 3 75 \n",
"1 0.0 2 6 1 3 3 75 \n",
"2 0.0 4 6 1 3 3 75 \n",
@ -2885,7 +2885,7 @@
"4 0.0 8 6 1 3 3 75 "
]
},
"execution_count": 44,
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@ -2897,12 +2897,551 @@
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"标准化"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def std(group):\n",
" group['travel_time_std'] = np.std(group['travel_time'])\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>month</th>\n",
" <th>link_ID_en</th>\n",
" <th>travel_time_std</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day month link_ID_en travel_time_std \n",
"0 0.0 0 6 1 3 3 75 0.223232 \n",
"1 0.0 2 6 1 3 3 75 0.223232 \n",
"2 0.0 4 6 1 3 3 75 0.223232 \n",
"3 0.0 6 6 1 3 3 75 0.223232 \n",
"4 0.0 8 6 1 3 3 75 0.223232 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.groupby('link_ID').apply(std)\n",
"df['travel_time'] = df['travel_time'] / df['travel_time_std']\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"缺失时间预测"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"params = {\n",
" 'learning_rate':0.2,\n",
" 'n_estimators':30,\n",
" 'subsample':0.8,\n",
" 'colsample_bytree':0.6,\n",
" 'max_depth':10,\n",
" 'min_child_weight':1,\n",
" 'reg_alpha':0,\n",
" 'gamma':0\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>...</th>\n",
" <th>day_27</th>\n",
" <th>day_28</th>\n",
" <th>day_29</th>\n",
" <th>day_30</th>\n",
" <th>day_31</th>\n",
" <th>month_3</th>\n",
" <th>month_4</th>\n",
" <th>month_5</th>\n",
" <th>month_6</th>\n",
" <th>month_7</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 103 columns</p>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length area vacation ... \\\n",
"0 NaN 1.960745 -0.252121 48 144 0.0 ... \n",
"1 NaN 1.960745 -0.246743 48 144 0.0 ... \n",
"2 NaN 1.960745 -0.241428 48 144 0.0 ... \n",
"3 NaN 1.960745 -0.236176 48 144 0.0 ... \n",
"4 NaN 1.960745 -0.230986 48 144 0.0 ... \n",
"\n",
" day_27 day_28 day_29 day_30 day_31 month_3 month_4 month_5 month_6 \\\n",
"0 0 0 0 0 0 1 0 0 0 \n",
"1 0 0 0 0 0 1 0 0 0 \n",
"2 0 0 0 0 0 1 0 0 0 \n",
"3 0 0 0 0 0 1 0 0 0 \n",
"4 0 0 0 0 0 1 0 0 0 \n",
"\n",
" month_7 \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 103 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.get_dummies(df, columns=['links_num','width','minute','hour',\n",
" 'week_day','day','month'])\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练的数据train_df为travel_time非空的数据而数据集test_df为travel_time空的数据"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['length', 'area', 'vacation', 'link_ID_en', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'minute_0', 'minute_2', 'minute_4', 'minute_6', 'minute_8', 'minute_10', 'minute_12', 'minute_14', 'minute_16', 'minute_18', 'minute_20', 'minute_22', 'minute_24', 'minute_26', 'minute_28', 'minute_30', 'minute_32', 'minute_34', 'minute_36', 'minute_38', 'minute_40', 'minute_42', 'minute_44', 'minute_46', 'minute_48', 'minute_50', 'minute_52', 'minute_54', 'minute_56', 'minute_58', 'hour_6', 'hour_7', 'hour_8', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'week_day_1', 'week_day_2', 'week_day_3', 'week_day_4', 'week_day_5', 'week_day_6', 'week_day_7', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7']\n"
]
}
],
"source": [
"feature = df.columns.values.tolist()\n",
"train_feature = [x for x in feature if \n",
" x not in ['link_ID', 'time_interval_begin', 'travel_time', 'date',\n",
" 'travel_time2', 'minute_trend', 'travel_time_std', 'date_trend']]\n",
"\n",
"train_df = df.loc[~df['travel_time'].isnull()] # 获取非空的值,~是非空意思\n",
"test_df = df.loc[df['travel_time2'].isnull()].copy()\n",
"\n",
"print(train_feature)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3165426, 103)\n",
"(1883574, 103)\n"
]
}
],
"source": [
"print(train_df.shape)\n",
"print(test_df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练数据切分"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X = train_df[train_feature].values\n",
"y = train_df['travel_time'].values\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n",
"\n",
"eval_set = [(X_test, y_test)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练回归模型来预测缺失值"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"regressor = xgb.XGBRegressor()"
]
}
],
"metadata": {

Loading…
Cancel
Save