|
|
|
@ -0,0 +1,560 @@
|
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"from scipy.interpolate import UnivariateSpline\n",
|
|
|
|
|
"from sklearn import linear_model\n",
|
|
|
|
|
"import xgboost as xgb\n",
|
|
|
|
|
"from sklearn.utils import *"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 2,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"pd.set_option('display.max_rows',150)\n",
|
|
|
|
|
"pd.set_option('display.max_columns',500)\n",
|
|
|
|
|
"pd.set_option('display.width',1000)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>link_ID</th>\n",
|
|
|
|
|
" <th>date</th>\n",
|
|
|
|
|
" <th>time_interval_begin</th>\n",
|
|
|
|
|
" <th>travel_time</th>\n",
|
|
|
|
|
" <th>imputationl</th>\n",
|
|
|
|
|
" <th>lagging1</th>\n",
|
|
|
|
|
" <th>lagging2</th>\n",
|
|
|
|
|
" <th>lagging3</th>\n",
|
|
|
|
|
" <th>lagging4</th>\n",
|
|
|
|
|
" <th>lagging5</th>\n",
|
|
|
|
|
" <th>length</th>\n",
|
|
|
|
|
" <th>area</th>\n",
|
|
|
|
|
" <th>vacation</th>\n",
|
|
|
|
|
" <th>minute_series</th>\n",
|
|
|
|
|
" <th>day_of_week</th>\n",
|
|
|
|
|
" <th>day_of_week_en</th>\n",
|
|
|
|
|
" <th>hour_en</th>\n",
|
|
|
|
|
" <th>week_hour_1.0,1.0</th>\n",
|
|
|
|
|
" <th>week_hour_1.0,2.0</th>\n",
|
|
|
|
|
" <th>week_hour_1.0,3.0</th>\n",
|
|
|
|
|
" <th>week_hour_2.0,1.0</th>\n",
|
|
|
|
|
" <th>week_hour_2.0,2.0</th>\n",
|
|
|
|
|
" <th>week_hour_2.0,3.0</th>\n",
|
|
|
|
|
" <th>week_hour_3.0,1.0</th>\n",
|
|
|
|
|
" <th>week_hour_3.0,2.0</th>\n",
|
|
|
|
|
" <th>week_hour_3.0,3.0</th>\n",
|
|
|
|
|
" <th>links_num_2</th>\n",
|
|
|
|
|
" <th>links_num_3</th>\n",
|
|
|
|
|
" <th>links_num_4</th>\n",
|
|
|
|
|
" <th>links_num_5</th>\n",
|
|
|
|
|
" <th>width_3</th>\n",
|
|
|
|
|
" <th>width_6</th>\n",
|
|
|
|
|
" <th>width_9</th>\n",
|
|
|
|
|
" <th>width_12</th>\n",
|
|
|
|
|
" <th>width_15</th>\n",
|
|
|
|
|
" <th>link_ID_en</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>3377906280028510514</td>\n",
|
|
|
|
|
" <td>2017-03-01</td>\n",
|
|
|
|
|
" <td>2017-03-01 06:00:00</td>\n",
|
|
|
|
|
" <td>1.659311</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>48</td>\n",
|
|
|
|
|
" <td>144</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>47</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>3377906280028510514</td>\n",
|
|
|
|
|
" <td>2017-03-01</td>\n",
|
|
|
|
|
" <td>2017-03-01 06:02:00</td>\n",
|
|
|
|
|
" <td>1.664941</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>1.659311</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>48</td>\n",
|
|
|
|
|
" <td>144</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>47</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>3377906280028510514</td>\n",
|
|
|
|
|
" <td>2017-03-01</td>\n",
|
|
|
|
|
" <td>2017-03-01 06:04:00</td>\n",
|
|
|
|
|
" <td>1.671675</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>1.664941</td>\n",
|
|
|
|
|
" <td>1.659311</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>48</td>\n",
|
|
|
|
|
" <td>144</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>47</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>3377906280028510514</td>\n",
|
|
|
|
|
" <td>2017-03-01</td>\n",
|
|
|
|
|
" <td>2017-03-01 06:06:00</td>\n",
|
|
|
|
|
" <td>1.676886</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>1.671675</td>\n",
|
|
|
|
|
" <td>1.664941</td>\n",
|
|
|
|
|
" <td>1.659311</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>48</td>\n",
|
|
|
|
|
" <td>144</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>6.0</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>47</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>3377906280028510514</td>\n",
|
|
|
|
|
" <td>2017-03-01</td>\n",
|
|
|
|
|
" <td>2017-03-01 06:08:00</td>\n",
|
|
|
|
|
" <td>1.682314</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>1.676886</td>\n",
|
|
|
|
|
" <td>1.671675</td>\n",
|
|
|
|
|
" <td>1.664941</td>\n",
|
|
|
|
|
" <td>1.659311</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>48</td>\n",
|
|
|
|
|
" <td>144</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>8.0</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>47</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" link_ID date time_interval_begin travel_time imputationl lagging1 lagging2 lagging3 lagging4 lagging5 length area vacation minute_series day_of_week day_of_week_en hour_en week_hour_1.0,1.0 week_hour_1.0,2.0 week_hour_1.0,3.0 week_hour_2.0,1.0 week_hour_2.0,2.0 week_hour_2.0,3.0 week_hour_3.0,1.0 week_hour_3.0,2.0 week_hour_3.0,3.0 links_num_2 links_num_3 links_num_4 links_num_5 width_3 width_6 width_9 width_12 width_15 link_ID_en\n",
|
|
|
|
|
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 True NaN NaN NaN NaN NaN 48 144 0.0 0.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n",
|
|
|
|
|
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 True 1.659311 NaN NaN NaN NaN 48 144 0.0 2.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n",
|
|
|
|
|
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 True 1.664941 1.659311 NaN NaN NaN 48 144 0.0 4.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n",
|
|
|
|
|
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 True 1.671675 1.664941 1.659311 NaN NaN 48 144 0.0 6.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n",
|
|
|
|
|
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 True 1.676886 1.671675 1.664941 1.659311 NaN 48 144 0.0 8.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 读取处理好的特征数据\n",
|
|
|
|
|
"df = pd.read_csv('com_trainning.txt', delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n",
|
|
|
|
|
"df.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"['lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 时间序列特征\n",
|
|
|
|
|
"lagging = 5\n",
|
|
|
|
|
"lagging_feature = ['lagging%01d' % e for e in range(lagging, 0, -1)]\n",
|
|
|
|
|
"lagging_feature"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"base_feature = [x for x in df.columns.values.tolist() if x not in ['time_interval_begin',\n",
|
|
|
|
|
" 'link_ID','link_ID_int',\n",
|
|
|
|
|
" 'date','travel_time',\n",
|
|
|
|
|
" 'imputationl','minute_series',\n",
|
|
|
|
|
" 'area','hour_en',\n",
|
|
|
|
|
" 'day_of_week']]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"base_feature = [x for x in base_feature if x not in lagging_feature]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 8,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"['length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"train_feature = list(base_feature)\n",
|
|
|
|
|
"train_feature.extend(lagging_feature)\n",
|
|
|
|
|
"valid_feature = list(base_feature)\n",
|
|
|
|
|
"valid_feature.extend(['minute_series', 'travel_time'])\n",
|
|
|
|
|
"print(train_feature)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"xgboost训练参数:"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"params_grid = {\n",
|
|
|
|
|
" 'learning_rate':[0.05],\n",
|
|
|
|
|
" 'n_estimators':[100],\n",
|
|
|
|
|
" 'subsample':[0.6],\n",
|
|
|
|
|
" 'colsample_bytree':[0.6],\n",
|
|
|
|
|
" 'max_depth':[7],\n",
|
|
|
|
|
" 'min_child_weight':[1],\n",
|
|
|
|
|
" 'reg_alpha':[2],\n",
|
|
|
|
|
" 'gamma':[0]\n",
|
|
|
|
|
"}"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 11,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.model_selection import ParameterGrid\n",
|
|
|
|
|
"grid = ParameterGrid(params_grid)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"训练模块"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.model_selection import cross_validate\n",
|
|
|
|
|
"def fit_evaluate(df, df_test, params):\n",
|
|
|
|
|
" df = df.dropna()\n",
|
|
|
|
|
" X = df[train_feature].values\n",
|
|
|
|
|
" y = df['travel_time'].values\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" df_test = df_test[valid_feature].values\n",
|
|
|
|
|
" valid_data = bucket_data(df_test)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" eval_set = [(X_test, y_test)]\n",
|
|
|
|
|
" regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'],\n",
|
|
|
|
|
" n_estimators=params['n_estimators'],\n",
|
|
|
|
|
" booster='gbtree', objective='reg:linear',\n",
|
|
|
|
|
" n_jobs=-1,subsample=params['subsample'],\n",
|
|
|
|
|
" colsample_bytree=params['colsample_bytree'],\n",
|
|
|
|
|
" random_state=0,max_depth=params['max_depth'],\n",
|
|
|
|
|
" gamma=params['gamma'],\n",
|
|
|
|
|
" min_child_weight=params['min_child_weight'],\n",
|
|
|
|
|
" reg_alpha=params['reg_alpha'])\n",
|
|
|
|
|
" regressor.fit(X_train,y_train,verbose=False,early_stopping_rounds=10,eval_set=eval_set)\n",
|
|
|
|
|
" return regressor, cross_validate(regressor, valid_data, lagging=lagging), regressor.best_iteration,regressor.best_score"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 15,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def train(df, params, best, vis=False):\n",
|
|
|
|
|
" train1 = df.loc[df['time_interval_begin'] <= pd.to_datetime('2017-03-24')]\n",
|
|
|
|
|
" train2 = df.loc[\n",
|
|
|
|
|
" (df['time_interval_begin']>pd.to_datetime('2017-03-24'))&(\n",
|
|
|
|
|
" df['time_interval_begin'] <= pd.to_datetime('2017-04-18'))]\n",
|
|
|
|
|
" train3 = df.loc[\n",
|
|
|
|
|
" (df['time_interval_begin']>pd.to_datetime('2017-04-18'))&(\n",
|
|
|
|
|
" df['time_interval_begin'] <= pd.to_datetime('2017-05-12'))]\n",
|
|
|
|
|
" train4 = df.loc[\n",
|
|
|
|
|
" (df['time_interval_begin']>pd.to_datetime('2017-05-12'))&(\n",
|
|
|
|
|
" df['time_interval_begin'] <= pd.to_datetime('2017-06-06'))]\n",
|
|
|
|
|
" train5 = df.loc[\n",
|
|
|
|
|
" (df['time_interval_begin']>pd.to_datetime('2017-06-06'))&(\n",
|
|
|
|
|
" df['time_interval_begin'] <= pd.to_datetime('2017-06-30'))]\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" regressor, loss1, best_iteration1,best_score1 = fit_evaluate(pd.concat([train1,\n",
|
|
|
|
|
" train2,\n",
|
|
|
|
|
" train3,\n",
|
|
|
|
|
" train4]),train5,\n",
|
|
|
|
|
" params)\n",
|
|
|
|
|
" print(best_iteration1,best_score1,loss1)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" regressor, loss2, best_iteration2,best_score2 = fit_evaluate(pd.concat([train1,\n",
|
|
|
|
|
" train2,\n",
|
|
|
|
|
" train3,\n",
|
|
|
|
|
" train5]),train4,\n",
|
|
|
|
|
" params) \n",
|
|
|
|
|
" print(best_iteration2,best_score2,loss2)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" regressor, loss3, best_iteration3,best_score3 = fit_evaluate(pd.concat([train1,\n",
|
|
|
|
|
" train2,\n",
|
|
|
|
|
" train4,\n",
|
|
|
|
|
" train5]),train3,\n",
|
|
|
|
|
" params) \n",
|
|
|
|
|
" print(best_iteration3,best_score3,loss3) \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" regressor, loss4, best_iteration4,best_score4 = fit_evaluate(pd.concat([train1,\n",
|
|
|
|
|
" train3,\n",
|
|
|
|
|
" train4,\n",
|
|
|
|
|
" train5]),train2,\n",
|
|
|
|
|
" params) \n",
|
|
|
|
|
" print(best_iteration4,best_score4,loss4) \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" regressor, loss5, best_iteration5,best_score5 = fit_evaluate(pd.concat([train2,\n",
|
|
|
|
|
" train3,\n",
|
|
|
|
|
" train4,\n",
|
|
|
|
|
" train5]),train1,\n",
|
|
|
|
|
" params)\n",
|
|
|
|
|
" print(best_iteration5,best_score5,loss5) \n",
|
|
|
|
|
" \n",
|
|
|
|
|
" loss = [loss1,loss2, loss3, loss4, loss5]\n",
|
|
|
|
|
" params['loss_std'] = np.std(loss)\n",
|
|
|
|
|
" params['loss'] = str(loss)\n",
|
|
|
|
|
" params['mean_loss'] = np.mean(loss)\n",
|
|
|
|
|
" params['n_estimators'] = str([best_iteration1, best_iteration2, best_iteration3,\n",
|
|
|
|
|
" best_iteration4, best_iteration5])\n",
|
|
|
|
|
" params['best_score'] = str([best_score1, best_score2, best_score3,\n",
|
|
|
|
|
" best_score4, best_score5])\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" print(str(params))\n",
|
|
|
|
|
" if np.mean(loss) <= best:\n",
|
|
|
|
|
" best = np.mean(loss)\n",
|
|
|
|
|
" print('best with:' + str(params))\n",
|
|
|
|
|
" return best"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.7.3"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
}
|