You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

5771 lines
284 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 任务:预测道路某段时间的通行时间\n",
"\n",
"### 数据展示"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.interpolate import UnivariateSpline\n",
"from sklearn import linear_model\n",
"import xgboost as xgb\n",
"from sklearn.utils import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道路通行时间:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:04:00,2017-05-06 11:06:00)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:42:00,2017-05-06 10:44:00)</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:56:00,2017-05-06 11:58:00)</td>\n",
" <td>35.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 17:46:00,2017-05-06 17:48:00)</td>\n",
" <td>26.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:52:00,2017-05-06 10:54:00)</td>\n",
" <td>10.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval \\\n",
"0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
"1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
"2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
"3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
"4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
"\n",
" travel_time \n",
"0 3.0 \n",
"1 1.0 \n",
"2 35.2 \n",
"3 26.2 \n",
"4 10.4 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('data/new_gy_contest_traveltime_training_data_second.txt',delimiter=';',dtype={'link_ID':object})\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"time_interval时间间隔两分钟为单位\n",
"\n",
"travel_time平均通行时间"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道理长宽情况:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>57</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906284594800514</td>\n",
" <td>247</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289425800514</td>\n",
" <td>194</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906284525800514</td>\n",
" <td>839</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906284422600514</td>\n",
" <td>55</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID length width link_class\n",
"0 4377906289869500514 57 3 1\n",
"1 4377906284594800514 247 9 1\n",
"2 4377906289425800514 194 3 1\n",
"3 4377906284525800514 839 3 1\n",
"4 4377906284422600514 55 12 1"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_df = pd.read_csv('data/gy_contest_link_info.txt',delimiter=';',dtype={'link_ID':object})\n",
"link_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"length长度 width宽度 link_class类别"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道路之间连接情况:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>in_links</th>\n",
" <th>out_links</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906284594800514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289425800514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906284525800514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906284422600514</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID in_links out_links\n",
"0 4377906289869500514 1 1\n",
"1 4377906284594800514 1 1\n",
"2 4377906289425800514 1 1\n",
"3 4377906284525800514 1 1\n",
"4 4377906284422600514 2 1"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_tops = pd.read_csv('data/gy_contest_link_top_update.txt',delimiter=',',dtype={'link_ID':object})\n",
"link_tops.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 任务:预测未来一个月平均通行结果,每两分钟一次\n",
"回归任务\n",
"\n",
"构建时间序列,基于前几天或者前几十天的数据预测\n",
"https://tianchi.aliyun.com/competition/entrance/231598/information"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据集筛选与标签转换\n",
"数据集中有些数据可能由于异常情况导致不适合建模(堵车,维修等)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:04:00,2017-05-06 11:06:00)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:42:00,2017-05-06 10:44:00)</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:56:00,2017-05-06 11:58:00)</td>\n",
" <td>35.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 17:46:00,2017-05-06 17:48:00)</td>\n",
" <td>26.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:52:00,2017-05-06 10:54:00)</td>\n",
" <td>10.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval \\\n",
"0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
"1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
"2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
"3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
"4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
"\n",
" travel_time \n",
"0 3.0 \n",
"1 1.0 \n",
"2 35.2 \n",
"3 26.2 \n",
"4 10.4 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval</th>\n",
" <th>travel_time</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:04:00,2017-05-06 11:06:00)</td>\n",
" <td>3.0</td>\n",
" <td>2017-05-06 11:04:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:42:00,2017-05-06 10:44:00)</td>\n",
" <td>1.0</td>\n",
" <td>2017-05-06 10:42:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:56:00,2017-05-06 11:58:00)</td>\n",
" <td>35.2</td>\n",
" <td>2017-05-06 11:56:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 17:46:00,2017-05-06 17:48:00)</td>\n",
" <td>26.2</td>\n",
" <td>2017-05-06 17:46:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:52:00,2017-05-06 10:54:00)</td>\n",
" <td>10.4</td>\n",
" <td>2017-05-06 10:52:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval \\\n",
"0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
"1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
"2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
"3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
"4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
"\n",
" travel_time time_interval_begin \n",
"0 3.0 2017-05-06 11:04:00 \n",
"1 1.0 2017-05-06 10:42:00 \n",
"2 35.2 2017-05-06 11:56:00 \n",
"3 26.2 2017-05-06 17:46:00 \n",
"4 10.4 2017-05-06 10:52:00 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#截取开始时间\n",
"df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20]))\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"标签转换\n",
"<img src=\"assets/20201202211044.png\" width=\"100%\">\n",
"对于travel_time我们希望是右边的图越是正态分布越好预测"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop(['time_interval'],axis=1)\n",
"df['travel_time'] = np.log1p(df['travel_time'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"筛选方法"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#剔除一些离群点如travel_time突然有几百分钟的时间可能是意外、道路维修或者统计错误\n",
"def quantile_clip(group):\n",
" # 选择一定的百分比过滤\n",
" group[group < group.quantile(.05)] = group.quantile(.05)\n",
" group[group > group.quantile(.95)] = group.quantile(.95)\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>1.386294</td>\n",
" <td>2017-05-06 11:04:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>0.693147</td>\n",
" <td>2017-05-06 10:42:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.589059</td>\n",
" <td>2017-05-06 11:56:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.303217</td>\n",
" <td>2017-05-06 17:46:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>2.251292</td>\n",
" <td>2017-05-06 10:52:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date travel_time time_interval_begin\n",
"0 4377906283422600514 2017-05-06 1.386294 2017-05-06 11:04:00\n",
"1 3377906289434510514 2017-05-06 0.693147 2017-05-06 10:42:00\n",
"2 3377906285934510514 2017-05-06 3.589059 2017-05-06 11:56:00\n",
"3 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n",
"4 3377906287934510514 2017-05-06 2.251292 2017-05-06 10:52:00"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#对每条道路(link_ID),每天执行(date)\n",
"df['travel_time'] = df.groupby(['link_ID','date'])['travel_time'].transform(quantile_clip)\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#根据需求选择样本数据比如预测高峰时刻如早上6-8、中午下午13-18\n",
"df = df.loc[(df['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#保存处理结果\n",
"df.to_csv('data/raw_data.txt',header=True,index=None,sep=';',mode='w')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 缺失值预处理"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.303217</td>\n",
" <td>2017-05-06 17:46:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>1.887070</td>\n",
" <td>2017-05-06 14:36:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906287674510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>1.931521</td>\n",
" <td>2017-05-06 06:30:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906287886510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.616309</td>\n",
" <td>2017-05-06 07:32:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906283759500514</td>\n",
" <td>2017-05-06</td>\n",
" <td>2.140066</td>\n",
" <td>2017-05-06 13:24:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date travel_time time_interval_begin\n",
"0 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n",
"1 3377906287934510514 2017-05-06 1.887070 2017-05-06 14:36:00\n",
"2 3377906287674510514 2017-05-06 1.931521 2017-05-06 06:30:00\n",
"3 3377906287886510514 2017-05-06 3.616309 2017-05-06 07:32:00\n",
"4 4377906283759500514 2017-05-06 2.140066 2017-05-06 13:24:00"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('data/raw_data.txt',delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"如上第一行中2017-05-06 17:46:00那么是不是没有17:48、17:50所以我们需要补充"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>57</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906284594800514</td>\n",
" <td>247</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289425800514</td>\n",
" <td>194</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906284525800514</td>\n",
" <td>839</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906284422600514</td>\n",
" <td>55</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID length width link_class\n",
"0 4377906289869500514 57 3 1\n",
"1 4377906284594800514 247 9 1\n",
"2 4377906289425800514 194 3 1\n",
"3 4377906284525800514 839 3 1\n",
"4 4377906284422600514 55 12 1"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatetimeIndex(['2017-03-01 00:00:00', '2017-03-01 00:02:00',\n",
" '2017-03-01 00:04:00', '2017-03-01 00:06:00',\n",
" '2017-03-01 00:08:00'],\n",
" dtype='datetime64[ns]', freq='2T')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"date_range = pd.date_range('2017-03-01 00:00:00','2017-07-31 23:58:00',freq='2min')\n",
"date_range[:5]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:04:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:06:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:08:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin\n",
"0 4377906289869500514 2017-03-01 00:00:00\n",
"1 4377906289869500514 2017-03-01 00:02:00\n",
"2 4377906289869500514 2017-03-01 00:04:00\n",
"3 4377906289869500514 2017-03-01 00:06:00\n",
"4 4377906289869500514 2017-03-01 00:08:00"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#给每个ID做出每一个时刻\n",
"new_index = pd.MultiIndex.from_product([link_df['link_ID'].unique(),date_range],\n",
" names=['link_ID', 'time_interval_begin'])\n",
"new_df = pd.DataFrame(index=new_index).reset_index()\n",
"new_df.head() # 此时每个ID都有从2017-03-01 00:00:00到2017-03-71 23:58:00的时间间隔"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:00:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:04:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:06:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:08:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time\n",
"0 4377906289869500514 2017-03-01 00:00:00 NaN NaN\n",
"1 4377906289869500514 2017-03-01 00:02:00 NaN NaN\n",
"2 4377906289869500514 2017-03-01 00:04:00 NaN NaN\n",
"3 4377906289869500514 2017-03-01 00:06:00 NaN NaN\n",
"4 4377906289869500514 2017-03-01 00:08:00 NaN NaN"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#原来的表里也有部分值,进行合并,出现大量缺失值\n",
"df2 = pd.merge(new_df, df,on=['link_ID','time_interval_begin'],how='left')\n",
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"#筛选时间段数据\n",
"df2 = df2.loc[(df2['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]\n",
"df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n",
" (df2['time_interval_begin'].dt.month == 7) & \n",
" (df2['time_interval_begin'].dt.hour.isin([8,15,18])))]\n",
"df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n",
" (df2['time_interval_begin'].dt.month == 3) & \n",
" (df2['time_interval_begin'].dt.day == 31))]\n",
"\n",
"df2['date'] = df2['time_interval_begin'].dt.strftime('%Y-%m-%d')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>181</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time\n",
"180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752\n",
"181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752\n",
"182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752\n",
"183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752\n",
"184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"#保存中间结果\n",
"df2.to_csv('data/pre_trainning.txt',header=True,index=None,sep=';',mode='w')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 补全时间序列"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>181</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
"181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
"182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
"183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
"184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
"\n",
" travel_time2 \n",
"180 2.174752 \n",
"181 2.174752 \n",
"182 2.174752 \n",
"183 2.174752 \n",
"184 2.174752 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df2\n",
"df['travel_time2'] = df['travel_time']\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"多个月统计-季节性变化"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def date_trend(group):\n",
" tmp = group.groupby('date_hour').mean().reset_index()\n",
" \n",
" def nan_helper(y):\n",
" return np.isnan(y), lambda z: z.nonzero()[0]\n",
" \n",
" y = tmp['travel_time'].values\n",
" nans, x = nan_helper(y)\n",
" if group.link_ID.values[0] in ['3377906282328510514','3377906283328510514',\n",
" '4377906280784800514','9377906281555510514']:\n",
" tmp['date_trend'] = group['travel_time'].median()\n",
" else:\n",
" regr = linear_model.LinearRegression()\n",
" regr.fit(x(~nans).reshape(-1,1), y[~nans].reshape(-1,1))\n",
" tmp['date_trend'] = regr.predict(tmp.index.values.reshape(-1,1)).ravel()\n",
" group = pd.merge(group,tmp[['date_trend','date_hour']], on='date_hour',how='left')\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>181</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
"181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
"182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
"183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
"184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
"\n",
" travel_time2 date_hour \n",
"180 2.174752 2017-03-01-06 \n",
"181 2.174752 2017-03-01-06 \n",
"182 2.174752 2017-03-01-06 \n",
"183 2.174752 2017-03-01-06 \n",
"184 2.174752 2017-03-01-06 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 基于小时进行预测,如果基于整体预测,结果可能不准确定,我们先算小时对结果的影响\n",
"df['date_hour'] = df.time_interval_begin.map(lambda x: x.strftime('%Y-%m-%d-%H'))\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道路每小时通行时间的回归结果\n",
"<img src=\"assets/20201202221436.png\" width=\"100%\">\n",
"左图:回归预测,蓝色线是回归线,红色是时间\n",
"右图:对某几个道路,直接用中位数预测"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_hour</th>\n",
" <th>date_trend</th>\n",
" </tr>\n",
" <tr>\n",
" <th>link_ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">3377906280028510514</th>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date \\\n",
"link_ID \n",
"3377906280028510514 0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 \n",
" 1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 \n",
" 2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 \n",
" 3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 \n",
" 4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 \n",
"\n",
" travel_time travel_time2 date_hour date_trend \n",
"link_ID \n",
"3377906280028510514 0 NaN NaN 2017-03-01-06 1.960745 \n",
" 1 NaN NaN 2017-03-01-06 1.960745 \n",
" 2 NaN NaN 2017-03-01-06 1.960745 \n",
" 3 NaN NaN 2017-03-01-06 1.960745 \n",
" 4 NaN NaN 2017-03-01-06 1.960745 "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.groupby('link_ID').apply(date_trend)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"蓝线回归得到的值存在 df['date trend']里,此时 travel_time 就更新为 df['travel_time']= df['travel_time']-df['date_trend'],表示date_trend作为大的趋势已经被线性回归决定了,剩下的就是研究这个残差了,之后训练和预测都是基于残差,最后用预测出来的残差加上相应的date_trend即可得到需要的预测值"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend \n",
"0 NaN 1.960745 \n",
"1 NaN 1.960745 \n",
"2 NaN 1.960745 \n",
"3 NaN 1.960745 \n",
"4 NaN 1.960745 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.drop(['date_hour','link_ID'],axis=1)\n",
"df = df.reset_index()\n",
"df = df.drop('level_1',axis=1)\n",
"df['travel_time'] = df['travel_time'] - df['date_trend']\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"日变化量(分钟)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"def minute_trend(group):\n",
" tmp = group.groupby('hour_minute').mean().reset_index()\n",
" #s的值越小对数据的拟合越好但是存在过拟合风险\n",
" spl = UnivariateSpline(tmp.index, tmp['travel_time'].values, s=0.5)\n",
" tmp['minute_trend'] = spl(tmp.index)\n",
" group = pd.merge(group, tmp[['minute_trend', 'hour_minute']], on='hour_minute', how='left')\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>hour_minute</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend hour_minute \n",
"0 NaN 1.960745 06-00 \n",
"1 NaN 1.960745 06-02 \n",
"2 NaN 1.960745 06-04 \n",
"3 NaN 1.960745 06-06 \n",
"4 NaN 1.960745 06-08 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['hour_minute'] = df.time_interval_begin.map(lambda x: x.strftime('%H-%M'))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>hour_minute</th>\n",
" <th>minute_trend</th>\n",
" </tr>\n",
" <tr>\n",
" <th>link_ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">3377906280028510514</th>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-00</td>\n",
" <td>-0.252121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-02</td>\n",
" <td>-0.246743</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-04</td>\n",
" <td>-0.241428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-06</td>\n",
" <td>-0.236176</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>06-08</td>\n",
" <td>-0.230986</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date \\\n",
"link_ID \n",
"3377906280028510514 0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 \n",
" 1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 \n",
" 2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 \n",
" 3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 \n",
" 4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 \n",
"\n",
" travel_time travel_time2 date_trend hour_minute \\\n",
"link_ID \n",
"3377906280028510514 0 NaN NaN 1.960745 06-00 \n",
" 1 NaN NaN 1.960745 06-02 \n",
" 2 NaN NaN 1.960745 06-04 \n",
" 3 NaN NaN 1.960745 06-06 \n",
" 4 NaN NaN 1.960745 06-08 \n",
"\n",
" minute_trend \n",
"link_ID \n",
"3377906280028510514 0 -0.252121 \n",
" 1 -0.246743 \n",
" 2 -0.241428 \n",
" 3 -0.236176 \n",
" 4 -0.230986 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df=df.groupby('link_ID').apply(minute_trend)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"与季节残差一样,回归得到的值存在 df['minute_trend']里,因此现在的travel_time再次更新为 df['travel_time]= df['travel_time']-df['minute_trend]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop(['hour_minute', 'link_ID'], axis=1)\n",
"df = df.reset_index()\n",
"df = df.drop('level_1',axis=1)\n",
"df['travel_time'] = df['travel_time'] - df['minute_trend']"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend \n",
"0 NaN 1.960745 -0.252121 \n",
"1 NaN 1.960745 -0.246743 \n",
"2 NaN 1.960745 -0.241428 \n",
"3 NaN 1.960745 -0.236176 \n",
"4 NaN 1.960745 -0.230986 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"link_infos = pd.read_csv('data/gy_contest_link_info.txt',delimiter=';',dtype={'link_ID':object})\n",
"link_tops = pd.read_csv('data/gy_contest_link_top_update.txt',delimiter=',',dtype={'link_ID':object})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"基本上大概的走势已经被 date_trend和 hour_trend决定了,剩下就是建模得到这个travel_time如何围绕这两个trends上下变化的\n",
"\n",
"选择训练特征:"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width links_num area \n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_infos = pd.merge(link_infos, link_tops, on=['link_ID'], how='left')\n",
"link_infos['links_num'] = link_infos['in_links']+link_infos['out_links']\n",
"link_infos['area'] = link_infos['length'] * link_infos['width']\n",
"df = pd.merge(df, link_infos[['link_ID','length','width', 'links_num', 'area']], on=['link_ID'], how='left')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day month \n",
"0 0.0 0 6 1 3 3 \n",
"1 0.0 2 6 1 3 3 \n",
"2 0.0 4 6 1 3 3 \n",
"3 0.0 6 6 1 3 3 \n",
"4 0.0 8 6 1 3 3 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#时间相关特征\n",
"df.loc[df['date'].isin(['2017-04-02','2017-04-03','2017-04-04','2017-04-29','2017-04-30',\n",
" '2017-05-01','2017-05-28','2017-05-29','2017-05-30']),'vacation']=1\n",
"\n",
"df.loc[~df['date'].isin(['2017-04-02','2017-04-03','2017-04-04','2017-04-29','2017-04-30',\n",
" '2017-05-01','2017-05-28','2017-05-29','2017-05-30']),'vacation']=0\n",
"\n",
"df['minute'] = df['time_interval_begin'].dt.minute\n",
"df['hour'] = df['time_interval_begin'].dt.hour\n",
"df['day'] = df['time_interval_begin'].dt.day\n",
"df['week_day'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1)\n",
"df['month'] = df['time_interval_begin'].dt.month\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"def mean_time(group):\n",
" group['link_ID_en'] = group['travel_time'].mean()\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>month</th>\n",
" <th>link_ID_en</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0.000138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0.000138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0.000138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0.000138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0.000138</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day month link_ID_en \n",
"0 0.0 0 6 1 3 3 0.000138 \n",
"1 0.0 2 6 1 3 3 0.000138 \n",
"2 0.0 4 6 1 3 3 0.000138 \n",
"3 0.0 6 6 1 3 3 0.000138 \n",
"4 0.0 8 6 1 3 3 0.000138 "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.groupby('link_ID').apply(mean_time)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>month</th>\n",
" <th>link_ID_en</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day month link_ID_en \n",
"0 0.0 0 6 1 3 3 75 \n",
"1 0.0 2 6 1 3 3 75 \n",
"2 0.0 4 6 1 3 3 75 \n",
"3 0.0 6 6 1 3 3 75 \n",
"4 0.0 8 6 1 3 3 75 "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 通行时间长的编号大\n",
"sorted_link = np.sort(df['link_ID_en'].unique())\n",
"df['link_ID_en'] = df['link_ID_en'].map(lambda x: np.argmin(x >= sorted_link))\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"标准化"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def std(group):\n",
" group['travel_time_std'] = np.std(group['travel_time'])\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute</th>\n",
" <th>hour</th>\n",
" <th>day</th>\n",
" <th>week_day</th>\n",
" <th>month</th>\n",
" <th>link_ID_en</th>\n",
" <th>travel_time_std</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>75</td>\n",
" <td>0.223232</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length width links_num area \\\n",
"0 NaN 1.960745 -0.252121 48 3 2 144 \n",
"1 NaN 1.960745 -0.246743 48 3 2 144 \n",
"2 NaN 1.960745 -0.241428 48 3 2 144 \n",
"3 NaN 1.960745 -0.236176 48 3 2 144 \n",
"4 NaN 1.960745 -0.230986 48 3 2 144 \n",
"\n",
" vacation minute hour day week_day month link_ID_en travel_time_std \n",
"0 0.0 0 6 1 3 3 75 0.223232 \n",
"1 0.0 2 6 1 3 3 75 0.223232 \n",
"2 0.0 4 6 1 3 3 75 0.223232 \n",
"3 0.0 6 6 1 3 3 75 0.223232 \n",
"4 0.0 8 6 1 3 3 75 0.223232 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.groupby('link_ID').apply(std)\n",
"df['travel_time'] = df['travel_time'] / df['travel_time_std']\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"缺失时间预测"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"params = {\n",
" 'learning_rate':0.2,\n",
" 'n_estimators':30,\n",
" 'subsample':0.8,\n",
" 'colsample_bytree':0.6,\n",
" 'max_depth':10,\n",
" 'min_child_weight':1,\n",
" 'reg_alpha':0,\n",
" 'gamma':0\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>...</th>\n",
" <th>day_27</th>\n",
" <th>day_28</th>\n",
" <th>day_29</th>\n",
" <th>day_30</th>\n",
" <th>day_31</th>\n",
" <th>month_3</th>\n",
" <th>month_4</th>\n",
" <th>month_5</th>\n",
" <th>month_6</th>\n",
" <th>month_7</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 103 columns</p>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend minute_trend length area vacation ... \\\n",
"0 NaN 1.960745 -0.252121 48 144 0.0 ... \n",
"1 NaN 1.960745 -0.246743 48 144 0.0 ... \n",
"2 NaN 1.960745 -0.241428 48 144 0.0 ... \n",
"3 NaN 1.960745 -0.236176 48 144 0.0 ... \n",
"4 NaN 1.960745 -0.230986 48 144 0.0 ... \n",
"\n",
" day_27 day_28 day_29 day_30 day_31 month_3 month_4 month_5 month_6 \\\n",
"0 0 0 0 0 0 1 0 0 0 \n",
"1 0 0 0 0 0 1 0 0 0 \n",
"2 0 0 0 0 0 1 0 0 0 \n",
"3 0 0 0 0 0 1 0 0 0 \n",
"4 0 0 0 0 0 1 0 0 0 \n",
"\n",
" month_7 \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 103 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.get_dummies(df, columns=['links_num','width','minute','hour',\n",
" 'week_day','day','month'])\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练的数据train_df为travel_time非空的数据而数据集test_df为travel_time空的数据"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['length', 'area', 'vacation', 'link_ID_en', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'minute_0', 'minute_2', 'minute_4', 'minute_6', 'minute_8', 'minute_10', 'minute_12', 'minute_14', 'minute_16', 'minute_18', 'minute_20', 'minute_22', 'minute_24', 'minute_26', 'minute_28', 'minute_30', 'minute_32', 'minute_34', 'minute_36', 'minute_38', 'minute_40', 'minute_42', 'minute_44', 'minute_46', 'minute_48', 'minute_50', 'minute_52', 'minute_54', 'minute_56', 'minute_58', 'hour_6', 'hour_7', 'hour_8', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'week_day_1', 'week_day_2', 'week_day_3', 'week_day_4', 'week_day_5', 'week_day_6', 'week_day_7', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7']\n"
]
}
],
"source": [
"feature = df.columns.values.tolist()\n",
"train_feature = [x for x in feature if \n",
" x not in ['link_ID', 'time_interval_begin', 'travel_time', 'date',\n",
" 'travel_time2', 'minute_trend', 'travel_time_std', 'date_trend']]\n",
"\n",
"train_df = df.loc[~df['travel_time'].isnull()] # 获取非空的值,~是非空意思\n",
"test_df = df.loc[df['travel_time2'].isnull()].copy()\n",
"\n",
"print(train_feature)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3165426, 103)\n",
"(1883574, 103)\n"
]
}
],
"source": [
"print(train_df.shape)\n",
"print(test_df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练数据切分"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X = train_df[train_feature].values\n",
"y = train_df['travel_time'].values\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n",
"\n",
"eval_set = [(X_test, y_test)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练回归模型来预测缺失值"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[09:21:48] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n",
"[0]\tvalidation_0-rmse:1.07239\n",
"Will train until validation_0-rmse hasn't improved in 10 rounds.\n",
"[1]\tvalidation_0-rmse:1.04051\n",
"[2]\tvalidation_0-rmse:1.0181\n",
"[3]\tvalidation_0-rmse:1.00329\n",
"[4]\tvalidation_0-rmse:0.994137\n",
"[5]\tvalidation_0-rmse:0.986071\n",
"[6]\tvalidation_0-rmse:0.980012\n",
"[7]\tvalidation_0-rmse:0.975637\n",
"[8]\tvalidation_0-rmse:0.972596\n",
"[9]\tvalidation_0-rmse:0.970479\n",
"[10]\tvalidation_0-rmse:0.968498\n",
"[11]\tvalidation_0-rmse:0.967078\n",
"[12]\tvalidation_0-rmse:0.966369\n",
"[13]\tvalidation_0-rmse:0.965318\n",
"[14]\tvalidation_0-rmse:0.964468\n",
"[15]\tvalidation_0-rmse:0.96396\n",
"[16]\tvalidation_0-rmse:0.962917\n",
"[17]\tvalidation_0-rmse:0.962312\n",
"[18]\tvalidation_0-rmse:0.961388\n",
"[19]\tvalidation_0-rmse:0.960701\n",
"[20]\tvalidation_0-rmse:0.960348\n",
"[21]\tvalidation_0-rmse:0.959666\n",
"[22]\tvalidation_0-rmse:0.959202\n",
"[23]\tvalidation_0-rmse:0.958588\n",
"[24]\tvalidation_0-rmse:0.958098\n",
"[25]\tvalidation_0-rmse:0.956764\n",
"[26]\tvalidation_0-rmse:0.956351\n",
"[27]\tvalidation_0-rmse:0.955532\n",
"[28]\tvalidation_0-rmse:0.955296\n",
"[29]\tvalidation_0-rmse:0.954658\n",
" length area vacation link_ID_en links_num_2 links_num_3 links_num_4 \\\n",
"0 48 144 0.0 75 1 0 0 \n",
"1 48 144 0.0 75 1 0 0 \n",
"2 48 144 0.0 75 1 0 0 \n",
"3 48 144 0.0 75 1 0 0 \n",
"4 48 144 0.0 75 1 0 0 \n",
"\n",
" links_num_5 width_3 width_6 ... day_27 day_28 day_29 day_30 day_31 \\\n",
"0 0 1 0 ... 0 0 0 0 0 \n",
"1 0 1 0 ... 0 0 0 0 0 \n",
"2 0 1 0 ... 0 0 0 0 0 \n",
"3 0 1 0 ... 0 0 0 0 0 \n",
"4 0 1 0 ... 0 0 0 0 0 \n",
"\n",
" month_3 month_4 month_5 month_6 month_7 \n",
"0 1 0 0 0 0 \n",
"1 1 0 0 0 0 \n",
"2 1 0 0 0 0 \n",
"3 1 0 0 0 0 \n",
"4 1 0 0 0 0 \n",
"\n",
"[5 rows x 95 columns]\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 1883574 entries, 0 to 5048999\n",
"Data columns (total 95 columns):\n",
"length int64\n",
"area int64\n",
"vacation float64\n",
"link_ID_en int64\n",
"links_num_2 uint8\n",
"links_num_3 uint8\n",
"links_num_4 uint8\n",
"links_num_5 uint8\n",
"width_3 uint8\n",
"width_6 uint8\n",
"width_9 uint8\n",
"width_12 uint8\n",
"width_15 uint8\n",
"minute_0 uint8\n",
"minute_2 uint8\n",
"minute_4 uint8\n",
"minute_6 uint8\n",
"minute_8 uint8\n",
"minute_10 uint8\n",
"minute_12 uint8\n",
"minute_14 uint8\n",
"minute_16 uint8\n",
"minute_18 uint8\n",
"minute_20 uint8\n",
"minute_22 uint8\n",
"minute_24 uint8\n",
"minute_26 uint8\n",
"minute_28 uint8\n",
"minute_30 uint8\n",
"minute_32 uint8\n",
"minute_34 uint8\n",
"minute_36 uint8\n",
"minute_38 uint8\n",
"minute_40 uint8\n",
"minute_42 uint8\n",
"minute_44 uint8\n",
"minute_46 uint8\n",
"minute_48 uint8\n",
"minute_50 uint8\n",
"minute_52 uint8\n",
"minute_54 uint8\n",
"minute_56 uint8\n",
"minute_58 uint8\n",
"hour_6 uint8\n",
"hour_7 uint8\n",
"hour_8 uint8\n",
"hour_13 uint8\n",
"hour_14 uint8\n",
"hour_15 uint8\n",
"hour_16 uint8\n",
"hour_17 uint8\n",
"hour_18 uint8\n",
"week_day_1 uint8\n",
"week_day_2 uint8\n",
"week_day_3 uint8\n",
"week_day_4 uint8\n",
"week_day_5 uint8\n",
"week_day_6 uint8\n",
"week_day_7 uint8\n",
"day_1 uint8\n",
"day_2 uint8\n",
"day_3 uint8\n",
"day_4 uint8\n",
"day_5 uint8\n",
"day_6 uint8\n",
"day_7 uint8\n",
"day_8 uint8\n",
"day_9 uint8\n",
"day_10 uint8\n",
"day_11 uint8\n",
"day_12 uint8\n",
"day_13 uint8\n",
"day_14 uint8\n",
"day_15 uint8\n",
"day_16 uint8\n",
"day_17 uint8\n",
"day_18 uint8\n",
"day_19 uint8\n",
"day_20 uint8\n",
"day_21 uint8\n",
"day_22 uint8\n",
"day_23 uint8\n",
"day_24 uint8\n",
"day_25 uint8\n",
"day_26 uint8\n",
"day_27 uint8\n",
"day_28 uint8\n",
"day_29 uint8\n",
"day_30 uint8\n",
"day_31 uint8\n",
"month_3 uint8\n",
"month_4 uint8\n",
"month_5 uint8\n",
"month_6 uint8\n",
"month_7 uint8\n",
"dtypes: float64(1), int64(3), uint8(91)\n",
"memory usage: 235.3 MB\n",
"None\n"
]
}
],
"source": [
"regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'],\n",
" n_estimators=params['n_estimators'],\n",
" booster='gbtree', objective='reg:linear',\n",
" n_jobs=-1,subsample=params['subsample'],\n",
" colsample_bytree=params['colsample_bytree'],\n",
" random_state=0,max_depth=params['max_depth'],\n",
" gamma=params['gamma'],\n",
" min_child_weight=params['min_child_weight'],\n",
" reg_alpha=params['reg_alpha'])\n",
"regressor.fit(X_train,y_train,verbose=True,early_stopping_rounds=10,eval_set=eval_set)\n",
"print(test_df[train_feature].head())\n",
"print(test_df[train_feature].info())\n",
"test_df['prediction'] = regressor.predict(test_df[train_feature].values)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"df = pd.merge(df, test_df[['link_ID','time_interval_begin','prediction']],\n",
" on=['link_ID','time_interval_begin'],how='left')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd # 数据处理包\n",
"import numpy as np # 数据处理包\n",
"import matplotlib.pyplot as plt # 图形处理包\n",
"\n",
"\n",
"def importance_plt(feats_list, feats_importance, png_savename=0):\n",
" \"\"\"\n",
" 功能:打印特征重要图\n",
" why: 能看出哪个特征更重要继而对特征做相关衍生也可以讲特征使用次数为0的特征去掉防止冗余。\n",
" feats_list: 特征名list类型可以用如下方法获取X.columns.values # 获取全部特征\n",
" feats_importance: 已训练过的模型的特征重要性\n",
" xgb和lgb可以用如下方法获得:\n",
" feats_importance = clf.feature_importances_ # 获取特征使用次数\n",
" 长这样array([0.00567917, 0.00615975,],dtype=float32)\n",
" png_savename: 保存图片的名字,默认不保存\n",
" return: 打印出特征重要性图\n",
" \"\"\"\n",
" sorted_idx = np.argsort(feats_importance)\n",
" \n",
" plt.figure(figsize=(10, 55))\n",
" # 下面是画图操作\n",
" plt.barh(range(len(sorted_idx)), feats_importance[sorted_idx], align='center')\n",
"# plt.yticks(range(len(sorted_idx)), feats_list[sorted_idx], align='center')\n",
" plt.yticks(range(len(sorted_idx)), np.array(feats_list)[sorted_idx])\n",
" plt.xlabel(\"Importance\")\n",
" plt.title(\"Feature importances\")\n",
" if png_savename:\n",
" plt.savefig(\"特征重要性.png\", dpi=500, bbox_inches='tight') # 由于特征过多图片过大,所以需要这些处理才能让图片全部保存下来\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x3960 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"importance_plt(train_feature, regressor.feature_importances_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"最重要的是vacation是否是假期对结果的影响非常重要"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"还原预测结果"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"df['imputation1'] = df['travel_time'].isnull()\n",
"df['travel_time'] = df['travel_time'].fillna(value=df['prediction'])\n",
"df['travel_time'] = (df['travel_time']\n",
" * np.array(df['travel_time_std'])\n",
" + np.array(df['minute_trend'])\n",
" + np.array(df['date_trend']))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" <th>minute_trend</th>\n",
" <th>length</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>...</th>\n",
" <th>day_29</th>\n",
" <th>day_30</th>\n",
" <th>day_31</th>\n",
" <th>month_3</th>\n",
" <th>month_4</th>\n",
" <th>month_5</th>\n",
" <th>month_6</th>\n",
" <th>month_7</th>\n",
" <th>prediction</th>\n",
" <th>imputation1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.252121</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.220903</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>1.664941</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.246743</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.219772</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>1.671675</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.241428</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.213418</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>1.676886</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.236176</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.213602</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>1.682314</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" <td>-0.230986</td>\n",
" <td>48</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.212535</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 105 columns</p>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 1.659311 \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 1.664941 \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 1.671675 \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 1.676886 \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 1.682314 \n",
"\n",
" travel_time2 date_trend minute_trend length area vacation ... \\\n",
"0 NaN 1.960745 -0.252121 48 144 0.0 ... \n",
"1 NaN 1.960745 -0.246743 48 144 0.0 ... \n",
"2 NaN 1.960745 -0.241428 48 144 0.0 ... \n",
"3 NaN 1.960745 -0.236176 48 144 0.0 ... \n",
"4 NaN 1.960745 -0.230986 48 144 0.0 ... \n",
"\n",
" day_29 day_30 day_31 month_3 month_4 month_5 month_6 month_7 \\\n",
"0 0 0 0 1 0 0 0 0 \n",
"1 0 0 0 1 0 0 0 0 \n",
"2 0 0 0 1 0 0 0 0 \n",
"3 0 0 0 1 0 0 0 0 \n",
"4 0 0 0 1 0 0 0 0 \n",
"\n",
" prediction imputation1 \n",
"0 -0.220903 True \n",
"1 -0.219772 True \n",
"2 -0.213418 True \n",
"3 -0.213602 True \n",
"4 -0.212535 True \n",
"\n",
"[5 rows x 105 columns]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" travel_time prediction travel_time2\n",
"count 5.049000e+06 1.883574e+06 3.165426e+06\n",
"mean 2.395459e+00 7.011203e-02 2.416565e+00\n",
"std 9.192798e-01 1.871515e-01 9.521757e-01\n",
"min 3.364722e-01 -1.735234e+00 3.364722e-01\n",
"25% 1.673816e+00 -3.902411e-02 1.648659e+00\n",
"50% 2.379546e+00 7.028064e-02 2.388763e+00\n",
"75% 3.068053e+00 1.804217e-01 3.113515e+00\n",
"max 5.913699e+00 3.253515e+00 5.913699e+00\n"
]
}
],
"source": [
"print(df[['travel_time','prediction', 'travel_time2']].describe())\n",
"df[['link_ID','date','time_interval_begin','travel_time','imputation1']].to_csv('data/com_trainning.txt',\n",
" header=True,\n",
" index=None,\n",
" sep=';',mode='w')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 构建特征"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('data/com_trainning.txt',\n",
" delimiter=';',\n",
" parse_dates=['time_interval_begin'],\n",
" dtype={'link_ID':object})"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"\n",
" imputation1 \n",
"0 True \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True "
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = df.copy()\n",
"df1.head() # imputation1如果是False表示真实值True则是预测"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:10:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:12:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:14:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:16:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:18:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:10:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:12:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:14:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:16:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:18:00 1.682314 \n",
"\n",
" imputation1 \n",
"0 True \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#平移5格\n",
"df1['time_interval_begin'] = df1['time_interval_begin'] + pd.DateOffset(minutes=5*2)\n",
"df1.head()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>lagging5</th>\n",
" <th>imputation1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:10:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:12:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:14:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:16:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:18:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin lagging5 imputation1\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:10:00 1.659311 True\n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:12:00 1.664941 True\n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:14:00 1.671675 True\n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:16:00 1.676886 True\n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:18:00 1.682314 True"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = df1.rename(columns={'travel_time':'lagging'+str(5)})\n",
"df1.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" <th>lagging5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:10:00</td>\n",
" <td>1.629241</td>\n",
" <td>False</td>\n",
" <td>1.659311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:12:00</td>\n",
" <td>1.629241</td>\n",
" <td>False</td>\n",
" <td>1.664941</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:14:00</td>\n",
" <td>1.629241</td>\n",
" <td>False</td>\n",
" <td>1.671675</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"5 3377906280028510514 2017-03-01 2017-03-01 06:10:00 1.629241 \n",
"6 3377906280028510514 2017-03-01 2017-03-01 06:12:00 1.629241 \n",
"7 3377906280028510514 2017-03-01 2017-03-01 06:14:00 1.629241 \n",
"\n",
" imputation1 lagging5 \n",
"0 True NaN \n",
"1 True NaN \n",
"2 True NaN \n",
"3 True NaN \n",
"4 True NaN \n",
"5 False 1.659311 \n",
"6 False 1.664941 \n",
"7 False 1.671675 "
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.merge(df,df1[['link_ID','time_interval_begin','lagging'+str(5)]],\n",
" on=['link_ID','time_interval_begin'],how='left')\n",
"df2.head(8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"第5行是第0行前10分钟的值是一样的只是往后移动"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"def create_lagging(df, df_original, i):\n",
" df1 = df_original.copy()\n",
" df1['time_interval_begin'] = df1['time_interval_begin']+pd.DateOffset(minutes=i*2)\n",
" df1 = df1.rename(columns={'travel_time':'lagging'+str(i)})\n",
" df2 = pd.merge(df,df1[['link_ID','time_interval_begin','lagging'+str(i)]],\n",
" on=['link_ID','time_interval_begin'],how='left')\n",
" return df2"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"df1 = create_lagging(df, df, 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"构建时间序列特征"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"lagging = 5\n",
"for i in range(2,lagging+1):\n",
" df1 = create_lagging(df1,df,i)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" <th>lagging1</th>\n",
" <th>lagging2</th>\n",
" <th>lagging3</th>\n",
" <th>lagging4</th>\n",
" <th>lagging5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:10:00</td>\n",
" <td>1.629241</td>\n",
" <td>False</td>\n",
" <td>1.682314</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:12:00</td>\n",
" <td>1.629241</td>\n",
" <td>False</td>\n",
" <td>1.629241</td>\n",
" <td>1.682314</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:14:00</td>\n",
" <td>1.629241</td>\n",
" <td>False</td>\n",
" <td>1.629241</td>\n",
" <td>1.629241</td>\n",
" <td>1.682314</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"5 3377906280028510514 2017-03-01 2017-03-01 06:10:00 1.629241 \n",
"6 3377906280028510514 2017-03-01 2017-03-01 06:12:00 1.629241 \n",
"7 3377906280028510514 2017-03-01 2017-03-01 06:14:00 1.629241 \n",
"\n",
" imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 \n",
"0 True NaN NaN NaN NaN NaN \n",
"1 True 1.659311 NaN NaN NaN NaN \n",
"2 True 1.664941 1.659311 NaN NaN NaN \n",
"3 True 1.671675 1.664941 1.659311 NaN NaN \n",
"4 True 1.676886 1.671675 1.664941 1.659311 NaN \n",
"5 False 1.682314 1.676886 1.671675 1.664941 1.659311 \n",
"6 False 1.629241 1.682314 1.676886 1.671675 1.664941 \n",
"7 False 1.629241 1.629241 1.682314 1.676886 1.671675 "
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.head(8)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"link_infos = pd.read_csv('data/gy_contest_link_info.txt',delimiter=';',dtype={'link_ID':object})\n",
"link_tops = pd.read_csv('data/gy_contest_link_top_update.txt',delimiter=',',dtype={'link_ID':object})"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"link_tops = link_tops.fillna(0)\n",
"link_infos = pd.merge(link_infos,link_tops,on=['link_ID'],how='left')\n",
"link_infos['links_num'] = link_infos['in_links']+link_infos['out_links']"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"link_infos['area'] = link_infos['length'] * link_infos['width']\n",
"df2 = pd.merge(df1,link_infos[['link_ID','length','width',\n",
" 'links_num','area']],on=['link_ID'],how='left')"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" <th>lagging1</th>\n",
" <th>lagging2</th>\n",
" <th>lagging3</th>\n",
" <th>lagging4</th>\n",
" <th>lagging5</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"\n",
" imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 length \\\n",
"0 True NaN NaN NaN NaN NaN 48 \n",
"1 True 1.659311 NaN NaN NaN NaN 48 \n",
"2 True 1.664941 1.659311 NaN NaN NaN 48 \n",
"3 True 1.671675 1.664941 1.659311 NaN NaN 48 \n",
"4 True 1.676886 1.671675 1.664941 1.659311 NaN 48 \n",
"\n",
" width links_num area \n",
"0 3 2 144 \n",
"1 3 2 144 \n",
"2 3 2 144 \n",
"3 3 2 144 \n",
"4 3 2 144 "
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"# 假期特征\n",
"df2.loc[df2['date'].isin(['2017-04-02','2017-04-03','2017-04-04','2017-04-29','2017-04-30',\n",
" '2017-05-01','2017-05-28','2017-05-29','2017-05-30']),'vacation']=1\n",
"\n",
"df2.loc[~df2['date'].isin(['2017-04-02','2017-04-03','2017-04-04','2017-04-29','2017-04-30',\n",
" '2017-05-01','2017-05-28','2017-05-29','2017-05-30']),'vacation']=0"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" <th>lagging1</th>\n",
" <th>lagging2</th>\n",
" <th>lagging3</th>\n",
" <th>lagging4</th>\n",
" <th>lagging5</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"\n",
" imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 length \\\n",
"0 True NaN NaN NaN NaN NaN 48 \n",
"1 True 1.659311 NaN NaN NaN NaN 48 \n",
"2 True 1.664941 1.659311 NaN NaN NaN 48 \n",
"3 True 1.671675 1.664941 1.659311 NaN NaN 48 \n",
"4 True 1.676886 1.671675 1.664941 1.659311 NaN 48 \n",
"\n",
" width links_num area vacation \n",
"0 3 2 144 0.0 \n",
"1 3 2 144 0.0 \n",
"2 3 2 144 0.0 \n",
"3 3 2 144 0.0 \n",
"4 3 2 144 0.0 "
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"#起始分钟特征\n",
"df2.loc[df2['time_interval_begin'].dt.hour.isin([6,7,8]),'minute_series']=\\\n",
" df2['time_interval_begin'].dt.minute+(df2['time_interval_begin'].dt.hour-6)*60\n",
"\n",
"df2.loc[df2['time_interval_begin'].dt.hour.isin([13,14,15]),'minute_series']=\\\n",
" df2['time_interval_begin'].dt.minute+(df2['time_interval_begin'].dt.hour-13)*60\n",
"\n",
"df2.loc[df2['time_interval_begin'].dt.hour.isin([16,17,18]),'minute_series']=\\\n",
" df2['time_interval_begin'].dt.minute+(df2['time_interval_begin'].dt.hour-16)*60"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" <th>lagging1</th>\n",
" <th>lagging2</th>\n",
" <th>lagging3</th>\n",
" <th>lagging4</th>\n",
" <th>lagging5</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute_series</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"\n",
" imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 length \\\n",
"0 True NaN NaN NaN NaN NaN 48 \n",
"1 True 1.659311 NaN NaN NaN NaN 48 \n",
"2 True 1.664941 1.659311 NaN NaN NaN 48 \n",
"3 True 1.671675 1.664941 1.659311 NaN NaN 48 \n",
"4 True 1.676886 1.671675 1.664941 1.659311 NaN 48 \n",
"\n",
" width links_num area vacation minute_series \n",
"0 3 2 144 0.0 0.0 \n",
"1 3 2 144 0.0 2.0 \n",
"2 3 2 144 0.0 4.0 \n",
"3 3 2 144 0.0 6.0 \n",
"4 3 2 144 0.0 8.0 "
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"# 星期特征\n",
"df2['day_of_week'] = df2['time_interval_begin'].map(lambda x: x.weekday()+1)\n",
"df2.loc[df2['day_of_week'].isin([1,2,3]),'day_of_week_en'] = 1\n",
"df2.loc[df2['day_of_week'].isin([4,5]),'day_of_week_en'] = 2\n",
"df2.loc[df2['day_of_week'].isin([6,7]),'day_of_week_en'] = 3"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"# 时间段特征\n",
"df2.loc[df['time_interval_begin'].dt.hour.isin([6,7,8]), 'hour_en']=1\n",
"df2.loc[df['time_interval_begin'].dt.hour.isin([13,14,15]), 'hour_en']=2\n",
"df2.loc[df['time_interval_begin'].dt.hour.isin([16,17,18]), 'hour_en']=3"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"# 星期,时间段合并特征\n",
"df2['week_hour'] = df2['day_of_week_en'].astype('str') + ','+df2['hour_en'].astype('str')"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" <th>lagging1</th>\n",
" <th>lagging2</th>\n",
" <th>lagging3</th>\n",
" <th>lagging4</th>\n",
" <th>lagging5</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>links_num</th>\n",
" <th>area</th>\n",
" <th>vacation</th>\n",
" <th>minute_series</th>\n",
" <th>day_of_week</th>\n",
" <th>day_of_week_en</th>\n",
" <th>hour_en</th>\n",
" <th>week_hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0,1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0,1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0,1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0,1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>48</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>144</td>\n",
" <td>0.0</td>\n",
" <td>8.0</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0,1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"\n",
" imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 length \\\n",
"0 True NaN NaN NaN NaN NaN 48 \n",
"1 True 1.659311 NaN NaN NaN NaN 48 \n",
"2 True 1.664941 1.659311 NaN NaN NaN 48 \n",
"3 True 1.671675 1.664941 1.659311 NaN NaN 48 \n",
"4 True 1.676886 1.671675 1.664941 1.659311 NaN 48 \n",
"\n",
" width links_num area vacation minute_series day_of_week \\\n",
"0 3 2 144 0.0 0.0 3 \n",
"1 3 2 144 0.0 2.0 3 \n",
"2 3 2 144 0.0 4.0 3 \n",
"3 3 2 144 0.0 6.0 3 \n",
"4 3 2 144 0.0 8.0 3 \n",
"\n",
" day_of_week_en hour_en week_hour \n",
"0 1.0 1.0 1.0,1.0 \n",
"1 1.0 1.0 1.0,1.0 \n",
"2 1.0 1.0 1.0,1.0 \n",
"3 1.0 1.0 1.0,1.0 \n",
"4 1.0 1.0 1.0,1.0 "
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"df2 = pd.get_dummies(df2,columns=['week_hour','links_num','width'])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"def mean_time(group):\n",
" group['link_ID_en'] = group['travel_time'].mean()\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"df2 = df2.groupby('link_ID').apply(mean_time)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"sorted_link = np.sort(df2['link_ID_en'].unique())\n",
"df2['link_ID_en'] = df2['link_ID_en'].map(lambda x: np.argmin(x >= sorted_link))"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval_begin</th>\n",
" <th>travel_time</th>\n",
" <th>imputation1</th>\n",
" <th>lagging1</th>\n",
" <th>lagging2</th>\n",
" <th>lagging3</th>\n",
" <th>lagging4</th>\n",
" <th>lagging5</th>\n",
" <th>...</th>\n",
" <th>links_num_2</th>\n",
" <th>links_num_3</th>\n",
" <th>links_num_4</th>\n",
" <th>links_num_5</th>\n",
" <th>width_3</th>\n",
" <th>width_6</th>\n",
" <th>width_9</th>\n",
" <th>width_12</th>\n",
" <th>width_15</th>\n",
" <th>link_ID_en</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>1.659311</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>1.664941</td>\n",
" <td>True</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>1.671675</td>\n",
" <td>True</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>1.676886</td>\n",
" <td>True</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>1.682314</td>\n",
" <td>True</td>\n",
" <td>1.676886</td>\n",
" <td>1.671675</td>\n",
" <td>1.664941</td>\n",
" <td>1.659311</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 36 columns</p>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval_begin travel_time \\\n",
"0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 \n",
"1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 \n",
"2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 \n",
"3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 \n",
"4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 \n",
"\n",
" imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 ... \\\n",
"0 True NaN NaN NaN NaN NaN ... \n",
"1 True 1.659311 NaN NaN NaN NaN ... \n",
"2 True 1.664941 1.659311 NaN NaN NaN ... \n",
"3 True 1.671675 1.664941 1.659311 NaN NaN ... \n",
"4 True 1.676886 1.671675 1.664941 1.659311 NaN ... \n",
"\n",
" links_num_2 links_num_3 links_num_4 links_num_5 width_3 width_6 \\\n",
"0 1 0 0 0 1 0 \n",
"1 1 0 0 0 1 0 \n",
"2 1 0 0 0 1 0 \n",
"3 1 0 0 0 1 0 \n",
"4 1 0 0 0 1 0 \n",
"\n",
" width_9 width_12 width_15 link_ID_en \n",
"0 0 0 0 47 \n",
"1 0 0 0 47 \n",
"2 0 0 0 47 \n",
"3 0 0 0 47 \n",
"4 0 0 0 47 \n",
"\n",
"[5 rows x 36 columns]"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"df2.to_csv('data/trainning.txt',header=True,index=None,sep=';',mode='w')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}