Update 道路通行时间预测

pull/2/head
benjas 5 years ago
parent 87704aa1fb
commit e13dc7087d

@ -1,6 +1,374 @@
{ {
"cells": [], "cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据展示"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.interpolate import UnivariateSpline\n",
"from sklearn import linear_model\n",
"import xgboost as xgb\n",
"# from ultis import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道路通行时间:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:04:00,2017-05-06 11:06:00)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:42:00,2017-05-06 10:44:00)</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:56:00,2017-05-06 11:58:00)</td>\n",
" <td>35.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 17:46:00,2017-05-06 17:48:00)</td>\n",
" <td>26.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:52:00,2017-05-06 10:54:00)</td>\n",
" <td>10.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval \\\n",
"0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
"1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
"2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
"3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
"4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
"\n",
" travel_time \n",
"0 3.0 \n",
"1 1.0 \n",
"2 35.2 \n",
"3 26.2 \n",
"4 10.4 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('new_gy_contest_traveltime_training_data_second.txt',delimiter=';',dtype={'link_ID':object})\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"time_interval时间间隔两分钟为单位\n",
"\n",
"travel_time平均通行时间"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道理长宽情况:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>57</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906284594800514</td>\n",
" <td>247</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289425800514</td>\n",
" <td>194</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906284525800514</td>\n",
" <td>839</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906284422600514</td>\n",
" <td>55</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID length width link_class\n",
"0 4377906289869500514 57 3 1\n",
"1 4377906284594800514 247 9 1\n",
"2 4377906289425800514 194 3 1\n",
"3 4377906284525800514 839 3 1\n",
"4 4377906284422600514 55 12 1"
]
},
"execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_df = pd.read_csv('gy_contest_link_info.txt',delimiter=';',dtype={'link_ID':object})\n",
"link_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"length长度 width宽度 link_class类别"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道路之间连接情况:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>in_links</th>\n",
" <th>out_links</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906284594800514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289425800514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906284525800514</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906284422600514</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID in_links out_links\n",
"0 4377906289869500514 1 1\n",
"1 4377906284594800514 1 1\n",
"2 4377906289425800514 1 1\n",
"3 4377906284525800514 1 1\n",
"4 4377906284422600514 2 1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_tops = pd.read_csv('gy_contest_link_top_update.txt',delimiter=',',dtype={'link_ID':object})\n",
"link_tops.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 任务:预测未来一个月平均通行结果,每两分钟一次\n",
"回归任务\n",
"\n",
"构建时间序列,基于前几天或者前几十天的数据预测"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2 "nbformat_minor": 2
} }

@ -342,6 +342,879 @@
"构建时间序列,基于前几天或者前几十天的数据预测" "构建时间序列,基于前几天或者前几十天的数据预测"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据集筛选与标签转换\n",
"数据集中有些数据可能由于异常情况导致不适合建模(堵车,维修等)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:04:00,2017-05-06 11:06:00)</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:42:00,2017-05-06 10:44:00)</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:56:00,2017-05-06 11:58:00)</td>\n",
" <td>35.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 17:46:00,2017-05-06 17:48:00)</td>\n",
" <td>26.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:52:00,2017-05-06 10:54:00)</td>\n",
" <td>10.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval \\\n",
"0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
"1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
"2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
"3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
"4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
"\n",
" travel_time \n",
"0 3.0 \n",
"1 1.0 \n",
"2 35.2 \n",
"3 26.2 \n",
"4 10.4 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>time_interval</th>\n",
" <th>travel_time</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:04:00,2017-05-06 11:06:00)</td>\n",
" <td>3.0</td>\n",
" <td>2017-05-06 11:04:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:42:00,2017-05-06 10:44:00)</td>\n",
" <td>1.0</td>\n",
" <td>2017-05-06 10:42:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 11:56:00,2017-05-06 11:58:00)</td>\n",
" <td>35.2</td>\n",
" <td>2017-05-06 11:56:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 17:46:00,2017-05-06 17:48:00)</td>\n",
" <td>26.2</td>\n",
" <td>2017-05-06 17:46:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>[2017-05-06 10:52:00,2017-05-06 10:54:00)</td>\n",
" <td>10.4</td>\n",
" <td>2017-05-06 10:52:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date time_interval \\\n",
"0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
"1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
"2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
"3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
"4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
"\n",
" travel_time time_interval_begin \n",
"0 3.0 2017-05-06 11:04:00 \n",
"1 1.0 2017-05-06 10:42:00 \n",
"2 35.2 2017-05-06 11:56:00 \n",
"3 26.2 2017-05-06 17:46:00 \n",
"4 10.4 2017-05-06 10:52:00 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#截取开始时间\n",
"df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20]))\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"标签转换\n",
"<img src=\"assets/20201202211044.png\" width=\"100%\">\n",
"我们希望是右边的图,越是正态分布,越好预测"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop(['time_interval'],axis=1)\n",
"df['travel_time'] = np.log1p(df['travel_time'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"筛选方法"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"#剔除一些离群点如travel_time突然有几百分钟的时间可能是意外、道路维修或者统计错误\n",
"def quantile_clip(group):\n",
" # 选择一定的百分比过滤\n",
" group[group < group.quantile(.05)] = group.quantile(.05)\n",
" group[group > group.quantile(.95)] = group.quantile(.95)\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906283422600514</td>\n",
" <td>2017-05-06</td>\n",
" <td>1.386294</td>\n",
" <td>2017-05-06 11:04:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906289434510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>0.693147</td>\n",
" <td>2017-05-06 10:42:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.589059</td>\n",
" <td>2017-05-06 11:56:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.303217</td>\n",
" <td>2017-05-06 17:46:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>2.251292</td>\n",
" <td>2017-05-06 10:52:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date travel_time time_interval_begin\n",
"0 4377906283422600514 2017-05-06 1.386294 2017-05-06 11:04:00\n",
"1 3377906289434510514 2017-05-06 0.693147 2017-05-06 10:42:00\n",
"2 3377906285934510514 2017-05-06 3.589059 2017-05-06 11:56:00\n",
"3 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n",
"4 3377906287934510514 2017-05-06 2.251292 2017-05-06 10:52:00"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#对每条道路(link_ID),每天执行(date)\n",
"df['travel_time'] = df.groupby(['link_ID','date'])['travel_time'].transform(quantile_clip)\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"#根据需求选择样本数据比如预测高峰时刻如早上6-8、中午下午13-18\n",
"df = df.loc[(df['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"#保存处理结果\n",
"df.to_csv('raw_data.txt',header=True,index=None,sep=';',mode='w')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 缺失值预处理"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906285934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.303217</td>\n",
" <td>2017-05-06 17:46:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906287934510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>1.887070</td>\n",
" <td>2017-05-06 14:36:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906287674510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>1.931521</td>\n",
" <td>2017-05-06 06:30:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906287886510514</td>\n",
" <td>2017-05-06</td>\n",
" <td>3.616309</td>\n",
" <td>2017-05-06 07:32:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906283759500514</td>\n",
" <td>2017-05-06</td>\n",
" <td>2.140066</td>\n",
" <td>2017-05-06 13:24:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID date travel_time time_interval_begin\n",
"0 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n",
"1 3377906287934510514 2017-05-06 1.887070 2017-05-06 14:36:00\n",
"2 3377906287674510514 2017-05-06 1.931521 2017-05-06 06:30:00\n",
"3 3377906287886510514 2017-05-06 3.616309 2017-05-06 07:32:00\n",
"4 4377906283759500514 2017-05-06 2.140066 2017-05-06 13:24:00"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('raw_data.txt',delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"如上第一行中2017-05-06 17:46:00那么是不是没有17:48、17:50所以我们需要补充"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>link_class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>57</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906284594800514</td>\n",
" <td>247</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289425800514</td>\n",
" <td>194</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906284525800514</td>\n",
" <td>839</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906284422600514</td>\n",
" <td>55</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID length width link_class\n",
"0 4377906289869500514 57 3 1\n",
"1 4377906284594800514 247 9 1\n",
"2 4377906289425800514 194 3 1\n",
"3 4377906284525800514 839 3 1\n",
"4 4377906284422600514 55 12 1"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatetimeIndex(['2017-03-01 00:00:00', '2017-03-01 00:02:00',\n",
" '2017-03-01 00:04:00', '2017-03-01 00:06:00',\n",
" '2017-03-01 00:08:00'],\n",
" dtype='datetime64[ns]', freq='2T')"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"date_range = pd.date_range('2017-03-01 00:00:00','2017-07-31 23:58:00',freq='2min')\n",
"date_range[:5]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:04:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:06:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:08:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin\n",
"0 4377906289869500514 2017-03-01 00:00:00\n",
"1 4377906289869500514 2017-03-01 00:02:00\n",
"2 4377906289869500514 2017-03-01 00:04:00\n",
"3 4377906289869500514 2017-03-01 00:06:00\n",
"4 4377906289869500514 2017-03-01 00:08:00"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#给每个ID做出每一个时刻\n",
"new_index = pd.MultiIndex.from_product([link_df['link_ID'].unique(),date_range],\n",
" names=['link_ID', 'time_interval_begin'])\n",
"new_df = pd.DataFrame(index=new_index).reset_index()\n",
"new_df.head() # 此时每个ID都有从2017-03-01 00:00:00到2017-03-71 23:58:00的时间间隔"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:00:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:04:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:06:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 00:08:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time\n",
"0 4377906289869500514 2017-03-01 00:00:00 NaN NaN\n",
"1 4377906289869500514 2017-03-01 00:02:00 NaN NaN\n",
"2 4377906289869500514 2017-03-01 00:04:00 NaN NaN\n",
"3 4377906289869500514 2017-03-01 00:06:00 NaN NaN\n",
"4 4377906289869500514 2017-03-01 00:08:00 NaN NaN"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#原来的表里也有部分值,进行合并,出现大量缺失值\n",
"df2 = pd.merge(new_df, df,on=['link_ID','time_interval_begin'],how='left')\n",
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"#筛选时间段数据\n",
"df2 = df2.loc[(df2['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]\n",
"df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n",
" (df2['time_interval_begin'].dt.month == 7) & \n",
" (df2['time_interval_begin'].dt.hour.isin([8,15,18])))]\n",
"df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n",
" (df2['time_interval_begin'].dt.month == 3) & \n",
" (df2['time_interval_begin'].dt.day == 31))]\n",
"\n",
"df2['date'] = df2['time_interval_begin'].dt.strftime('%Y-%m-%d')"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>181</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time\n",
"180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752\n",
"181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752\n",
"182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752\n",
"183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752\n",
"184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#保存中间结果\n",
"df2.to_csv('pre_trainning.txt',header=True,index=None,sep=';',mode='w')"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

Loading…
Cancel
Save