Add 时间序的补全方法

pull/2/head
benjas 5 years ago
parent e13dc7087d
commit 2fae7985e4

@ -558,7 +558,7 @@
"source": [
"标签转换\n",
"<img src=\"assets/20201202211044.png\" width=\"100%\">\n",
"我们希望是右边的图,越是正态分布,越好预测"
"对于travel_time我们希望是右边的图,越是正态分布,越好预测"
]
},
{
@ -1207,7 +1207,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
@ -1215,6 +1215,522 @@
"df2.to_csv('pre_trainning.txt',header=True,index=None,sep=';',mode='w')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 补全时间序列"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>181</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
"181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
"182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
"183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
"184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
"\n",
" travel_time2 \n",
"180 2.174752 \n",
"181 2.174752 \n",
"182 2.174752 \n",
"183 2.174752 \n",
"184 2.174752 "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df2\n",
"df['travel_time2'] = df['travel_time']\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"多个月统计-季节性变化"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"def date_trend(group):\n",
" tmp = group.groupby('date_hour').mean().reset_index()\n",
" \n",
" def nan_helper(y):\n",
" return np.isnan(y), lambda z: z.nonzero()[0]\n",
" \n",
" y = tmp['travel_time'].values\n",
" nans, x = nan_helper(y)\n",
" if group.link_ID.values[0] in ['3377906282328510514','3377906283328510514',\n",
" '4377906280784800514','9377906281555510514']:\n",
" tmp['date_trend'] = group['travel_time'].median()\n",
" else:\n",
" regr = linear_model.LinearRegression()\n",
" regr.fit(x(~nans).reshape(-1,1), y[~nans].reshape(-1,1))\n",
" tmp['date_trend'] = regr.predict(tmp.index.values.reshape(-1,1)).ravel()\n",
" group = pd.merge(group,tmp[['date_trend','date_hour']], on='date_hour',how='left')\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>181</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>4377906289869500514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>2.174752</td>\n",
" <td>2.174752</td>\n",
" <td>2017-03-01-06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
"181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
"182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
"183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
"184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
"\n",
" travel_time2 date_hour \n",
"180 2.174752 2017-03-01-06 \n",
"181 2.174752 2017-03-01-06 \n",
"182 2.174752 2017-03-01-06 \n",
"183 2.174752 2017-03-01-06 \n",
"184 2.174752 2017-03-01-06 "
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 基于小时进行预测,如果基于整体预测,结果可能不准确定,我们先算小时对结果的影响\n",
"df['date_hour'] = df.time_interval_begin.map(lambda x: x.strftime('%Y-%m-%d-%H'))\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"道路每小时通行时间的回归结果\n",
"<img src=\"assets/20201202221436.png\" width=\"100%\">\n",
"左图:回归预测,蓝色线是回归线,红色是时间\n",
"右图:对某几个道路,直接用中位数预测"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_hour</th>\n",
" <th>date_trend</th>\n",
" </tr>\n",
" <tr>\n",
" <th>link_ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">3377906280028510514</th>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2017-03-01-06</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date \\\n",
"link_ID \n",
"3377906280028510514 0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 \n",
" 1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 \n",
" 2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 \n",
" 3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 \n",
" 4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 \n",
"\n",
" travel_time travel_time2 date_hour date_trend \n",
"link_ID \n",
"3377906280028510514 0 NaN NaN 2017-03-01-06 1.960745 \n",
" 1 NaN NaN 2017-03-01-06 1.960745 \n",
" 2 NaN NaN 2017-03-01-06 1.960745 \n",
" 3 NaN NaN 2017-03-01-06 1.960745 \n",
" 4 NaN NaN 2017-03-01-06 1.960745 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.groupby('link_ID').apply(date_trend)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"蓝线回归得到的值存在 df['date trend']里,此时 travel_time 就更新为 df['travel_time']= df['travel_time']-df['date_trend'],表示date_trend作为大的趋势已经被线性回归决定了,剩下的就是研究这个残差了,之后训练和预测都是基于残差,最后用预测出来的残差加上相应的date_trend即可得到需要的预测值"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link_ID</th>\n",
" <th>time_interval_begin</th>\n",
" <th>date</th>\n",
" <th>travel_time</th>\n",
" <th>travel_time2</th>\n",
" <th>date_trend</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:00:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:02:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:04:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:06:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3377906280028510514</td>\n",
" <td>2017-03-01 06:08:00</td>\n",
" <td>2017-03-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.960745</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" link_ID time_interval_begin date travel_time \\\n",
"0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
"1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
"2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
"3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
"4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
"\n",
" travel_time2 date_trend \n",
"0 NaN 1.960745 \n",
"1 NaN 1.960745 \n",
"2 NaN 1.960745 \n",
"3 NaN 1.960745 \n",
"4 NaN 1.960745 "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.drop(['date_hour','link_ID'],axis=1)\n",
"df = df.reset_index()\n",
"df = df.drop('level_1',axis=1)\n",
"df['travel_time'] = df['travel_time'] - df['date_trend']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,

Loading…
Cancel
Save