diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb
index bfa7f57..af3985e 100644
--- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb
@@ -342,6 +342,1395 @@
"构建时间序列,基于前几天或者前几十天的数据预测"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 数据集筛选与标签转换\n",
+ "数据集中有些数据可能由于异常情况导致不适合建模(堵车,维修等)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " date | \n",
+ " time_interval | \n",
+ " travel_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4377906283422600514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 11:04:00,2017-05-06 11:06:00) | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906289434510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 10:42:00,2017-05-06 10:44:00) | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906285934510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 11:56:00,2017-05-06 11:58:00) | \n",
+ " 35.2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906285934510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 17:46:00,2017-05-06 17:48:00) | \n",
+ " 26.2 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906287934510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 10:52:00,2017-05-06 10:54:00) | \n",
+ " 10.4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID date time_interval \\\n",
+ "0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
+ "1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
+ "2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
+ "3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
+ "4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
+ "\n",
+ " travel_time \n",
+ "0 3.0 \n",
+ "1 1.0 \n",
+ "2 35.2 \n",
+ "3 26.2 \n",
+ "4 10.4 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " date | \n",
+ " time_interval | \n",
+ " travel_time | \n",
+ " time_interval_begin | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4377906283422600514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 11:04:00,2017-05-06 11:06:00) | \n",
+ " 3.0 | \n",
+ " 2017-05-06 11:04:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906289434510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 10:42:00,2017-05-06 10:44:00) | \n",
+ " 1.0 | \n",
+ " 2017-05-06 10:42:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906285934510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 11:56:00,2017-05-06 11:58:00) | \n",
+ " 35.2 | \n",
+ " 2017-05-06 11:56:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906285934510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 17:46:00,2017-05-06 17:48:00) | \n",
+ " 26.2 | \n",
+ " 2017-05-06 17:46:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906287934510514 | \n",
+ " 2017-05-06 | \n",
+ " [2017-05-06 10:52:00,2017-05-06 10:54:00) | \n",
+ " 10.4 | \n",
+ " 2017-05-06 10:52:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID date time_interval \\\n",
+ "0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n",
+ "1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n",
+ "2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n",
+ "3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n",
+ "4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n",
+ "\n",
+ " travel_time time_interval_begin \n",
+ "0 3.0 2017-05-06 11:04:00 \n",
+ "1 1.0 2017-05-06 10:42:00 \n",
+ "2 35.2 2017-05-06 11:56:00 \n",
+ "3 26.2 2017-05-06 17:46:00 \n",
+ "4 10.4 2017-05-06 10:52:00 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#截取开始时间\n",
+ "df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20]))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "标签转换\n",
+ "
\n",
+ "对于travel_time,我们希望是右边的图,越是正态分布,越好预测"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.drop(['time_interval'],axis=1)\n",
+ "df['travel_time'] = np.log1p(df['travel_time'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "筛选方法"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#剔除一些离群点:如travel_time突然有几百分钟的时间,可能是意外、道路维修或者统计错误\n",
+ "def quantile_clip(group):\n",
+ " # 选择一定的百分比过滤\n",
+ " group[group < group.quantile(.05)] = group.quantile(.05)\n",
+ " group[group > group.quantile(.95)] = group.quantile(.95)\n",
+ " return group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " date | \n",
+ " travel_time | \n",
+ " time_interval_begin | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4377906283422600514 | \n",
+ " 2017-05-06 | \n",
+ " 1.386294 | \n",
+ " 2017-05-06 11:04:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906289434510514 | \n",
+ " 2017-05-06 | \n",
+ " 0.693147 | \n",
+ " 2017-05-06 10:42:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906285934510514 | \n",
+ " 2017-05-06 | \n",
+ " 3.589059 | \n",
+ " 2017-05-06 11:56:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906285934510514 | \n",
+ " 2017-05-06 | \n",
+ " 3.303217 | \n",
+ " 2017-05-06 17:46:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906287934510514 | \n",
+ " 2017-05-06 | \n",
+ " 2.251292 | \n",
+ " 2017-05-06 10:52:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID date travel_time time_interval_begin\n",
+ "0 4377906283422600514 2017-05-06 1.386294 2017-05-06 11:04:00\n",
+ "1 3377906289434510514 2017-05-06 0.693147 2017-05-06 10:42:00\n",
+ "2 3377906285934510514 2017-05-06 3.589059 2017-05-06 11:56:00\n",
+ "3 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n",
+ "4 3377906287934510514 2017-05-06 2.251292 2017-05-06 10:52:00"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#对每条道路(link_ID),每天执行(date)\n",
+ "df['travel_time'] = df.groupby(['link_ID','date'])['travel_time'].transform(quantile_clip)\n",
+ "df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#根据需求选择样本数据,比如预测高峰时刻,如早上6-8、中午下午13-18\n",
+ "df = df.loc[(df['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#保存处理结果\n",
+ "df.to_csv('raw_data.txt',header=True,index=None,sep=';',mode='w')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 缺失值预处理"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " date | \n",
+ " travel_time | \n",
+ " time_interval_begin | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3377906285934510514 | \n",
+ " 2017-05-06 | \n",
+ " 3.303217 | \n",
+ " 2017-05-06 17:46:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906287934510514 | \n",
+ " 2017-05-06 | \n",
+ " 1.887070 | \n",
+ " 2017-05-06 14:36:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906287674510514 | \n",
+ " 2017-05-06 | \n",
+ " 1.931521 | \n",
+ " 2017-05-06 06:30:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906287886510514 | \n",
+ " 2017-05-06 | \n",
+ " 3.616309 | \n",
+ " 2017-05-06 07:32:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4377906283759500514 | \n",
+ " 2017-05-06 | \n",
+ " 2.140066 | \n",
+ " 2017-05-06 13:24:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID date travel_time time_interval_begin\n",
+ "0 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n",
+ "1 3377906287934510514 2017-05-06 1.887070 2017-05-06 14:36:00\n",
+ "2 3377906287674510514 2017-05-06 1.931521 2017-05-06 06:30:00\n",
+ "3 3377906287886510514 2017-05-06 3.616309 2017-05-06 07:32:00\n",
+ "4 4377906283759500514 2017-05-06 2.140066 2017-05-06 13:24:00"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv('raw_data.txt',delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "如上第一行中,2017-05-06 17:46:00,那么是不是没有17:48、17:50,所以我们需要补充"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " length | \n",
+ " width | \n",
+ " link_class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4377906289869500514 | \n",
+ " 57 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4377906284594800514 | \n",
+ " 247 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4377906289425800514 | \n",
+ " 194 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4377906284525800514 | \n",
+ " 839 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4377906284422600514 | \n",
+ " 55 | \n",
+ " 12 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID length width link_class\n",
+ "0 4377906289869500514 57 3 1\n",
+ "1 4377906284594800514 247 9 1\n",
+ "2 4377906289425800514 194 3 1\n",
+ "3 4377906284525800514 839 3 1\n",
+ "4 4377906284422600514 55 12 1"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "link_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DatetimeIndex(['2017-03-01 00:00:00', '2017-03-01 00:02:00',\n",
+ " '2017-03-01 00:04:00', '2017-03-01 00:06:00',\n",
+ " '2017-03-01 00:08:00'],\n",
+ " dtype='datetime64[ns]', freq='2T')"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "date_range = pd.date_range('2017-03-01 00:00:00','2017-07-31 23:58:00',freq='2min')\n",
+ "date_range[:5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:00:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:02:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:04:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:06:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:08:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin\n",
+ "0 4377906289869500514 2017-03-01 00:00:00\n",
+ "1 4377906289869500514 2017-03-01 00:02:00\n",
+ "2 4377906289869500514 2017-03-01 00:04:00\n",
+ "3 4377906289869500514 2017-03-01 00:06:00\n",
+ "4 4377906289869500514 2017-03-01 00:08:00"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#给每个ID,做出每一个时刻\n",
+ "new_index = pd.MultiIndex.from_product([link_df['link_ID'].unique(),date_range],\n",
+ " names=['link_ID', 'time_interval_begin'])\n",
+ "new_df = pd.DataFrame(index=new_index).reset_index()\n",
+ "new_df.head() # 此时每个ID都有从2017-03-01 00:00:00到2017-03-71 23:58:00的时间间隔"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:00:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:04:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:06:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 00:08:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time\n",
+ "0 4377906289869500514 2017-03-01 00:00:00 NaN NaN\n",
+ "1 4377906289869500514 2017-03-01 00:02:00 NaN NaN\n",
+ "2 4377906289869500514 2017-03-01 00:04:00 NaN NaN\n",
+ "3 4377906289869500514 2017-03-01 00:06:00 NaN NaN\n",
+ "4 4377906289869500514 2017-03-01 00:08:00 NaN NaN"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#原来的表里也有部分值,进行合并,出现大量缺失值\n",
+ "df2 = pd.merge(new_df, df,on=['link_ID','time_interval_begin'],how='left')\n",
+ "df2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#筛选时间段数据\n",
+ "df2 = df2.loc[(df2['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]\n",
+ "df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n",
+ " (df2['time_interval_begin'].dt.month == 7) & \n",
+ " (df2['time_interval_begin'].dt.hour.isin([8,15,18])))]\n",
+ "df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n",
+ " (df2['time_interval_begin'].dt.month == 3) & \n",
+ " (df2['time_interval_begin'].dt.day == 31))]\n",
+ "\n",
+ "df2['date'] = df2['time_interval_begin'].dt.strftime('%Y-%m-%d')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 180 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 181 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 182 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 183 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 184 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time\n",
+ "180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752\n",
+ "181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752\n",
+ "182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752\n",
+ "183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752\n",
+ "184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#保存中间结果\n",
+ "df2.to_csv('pre_trainning.txt',header=True,index=None,sep=';',mode='w')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 补全时间序列"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 180 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 181 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 182 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 183 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 184 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
+ "181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
+ "182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
+ "183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
+ "184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
+ "\n",
+ " travel_time2 \n",
+ "180 2.174752 \n",
+ "181 2.174752 \n",
+ "182 2.174752 \n",
+ "183 2.174752 \n",
+ "184 2.174752 "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df2\n",
+ "df['travel_time2'] = df['travel_time']\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "多个月统计-季节性变化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def date_trend(group):\n",
+ " tmp = group.groupby('date_hour').mean().reset_index()\n",
+ " \n",
+ " def nan_helper(y):\n",
+ " return np.isnan(y), lambda z: z.nonzero()[0]\n",
+ " \n",
+ " y = tmp['travel_time'].values\n",
+ " nans, x = nan_helper(y)\n",
+ " if group.link_ID.values[0] in ['3377906282328510514','3377906283328510514',\n",
+ " '4377906280784800514','9377906281555510514']:\n",
+ " tmp['date_trend'] = group['travel_time'].median()\n",
+ " else:\n",
+ " regr = linear_model.LinearRegression()\n",
+ " regr.fit(x(~nans).reshape(-1,1), y[~nans].reshape(-1,1))\n",
+ " tmp['date_trend'] = regr.predict(tmp.index.values.reshape(-1,1)).ravel()\n",
+ " group = pd.merge(group,tmp[['date_trend','date_hour']], on='date_hour',how='left')\n",
+ " return group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_hour | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 180 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 181 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 182 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 183 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 184 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
+ "181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
+ "182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
+ "183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
+ "184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
+ "\n",
+ " travel_time2 date_hour \n",
+ "180 2.174752 2017-03-01-06 \n",
+ "181 2.174752 2017-03-01-06 \n",
+ "182 2.174752 2017-03-01-06 \n",
+ "183 2.174752 2017-03-01-06 \n",
+ "184 2.174752 2017-03-01-06 "
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 基于小时进行预测,如果基于整体预测,结果可能不准确定,我们先算小时对结果的影响\n",
+ "df['date_hour'] = df.time_interval_begin.map(lambda x: x.strftime('%Y-%m-%d-%H'))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "道路每小时通行时间的回归结果\n",
+ "
\n",
+ "左图:回归预测,蓝色线是回归线,红色是时间\n",
+ "右图:对某几个道路,直接用中位数预测"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_hour | \n",
+ " date_trend | \n",
+ "
\n",
+ " \n",
+ " link_ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3377906280028510514 | \n",
+ " 0 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date \\\n",
+ "link_ID \n",
+ "3377906280028510514 0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 \n",
+ " 1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 \n",
+ " 2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 \n",
+ " 3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 \n",
+ " 4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 \n",
+ "\n",
+ " travel_time travel_time2 date_hour date_trend \n",
+ "link_ID \n",
+ "3377906280028510514 0 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 1 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 2 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 3 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 4 NaN NaN 2017-03-01-06 1.960745 "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.groupby('link_ID').apply(date_trend)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "蓝线回归得到的值存在 df['date trend']里,此时 travel_time 就更新为 df['travel_time']= df['travel_time']-df['date_trend'],表示date_trend作为大的趋势已经被线性回归决定了,剩下的就是研究这个残差了,之后训练和预测都是基于残差,最后用预测出来的残差加上相应的date_trend即可得到需要的预测值"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_trend | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
+ "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
+ "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
+ "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
+ "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
+ "\n",
+ " travel_time2 date_trend \n",
+ "0 NaN 1.960745 \n",
+ "1 NaN 1.960745 \n",
+ "2 NaN 1.960745 \n",
+ "3 NaN 1.960745 \n",
+ "4 NaN 1.960745 "
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.drop(['date_hour','link_ID'],axis=1)\n",
+ "df = df.reset_index()\n",
+ "df = df.drop('level_1',axis=1)\n",
+ "df['travel_time'] = df['travel_time'] - df['date_trend']\n",
+ "df.head()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/assets/20201202221436.png b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/assets/20201202221436.png
new file mode 100644
index 0000000..8a5f4f1
Binary files /dev/null and b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/assets/20201202221436.png differ
diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb
index e425e2f..af3985e 100644
--- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb
@@ -558,7 +558,7 @@
"source": [
"标签转换\n",
"
\n",
- "我们希望是右边的图,越是正态分布,越好预测"
+ "对于travel_time,我们希望是右边的图,越是正态分布,越好预测"
]
},
{
@@ -1207,7 +1207,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
@@ -1215,6 +1215,522 @@
"df2.to_csv('pre_trainning.txt',header=True,index=None,sep=';',mode='w')"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 补全时间序列"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 180 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 181 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 182 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 183 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ " 184 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
+ "181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
+ "182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
+ "183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
+ "184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
+ "\n",
+ " travel_time2 \n",
+ "180 2.174752 \n",
+ "181 2.174752 \n",
+ "182 2.174752 \n",
+ "183 2.174752 \n",
+ "184 2.174752 "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df2\n",
+ "df['travel_time2'] = df['travel_time']\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "多个月统计-季节性变化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def date_trend(group):\n",
+ " tmp = group.groupby('date_hour').mean().reset_index()\n",
+ " \n",
+ " def nan_helper(y):\n",
+ " return np.isnan(y), lambda z: z.nonzero()[0]\n",
+ " \n",
+ " y = tmp['travel_time'].values\n",
+ " nans, x = nan_helper(y)\n",
+ " if group.link_ID.values[0] in ['3377906282328510514','3377906283328510514',\n",
+ " '4377906280784800514','9377906281555510514']:\n",
+ " tmp['date_trend'] = group['travel_time'].median()\n",
+ " else:\n",
+ " regr = linear_model.LinearRegression()\n",
+ " regr.fit(x(~nans).reshape(-1,1), y[~nans].reshape(-1,1))\n",
+ " tmp['date_trend'] = regr.predict(tmp.index.values.reshape(-1,1)).ravel()\n",
+ " group = pd.merge(group,tmp[['date_trend','date_hour']], on='date_hour',how='left')\n",
+ " return group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_hour | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 180 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 181 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 182 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 183 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ " 184 | \n",
+ " 4377906289869500514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " 2.174752 | \n",
+ " 2.174752 | \n",
+ " 2017-03-01-06 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752 \n",
+ "181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752 \n",
+ "182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752 \n",
+ "183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752 \n",
+ "184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752 \n",
+ "\n",
+ " travel_time2 date_hour \n",
+ "180 2.174752 2017-03-01-06 \n",
+ "181 2.174752 2017-03-01-06 \n",
+ "182 2.174752 2017-03-01-06 \n",
+ "183 2.174752 2017-03-01-06 \n",
+ "184 2.174752 2017-03-01-06 "
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 基于小时进行预测,如果基于整体预测,结果可能不准确定,我们先算小时对结果的影响\n",
+ "df['date_hour'] = df.time_interval_begin.map(lambda x: x.strftime('%Y-%m-%d-%H'))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "道路每小时通行时间的回归结果\n",
+ "
\n",
+ "左图:回归预测,蓝色线是回归线,红色是时间\n",
+ "右图:对某几个道路,直接用中位数预测"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_hour | \n",
+ " date_trend | \n",
+ "
\n",
+ " \n",
+ " link_ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3377906280028510514 | \n",
+ " 0 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2017-03-01-06 | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date \\\n",
+ "link_ID \n",
+ "3377906280028510514 0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 \n",
+ " 1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 \n",
+ " 2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 \n",
+ " 3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 \n",
+ " 4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 \n",
+ "\n",
+ " travel_time travel_time2 date_hour date_trend \n",
+ "link_ID \n",
+ "3377906280028510514 0 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 1 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 2 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 3 NaN NaN 2017-03-01-06 1.960745 \n",
+ " 4 NaN NaN 2017-03-01-06 1.960745 "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.groupby('link_ID').apply(date_trend)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "蓝线回归得到的值存在 df['date trend']里,此时 travel_time 就更新为 df['travel_time']= df['travel_time']-df['date_trend'],表示date_trend作为大的趋势已经被线性回归决定了,剩下的就是研究这个残差了,之后训练和预测都是基于残差,最后用预测出来的残差加上相应的date_trend即可得到需要的预测值"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link_ID | \n",
+ " time_interval_begin | \n",
+ " date | \n",
+ " travel_time | \n",
+ " travel_time2 | \n",
+ " date_trend | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:00:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:02:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:04:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:06:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3377906280028510514 | \n",
+ " 2017-03-01 06:08:00 | \n",
+ " 2017-03-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.960745 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link_ID time_interval_begin date travel_time \\\n",
+ "0 3377906280028510514 2017-03-01 06:00:00 2017-03-01 NaN \n",
+ "1 3377906280028510514 2017-03-01 06:02:00 2017-03-01 NaN \n",
+ "2 3377906280028510514 2017-03-01 06:04:00 2017-03-01 NaN \n",
+ "3 3377906280028510514 2017-03-01 06:06:00 2017-03-01 NaN \n",
+ "4 3377906280028510514 2017-03-01 06:08:00 2017-03-01 NaN \n",
+ "\n",
+ " travel_time2 date_trend \n",
+ "0 NaN 1.960745 \n",
+ "1 NaN 1.960745 \n",
+ "2 NaN 1.960745 \n",
+ "3 NaN 1.960745 \n",
+ "4 NaN 1.960745 "
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.drop(['date_hour','link_ID'],axis=1)\n",
+ "df = df.reset_index()\n",
+ "df = df.drop('level_1',axis=1)\n",
+ "df['travel_time'] = df['travel_time'] - df['date_trend']\n",
+ "df.head()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,