diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb index 2fd6442..bfa7f57 100644 --- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/道路通行时间预测-checkpoint.ipynb @@ -1,6 +1,374 @@ { - "cells": [], - "metadata": {}, + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据展示" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from scipy.interpolate import UnivariateSpline\n", + "from sklearn import linear_model\n", + "import xgboost as xgb\n", + "# from ultis import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "道路通行时间:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDdatetime_intervaltravel_time
043779062834226005142017-05-06[2017-05-06 11:04:00,2017-05-06 11:06:00)3.0
133779062894345105142017-05-06[2017-05-06 10:42:00,2017-05-06 10:44:00)1.0
233779062859345105142017-05-06[2017-05-06 11:56:00,2017-05-06 11:58:00)35.2
333779062859345105142017-05-06[2017-05-06 17:46:00,2017-05-06 17:48:00)26.2
433779062879345105142017-05-06[2017-05-06 10:52:00,2017-05-06 10:54:00)10.4
\n", + "
" + ], + "text/plain": [ + " link_ID date time_interval \\\n", + "0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n", + "1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n", + "2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n", + "3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n", + "4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n", + "\n", + " travel_time \n", + "0 3.0 \n", + "1 1.0 \n", + "2 35.2 \n", + "3 26.2 \n", + "4 10.4 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('new_gy_contest_traveltime_training_data_second.txt',delimiter=';',dtype={'link_ID':object})\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "time_interval时间间隔,两分钟为单位\n", + "\n", + "travel_time平均通行时间" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "道理长宽情况:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDlengthwidthlink_class
043779062898695005145731
1437790628459480051424791
2437790628942580051419431
3437790628452580051483931
4437790628442260051455121
\n", + "
" + ], + "text/plain": [ + " link_ID length width link_class\n", + "0 4377906289869500514 57 3 1\n", + "1 4377906284594800514 247 9 1\n", + "2 4377906289425800514 194 3 1\n", + "3 4377906284525800514 839 3 1\n", + "4 4377906284422600514 55 12 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link_df = pd.read_csv('gy_contest_link_info.txt',delimiter=';',dtype={'link_ID':object})\n", + "link_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "length长度 width宽度 link_class类别" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "道路之间连接情况:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDin_linksout_links
0437790628986950051411
1437790628459480051411
2437790628942580051411
3437790628452580051411
4437790628442260051421
\n", + "
" + ], + "text/plain": [ + " link_ID in_links out_links\n", + "0 4377906289869500514 1 1\n", + "1 4377906284594800514 1 1\n", + "2 4377906289425800514 1 1\n", + "3 4377906284525800514 1 1\n", + "4 4377906284422600514 2 1" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link_tops = pd.read_csv('gy_contest_link_top_update.txt',delimiter=',',dtype={'link_ID':object})\n", + "link_tops.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 任务:预测未来一个月平均通行结果,每两分钟一次\n", + "回归任务\n", + "\n", + "构建时间序列,基于前几天或者前几十天的数据预测" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, "nbformat": 4, "nbformat_minor": 2 } diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/assets/20201202211044.png b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/assets/20201202211044.png new file mode 100644 index 0000000..b94a263 Binary files /dev/null and b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/assets/20201202211044.png differ diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb index bfa7f57..e425e2f 100644 --- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb +++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/道路通行时间预测.ipynb @@ -342,6 +342,879 @@ "构建时间序列,基于前几天或者前几十天的数据预测" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据集筛选与标签转换\n", + "数据集中有些数据可能由于异常情况导致不适合建模(堵车,维修等)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDdatetime_intervaltravel_time
043779062834226005142017-05-06[2017-05-06 11:04:00,2017-05-06 11:06:00)3.0
133779062894345105142017-05-06[2017-05-06 10:42:00,2017-05-06 10:44:00)1.0
233779062859345105142017-05-06[2017-05-06 11:56:00,2017-05-06 11:58:00)35.2
333779062859345105142017-05-06[2017-05-06 17:46:00,2017-05-06 17:48:00)26.2
433779062879345105142017-05-06[2017-05-06 10:52:00,2017-05-06 10:54:00)10.4
\n", + "
" + ], + "text/plain": [ + " link_ID date time_interval \\\n", + "0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n", + "1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n", + "2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n", + "3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n", + "4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n", + "\n", + " travel_time \n", + "0 3.0 \n", + "1 1.0 \n", + "2 35.2 \n", + "3 26.2 \n", + "4 10.4 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDdatetime_intervaltravel_timetime_interval_begin
043779062834226005142017-05-06[2017-05-06 11:04:00,2017-05-06 11:06:00)3.02017-05-06 11:04:00
133779062894345105142017-05-06[2017-05-06 10:42:00,2017-05-06 10:44:00)1.02017-05-06 10:42:00
233779062859345105142017-05-06[2017-05-06 11:56:00,2017-05-06 11:58:00)35.22017-05-06 11:56:00
333779062859345105142017-05-06[2017-05-06 17:46:00,2017-05-06 17:48:00)26.22017-05-06 17:46:00
433779062879345105142017-05-06[2017-05-06 10:52:00,2017-05-06 10:54:00)10.42017-05-06 10:52:00
\n", + "
" + ], + "text/plain": [ + " link_ID date time_interval \\\n", + "0 4377906283422600514 2017-05-06 [2017-05-06 11:04:00,2017-05-06 11:06:00) \n", + "1 3377906289434510514 2017-05-06 [2017-05-06 10:42:00,2017-05-06 10:44:00) \n", + "2 3377906285934510514 2017-05-06 [2017-05-06 11:56:00,2017-05-06 11:58:00) \n", + "3 3377906285934510514 2017-05-06 [2017-05-06 17:46:00,2017-05-06 17:48:00) \n", + "4 3377906287934510514 2017-05-06 [2017-05-06 10:52:00,2017-05-06 10:54:00) \n", + "\n", + " travel_time time_interval_begin \n", + "0 3.0 2017-05-06 11:04:00 \n", + "1 1.0 2017-05-06 10:42:00 \n", + "2 35.2 2017-05-06 11:56:00 \n", + "3 26.2 2017-05-06 17:46:00 \n", + "4 10.4 2017-05-06 10:52:00 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#截取开始时间\n", + "df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20]))\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "标签转换\n", + "\n", + "我们希望是右边的图,越是正态分布,越好预测" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(['time_interval'],axis=1)\n", + "df['travel_time'] = np.log1p(df['travel_time'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "筛选方法" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#剔除一些离群点:如travel_time突然有几百分钟的时间,可能是意外、道路维修或者统计错误\n", + "def quantile_clip(group):\n", + " # 选择一定的百分比过滤\n", + " group[group < group.quantile(.05)] = group.quantile(.05)\n", + " group[group > group.quantile(.95)] = group.quantile(.95)\n", + " return group" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDdatetravel_timetime_interval_begin
043779062834226005142017-05-061.3862942017-05-06 11:04:00
133779062894345105142017-05-060.6931472017-05-06 10:42:00
233779062859345105142017-05-063.5890592017-05-06 11:56:00
333779062859345105142017-05-063.3032172017-05-06 17:46:00
433779062879345105142017-05-062.2512922017-05-06 10:52:00
\n", + "
" + ], + "text/plain": [ + " link_ID date travel_time time_interval_begin\n", + "0 4377906283422600514 2017-05-06 1.386294 2017-05-06 11:04:00\n", + "1 3377906289434510514 2017-05-06 0.693147 2017-05-06 10:42:00\n", + "2 3377906285934510514 2017-05-06 3.589059 2017-05-06 11:56:00\n", + "3 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n", + "4 3377906287934510514 2017-05-06 2.251292 2017-05-06 10:52:00" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#对每条道路(link_ID),每天执行(date)\n", + "df['travel_time'] = df.groupby(['link_ID','date'])['travel_time'].transform(quantile_clip)\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "#根据需求选择样本数据,比如预测高峰时刻,如早上6-8、中午下午13-18\n", + "df = df.loc[(df['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "#保存处理结果\n", + "df.to_csv('raw_data.txt',header=True,index=None,sep=';',mode='w')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 缺失值预处理" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDdatetravel_timetime_interval_begin
033779062859345105142017-05-063.3032172017-05-06 17:46:00
133779062879345105142017-05-061.8870702017-05-06 14:36:00
233779062876745105142017-05-061.9315212017-05-06 06:30:00
333779062878865105142017-05-063.6163092017-05-06 07:32:00
443779062837595005142017-05-062.1400662017-05-06 13:24:00
\n", + "
" + ], + "text/plain": [ + " link_ID date travel_time time_interval_begin\n", + "0 3377906285934510514 2017-05-06 3.303217 2017-05-06 17:46:00\n", + "1 3377906287934510514 2017-05-06 1.887070 2017-05-06 14:36:00\n", + "2 3377906287674510514 2017-05-06 1.931521 2017-05-06 06:30:00\n", + "3 3377906287886510514 2017-05-06 3.616309 2017-05-06 07:32:00\n", + "4 4377906283759500514 2017-05-06 2.140066 2017-05-06 13:24:00" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('raw_data.txt',delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "如上第一行中,2017-05-06 17:46:00,那么是不是没有17:48、17:50,所以我们需要补充" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDlengthwidthlink_class
043779062898695005145731
1437790628459480051424791
2437790628942580051419431
3437790628452580051483931
4437790628442260051455121
\n", + "
" + ], + "text/plain": [ + " link_ID length width link_class\n", + "0 4377906289869500514 57 3 1\n", + "1 4377906284594800514 247 9 1\n", + "2 4377906289425800514 194 3 1\n", + "3 4377906284525800514 839 3 1\n", + "4 4377906284422600514 55 12 1" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2017-03-01 00:00:00', '2017-03-01 00:02:00',\n", + " '2017-03-01 00:04:00', '2017-03-01 00:06:00',\n", + " '2017-03-01 00:08:00'],\n", + " dtype='datetime64[ns]', freq='2T')" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "date_range = pd.date_range('2017-03-01 00:00:00','2017-07-31 23:58:00',freq='2min')\n", + "date_range[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begin
043779062898695005142017-03-01 00:00:00
143779062898695005142017-03-01 00:02:00
243779062898695005142017-03-01 00:04:00
343779062898695005142017-03-01 00:06:00
443779062898695005142017-03-01 00:08:00
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin\n", + "0 4377906289869500514 2017-03-01 00:00:00\n", + "1 4377906289869500514 2017-03-01 00:02:00\n", + "2 4377906289869500514 2017-03-01 00:04:00\n", + "3 4377906289869500514 2017-03-01 00:06:00\n", + "4 4377906289869500514 2017-03-01 00:08:00" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#给每个ID,做出每一个时刻\n", + "new_index = pd.MultiIndex.from_product([link_df['link_ID'].unique(),date_range],\n", + " names=['link_ID', 'time_interval_begin'])\n", + "new_df = pd.DataFrame(index=new_index).reset_index()\n", + "new_df.head() # 此时每个ID都有从2017-03-01 00:00:00到2017-03-71 23:58:00的时间间隔" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_time
043779062898695005142017-03-01 00:00:00NaNNaN
143779062898695005142017-03-01 00:02:00NaNNaN
243779062898695005142017-03-01 00:04:00NaNNaN
343779062898695005142017-03-01 00:06:00NaNNaN
443779062898695005142017-03-01 00:08:00NaNNaN
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time\n", + "0 4377906289869500514 2017-03-01 00:00:00 NaN NaN\n", + "1 4377906289869500514 2017-03-01 00:02:00 NaN NaN\n", + "2 4377906289869500514 2017-03-01 00:04:00 NaN NaN\n", + "3 4377906289869500514 2017-03-01 00:06:00 NaN NaN\n", + "4 4377906289869500514 2017-03-01 00:08:00 NaN NaN" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#原来的表里也有部分值,进行合并,出现大量缺失值\n", + "df2 = pd.merge(new_df, df,on=['link_ID','time_interval_begin'],how='left')\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "#筛选时间段数据\n", + "df2 = df2.loc[(df2['time_interval_begin'].dt.hour.isin([6,7,8,13,14,15,16,17,18]))]\n", + "df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n", + " (df2['time_interval_begin'].dt.month == 7) & \n", + " (df2['time_interval_begin'].dt.hour.isin([8,15,18])))]\n", + "df2 = df2.loc[~((df2['time_interval_begin'].dt.year == 2017) & \n", + " (df2['time_interval_begin'].dt.month == 3) & \n", + " (df2['time_interval_begin'].dt.day == 31))]\n", + "\n", + "df2['date'] = df2['time_interval_begin'].dt.strftime('%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDtime_interval_begindatetravel_time
18043779062898695005142017-03-01 06:00:002017-03-012.174752
18143779062898695005142017-03-01 06:02:002017-03-012.174752
18243779062898695005142017-03-01 06:04:002017-03-012.174752
18343779062898695005142017-03-01 06:06:002017-03-012.174752
18443779062898695005142017-03-01 06:08:002017-03-012.174752
\n", + "
" + ], + "text/plain": [ + " link_ID time_interval_begin date travel_time\n", + "180 4377906289869500514 2017-03-01 06:00:00 2017-03-01 2.174752\n", + "181 4377906289869500514 2017-03-01 06:02:00 2017-03-01 2.174752\n", + "182 4377906289869500514 2017-03-01 06:04:00 2017-03-01 2.174752\n", + "183 4377906289869500514 2017-03-01 06:06:00 2017-03-01 2.174752\n", + "184 4377906289869500514 2017-03-01 06:08:00 2017-03-01 2.174752" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#保存中间结果\n", + "df2.to_csv('pre_trainning.txt',header=True,index=None,sep=';',mode='w')" + ] + }, { "cell_type": "code", "execution_count": null,