From a95987af5147a0f082a35d7008acde8b1239000d Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Mon, 7 Dec 2020 22:47:25 +0800 Subject: [PATCH] =?UTF-8?q?Create=20=E5=BB=BA=E6=A8=A1=E9=A2=84=E6=B5=8B-c?= =?UTF-8?q?heckpoint.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../建模预测-checkpoint.ipynb | 560 ++++++++++++++++++ 1 file changed, 560 insertions(+) create mode 100644 机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb new file mode 100644 index 0000000..9f83756 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from scipy.interpolate import UnivariateSpline\n", + "from sklearn import linear_model\n", + "import xgboost as xgb\n", + "from sklearn.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows',150)\n", + "pd.set_option('display.max_columns',500)\n", + "pd.set_option('display.width',1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
link_IDdatetime_interval_begintravel_timeimputationllagging1lagging2lagging3lagging4lagging5lengthareavacationminute_seriesday_of_weekday_of_week_enhour_enweek_hour_1.0,1.0week_hour_1.0,2.0week_hour_1.0,3.0week_hour_2.0,1.0week_hour_2.0,2.0week_hour_2.0,3.0week_hour_3.0,1.0week_hour_3.0,2.0week_hour_3.0,3.0links_num_2links_num_3links_num_4links_num_5width_3width_6width_9width_12width_15link_ID_en
033779062800285105142017-03-012017-03-01 06:00:001.659311TrueNaNNaNNaNNaNNaN481440.00.031.01.010000000010001000047
133779062800285105142017-03-012017-03-01 06:02:001.664941True1.659311NaNNaNNaNNaN481440.02.031.01.010000000010001000047
233779062800285105142017-03-012017-03-01 06:04:001.671675True1.6649411.659311NaNNaNNaN481440.04.031.01.010000000010001000047
333779062800285105142017-03-012017-03-01 06:06:001.676886True1.6716751.6649411.659311NaNNaN481440.06.031.01.010000000010001000047
433779062800285105142017-03-012017-03-01 06:08:001.682314True1.6768861.6716751.6649411.659311NaN481440.08.031.01.010000000010001000047
\n", + "
" + ], + "text/plain": [ + " link_ID date time_interval_begin travel_time imputationl lagging1 lagging2 lagging3 lagging4 lagging5 length area vacation minute_series day_of_week day_of_week_en hour_en week_hour_1.0,1.0 week_hour_1.0,2.0 week_hour_1.0,3.0 week_hour_2.0,1.0 week_hour_2.0,2.0 week_hour_2.0,3.0 week_hour_3.0,1.0 week_hour_3.0,2.0 week_hour_3.0,3.0 links_num_2 links_num_3 links_num_4 links_num_5 width_3 width_6 width_9 width_12 width_15 link_ID_en\n", + "0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 True NaN NaN NaN NaN NaN 48 144 0.0 0.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", + "1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 True 1.659311 NaN NaN NaN NaN 48 144 0.0 2.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", + "2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 True 1.664941 1.659311 NaN NaN NaN 48 144 0.0 4.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", + "3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 True 1.671675 1.664941 1.659311 NaN NaN 48 144 0.0 6.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", + "4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 True 1.676886 1.671675 1.664941 1.659311 NaN 48 144 0.0 8.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 读取处理好的特征数据\n", + "df = pd.read_csv('com_trainning.txt', delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 时间序列特征\n", + "lagging = 5\n", + "lagging_feature = ['lagging%01d' % e for e in range(lagging, 0, -1)]\n", + "lagging_feature" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "base_feature = [x for x in df.columns.values.tolist() if x not in ['time_interval_begin',\n", + " 'link_ID','link_ID_int',\n", + " 'date','travel_time',\n", + " 'imputationl','minute_series',\n", + " 'area','hour_en',\n", + " 'day_of_week']]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "base_feature = [x for x in base_feature if x not in lagging_feature]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']\n" + ] + } + ], + "source": [ + "train_feature = list(base_feature)\n", + "train_feature.extend(lagging_feature)\n", + "valid_feature = list(base_feature)\n", + "valid_feature.extend(['minute_series', 'travel_time'])\n", + "print(train_feature)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "xgboost训练参数:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "params_grid = {\n", + " 'learning_rate':[0.05],\n", + " 'n_estimators':[100],\n", + " 'subsample':[0.6],\n", + " 'colsample_bytree':[0.6],\n", + " 'max_depth':[7],\n", + " 'min_child_weight':[1],\n", + " 'reg_alpha':[2],\n", + " 'gamma':[0]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import ParameterGrid\n", + "grid = ParameterGrid(params_grid)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "训练模块" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_validate\n", + "def fit_evaluate(df, df_test, params):\n", + " df = df.dropna()\n", + " X = df[train_feature].values\n", + " y = df['travel_time'].values\n", + " \n", + " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n", + " \n", + " df_test = df_test[valid_feature].values\n", + " valid_data = bucket_data(df_test)\n", + " \n", + " eval_set = [(X_test, y_test)]\n", + " regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'],\n", + " n_estimators=params['n_estimators'],\n", + " booster='gbtree', objective='reg:linear',\n", + " n_jobs=-1,subsample=params['subsample'],\n", + " colsample_bytree=params['colsample_bytree'],\n", + " random_state=0,max_depth=params['max_depth'],\n", + " gamma=params['gamma'],\n", + " min_child_weight=params['min_child_weight'],\n", + " reg_alpha=params['reg_alpha'])\n", + " regressor.fit(X_train,y_train,verbose=False,early_stopping_rounds=10,eval_set=eval_set)\n", + " return regressor, cross_validate(regressor, valid_data, lagging=lagging), regressor.best_iteration,regressor.best_score" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def train(df, params, best, vis=False):\n", + " train1 = df.loc[df['time_interval_begin'] <= pd.to_datetime('2017-03-24')]\n", + " train2 = df.loc[\n", + " (df['time_interval_begin']>pd.to_datetime('2017-03-24'))&(\n", + " df['time_interval_begin'] <= pd.to_datetime('2017-04-18'))]\n", + " train3 = df.loc[\n", + " (df['time_interval_begin']>pd.to_datetime('2017-04-18'))&(\n", + " df['time_interval_begin'] <= pd.to_datetime('2017-05-12'))]\n", + " train4 = df.loc[\n", + " (df['time_interval_begin']>pd.to_datetime('2017-05-12'))&(\n", + " df['time_interval_begin'] <= pd.to_datetime('2017-06-06'))]\n", + " train5 = df.loc[\n", + " (df['time_interval_begin']>pd.to_datetime('2017-06-06'))&(\n", + " df['time_interval_begin'] <= pd.to_datetime('2017-06-30'))]\n", + " \n", + " regressor, loss1, best_iteration1,best_score1 = fit_evaluate(pd.concat([train1,\n", + " train2,\n", + " train3,\n", + " train4]),train5,\n", + " params)\n", + " print(best_iteration1,best_score1,loss1)\n", + " \n", + " regressor, loss2, best_iteration2,best_score2 = fit_evaluate(pd.concat([train1,\n", + " train2,\n", + " train3,\n", + " train5]),train4,\n", + " params) \n", + " print(best_iteration2,best_score2,loss2)\n", + " \n", + " regressor, loss3, best_iteration3,best_score3 = fit_evaluate(pd.concat([train1,\n", + " train2,\n", + " train4,\n", + " train5]),train3,\n", + " params) \n", + " print(best_iteration3,best_score3,loss3) \n", + "\n", + " regressor, loss4, best_iteration4,best_score4 = fit_evaluate(pd.concat([train1,\n", + " train3,\n", + " train4,\n", + " train5]),train2,\n", + " params) \n", + " print(best_iteration4,best_score4,loss4) \n", + "\n", + " regressor, loss5, best_iteration5,best_score5 = fit_evaluate(pd.concat([train2,\n", + " train3,\n", + " train4,\n", + " train5]),train1,\n", + " params)\n", + " print(best_iteration5,best_score5,loss5) \n", + " \n", + " loss = [loss1,loss2, loss3, loss4, loss5]\n", + " params['loss_std'] = np.std(loss)\n", + " params['loss'] = str(loss)\n", + " params['mean_loss'] = np.mean(loss)\n", + " params['n_estimators'] = str([best_iteration1, best_iteration2, best_iteration3,\n", + " best_iteration4, best_iteration5])\n", + " params['best_score'] = str([best_score1, best_score2, best_score3,\n", + " best_score4, best_score5])\n", + " \n", + " print(str(params))\n", + " if np.mean(loss) <= best:\n", + " best = np.mean(loss)\n", + " print('best with:' + str(params))\n", + " return best" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}