diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/建模预测.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/建模预测.ipynb deleted file mode 100644 index 79189bf..0000000 --- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/建模预测.ipynb +++ /dev/null @@ -1,871 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "D:\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", - " warnings.warn(msg, category=DeprecationWarning)\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import os\n", - "from scipy.interpolate import UnivariateSpline\n", - "from sklearn import linear_model\n", - "import xgboost as xgb\n", - "from sklearn.externals import joblib\n", - "from sklearn.utils import *" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_rows',150)\n", - "pd.set_option('display.max_columns',500)\n", - "pd.set_option('display.width',1000)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
link_IDdatetime_interval_begintravel_timeimputation1lagging1lagging2lagging3lagging4lagging5lengthareavacationminute_seriesday_of_weekday_of_week_enhour_enweek_hour_1.0,1.0week_hour_1.0,2.0week_hour_1.0,3.0week_hour_2.0,1.0week_hour_2.0,2.0week_hour_2.0,3.0week_hour_3.0,1.0week_hour_3.0,2.0week_hour_3.0,3.0links_num_2links_num_3links_num_4links_num_5width_3width_6width_9width_12width_15link_ID_en
033779062800285105142017-03-012017-03-01 06:00:001.659311TrueNaNNaNNaNNaNNaN481440.00.031.01.010000000010001000047
133779062800285105142017-03-012017-03-01 06:02:001.664941True1.659311NaNNaNNaNNaN481440.02.031.01.010000000010001000047
233779062800285105142017-03-012017-03-01 06:04:001.671675True1.6649411.659311NaNNaNNaN481440.04.031.01.010000000010001000047
333779062800285105142017-03-012017-03-01 06:06:001.676886True1.6716751.6649411.659311NaNNaN481440.06.031.01.010000000010001000047
433779062800285105142017-03-012017-03-01 06:08:001.682314True1.6768861.6716751.6649411.659311NaN481440.08.031.01.010000000010001000047
\n", - "
" - ], - "text/plain": [ - " link_ID date time_interval_begin travel_time imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 length area vacation minute_series day_of_week day_of_week_en hour_en week_hour_1.0,1.0 week_hour_1.0,2.0 week_hour_1.0,3.0 week_hour_2.0,1.0 week_hour_2.0,2.0 week_hour_2.0,3.0 week_hour_3.0,1.0 week_hour_3.0,2.0 week_hour_3.0,3.0 links_num_2 links_num_3 links_num_4 links_num_5 width_3 width_6 width_9 width_12 width_15 link_ID_en\n", - "0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 True NaN NaN NaN NaN NaN 48 144 0.0 0.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", - "1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 True 1.659311 NaN NaN NaN NaN 48 144 0.0 2.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", - "2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 True 1.664941 1.659311 NaN NaN NaN 48 144 0.0 4.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", - "3 3377906280028510514 2017-03-01 2017-03-01 06:06:00 1.676886 True 1.671675 1.664941 1.659311 NaN NaN 48 144 0.0 6.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", - "4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 True 1.676886 1.671675 1.664941 1.659311 NaN 48 144 0.0 8.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 读取处理好的特征数据\n", - "df = pd.read_csv('data/trainning.txt', delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 时间序列特征\n", - "lagging = 5\n", - "lagging_feature = ['lagging%01d' % e for e in range(lagging, 0, -1)]\n", - "lagging_feature" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "base_feature = [x for x in df.columns.values.tolist() if x not in ['time_interval_begin',\n", - " 'link_ID','link_ID_int',\n", - " 'date','travel_time',\n", - " 'imputationl','minute_series',\n", - " 'area','hour_en',\n", - " 'day_of_week']]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "base_feature = [x for x in base_feature if x not in lagging_feature]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['imputation1', 'length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']\n" - ] - } - ], - "source": [ - "train_feature = list(base_feature)\n", - "train_feature.extend(lagging_feature)\n", - "valid_feature = list(base_feature)\n", - "valid_feature.extend(['minute_series', 'travel_time'])\n", - "print(train_feature)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "xgboost训练参数:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "params_grid = {\n", - " 'learning_rate':[0.05],\n", - " 'n_estimators':[100],\n", - " 'subsample':[0.6],\n", - " 'colsample_bytree':[0.6],\n", - " 'max_depth':[7],\n", - " 'min_child_weight':[1],\n", - " 'reg_alpha':[2],\n", - " 'gamma':[0]\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import ParameterGrid\n", - "grid = ParameterGrid(params_grid)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def bucket_data(lines):\n", - " bucket = {}\n", - " for line in lines:\n", - " time_series = line[-2]\n", - " bucket[time_series] = []\n", - " for line in lines:\n", - " time_series, y1 = line[-2:]\n", - " line = np.delete(line, -2, axis=0)\n", - " bucket[time_series].append(line)\n", - " return bucket\n", - "\n", - "\n", - "def cross_valid(regressor, bucket, lagging):\n", - " valid_loss = []\n", - " last = [[] for i in range(len(bucket[list(bucket.keys())[0]]))]\n", - " for time_series in sorted(bucket.keys(), key=float):\n", - " if time_series >= 120:\n", - " if int(time_series) in range(120,120+lagging*2,2):\n", - " last = np.concatenate((last, np.array(bucket[time_series], dtype=float)[:, -1].reshape(-1,1)),axis=1)\n", - " else:\n", - " batch = np.array(bucket[time_series], dtype=float)\n", - " y = batch[:,-1]\n", - " batch = np.delete(batch, -1, axis=1)\n", - " batch = np.concatenate((batch, last), axis=1)\n", - " y_pre = regressor.predict(batch)\n", - " last = np.delete(last, 0, axis=1)\n", - " last = np.concatenate((last, y_pre.reshape(-1,1)),axis=1)\n", - " loss = np.mean(abs(np.expm1(y) - np.expm1(y_pre))/np.expm1(y))\n", - " valid_loss.append(loss)\n", - " return np.mean(valid_loss)\n", - "\n", - "\n", - "def mape_ln(y, d):\n", - " c = d.get_label()\n", - " result = np.sum(np.abs((np.expm1(y)-np.expm1(c))/np.expm1(c)))/len(c)\n", - " return 'mape', result\n", - "\n", - "\n", - "def submission(train_feature, regressor,df, file1,file2,file3,file4):\n", - " test_df = df.loc[((df['time_interval_begin'].dt.year==2017)&(df['time_interval_begin'].dt.month==7)\n", - " &(df['time_interval_begin'].dt.hour.isin([7,14,17]))\n", - " &(df['time_interval_begin'].dt.minute==58))].copy()\n", - " test_df['lagging5'] = test_df['lagging4']\n", - " test_df['lagging4'] = test_df['lagging3']\n", - " test_df['lagging3'] = test_df['lagging2']\n", - " test_df['lagging2'] = test_df['lagging1']\n", - " test_df['lagging1'] = test_df['travel_time']\n", - " with open(file1, 'w'):\n", - " pass\n", - " with open(file2, 'w'):\n", - " pass\n", - " with open(file3, 'w'):\n", - " pass\n", - " with open(file4, 'w'):\n", - " pass\n", - " for i in range(30):\n", - " test_X = test_df[train_feature]\n", - " y_prediction = regressor.predict(test_X.values)\n", - " test_df['lagging5'] = test_df['lagging4']\n", - " test_df['lagging4'] = test_df['lagging3']\n", - " test_df['lagging3'] = test_df['lagging2']\n", - " test_df['lagging2'] = test_df['lagging1']\n", - " test_df['lagging1'] = y_prediction\n", - " \n", - " test_df['prediction'] = np.expm1(y_prediction)\n", - " test_df['time_interval_begin'] = test_df['time_interval_begin']+pd.DateOffset(minutes=2)\n", - " test_df['time_interval'] = test_df['time_interval_begin'].map(\n", - " lambda x: '[' + str(x)+','+str(x+pd.DateOffset(minutes=2))+')')\n", - " test_df.time_interval = test_df.time_interval.astype(object)\n", - " if i < 7:\n", - " test_df[['link_ID','date','time_interval','prediction']].to_csv(file1,mode='a',\n", - " header=False,\n", - " index=False,\n", - " sep=';')\n", - " elif (7 <= i) and (i < 14):\n", - " test_df[['link_ID','date','time_interval','prediction']].to_csv(file2,mode='a',\n", - " header=False,\n", - " index=False,\n", - " sep=';')\n", - " elif (14 <= i) and (i < 22):\n", - " test_df[['link_ID','date','time_interval','prediction']].to_csv(file1,mode='a',\n", - " header=False,\n", - " index=False,\n", - " sep=';')\n", - " else:\n", - " test_df[['link_ID','date','time_interval','prediction']].to_csv(file4,mode='a',\n", - " header=False,\n", - " index=False,\n", - " sep=';')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "训练模块" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "def fit_evaluate(df, df_test, params):\n", - " df = df.dropna()\n", - " X = df[train_feature].values\n", - " y = df['travel_time'].values\n", - " \n", - " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n", - " \n", - " df_test = df_test[valid_feature].values\n", - " valid_data = bucket_data(df_test)\n", - " \n", - " eval_set = [(X_test, y_test)]\n", - " regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'],\n", - " n_estimators=params['n_estimators'],\n", - " booster='gbtree', objective='reg:linear',\n", - " n_jobs=-1,subsample=params['subsample'],\n", - " colsample_bytree=params['colsample_bytree'],\n", - " random_state=0,max_depth=params['max_depth'],\n", - " gamma=params['gamma'],\n", - " min_child_weight=params['min_child_weight'],\n", - " reg_alpha=params['reg_alpha'])\n", - " regressor.fit(X_train,y_train,verbose=False,early_stopping_rounds=10,eval_set=eval_set)\n", - " return regressor, cross_valid(regressor, valid_data, lagging=lagging), regressor.best_iteration,regressor.best_score" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def train(df, params, best, vis=False):\n", - " train1 = df.loc[df['time_interval_begin'] <= pd.to_datetime('2017-03-24')]\n", - " train2 = df.loc[\n", - " (df['time_interval_begin']>pd.to_datetime('2017-03-24'))&(\n", - " df['time_interval_begin'] <= pd.to_datetime('2017-04-18'))]\n", - " train3 = df.loc[\n", - " (df['time_interval_begin']>pd.to_datetime('2017-04-18'))&(\n", - " df['time_interval_begin'] <= pd.to_datetime('2017-05-12'))]\n", - " train4 = df.loc[\n", - " (df['time_interval_begin']>pd.to_datetime('2017-05-12'))&(\n", - " df['time_interval_begin'] <= pd.to_datetime('2017-06-06'))]\n", - " train5 = df.loc[\n", - " (df['time_interval_begin']>pd.to_datetime('2017-06-06'))&(\n", - " df['time_interval_begin'] <= pd.to_datetime('2017-06-30'))]\n", - " \n", - " regressor, loss1, best_iteration1,best_score1 = fit_evaluate(pd.concat([train1,\n", - " train2,\n", - " train3,\n", - " train4]),train5,\n", - " params)\n", - " print(best_iteration1,best_score1,loss1)\n", - " \n", - " regressor, loss2, best_iteration2,best_score2 = fit_evaluate(pd.concat([train1,\n", - " train2,\n", - " train3,\n", - " train5]),train4,\n", - " params) \n", - " print(best_iteration2,best_score2,loss2)\n", - " \n", - " regressor, loss3, best_iteration3,best_score3 = fit_evaluate(pd.concat([train1,\n", - " train2,\n", - " train4,\n", - " train5]),train3,\n", - " params) \n", - " print(best_iteration3,best_score3,loss3) \n", - "\n", - " regressor, loss4, best_iteration4,best_score4 = fit_evaluate(pd.concat([train1,\n", - " train3,\n", - " train4,\n", - " train5]),train2,\n", - " params) \n", - " print(best_iteration4,best_score4,loss4) \n", - "\n", - " regressor, loss5, best_iteration5,best_score5 = fit_evaluate(pd.concat([train2,\n", - " train3,\n", - " train4,\n", - " train5]),train1,\n", - " params)\n", - " print(best_iteration5,best_score5,loss5) \n", - " \n", - " loss = [loss1,loss2, loss3, loss4, loss5]\n", - " params['loss_std'] = np.std(loss)\n", - " params['loss'] = str(loss)\n", - " params['mean_loss'] = np.mean(loss)\n", - " params['n_estimators'] = str([best_iteration1, best_iteration2, best_iteration3,\n", - " best_iteration4, best_iteration5])\n", - " params['best_score'] = str([best_score1, best_score2, best_score3,\n", - " best_score4, best_score5])\n", - " \n", - " print(str(params))\n", - " if np.mean(loss) <= best:\n", - " best = np.mean(loss)\n", - " print('best with:' + str(params))\n", - " return best" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[22:07:01] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", - "99 0.231729 0.09787323564628972\n", - "[22:12:48] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", - "99 0.211948 0.22588986922596394\n", - "[22:18:32] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", - "99 0.207832 0.269828138777363\n", - "[22:24:17] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", - "99 0.205743 0.27878690843594917\n", - "[22:29:46] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", - "99 0.206546 0.2825731100341743\n", - "{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[99, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06956988861011186, 'loss': '[0.09787323564628972, 0.22588986922596394, 0.269828138777363, 0.27878690843594917, 0.2825731100341743]', 'mean_loss': 0.23099025242394805, 'best_score': '[0.231729, 0.211948, 0.207832, 0.205743, 0.206546]'}\n", - "best with:{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[99, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06956988861011186, 'loss': '[0.09787323564628972, 0.22588986922596394, 0.269828138777363, 0.27878690843594917, 0.2825731100341743]', 'mean_loss': 0.23099025242394805, 'best_score': '[0.231729, 0.211948, 0.207832, 0.205743, 0.206546]'}\n" - ] - } - ], - "source": [ - "best = 1\n", - "for params in grid:\n", - " best = train(df, params, best)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "submit_params = {\n", - " 'learning_rate':0.05,\n", - " 'n_estimators':100,\n", - " 'subsample':0.6,\n", - " 'colsample_bytree':0.6,\n", - " 'max_depth':7,\n", - " 'min_child_weight':1,\n", - " 'reg_alpha':2,\n", - " 'gamma':0\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "def xgboost_submit(df, params):\n", - " train_df = df.loc[df['time_interval_begin']