diff --git a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb index 9f83756..dc38cc5 100644 --- a/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/智慧城市-道路通行时间预测/.ipynb_checkpoints/建模预测-checkpoint.ipynb @@ -2,15 +2,25 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", + " warnings.warn(msg, category=DeprecationWarning)\n" + ] + } + ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from scipy.interpolate import UnivariateSpline\n", "from sklearn import linear_model\n", "import xgboost as xgb\n", + "from sklearn.externals import joblib\n", "from sklearn.utils import *" ] }, @@ -27,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -55,7 +65,7 @@ " date\n", " time_interval_begin\n", " travel_time\n", - " imputationl\n", + " imputation1\n", " lagging1\n", " lagging2\n", " lagging3\n", @@ -290,7 +300,7 @@ "" ], "text/plain": [ - " link_ID date time_interval_begin travel_time imputationl lagging1 lagging2 lagging3 lagging4 lagging5 length area vacation minute_series day_of_week day_of_week_en hour_en week_hour_1.0,1.0 week_hour_1.0,2.0 week_hour_1.0,3.0 week_hour_2.0,1.0 week_hour_2.0,2.0 week_hour_2.0,3.0 week_hour_3.0,1.0 week_hour_3.0,2.0 week_hour_3.0,3.0 links_num_2 links_num_3 links_num_4 links_num_5 width_3 width_6 width_9 width_12 width_15 link_ID_en\n", + " link_ID date time_interval_begin travel_time imputation1 lagging1 lagging2 lagging3 lagging4 lagging5 length area vacation minute_series day_of_week day_of_week_en hour_en week_hour_1.0,1.0 week_hour_1.0,2.0 week_hour_1.0,3.0 week_hour_2.0,1.0 week_hour_2.0,2.0 week_hour_2.0,3.0 week_hour_3.0,1.0 week_hour_3.0,2.0 week_hour_3.0,3.0 links_num_2 links_num_3 links_num_4 links_num_5 width_3 width_6 width_9 width_12 width_15 link_ID_en\n", "0 3377906280028510514 2017-03-01 2017-03-01 06:00:00 1.659311 True NaN NaN NaN NaN NaN 48 144 0.0 0.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", "1 3377906280028510514 2017-03-01 2017-03-01 06:02:00 1.664941 True 1.659311 NaN NaN NaN NaN 48 144 0.0 2.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", "2 3377906280028510514 2017-03-01 2017-03-01 06:04:00 1.671675 True 1.664941 1.659311 NaN NaN NaN 48 144 0.0 4.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47\n", @@ -298,20 +308,20 @@ "4 3377906280028510514 2017-03-01 2017-03-01 06:08:00 1.682314 True 1.676886 1.671675 1.664941 1.659311 NaN 48 144 0.0 8.0 3 1.0 1.0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 47" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 读取处理好的特征数据\n", - "df = pd.read_csv('com_trainning.txt', delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n", + "df = pd.read_csv('trainning.txt', delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})\n", "df.head()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -320,7 +330,7 @@ "['lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -334,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -348,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -357,14 +367,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']\n" + "['imputation1', 'length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']\n" ] } ], @@ -385,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -403,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -411,6 +421,106 @@ "grid = ParameterGrid(params_grid)" ] }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def bucket_data(lines):\n", + " bucket = {}\n", + " for line in lines:\n", + " time_series = line[-2]\n", + " bucket[time_series] = []\n", + " for line in lines:\n", + " time_series, y1 = line[-2:]\n", + " line = np.delete(line, -2, axis=0)\n", + " bucket[time_series].append(line)\n", + " return bucket\n", + "\n", + "\n", + "def cross_valid(regressor, bucket, lagging):\n", + " valid_loss = []\n", + " last = [[] for i in range(len(bucket[list(bucket.keys())[0]]))]\n", + " for time_series in sorted(bucket.keys(), key=float):\n", + " if time_series >= 120:\n", + " if int(time_series) in range(120,120+lagging*2,2):\n", + " last = np.concatenate((last, np.array(bucket[time_series], dtype=float)[:, -1].reshape(-1,1)),axis=1)\n", + " else:\n", + " batch = np.array(bucket[time_series], dtype=float)\n", + " y = batch[:,-1]\n", + " batch = np.delete(batch, -1, axis=1)\n", + " print(\"====================================\")\n", + " print(batch.shape, last.shape, type(time_series))\n", + " print(\"====================================\")\n", + " batch = np.concatenate((batch, last), axis=1)\n", + " y_pre = regressor.predict(batch)\n", + " last = np.delete(last, 0, axis=1)\n", + " last = np.concatenate((last, y_pre.reshape(-1,1)),axis=1)\n", + " loss = np.mean(abs(np.expm1(y) - np.expm1(y_pre))/np.expm1(y))\n", + " valid_loss.append(loss)\n", + " return np.mean(valid_loss)\n", + "\n", + "\n", + "def mape_ln(y, d):\n", + " c = d.get_label()\n", + " result = np.sum(np.abs((np.expm1(y)-np.expm1(c))/np.expm1(c)))/len(c)\n", + " return 'mape', result\n", + "\n", + "\n", + "def submission(train_feature, regressor,df, file1,file2,file3,file4):\n", + " test_df = df.loc[((df['time_interval_begin'].dt.year==2017)&(df['time_interval_begin'].dt.month==7)\n", + " &(df['time_interval_begin'].dt.hour.isin([7,14,17]))\n", + " &(df['time_interval_begin'].dt.minute==58))].copy()\n", + " test_df['lagging5'] = test_df['lagging4']\n", + " test_df['lagging4'] = test_df['lagging3']\n", + " test_df['lagging3'] = test_df['lagging2']\n", + " test_df['lagging2'] = test_df['lagging1']\n", + " test_df['lagging1'] = test_df['travel_time']\n", + " with open(file1, 'w'):\n", + " pass\n", + " with open(file2, 'w'):\n", + " pass\n", + " with open(file3, 'w'):\n", + " pass\n", + " with open(file4, 'w'):\n", + " pass\n", + " for i in range(30):\n", + " test_X = test_df[train_feature]\n", + " y_prediction = regressor.predict(test_X.values)\n", + " test_df['lagging5'] = test_df['lagging4']\n", + " test_df['lagging4'] = test_df['lagging3']\n", + " test_df['lagging3'] = test_df['lagging2']\n", + " test_df['lagging2'] = test_df['lagging1']\n", + " test_df['lagging1'] = y_prediction\n", + " \n", + " test_df['prediction'] = np.expm1(y_prediction)\n", + " test_df['time_interval_begin'] = test_df['time_interval_begin']+pd.DateOffset(minutes=2)\n", + " test_df['time_interval'] = test_df['time_interval_begin'].map(\n", + " lambda x: '[' + str(x)+','+str(x+pd.DateOffset(minutes=2))+')')\n", + " test_df.time_interval = test_df.time_interval.astype(object)\n", + " if i < 7:\n", + " test_df[['link_ID','date','time_interval','prediction']].to_csv(file1,mode='a',\n", + " header=False,\n", + " index=False,\n", + " sep=';')\n", + " elif (7 <= i) and (i < 14):\n", + " test_df[['link_ID','date','time_interval','prediction']].to_csv(file2,mode='a',\n", + " header=False,\n", + " index=False,\n", + " sep=';')\n", + " elif (14 <= i) and (i < 22):\n", + " test_df[['link_ID','date','time_interval','prediction']].to_csv(file1,mode='a',\n", + " header=False,\n", + " index=False,\n", + " sep=';')\n", + " else:\n", + " test_df[['link_ID','date','time_interval','prediction']].to_csv(file4,mode='a',\n", + " header=False,\n", + " index=False,\n", + " sep=';')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -420,11 +530,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import train_test_split\n", "def fit_evaluate(df, df_test, params):\n", " df = df.dropna()\n", " X = df[train_feature].values\n", @@ -446,12 +556,12 @@ " min_child_weight=params['min_child_weight'],\n", " reg_alpha=params['reg_alpha'])\n", " regressor.fit(X_train,y_train,verbose=False,early_stopping_rounds=10,eval_set=eval_set)\n", - " return regressor, cross_validate(regressor, valid_data, lagging=lagging), regressor.best_iteration,regressor.best_score" + " return regressor, cross_valid(regressor, valid_data, lagging=lagging), regressor.best_iteration,regressor.best_score" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -523,17 +633,593 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[16:02:58] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "99 0.231729 0.09787323564628972\n", + "[16:08:54] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "====================================\n", + "(9900, 23) (9900, 5) \n", + "====================================\n", + "99 0.211948 0.22588986922596394\n", + "[16:14:56] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "99 0.207832 0.269828138777363\n", + "[16:21:05] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "====================================\n", + "(9504, 23) (9504, 5) \n", + "====================================\n", + "99 0.205743 0.27878690843594917\n", + "[16:27:05] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "====================================\n", + "(9108, 23) (9108, 5) \n", + "====================================\n", + "99 0.206546 0.2825731100341743\n", + "{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[99, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06956988861011186, 'loss': '[0.09787323564628972, 0.22588986922596394, 0.269828138777363, 0.27878690843594917, 0.2825731100341743]', 'mean_loss': 0.23099025242394805, 'best_score': '[0.231729, 0.211948, 0.207832, 0.205743, 0.206546]'}\n", + "best with:{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[99, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06956988861011186, 'loss': '[0.09787323564628972, 0.22588986922596394, 0.269828138777363, 0.27878690843594917, 0.2825731100341743]', 'mean_loss': 0.23099025242394805, 'best_score': '[0.231729, 0.211948, 0.207832, 0.205743, 0.206546]'}\n" + ] + } + ], + "source": [ + "best = 1\n", + "for params in grid:\n", + " best = train(df, params, best)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "submit_params = {\n", + " 'learning_rate':0.05,\n", + " 'n_estimators':100,\n", + " 'subsample':0.6,\n", + " 'colsample_bytree':0.6,\n", + " 'max_depth':7,\n", + " 'min_child_weight':1,\n", + " 'reg_alpha':2,\n", + " 'gamma':0\n", + "}" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def xgboost_submit(df, params):\n", + " train_df = df.loc[df['time_interval_begin']