From a05e45295ec3f1431cf741951decc66105cb1564 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 25 Dec 2020 11:05:11 +0800 Subject: [PATCH] =?UTF-8?q?Add=20=E5=9F=BA=E7=A1=80=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E6=AF=94=E8=BE=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...分析_建筑能源利用率预测.ipynb | 239 +++++++++++++++++- 1 file changed, 237 insertions(+), 2 deletions(-) diff --git a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb index 28335ad..11a7735 100644 --- a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb +++ b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb @@ -909,14 +909,249 @@ "print(np.where(~np.isfinite(X_test)))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 特征标准化与归一化" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "## 特征" + "# Create the scaler object with a range of 0-1\n", + "scaler = MinMaxScaler(feature_range=(0, 1))\n", + "\n", + "# Fit on the training data\n", + "scaler.fit(X)\n", + "\n", + "# Transform both training data and testing data\n", + "X = scaler.transform(X)\n", + "X_test = scaler.transform(X_test)" ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert y to one-dimensional array (vector)\n", + "y = np.array(train_labels).reshape((-1, ))\n", + "y_test = np.array(test_labels).reshape((-1, ))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 选择的机器学习算法(回归问题)\n", + " 1. Linear Regression\n", + " 2. Support Vector Machine Regression\n", + " 3. Random Forest Regression\n", + " 4. Gradient Boosting Regression\n", + " 5. K-Nearest Neighbors Regression\n", + "\n", + "这里先使用默认参数,后续再调参" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to calculation mean absolute error\n", + "def mae (y_true,y_pred):\n", + " return np.mean(abs(y_true - y_pred))\n", + "\n", + "# Takes in a model, trains the model, and evaluates the model on the test set\n", + "def fit_and_evaluate(model):\n", + " \n", + " # Train the model\n", + " model.fit(X,y)\n", + " \n", + " # Make predictions and evalute\n", + " model_pred = model.predict(X_test)\n", + " model_mae = mae(y_test,model_pred)\n", + " \n", + " # Return the performance metric\n", + " return model_mae" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Linear Regression Performance on the test set: MAE = 13.4651\n" + ] + } + ], + "source": [ + "lr = LinearRegression()\n", + "lr_mae = fit_and_evaluate(lr)\n", + "\n", + "print('Linear Regression Performance on the test set: MAE = %0.4f'% lr_mae)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Support Vector Machine Regression Performance on the test set: MAE = 10.9337\n" + ] + } + ], + "source": [ + "svm = SVR(C=1000,gamma =0.1)\n", + "svm_mae = fit_and_evaluate(svm)\n", + "\n", + "print('Support Vector Machine Regression Performance on the test set: MAE = %0.4f' % svm_mae)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", + " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest Regression Performance on the test set: MAE = 10.0073\n" + ] + } + ], + "source": [ + "random_forest = RandomForestRegressor(random_state = 60)\n", + "random_forest_mae = fit_and_evaluate(random_forest)\n", + "\n", + "print('Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gradient Boosted Regression Performance on the test set: MAE = 10.0144\n" + ] + } + ], + "source": [ + "gradient_boosted = GradientBoostingRegressor(random_state=60)\n", + "gradient_boosted_mae = fit_and_evaluate(gradient_boosted)\n", + "\n", + "print('Gradient Boosted Regression Performance on the test set: MAE = %0.4f' % gradient_boosted_mae)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "K-Nearest Neighbors Regression Performance on the test set: MAE = 13.0131\n" + ] + } + ], + "source": [ + "knn = KNeighborsRegressor(n_neighbors=10)\n", + "knn_mae = fit_and_evaluate(knn)\n", + "\n", + "print('K-Nearest Neighbors Regression Performance on the test set: MAE = %0.4f' % knn_mae)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.style.use('fivethirtyeight')\n", + "figsize(8, 4)\n", + "\n", + "# Dataframe to hold the results\n", + "model_comparison = pd.DataFrame({'model':['Linear Regression',\n", + " 'Support Vector Machine',\n", + " 'Random Forest',\n", + " 'Gradient Boosted',\n", + " 'K-Nearest Neighbors'],\n", + " 'mae':[lr_mae,\n", + " svm_mae,\n", + " random_forest_mae, \n", + " gradient_boosted_mae, \n", + " knn_mae]})\n", + "# Horizontal bar chart of test mae\n", + "model_comparison.sort_values('mae',ascending = False).plot(x = 'model',\n", + " y = 'mae',\n", + " kind = 'barh',\n", + " color = 'red', \n", + " edgecolor = 'black')\n", + "# Plot formatting\n", + "plt.ylabel('');plt.yticks(size = 14);plt.xlabel('Mean Absolute Error');plt.xticks(size = 14)\n", + "plt.title('Model Comparison on Test MAE', size = 20);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "集成算法的效果更好,这里由于参数只使用默认的,对SVM等这种参数影响较大的模型不太公平。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 调参" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {