diff --git a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/2_建模_建筑能源利用率预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/2_建模_建筑能源利用率预测-checkpoint.ipynb
new file mode 100644
index 0000000..407832c
--- /dev/null
+++ b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/2_建模_建筑能源利用率预测-checkpoint.ipynb
@@ -0,0 +1,2257 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 载入工具包"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "pd.options.mode.chained_assignment = None # 消除警告,比如说提示版本升级之类的\n",
+ "\n",
+ "pd.set_option('display.max_columns', 60) # 设置最大显示列为60\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "\n",
+ "plt.rcParams['font.size'] = 24 # 设置字体大小\n",
+ "\n",
+ "from IPython.core.pylabtools import figsize # 设置画图大小\n",
+ "\n",
+ "import seaborn as sns # 画图工具\n",
+ "sns.set(font_scale=2)\n",
+ "\n",
+ "# 输入缺失值和缩放值\n",
+ "from sklearn.preprocessing import Imputer, MinMaxScaler\n",
+ "\n",
+ "# 机器学习模型\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
+ "from sklearn.svm import SVR\n",
+ "from sklearn.neighbors import KNeighborsRegressor\n",
+ "\n",
+ "# 超参数调整\n",
+ "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training Feature Size: (6622, 64)\n",
+ "Testing Feature Size: (2839, 64)\n",
+ "Training Labels Size: (6622, 1)\n",
+ "Testing Labels Size: (2839, 1)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Read in data into dataframes \n",
+ "train_features = pd.read_csv('data/training_features.csv')\n",
+ "test_features = pd.read_csv('data/testing_features.csv')\n",
+ "train_labels = pd.read_csv('data/training_labels.csv')\n",
+ "test_labels = pd.read_csv('data/testing_labels.csv')\n",
+ "\n",
+ "# Display sizes of data\n",
+ "print('Training Feature Size: ', train_features.shape)\n",
+ "print('Testing Feature Size: ', test_features.shape)\n",
+ "print('Training Labels Size: ', train_labels.shape)\n",
+ "print('Testing Labels Size: ', test_labels.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Order | \n",
+ " Property Id | \n",
+ " DOF Gross Floor Area | \n",
+ " Year Built | \n",
+ " Number of Buildings - Self-reported | \n",
+ " Occupancy | \n",
+ " Site EUI (kBtu/ft²) | \n",
+ " Weather Normalized Site Electricity Intensity (kWh/ft²) | \n",
+ " Weather Normalized Site Natural Gas Intensity (therms/ft²) | \n",
+ " Water Intensity (All Water Sources) (gal/ft²) | \n",
+ " Latitude | \n",
+ " Longitude | \n",
+ " Community Board | \n",
+ " Census Tract | \n",
+ " log_Direct GHG Emissions (Metric Tons CO2e) | \n",
+ " log_Water Intensity (All Water Sources) (gal/ft²) | \n",
+ " Borough_Staten Island | \n",
+ " Largest Property Use Type_Adult Education | \n",
+ " Largest Property Use Type_Automobile Dealership | \n",
+ " Largest Property Use Type_Bank Branch | \n",
+ " Largest Property Use Type_College/University | \n",
+ " Largest Property Use Type_Convenience Store without Gas Station | \n",
+ " Largest Property Use Type_Courthouse | \n",
+ " Largest Property Use Type_Distribution Center | \n",
+ " Largest Property Use Type_Enclosed Mall | \n",
+ " Largest Property Use Type_Financial Office | \n",
+ " Largest Property Use Type_Hospital (General Medical & Surgical) | \n",
+ " Largest Property Use Type_Hotel | \n",
+ " Largest Property Use Type_K-12 School | \n",
+ " Largest Property Use Type_Library | \n",
+ " ... | \n",
+ " Largest Property Use Type_Multifamily Housing | \n",
+ " Largest Property Use Type_Museum | \n",
+ " Largest Property Use Type_Non-Refrigerated Warehouse | \n",
+ " Largest Property Use Type_Other | \n",
+ " Largest Property Use Type_Other - Education | \n",
+ " Largest Property Use Type_Other - Entertainment/Public Assembly | \n",
+ " Largest Property Use Type_Other - Lodging/Residential | \n",
+ " Largest Property Use Type_Other - Mall | \n",
+ " Largest Property Use Type_Other - Public Services | \n",
+ " Largest Property Use Type_Other - Recreation | \n",
+ " Largest Property Use Type_Other - Services | \n",
+ " Largest Property Use Type_Other - Specialty Hospital | \n",
+ " Largest Property Use Type_Outpatient Rehabilitation/Physical Therapy | \n",
+ " Largest Property Use Type_Parking | \n",
+ " Largest Property Use Type_Performing Arts | \n",
+ " Largest Property Use Type_Pre-school/Daycare | \n",
+ " Largest Property Use Type_Refrigerated Warehouse | \n",
+ " Largest Property Use Type_Repair Services (Vehicle, Shoe, Locksmith, etc.) | \n",
+ " Largest Property Use Type_Residence Hall/Dormitory | \n",
+ " Largest Property Use Type_Residential Care Facility | \n",
+ " Largest Property Use Type_Restaurant | \n",
+ " Largest Property Use Type_Retail Store | \n",
+ " Largest Property Use Type_Self-Storage Facility | \n",
+ " Largest Property Use Type_Senior Care Community | \n",
+ " Largest Property Use Type_Social/Meeting Hall | \n",
+ " Largest Property Use Type_Strip Mall | \n",
+ " Largest Property Use Type_Supermarket/Grocery Store | \n",
+ " Largest Property Use Type_Urgent Care/Clinic/Other Outpatient | \n",
+ " Largest Property Use Type_Wholesale Club/Supercenter | \n",
+ " Largest Property Use Type_Worship Facility | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13276 | \n",
+ " 5849784 | \n",
+ " 90300.0 | \n",
+ " 1950 | \n",
+ " 1 | \n",
+ " 100 | \n",
+ " 126.0 | \n",
+ " 5.2 | \n",
+ " 1.2 | \n",
+ " 99.41 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6.088818 | \n",
+ " 4.599253 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 7377 | \n",
+ " 4398442 | \n",
+ " 52000.0 | \n",
+ " 1926 | \n",
+ " 1 | \n",
+ " 100 | \n",
+ " 95.4 | \n",
+ " 4.7 | \n",
+ " 0.9 | \n",
+ " NaN | \n",
+ " 40.835496 | \n",
+ " -73.887745 | \n",
+ " 3.0 | \n",
+ " 161.0 | \n",
+ " 5.384036 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 9479 | \n",
+ " 4665374 | \n",
+ " 104700.0 | \n",
+ " 1954 | \n",
+ " 1 | \n",
+ " 100 | \n",
+ " 40.4 | \n",
+ " 3.8 | \n",
+ " 0.3 | \n",
+ " NaN | \n",
+ " 40.663206 | \n",
+ " -73.949469 | \n",
+ " 9.0 | \n",
+ " 329.0 | \n",
+ " 5.017280 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 14774 | \n",
+ " 3393340 | \n",
+ " 129333.0 | \n",
+ " 1992 | \n",
+ " 1 | \n",
+ " 100 | \n",
+ " 157.1 | \n",
+ " 16.9 | \n",
+ " 1.1 | \n",
+ " NaN | \n",
+ " 40.622968 | \n",
+ " -74.078742 | \n",
+ " 1.0 | \n",
+ " 27.0 | \n",
+ " 6.510853 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3286 | \n",
+ " 2704325 | \n",
+ " 109896.0 | \n",
+ " 1927 | \n",
+ " 1 | \n",
+ " 100 | \n",
+ " 62.3 | \n",
+ " 3.5 | \n",
+ " 0.0 | \n",
+ " 28.65 | \n",
+ " 40.782421 | \n",
+ " -73.972622 | \n",
+ " 7.0 | \n",
+ " 165.0 | \n",
+ " 6.123589 | \n",
+ " 3.355153 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 64 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Order Property Id DOF Gross Floor Area Year Built \\\n",
+ "0 13276 5849784 90300.0 1950 \n",
+ "1 7377 4398442 52000.0 1926 \n",
+ "2 9479 4665374 104700.0 1954 \n",
+ "3 14774 3393340 129333.0 1992 \n",
+ "4 3286 2704325 109896.0 1927 \n",
+ "\n",
+ " Number of Buildings - Self-reported Occupancy Site EUI (kBtu/ft²) \\\n",
+ "0 1 100 126.0 \n",
+ "1 1 100 95.4 \n",
+ "2 1 100 40.4 \n",
+ "3 1 100 157.1 \n",
+ "4 1 100 62.3 \n",
+ "\n",
+ " Weather Normalized Site Electricity Intensity (kWh/ft²) \\\n",
+ "0 5.2 \n",
+ "1 4.7 \n",
+ "2 3.8 \n",
+ "3 16.9 \n",
+ "4 3.5 \n",
+ "\n",
+ " Weather Normalized Site Natural Gas Intensity (therms/ft²) \\\n",
+ "0 1.2 \n",
+ "1 0.9 \n",
+ "2 0.3 \n",
+ "3 1.1 \n",
+ "4 0.0 \n",
+ "\n",
+ " Water Intensity (All Water Sources) (gal/ft²) Latitude Longitude \\\n",
+ "0 99.41 NaN NaN \n",
+ "1 NaN 40.835496 -73.887745 \n",
+ "2 NaN 40.663206 -73.949469 \n",
+ "3 NaN 40.622968 -74.078742 \n",
+ "4 28.65 40.782421 -73.972622 \n",
+ "\n",
+ " Community Board Census Tract log_Direct GHG Emissions (Metric Tons CO2e) \\\n",
+ "0 NaN NaN 6.088818 \n",
+ "1 3.0 161.0 5.384036 \n",
+ "2 9.0 329.0 5.017280 \n",
+ "3 1.0 27.0 6.510853 \n",
+ "4 7.0 165.0 6.123589 \n",
+ "\n",
+ " log_Water Intensity (All Water Sources) (gal/ft²) Borough_Staten Island \\\n",
+ "0 4.599253 0 \n",
+ "1 NaN 0 \n",
+ "2 NaN 0 \n",
+ "3 NaN 1 \n",
+ "4 3.355153 0 \n",
+ "\n",
+ " Largest Property Use Type_Adult Education \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Automobile Dealership \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Bank Branch \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_College/University \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Convenience Store without Gas Station \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Courthouse \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Distribution Center \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Enclosed Mall \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Financial Office \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Hospital (General Medical & Surgical) \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Hotel Largest Property Use Type_K-12 School \\\n",
+ "0 0 0 \n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ " Largest Property Use Type_Library ... \\\n",
+ "0 0 ... \n",
+ "1 0 ... \n",
+ "2 0 ... \n",
+ "3 0 ... \n",
+ "4 0 ... \n",
+ "\n",
+ " Largest Property Use Type_Multifamily Housing \\\n",
+ "0 1 \n",
+ "1 1 \n",
+ "2 1 \n",
+ "3 0 \n",
+ "4 1 \n",
+ "\n",
+ " Largest Property Use Type_Museum \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Non-Refrigerated Warehouse \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Education \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Entertainment/Public Assembly \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Lodging/Residential \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Mall \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Public Services \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Recreation \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Services \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Other - Specialty Hospital \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Outpatient Rehabilitation/Physical Therapy \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Parking \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Performing Arts \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Pre-school/Daycare \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Refrigerated Warehouse \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Repair Services (Vehicle, Shoe, Locksmith, etc.) \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Residence Hall/Dormitory \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Residential Care Facility \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Restaurant \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Retail Store \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Self-Storage Facility \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Senior Care Community \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 1 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Social/Meeting Hall \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Strip Mall \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Supermarket/Grocery Store \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Urgent Care/Clinic/Other Outpatient \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Wholesale Club/Supercenter \\\n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " Largest Property Use Type_Worship Facility \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ "[5 rows x 64 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_features.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 缺失值填充\n",
+ "\n",
+ "利用sklearn的 Imputer object来进行缺失值填充,测试集则使用数据集中的结果进行填充,尽可能的不要利用测试集的数据对测试集加工,因为一开始我们也是不知道的,可参考[Data Leagage](https://www.kaggle.com/dansbecker/data-leakage)。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create an imputer object with a median filling strategy\n",
+ "imputer = Imputer(strategy = 'median')\n",
+ "\n",
+ "# Train on the training features\n",
+ "imputer.fit(train_features)\n",
+ "\n",
+ "# Transform both training data and testing data\n",
+ "X = imputer.transform(train_features)\n",
+ "X_test = imputer.transform(test_features)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Missing values in training features: 0\n",
+ "Missing values in testing features: 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Missing values in training features:', np.sum(np.isnan(X)))\n",
+ "print('Missing values in testing features:',np.sum(np.isnan(X_test)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(array([], dtype=int64), array([], dtype=int64))\n",
+ "(array([], dtype=int64), array([], dtype=int64))\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Make sure all values are finite\n",
+ "print(np.where(~np.isfinite(X)))\n",
+ "print(np.where(~np.isfinite(X_test)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 特征标准化与归一化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the scaler object with a range of 0-1\n",
+ "scaler = MinMaxScaler(feature_range=(0, 1))\n",
+ "\n",
+ "# Fit on the training data\n",
+ "scaler.fit(X)\n",
+ "\n",
+ "# Transform both training data and testing data\n",
+ "X = scaler.transform(X)\n",
+ "X_test = scaler.transform(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Convert y to one-dimensional array (vector)\n",
+ "y = np.array(train_labels).reshape((-1, ))\n",
+ "y_test = np.array(test_labels).reshape((-1, ))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 选择的机器学习算法(回归问题)\n",
+ " 1. Linear Regression\n",
+ " 2. Support Vector Machine Regression\n",
+ " 3. Random Forest Regression\n",
+ " 4. Gradient Boosting Regression\n",
+ " 5. K-Nearest Neighbors Regression\n",
+ "\n",
+ "这里先使用默认参数,后续再调参"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Function to calculation mean absolute error\n",
+ "def mae (y_true,y_pred):\n",
+ " return np.mean(abs(y_true - y_pred))\n",
+ "\n",
+ "# Takes in a model, trains the model, and evaluates the model on the test set\n",
+ "def fit_and_evaluate(model):\n",
+ " \n",
+ " # Train the model\n",
+ " model.fit(X,y)\n",
+ " \n",
+ " # Make predictions and evalute\n",
+ " model_pred = model.predict(X_test)\n",
+ " model_mae = mae(y_test,model_pred)\n",
+ " \n",
+ " # Return the performance metric\n",
+ " return model_mae"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Linear Regression Performance on the test set: MAE = 13.4651\n"
+ ]
+ }
+ ],
+ "source": [
+ "lr = LinearRegression()\n",
+ "lr_mae = fit_and_evaluate(lr)\n",
+ "\n",
+ "print('Linear Regression Performance on the test set: MAE = %0.4f'% lr_mae)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Support Vector Machine Regression Performance on the test set: MAE = 10.9337\n"
+ ]
+ }
+ ],
+ "source": [
+ "svm = SVR(C=1000,gamma =0.1)\n",
+ "svm_mae = fit_and_evaluate(svm)\n",
+ "\n",
+ "print('Support Vector Machine Regression Performance on the test set: MAE = %0.4f' % svm_mae)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "D:\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
+ " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Random Forest Regression Performance on the test set: MAE = 10.0073\n"
+ ]
+ }
+ ],
+ "source": [
+ "random_forest = RandomForestRegressor(random_state = 60)\n",
+ "random_forest_mae = fit_and_evaluate(random_forest)\n",
+ "\n",
+ "print('Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Gradient Boosted Regression Performance on the test set: MAE = 10.0144\n"
+ ]
+ }
+ ],
+ "source": [
+ "gradient_boosted = GradientBoostingRegressor(random_state=60)\n",
+ "gradient_boosted_mae = fit_and_evaluate(gradient_boosted)\n",
+ "\n",
+ "print('Gradient Boosted Regression Performance on the test set: MAE = %0.4f' % gradient_boosted_mae)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "K-Nearest Neighbors Regression Performance on the test set: MAE = 13.0131\n"
+ ]
+ }
+ ],
+ "source": [
+ "knn = KNeighborsRegressor(n_neighbors=10)\n",
+ "knn_mae = fit_and_evaluate(knn)\n",
+ "\n",
+ "print('K-Nearest Neighbors Regression Performance on the test set: MAE = %0.4f' % knn_mae)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.style.use('fivethirtyeight')\n",
+ "figsize(8, 4)\n",
+ "\n",
+ "# Dataframe to hold the results\n",
+ "model_comparison = pd.DataFrame({'model':['Linear Regression',\n",
+ " 'Support Vector Machine',\n",
+ " 'Random Forest',\n",
+ " 'Gradient Boosted',\n",
+ " 'K-Nearest Neighbors'],\n",
+ " 'mae':[lr_mae,\n",
+ " svm_mae,\n",
+ " random_forest_mae, \n",
+ " gradient_boosted_mae, \n",
+ " knn_mae]})\n",
+ "# Horizontal bar chart of test mae\n",
+ "model_comparison.sort_values('mae',ascending = False).plot(x = 'model',\n",
+ " y = 'mae',\n",
+ " kind = 'barh',\n",
+ " color = 'red', \n",
+ " edgecolor = 'black')\n",
+ "# Plot formatting\n",
+ "plt.ylabel('');plt.yticks(size = 14);plt.xlabel('Mean Absolute Error');plt.xticks(size = 14)\n",
+ "plt.title('Model Comparison on Test MAE', size = 20);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "集成算法的效果更好,这里由于参数只使用默认的,对SVM等这种参数影响较大的模型不太公平。"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 调参"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Cross Validation\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 设置相关参数\n",
+ "# 要优化的损失函数\n",
+ "loss = ['ls','lad','huber']\n",
+ "\n",
+ "# 梯度增强过程中使用的树的数量\n",
+ "n_estimators = [100,500, 900, 1100,1500]\n",
+ "\n",
+ "# 树的最大深度\n",
+ "max_depth = [2,3,5,10,15]\n",
+ "\n",
+ "# 每片叶子的最小样本数\n",
+ "min_samples_leaf = [1,2,4,6,8]\n",
+ "\n",
+ "# 拆分节点的最小样本数\n",
+ "min_samples_split = [2, 4, 6, 10]\n",
+ "\n",
+ "# 进行拆分时要考虑的最大特征数\n",
+ "max_features = ['auto', 'sqrt', 'log2', None]\n",
+ "\n",
+ "# 定义要进行搜索的超参数网格\n",
+ "hyperparameter_grid = {'loss': loss,\n",
+ " 'n_estimators': n_estimators,\n",
+ " 'max_depth': max_depth,\n",
+ " 'min_samples_leaf': min_samples_leaf,\n",
+ " 'min_samples_split': min_samples_split,\n",
+ " 'max_features': max_features} "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 随机搜索"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the model to use for hyperparameter tuning\n",
+ "model = GradientBoostingRegressor(random_state = 42)\n",
+ "\n",
+ "# Set up the random search with 4-fold cross validation\n",
+ "random_cv = RandomizedSearchCV(estimator=model,\n",
+ " param_distributions=hyperparameter_grid,\n",
+ " cv=4, n_iter=25, \n",
+ " scoring = 'neg_mean_absolute_error',\n",
+ " n_jobs = -1, verbose = 1, \n",
+ " return_train_score = True,\n",
+ " random_state=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fitting 4 folds for each of 25 candidates, totalling 100 fits\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 5.6min\n",
+ "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 8.8min finished\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "RandomizedSearchCV(cv=4, error_score='raise-deprecating',\n",
+ " estimator=GradientBoostingRegressor(alpha=0.9,\n",
+ " criterion='friedman_mse',\n",
+ " init=None,\n",
+ " learning_rate=0.1,\n",
+ " loss='ls', max_depth=3,\n",
+ " max_features=None,\n",
+ " max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0,\n",
+ " min_impurity_split=None,\n",
+ " min_samples_leaf=1,\n",
+ " min_samples_split=2,\n",
+ " min_weight_fraction_leaf=0.0,\n",
+ " n_estimators=100,...\n",
+ " iid='warn', n_iter=25, n_jobs=-1,\n",
+ " param_distributions={'loss': ['ls', 'lad', 'huber'],\n",
+ " 'max_depth': [2, 3, 5, 10, 15],\n",
+ " 'max_features': ['auto', 'sqrt', 'log2',\n",
+ " None],\n",
+ " 'min_samples_leaf': [1, 2, 4, 6, 8],\n",
+ " 'min_samples_split': [2, 4, 6, 10],\n",
+ " 'n_estimators': [100, 500, 900, 1100,\n",
+ " 1500]},\n",
+ " pre_dispatch='2*n_jobs', random_state=42, refit=True,\n",
+ " return_train_score=True, scoring='neg_mean_absolute_error',\n",
+ " verbose=1)"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Fit on the training data\n",
+ "random_cv.fit(X, y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mean_fit_time | \n",
+ " std_fit_time | \n",
+ " mean_score_time | \n",
+ " std_score_time | \n",
+ " param_n_estimators | \n",
+ " param_min_samples_split | \n",
+ " param_min_samples_leaf | \n",
+ " param_max_features | \n",
+ " param_max_depth | \n",
+ " param_loss | \n",
+ " params | \n",
+ " split0_test_score | \n",
+ " split1_test_score | \n",
+ " split2_test_score | \n",
+ " split3_test_score | \n",
+ " mean_test_score | \n",
+ " std_test_score | \n",
+ " rank_test_score | \n",
+ " split0_train_score | \n",
+ " split1_train_score | \n",
+ " split2_train_score | \n",
+ " split3_train_score | \n",
+ " mean_train_score | \n",
+ " std_train_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 12 | \n",
+ " 10.512508 | \n",
+ " 0.161444 | \n",
+ " 0.021770 | \n",
+ " 0.001280 | \n",
+ " 500 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " None | \n",
+ " 5 | \n",
+ " lad | \n",
+ " {'n_estimators': 500, 'min_samples_split': 6, ... | \n",
+ " -8.924621 | \n",
+ " -8.775078 | \n",
+ " -9.325044 | \n",
+ " -9.037550 | \n",
+ " -9.015523 | \n",
+ " 0.201467 | \n",
+ " 1 | \n",
+ " -6.934705 | \n",
+ " -6.837958 | \n",
+ " -6.869084 | \n",
+ " -6.840749 | \n",
+ " -6.870624 | \n",
+ " 0.038950 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 11.610687 | \n",
+ " 0.557825 | \n",
+ " 0.034470 | \n",
+ " 0.010364 | \n",
+ " 500 | \n",
+ " 6 | \n",
+ " 8 | \n",
+ " None | \n",
+ " 5 | \n",
+ " huber | \n",
+ " {'n_estimators': 500, 'min_samples_split': 6, ... | \n",
+ " -8.872382 | \n",
+ " -8.903982 | \n",
+ " -9.317120 | \n",
+ " -9.075047 | \n",
+ " -9.042086 | \n",
+ " 0.176489 | \n",
+ " 2 | \n",
+ " -4.384343 | \n",
+ " -4.326121 | \n",
+ " -4.823009 | \n",
+ " -4.229809 | \n",
+ " -4.440820 | \n",
+ " 0.227453 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 9.308119 | \n",
+ " 0.115381 | \n",
+ " 0.019372 | \n",
+ " 0.002762 | \n",
+ " 500 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " auto | \n",
+ " 3 | \n",
+ " huber | \n",
+ " {'n_estimators': 500, 'min_samples_split': 4, ... | \n",
+ " -9.062297 | \n",
+ " -9.042221 | \n",
+ " -9.439618 | \n",
+ " -9.153004 | \n",
+ " -9.174248 | \n",
+ " 0.158764 | \n",
+ " 3 | \n",
+ " -6.955777 | \n",
+ " -7.088986 | \n",
+ " -6.913108 | \n",
+ " -6.940327 | \n",
+ " -6.974550 | \n",
+ " 0.067813 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " 2.768650 | \n",
+ " 0.022904 | \n",
+ " 0.010735 | \n",
+ " 0.006010 | \n",
+ " 100 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " auto | \n",
+ " 5 | \n",
+ " ls | \n",
+ " {'n_estimators': 100, 'min_samples_split': 2, ... | \n",
+ " -9.100775 | \n",
+ " -9.026372 | \n",
+ " -9.457063 | \n",
+ " -9.201235 | \n",
+ " -9.196321 | \n",
+ " 0.162799 | \n",
+ " 4 | \n",
+ " -7.302003 | \n",
+ " -7.312645 | \n",
+ " -7.173397 | \n",
+ " -7.386171 | \n",
+ " -7.293554 | \n",
+ " 0.076569 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 5.052495 | \n",
+ " 0.146989 | \n",
+ " 0.016706 | \n",
+ " 0.003104 | \n",
+ " 500 | \n",
+ " 4 | \n",
+ " 6 | \n",
+ " auto | \n",
+ " 3 | \n",
+ " ls | \n",
+ " {'n_estimators': 500, 'min_samples_split': 4, ... | \n",
+ " -9.147703 | \n",
+ " -9.199053 | \n",
+ " -9.698781 | \n",
+ " -9.358627 | \n",
+ " -9.350987 | \n",
+ " 0.215296 | \n",
+ " 5 | \n",
+ " -7.089709 | \n",
+ " -7.103749 | \n",
+ " -7.021418 | \n",
+ " -7.080088 | \n",
+ " -7.073741 | \n",
+ " 0.031358 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 12.648192 | \n",
+ " 0.497953 | \n",
+ " 0.025433 | \n",
+ " 0.002286 | \n",
+ " 1100 | \n",
+ " 6 | \n",
+ " 2 | \n",
+ " auto | \n",
+ " 3 | \n",
+ " lad | \n",
+ " {'n_estimators': 1100, 'min_samples_split': 6,... | \n",
+ " -9.350800 | \n",
+ " -9.382812 | \n",
+ " -9.475236 | \n",
+ " -9.291753 | \n",
+ " -9.375148 | \n",
+ " 0.066368 | \n",
+ " 6 | \n",
+ " -8.513004 | \n",
+ " -8.658903 | \n",
+ " -7.820534 | \n",
+ " -8.017995 | \n",
+ " -8.252609 | \n",
+ " 0.344461 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 62.647066 | \n",
+ " 2.078449 | \n",
+ " 0.127230 | \n",
+ " 0.011548 | \n",
+ " 1100 | \n",
+ " 10 | \n",
+ " 6 | \n",
+ " None | \n",
+ " 10 | \n",
+ " huber | \n",
+ " {'n_estimators': 1100, 'min_samples_split': 10... | \n",
+ " -9.274382 | \n",
+ " -9.242743 | \n",
+ " -9.530568 | \n",
+ " -9.475944 | \n",
+ " -9.380872 | \n",
+ " 0.124366 | \n",
+ " 7 | \n",
+ " -0.409811 | \n",
+ " -0.459465 | \n",
+ " -0.393491 | \n",
+ " -0.370483 | \n",
+ " -0.408312 | \n",
+ " 0.032671 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 39.648134 | \n",
+ " 1.619497 | \n",
+ " 0.063250 | \n",
+ " 0.002862 | \n",
+ " 500 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " auto | \n",
+ " 10 | \n",
+ " huber | \n",
+ " {'n_estimators': 500, 'min_samples_split': 4, ... | \n",
+ " -9.415934 | \n",
+ " -9.291198 | \n",
+ " -9.575363 | \n",
+ " -9.429866 | \n",
+ " -9.428068 | \n",
+ " 0.100721 | \n",
+ " 8 | \n",
+ " -0.205702 | \n",
+ " -0.193613 | \n",
+ " -0.164415 | \n",
+ " -0.141062 | \n",
+ " -0.176198 | \n",
+ " 0.025235 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 17.596702 | \n",
+ " 0.263915 | \n",
+ " 0.041574 | \n",
+ " 0.001576 | \n",
+ " 1500 | \n",
+ " 4 | \n",
+ " 6 | \n",
+ " None | \n",
+ " 3 | \n",
+ " ls | \n",
+ " {'n_estimators': 1500, 'min_samples_split': 4,... | \n",
+ " -9.248892 | \n",
+ " -9.316306 | \n",
+ " -9.798843 | \n",
+ " -9.427920 | \n",
+ " -9.447940 | \n",
+ " 0.212403 | \n",
+ " 9 | \n",
+ " -4.821677 | \n",
+ " -4.876202 | \n",
+ " -4.776650 | \n",
+ " -4.729506 | \n",
+ " -4.801009 | \n",
+ " 0.054284 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 3.955678 | \n",
+ " 0.036891 | \n",
+ " 0.033909 | \n",
+ " 0.007227 | \n",
+ " 500 | \n",
+ " 6 | \n",
+ " 4 | \n",
+ " log2 | \n",
+ " 5 | \n",
+ " huber | \n",
+ " {'n_estimators': 500, 'min_samples_split': 6, ... | \n",
+ " -9.365194 | \n",
+ " -9.286988 | \n",
+ " -9.751767 | \n",
+ " -9.409179 | \n",
+ " -9.453243 | \n",
+ " 0.177788 | \n",
+ " 10 | \n",
+ " -5.733003 | \n",
+ " -5.742441 | \n",
+ " -5.571781 | \n",
+ " -5.666476 | \n",
+ " -5.678425 | \n",
+ " 0.068177 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " mean_fit_time std_fit_time mean_score_time std_score_time \\\n",
+ "12 10.512508 0.161444 0.021770 0.001280 \n",
+ "3 11.610687 0.557825 0.034470 0.010364 \n",
+ "9 9.308119 0.115381 0.019372 0.002762 \n",
+ "0 2.768650 0.022904 0.010735 0.006010 \n",
+ "7 5.052495 0.146989 0.016706 0.003104 \n",
+ "19 12.648192 0.497953 0.025433 0.002286 \n",
+ "10 62.647066 2.078449 0.127230 0.011548 \n",
+ "2 39.648134 1.619497 0.063250 0.002862 \n",
+ "16 17.596702 0.263915 0.041574 0.001576 \n",
+ "21 3.955678 0.036891 0.033909 0.007227 \n",
+ "\n",
+ " param_n_estimators param_min_samples_split param_min_samples_leaf \\\n",
+ "12 500 6 6 \n",
+ "3 500 6 8 \n",
+ "9 500 4 2 \n",
+ "0 100 2 6 \n",
+ "7 500 4 6 \n",
+ "19 1100 6 2 \n",
+ "10 1100 10 6 \n",
+ "2 500 4 2 \n",
+ "16 1500 4 6 \n",
+ "21 500 6 4 \n",
+ "\n",
+ " param_max_features param_max_depth param_loss \\\n",
+ "12 None 5 lad \n",
+ "3 None 5 huber \n",
+ "9 auto 3 huber \n",
+ "0 auto 5 ls \n",
+ "7 auto 3 ls \n",
+ "19 auto 3 lad \n",
+ "10 None 10 huber \n",
+ "2 auto 10 huber \n",
+ "16 None 3 ls \n",
+ "21 log2 5 huber \n",
+ "\n",
+ " params split0_test_score \\\n",
+ "12 {'n_estimators': 500, 'min_samples_split': 6, ... -8.924621 \n",
+ "3 {'n_estimators': 500, 'min_samples_split': 6, ... -8.872382 \n",
+ "9 {'n_estimators': 500, 'min_samples_split': 4, ... -9.062297 \n",
+ "0 {'n_estimators': 100, 'min_samples_split': 2, ... -9.100775 \n",
+ "7 {'n_estimators': 500, 'min_samples_split': 4, ... -9.147703 \n",
+ "19 {'n_estimators': 1100, 'min_samples_split': 6,... -9.350800 \n",
+ "10 {'n_estimators': 1100, 'min_samples_split': 10... -9.274382 \n",
+ "2 {'n_estimators': 500, 'min_samples_split': 4, ... -9.415934 \n",
+ "16 {'n_estimators': 1500, 'min_samples_split': 4,... -9.248892 \n",
+ "21 {'n_estimators': 500, 'min_samples_split': 6, ... -9.365194 \n",
+ "\n",
+ " split1_test_score split2_test_score split3_test_score mean_test_score \\\n",
+ "12 -8.775078 -9.325044 -9.037550 -9.015523 \n",
+ "3 -8.903982 -9.317120 -9.075047 -9.042086 \n",
+ "9 -9.042221 -9.439618 -9.153004 -9.174248 \n",
+ "0 -9.026372 -9.457063 -9.201235 -9.196321 \n",
+ "7 -9.199053 -9.698781 -9.358627 -9.350987 \n",
+ "19 -9.382812 -9.475236 -9.291753 -9.375148 \n",
+ "10 -9.242743 -9.530568 -9.475944 -9.380872 \n",
+ "2 -9.291198 -9.575363 -9.429866 -9.428068 \n",
+ "16 -9.316306 -9.798843 -9.427920 -9.447940 \n",
+ "21 -9.286988 -9.751767 -9.409179 -9.453243 \n",
+ "\n",
+ " std_test_score rank_test_score split0_train_score split1_train_score \\\n",
+ "12 0.201467 1 -6.934705 -6.837958 \n",
+ "3 0.176489 2 -4.384343 -4.326121 \n",
+ "9 0.158764 3 -6.955777 -7.088986 \n",
+ "0 0.162799 4 -7.302003 -7.312645 \n",
+ "7 0.215296 5 -7.089709 -7.103749 \n",
+ "19 0.066368 6 -8.513004 -8.658903 \n",
+ "10 0.124366 7 -0.409811 -0.459465 \n",
+ "2 0.100721 8 -0.205702 -0.193613 \n",
+ "16 0.212403 9 -4.821677 -4.876202 \n",
+ "21 0.177788 10 -5.733003 -5.742441 \n",
+ "\n",
+ " split2_train_score split3_train_score mean_train_score std_train_score \n",
+ "12 -6.869084 -6.840749 -6.870624 0.038950 \n",
+ "3 -4.823009 -4.229809 -4.440820 0.227453 \n",
+ "9 -6.913108 -6.940327 -6.974550 0.067813 \n",
+ "0 -7.173397 -7.386171 -7.293554 0.076569 \n",
+ "7 -7.021418 -7.080088 -7.073741 0.031358 \n",
+ "19 -7.820534 -8.017995 -8.252609 0.344461 \n",
+ "10 -0.393491 -0.370483 -0.408312 0.032671 \n",
+ "2 -0.164415 -0.141062 -0.176198 0.025235 \n",
+ "16 -4.776650 -4.729506 -4.801009 0.054284 \n",
+ "21 -5.571781 -5.666476 -5.678425 0.068177 "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 获取所有cv结果并按测试性能排序\n",
+ "random_results = pd.DataFrame(random_cv.cv_results_).sort_values('mean_test_score', ascending = False)\n",
+ "\n",
+ "random_results.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
+ " learning_rate=0.1, loss='lad', max_depth=5,\n",
+ " max_features=None, max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0, min_impurity_split=None,\n",
+ " min_samples_leaf=6, min_samples_split=6,\n",
+ " min_weight_fraction_leaf=0.0, n_estimators=500,\n",
+ " n_iter_no_change=None, presort='auto',\n",
+ " random_state=42, subsample=1.0, tol=0.0001,\n",
+ " validation_fraction=0.1, verbose=0, warm_start=False)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "random_cv.best_estimator_"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 网格搜索\n",
+ "\n",
+ "随机搜索找到一个大概的值,再用网格搜索更加精确的查找。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 创建一系列要评估的树\n",
+ "trees_grid = {'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800]}\n",
+ "\n",
+ "model = GradientBoostingRegressor(loss = 'lad', max_depth = 5,\n",
+ " min_samples_leaf = 6,\n",
+ " min_samples_split = 6,\n",
+ " max_features = None,\n",
+ " random_state = 42)\n",
+ "\n",
+ "# 使用树的范围和随机森林模型的网格搜索对象\n",
+ "grid_search = GridSearchCV(estimator = model, \n",
+ " param_grid=trees_grid, \n",
+ " cv = 4, \n",
+ " scoring = 'neg_mean_absolute_error', \n",
+ " verbose = 1,\n",
+ " n_jobs = -1, \n",
+ " return_train_score = True)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fitting 4 folds for each of 15 candidates, totalling 60 fits\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
+ "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 1.3min\n",
+ "[Parallel(n_jobs=-1)]: Done 60 out of 60 | elapsed: 2.2min finished\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "GridSearchCV(cv=4, error_score='raise-deprecating',\n",
+ " estimator=GradientBoostingRegressor(alpha=0.9,\n",
+ " criterion='friedman_mse',\n",
+ " init=None, learning_rate=0.1,\n",
+ " loss='lad', max_depth=5,\n",
+ " max_features=None,\n",
+ " max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0,\n",
+ " min_impurity_split=None,\n",
+ " min_samples_leaf=6,\n",
+ " min_samples_split=6,\n",
+ " min_weight_fraction_leaf=0.0,\n",
+ " n_estimators=100,\n",
+ " n_iter_no_change=None,\n",
+ " presort='auto',\n",
+ " random_state=42, subsample=1.0,\n",
+ " tol=0.0001,\n",
+ " validation_fraction=0.1,\n",
+ " verbose=0, warm_start=False),\n",
+ " iid='warn', n_jobs=-1,\n",
+ " param_grid={'n_estimators': [100, 150, 200, 250, 300, 350, 400,\n",
+ " 450, 500, 550, 600, 650, 700, 750,\n",
+ " 800]},\n",
+ " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
+ " scoring='neg_mean_absolute_error', verbose=1)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Fit the grid search\n",
+ "grid_search.fit(X, y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# 将结果导入数据框\n",
+ "results = pd.DataFrame(grid_search.cv_results_)\n",
+ "\n",
+ "# 绘制训练误差和测试误差与树木数量的关系图\n",
+ "figsize(8, 8)\n",
+ "plt.style.use('fivethirtyeight')\n",
+ "plt.plot(results['param_n_estimators'], -1 * results['mean_test_score'], label = 'Testing Error')\n",
+ "plt.plot(results['param_n_estimators'], -1 * results['mean_train_score'], label = 'Training Error')\n",
+ "plt.xlabel('Number of Trees'); plt.ylabel('Mean Abosolute Error'); plt.legend();\n",
+ "plt.title('Performance vs Number of Trees');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mean_fit_time | \n",
+ " std_fit_time | \n",
+ " mean_score_time | \n",
+ " std_score_time | \n",
+ " param_n_estimators | \n",
+ " params | \n",
+ " split0_test_score | \n",
+ " split1_test_score | \n",
+ " split2_test_score | \n",
+ " split3_test_score | \n",
+ " mean_test_score | \n",
+ " std_test_score | \n",
+ " rank_test_score | \n",
+ " split0_train_score | \n",
+ " split1_train_score | \n",
+ " split2_train_score | \n",
+ " split3_train_score | \n",
+ " mean_train_score | \n",
+ " std_train_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 13 | \n",
+ " 14.369840 | \n",
+ " 0.281917 | \n",
+ " 0.032413 | \n",
+ " 0.004375 | \n",
+ " 750 | \n",
+ " {'n_estimators': 750} | \n",
+ " -8.901092 | \n",
+ " -8.751699 | \n",
+ " -9.298411 | \n",
+ " -9.049730 | \n",
+ " -9.000181 | \n",
+ " 0.201836 | \n",
+ " 1 | \n",
+ " -6.716494 | \n",
+ " -6.660453 | \n",
+ " -6.520818 | \n",
+ " -6.579989 | \n",
+ " -6.619438 | \n",
+ " 0.074807 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 14.957021 | \n",
+ " 0.327790 | \n",
+ " 0.030169 | \n",
+ " 0.001295 | \n",
+ " 800 | \n",
+ " {'n_estimators': 800} | \n",
+ " -8.903857 | \n",
+ " -8.758299 | \n",
+ " -9.296941 | \n",
+ " -9.047485 | \n",
+ " -9.001594 | \n",
+ " 0.198793 | \n",
+ " 2 | \n",
+ " -6.689643 | \n",
+ " -6.648109 | \n",
+ " -6.486269 | \n",
+ " -6.530287 | \n",
+ " -6.588577 | \n",
+ " 0.083101 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 13.406416 | \n",
+ " 0.316643 | \n",
+ " 0.028174 | \n",
+ " 0.001915 | \n",
+ " 700 | \n",
+ " {'n_estimators': 700} | \n",
+ " -8.904269 | \n",
+ " -8.755862 | \n",
+ " -9.312990 | \n",
+ " -9.053755 | \n",
+ " -9.006666 | \n",
+ " 0.205811 | \n",
+ " 3 | \n",
+ " -6.743886 | \n",
+ " -6.695770 | \n",
+ " -6.611240 | \n",
+ " -6.624919 | \n",
+ " -6.668953 | \n",
+ " 0.053861 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 12.764133 | \n",
+ " 0.196318 | \n",
+ " 0.026181 | \n",
+ " 0.000824 | \n",
+ " 650 | \n",
+ " {'n_estimators': 650} | \n",
+ " -8.906208 | \n",
+ " -8.755401 | \n",
+ " -9.325190 | \n",
+ " -9.052215 | \n",
+ " -9.009699 | \n",
+ " 0.210186 | \n",
+ " 4 | \n",
+ " -6.778068 | \n",
+ " -6.721452 | \n",
+ " -6.708191 | \n",
+ " -6.666758 | \n",
+ " -6.718617 | \n",
+ " 0.039814 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 10.654021 | \n",
+ " 0.313924 | \n",
+ " 0.029671 | \n",
+ " 0.008415 | \n",
+ " 550 | \n",
+ " {'n_estimators': 550} | \n",
+ " -8.918502 | \n",
+ " -8.756938 | \n",
+ " -9.325492 | \n",
+ " -9.042721 | \n",
+ " -9.010861 | \n",
+ " 0.207970 | \n",
+ " 5 | \n",
+ " -6.874737 | \n",
+ " -6.781878 | \n",
+ " -6.809121 | \n",
+ " -6.745712 | \n",
+ " -6.802862 | \n",
+ " 0.047201 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " mean_fit_time std_fit_time mean_score_time std_score_time \\\n",
+ "13 14.369840 0.281917 0.032413 0.004375 \n",
+ "14 14.957021 0.327790 0.030169 0.001295 \n",
+ "12 13.406416 0.316643 0.028174 0.001915 \n",
+ "11 12.764133 0.196318 0.026181 0.000824 \n",
+ "9 10.654021 0.313924 0.029671 0.008415 \n",
+ "\n",
+ " param_n_estimators params split0_test_score \\\n",
+ "13 750 {'n_estimators': 750} -8.901092 \n",
+ "14 800 {'n_estimators': 800} -8.903857 \n",
+ "12 700 {'n_estimators': 700} -8.904269 \n",
+ "11 650 {'n_estimators': 650} -8.906208 \n",
+ "9 550 {'n_estimators': 550} -8.918502 \n",
+ "\n",
+ " split1_test_score split2_test_score split3_test_score mean_test_score \\\n",
+ "13 -8.751699 -9.298411 -9.049730 -9.000181 \n",
+ "14 -8.758299 -9.296941 -9.047485 -9.001594 \n",
+ "12 -8.755862 -9.312990 -9.053755 -9.006666 \n",
+ "11 -8.755401 -9.325190 -9.052215 -9.009699 \n",
+ "9 -8.756938 -9.325492 -9.042721 -9.010861 \n",
+ "\n",
+ " std_test_score rank_test_score split0_train_score split1_train_score \\\n",
+ "13 0.201836 1 -6.716494 -6.660453 \n",
+ "14 0.198793 2 -6.689643 -6.648109 \n",
+ "12 0.205811 3 -6.743886 -6.695770 \n",
+ "11 0.210186 4 -6.778068 -6.721452 \n",
+ "9 0.207970 5 -6.874737 -6.781878 \n",
+ "\n",
+ " split2_train_score split3_train_score mean_train_score std_train_score \n",
+ "13 -6.520818 -6.579989 -6.619438 0.074807 \n",
+ "14 -6.486269 -6.530287 -6.588577 0.083101 \n",
+ "12 -6.611240 -6.624919 -6.668953 0.053861 \n",
+ "11 -6.708191 -6.666758 -6.718617 0.039814 \n",
+ "9 -6.809121 -6.745712 -6.802862 0.047201 "
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results.sort_values('mean_test_score', ascending = False).head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 测试模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
+ " learning_rate=0.1, loss='lad', max_depth=5,\n",
+ " max_features=None, max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0, min_impurity_split=None,\n",
+ " min_samples_leaf=6, min_samples_split=6,\n",
+ " min_weight_fraction_leaf=0.0, n_estimators=750,\n",
+ " n_iter_no_change=None, presort='auto',\n",
+ " random_state=42, subsample=1.0, tol=0.0001,\n",
+ " validation_fraction=0.1, verbose=0, warm_start=False)"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 默认模型\n",
+ "default_model = GradientBoostingRegressor(random_state = 42)\n",
+ "\n",
+ "# 选择最佳模型\n",
+ "final_model = grid_search.best_estimator_\n",
+ "\n",
+ "final_model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.19 s ± 44.9 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 重复执行,-n在当前循环中执行多少次,-r循环多少次\n",
+ "%%timeit -n 1 -r 5\n",
+ "default_model.fit(X, y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "14.9 s ± 271 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%timeit -n 1 -r 5\n",
+ "final_model.fit(X, y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Default model performance on the test set: MAE = 10.0130.\n",
+ "Final model performance on the test set: MAE = 9.0474.\n"
+ ]
+ }
+ ],
+ "source": [
+ "default_pred = default_model.predict(X_test)\n",
+ "final_pred = final_model.predict(X_test)\n",
+ "\n",
+ "print('Default model performance on the test set: MAE = %0.4f.' % mae(y_test, default_pred))\n",
+ "print('Final model performance on the test set: MAE = %0.4f.' % mae(y_test, final_pred))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "对比测试结果,训练时间近似,模型得到差不多10%的提升。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "figsize(8, 8)\n",
+ "\n",
+ "# Density plot of the final predictions and the test values\n",
+ "sns.kdeplot(final_pred, label = 'Predictions')\n",
+ "sns.kdeplot(y_test, label = 'Values')\n",
+ "\n",
+ "# Label the plot\n",
+ "plt.xlabel('Energy Star Score'); plt.ylabel('Density');\n",
+ "plt.title('Test Values and Predictions');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "figsize = (6, 6)\n",
+ "\n",
+ "# 计算残差\n",
+ "residuals = final_pred - y_test\n",
+ "\n",
+ "# 绘制残差分布直方图\n",
+ "plt.hist(residuals, color = 'red', bins = 20,\n",
+ " edgecolor = 'black')\n",
+ "plt.xlabel('Error'); plt.ylabel('Count')\n",
+ "plt.title('Distribution of Residuals');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}