From c67515eb6bfa6edc504d1f154fb43004b2bf60e4 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 25 Dec 2020 10:42:31 +0800 Subject: [PATCH] =?UTF-8?q?Create=20=E5=BB=BA=E6=A8=A1=E4=B8=8E=E5=88=86?= =?UTF-8?q?=E6=9E=90=5F=E5=BB=BA=E7=AD=91=E8=83=BD=E6=BA=90=E5=88=A9?= =?UTF-8?q?=E7=94=A8=E7=8E=87=E9=A2=84=E6=B5=8B.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...分析_建筑能源利用率预测.ipynb | 943 ++++++++++++++++++ 1 file changed, 943 insertions(+) create mode 100644 机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb diff --git a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb new file mode 100644 index 0000000..28335ad --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/建模与分析_建筑能源利用率预测.ipynb @@ -0,0 +1,943 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 载入工具包" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "pd.options.mode.chained_assignment = None # 消除警告,比如说提示版本升级之类的\n", + "\n", + "pd.set_option('display.max_columns', 60) # 设置最大显示列为60\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "plt.rcParams['font.size'] = 24 # 设置字体大小\n", + "\n", + "from IPython.core.pylabtools import figsize # 设置画图大小\n", + "\n", + "import seaborn as sns # 画图工具\n", + "sns.set(font_scale=2)\n", + "\n", + "# 输入缺失值和缩放值\n", + "from sklearn.preprocessing import Imputer, MinMaxScaler\n", + "\n", + "# 机器学习模型\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", + "from sklearn.svm import SVR\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "\n", + "# 超参数调整\n", + "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Feature Size: (6622, 64)\n", + "Testing Feature Size: (2839, 64)\n", + "Training Labels Size: (6622, 1)\n", + "Testing Labels Size: (2839, 1)\n" + ] + } + ], + "source": [ + "# Read in data into dataframes \n", + "train_features = pd.read_csv('data/training_features.csv')\n", + "test_features = pd.read_csv('data/testing_features.csv')\n", + "train_labels = pd.read_csv('data/training_labels.csv')\n", + "test_labels = pd.read_csv('data/testing_labels.csv')\n", + "\n", + "# Display sizes of data\n", + "print('Training Feature Size: ', train_features.shape)\n", + "print('Testing Feature Size: ', test_features.shape)\n", + "print('Training Labels Size: ', train_labels.shape)\n", + "print('Testing Labels Size: ', test_labels.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OrderProperty IdDOF Gross Floor AreaYear BuiltNumber of Buildings - Self-reportedOccupancySite EUI (kBtu/ft²)Weather Normalized Site Electricity Intensity (kWh/ft²)Weather Normalized Site Natural Gas Intensity (therms/ft²)Water Intensity (All Water Sources) (gal/ft²)LatitudeLongitudeCommunity BoardCensus Tractlog_Direct GHG Emissions (Metric Tons CO2e)log_Water Intensity (All Water Sources) (gal/ft²)Borough_Staten IslandLargest Property Use Type_Adult EducationLargest Property Use Type_Automobile DealershipLargest Property Use Type_Bank BranchLargest Property Use Type_College/UniversityLargest Property Use Type_Convenience Store without Gas StationLargest Property Use Type_CourthouseLargest Property Use Type_Distribution CenterLargest Property Use Type_Enclosed MallLargest Property Use Type_Financial OfficeLargest Property Use Type_Hospital (General Medical & Surgical)Largest Property Use Type_HotelLargest Property Use Type_K-12 SchoolLargest Property Use Type_Library...Largest Property Use Type_Multifamily HousingLargest Property Use Type_MuseumLargest Property Use Type_Non-Refrigerated WarehouseLargest Property Use Type_OtherLargest Property Use Type_Other - EducationLargest Property Use Type_Other - Entertainment/Public AssemblyLargest Property Use Type_Other - Lodging/ResidentialLargest Property Use Type_Other - MallLargest Property Use Type_Other - Public ServicesLargest Property Use Type_Other - RecreationLargest Property Use Type_Other - ServicesLargest Property Use Type_Other - Specialty HospitalLargest Property Use Type_Outpatient Rehabilitation/Physical TherapyLargest Property Use Type_ParkingLargest Property Use Type_Performing ArtsLargest Property Use Type_Pre-school/DaycareLargest Property Use Type_Refrigerated WarehouseLargest Property Use Type_Repair Services (Vehicle, Shoe, Locksmith, etc.)Largest Property Use Type_Residence Hall/DormitoryLargest Property Use Type_Residential Care FacilityLargest Property Use Type_RestaurantLargest Property Use Type_Retail StoreLargest Property Use Type_Self-Storage FacilityLargest Property Use Type_Senior Care CommunityLargest Property Use Type_Social/Meeting HallLargest Property Use Type_Strip MallLargest Property Use Type_Supermarket/Grocery StoreLargest Property Use Type_Urgent Care/Clinic/Other OutpatientLargest Property Use Type_Wholesale Club/SupercenterLargest Property Use Type_Worship Facility
013276584978490300.019501100126.05.21.299.41NaNNaNNaNNaN6.0888184.59925300000000000000...100000000000000000000000000000
17377439844252000.01926110095.44.70.9NaN40.835496-73.8877453.0161.05.384036NaN00000000000000...100000000000000000000000000000
294794665374104700.01954110040.43.80.3NaN40.663206-73.9494699.0329.05.017280NaN00000000000000...100000000000000000000000000000
3147743393340129333.019921100157.116.91.1NaN40.622968-74.0787421.027.06.510853NaN10000000000000...000000000000000000000001000000
432862704325109896.01927110062.33.50.028.6540.782421-73.9726227.0165.06.1235893.35515300000000000000...100000000000000000000000000000
\n", + "

5 rows × 64 columns

\n", + "
" + ], + "text/plain": [ + " Order Property Id DOF Gross Floor Area Year Built \\\n", + "0 13276 5849784 90300.0 1950 \n", + "1 7377 4398442 52000.0 1926 \n", + "2 9479 4665374 104700.0 1954 \n", + "3 14774 3393340 129333.0 1992 \n", + "4 3286 2704325 109896.0 1927 \n", + "\n", + " Number of Buildings - Self-reported Occupancy Site EUI (kBtu/ft²) \\\n", + "0 1 100 126.0 \n", + "1 1 100 95.4 \n", + "2 1 100 40.4 \n", + "3 1 100 157.1 \n", + "4 1 100 62.3 \n", + "\n", + " Weather Normalized Site Electricity Intensity (kWh/ft²) \\\n", + "0 5.2 \n", + "1 4.7 \n", + "2 3.8 \n", + "3 16.9 \n", + "4 3.5 \n", + "\n", + " Weather Normalized Site Natural Gas Intensity (therms/ft²) \\\n", + "0 1.2 \n", + "1 0.9 \n", + "2 0.3 \n", + "3 1.1 \n", + "4 0.0 \n", + "\n", + " Water Intensity (All Water Sources) (gal/ft²) Latitude Longitude \\\n", + "0 99.41 NaN NaN \n", + "1 NaN 40.835496 -73.887745 \n", + "2 NaN 40.663206 -73.949469 \n", + "3 NaN 40.622968 -74.078742 \n", + "4 28.65 40.782421 -73.972622 \n", + "\n", + " Community Board Census Tract log_Direct GHG Emissions (Metric Tons CO2e) \\\n", + "0 NaN NaN 6.088818 \n", + "1 3.0 161.0 5.384036 \n", + "2 9.0 329.0 5.017280 \n", + "3 1.0 27.0 6.510853 \n", + "4 7.0 165.0 6.123589 \n", + "\n", + " log_Water Intensity (All Water Sources) (gal/ft²) Borough_Staten Island \\\n", + "0 4.599253 0 \n", + "1 NaN 0 \n", + "2 NaN 0 \n", + "3 NaN 1 \n", + "4 3.355153 0 \n", + "\n", + " Largest Property Use Type_Adult Education \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Automobile Dealership \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Bank Branch \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_College/University \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Convenience Store without Gas Station \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Courthouse \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Distribution Center \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Enclosed Mall \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Financial Office \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Hospital (General Medical & Surgical) \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Hotel Largest Property Use Type_K-12 School \\\n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "\n", + " Largest Property Use Type_Library ... \\\n", + "0 0 ... \n", + "1 0 ... \n", + "2 0 ... \n", + "3 0 ... \n", + "4 0 ... \n", + "\n", + " Largest Property Use Type_Multifamily Housing \\\n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 0 \n", + "4 1 \n", + "\n", + " Largest Property Use Type_Museum \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Non-Refrigerated Warehouse \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Education \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Entertainment/Public Assembly \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Lodging/Residential \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Mall \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Public Services \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Recreation \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Services \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Other - Specialty Hospital \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Outpatient Rehabilitation/Physical Therapy \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Parking \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Performing Arts \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Pre-school/Daycare \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Refrigerated Warehouse \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Repair Services (Vehicle, Shoe, Locksmith, etc.) \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Residence Hall/Dormitory \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Residential Care Facility \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Restaurant \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Retail Store \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Self-Storage Facility \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Senior Care Community \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 1 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Social/Meeting Hall \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Strip Mall \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Supermarket/Grocery Store \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Urgent Care/Clinic/Other Outpatient \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Wholesale Club/Supercenter \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + " Largest Property Use Type_Worship Facility \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + "[5 rows x 64 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_features.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 缺失值填充\n", + "\n", + "利用sklearn的 Imputer object来进行缺失值填充,测试集则使用数据集中的结果进行填充,尽可能的不要利用测试集的数据对测试集加工,因为一开始我们也是不知道的,可参考[Data Leagage](https://www.kaggle.com/dansbecker/data-leakage)。" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an imputer object with a median filling strategy\n", + "imputer = Imputer(strategy = 'median')\n", + "\n", + "# Train on the training features\n", + "imputer.fit(train_features)\n", + "\n", + "# Transform both training data and testing data\n", + "X = imputer.transform(train_features)\n", + "X_test = imputer.transform(test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing values in training features: 0\n", + "Missing values in testing features: 0\n" + ] + } + ], + "source": [ + "print('Missing values in training features:', np.sum(np.isnan(X)))\n", + "print('Missing values in testing features:',np.sum(np.isnan(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(array([], dtype=int64), array([], dtype=int64))\n", + "(array([], dtype=int64), array([], dtype=int64))\n" + ] + } + ], + "source": [ + "# Make sure all values are finite\n", + "print(np.where(~np.isfinite(X)))\n", + "print(np.where(~np.isfinite(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## 特征" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}