Create 建模与分析_建筑能源利用率预测.ipynb

pull/2/head
benjas 4 years ago
parent 0e9bc19a00
commit c67515eb6b

@ -0,0 +1,943 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 载入工具包"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"pd.options.mode.chained_assignment = None # 消除警告,比如说提示版本升级之类的\n",
"\n",
"pd.set_option('display.max_columns', 60) # 设置最大显示列为60\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"plt.rcParams['font.size'] = 24 # 设置字体大小\n",
"\n",
"from IPython.core.pylabtools import figsize # 设置画图大小\n",
"\n",
"import seaborn as sns # 画图工具\n",
"sns.set(font_scale=2)\n",
"\n",
"# 输入缺失值和缩放值\n",
"from sklearn.preprocessing import Imputer, MinMaxScaler\n",
"\n",
"# 机器学习模型\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
"from sklearn.svm import SVR\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"\n",
"# 超参数调整\n",
"from sklearn.model_selection import RandomizedSearchCV, GridSearchCV"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training Feature Size: (6622, 64)\n",
"Testing Feature Size: (2839, 64)\n",
"Training Labels Size: (6622, 1)\n",
"Testing Labels Size: (2839, 1)\n"
]
}
],
"source": [
"# Read in data into dataframes \n",
"train_features = pd.read_csv('data/training_features.csv')\n",
"test_features = pd.read_csv('data/testing_features.csv')\n",
"train_labels = pd.read_csv('data/training_labels.csv')\n",
"test_labels = pd.read_csv('data/testing_labels.csv')\n",
"\n",
"# Display sizes of data\n",
"print('Training Feature Size: ', train_features.shape)\n",
"print('Testing Feature Size: ', test_features.shape)\n",
"print('Training Labels Size: ', train_labels.shape)\n",
"print('Testing Labels Size: ', test_labels.shape)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Order</th>\n",
" <th>Property Id</th>\n",
" <th>DOF Gross Floor Area</th>\n",
" <th>Year Built</th>\n",
" <th>Number of Buildings - Self-reported</th>\n",
" <th>Occupancy</th>\n",
" <th>Site EUI (kBtu/ft²)</th>\n",
" <th>Weather Normalized Site Electricity Intensity (kWh/ft²)</th>\n",
" <th>Weather Normalized Site Natural Gas Intensity (therms/ft²)</th>\n",
" <th>Water Intensity (All Water Sources) (gal/ft²)</th>\n",
" <th>Latitude</th>\n",
" <th>Longitude</th>\n",
" <th>Community Board</th>\n",
" <th>Census Tract</th>\n",
" <th>log_Direct GHG Emissions (Metric Tons CO2e)</th>\n",
" <th>log_Water Intensity (All Water Sources) (gal/ft²)</th>\n",
" <th>Borough_Staten Island</th>\n",
" <th>Largest Property Use Type_Adult Education</th>\n",
" <th>Largest Property Use Type_Automobile Dealership</th>\n",
" <th>Largest Property Use Type_Bank Branch</th>\n",
" <th>Largest Property Use Type_College/University</th>\n",
" <th>Largest Property Use Type_Convenience Store without Gas Station</th>\n",
" <th>Largest Property Use Type_Courthouse</th>\n",
" <th>Largest Property Use Type_Distribution Center</th>\n",
" <th>Largest Property Use Type_Enclosed Mall</th>\n",
" <th>Largest Property Use Type_Financial Office</th>\n",
" <th>Largest Property Use Type_Hospital (General Medical &amp; Surgical)</th>\n",
" <th>Largest Property Use Type_Hotel</th>\n",
" <th>Largest Property Use Type_K-12 School</th>\n",
" <th>Largest Property Use Type_Library</th>\n",
" <th>...</th>\n",
" <th>Largest Property Use Type_Multifamily Housing</th>\n",
" <th>Largest Property Use Type_Museum</th>\n",
" <th>Largest Property Use Type_Non-Refrigerated Warehouse</th>\n",
" <th>Largest Property Use Type_Other</th>\n",
" <th>Largest Property Use Type_Other - Education</th>\n",
" <th>Largest Property Use Type_Other - Entertainment/Public Assembly</th>\n",
" <th>Largest Property Use Type_Other - Lodging/Residential</th>\n",
" <th>Largest Property Use Type_Other - Mall</th>\n",
" <th>Largest Property Use Type_Other - Public Services</th>\n",
" <th>Largest Property Use Type_Other - Recreation</th>\n",
" <th>Largest Property Use Type_Other - Services</th>\n",
" <th>Largest Property Use Type_Other - Specialty Hospital</th>\n",
" <th>Largest Property Use Type_Outpatient Rehabilitation/Physical Therapy</th>\n",
" <th>Largest Property Use Type_Parking</th>\n",
" <th>Largest Property Use Type_Performing Arts</th>\n",
" <th>Largest Property Use Type_Pre-school/Daycare</th>\n",
" <th>Largest Property Use Type_Refrigerated Warehouse</th>\n",
" <th>Largest Property Use Type_Repair Services (Vehicle, Shoe, Locksmith, etc.)</th>\n",
" <th>Largest Property Use Type_Residence Hall/Dormitory</th>\n",
" <th>Largest Property Use Type_Residential Care Facility</th>\n",
" <th>Largest Property Use Type_Restaurant</th>\n",
" <th>Largest Property Use Type_Retail Store</th>\n",
" <th>Largest Property Use Type_Self-Storage Facility</th>\n",
" <th>Largest Property Use Type_Senior Care Community</th>\n",
" <th>Largest Property Use Type_Social/Meeting Hall</th>\n",
" <th>Largest Property Use Type_Strip Mall</th>\n",
" <th>Largest Property Use Type_Supermarket/Grocery Store</th>\n",
" <th>Largest Property Use Type_Urgent Care/Clinic/Other Outpatient</th>\n",
" <th>Largest Property Use Type_Wholesale Club/Supercenter</th>\n",
" <th>Largest Property Use Type_Worship Facility</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13276</td>\n",
" <td>5849784</td>\n",
" <td>90300.0</td>\n",
" <td>1950</td>\n",
" <td>1</td>\n",
" <td>100</td>\n",
" <td>126.0</td>\n",
" <td>5.2</td>\n",
" <td>1.2</td>\n",
" <td>99.41</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6.088818</td>\n",
" <td>4.599253</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7377</td>\n",
" <td>4398442</td>\n",
" <td>52000.0</td>\n",
" <td>1926</td>\n",
" <td>1</td>\n",
" <td>100</td>\n",
" <td>95.4</td>\n",
" <td>4.7</td>\n",
" <td>0.9</td>\n",
" <td>NaN</td>\n",
" <td>40.835496</td>\n",
" <td>-73.887745</td>\n",
" <td>3.0</td>\n",
" <td>161.0</td>\n",
" <td>5.384036</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>9479</td>\n",
" <td>4665374</td>\n",
" <td>104700.0</td>\n",
" <td>1954</td>\n",
" <td>1</td>\n",
" <td>100</td>\n",
" <td>40.4</td>\n",
" <td>3.8</td>\n",
" <td>0.3</td>\n",
" <td>NaN</td>\n",
" <td>40.663206</td>\n",
" <td>-73.949469</td>\n",
" <td>9.0</td>\n",
" <td>329.0</td>\n",
" <td>5.017280</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>14774</td>\n",
" <td>3393340</td>\n",
" <td>129333.0</td>\n",
" <td>1992</td>\n",
" <td>1</td>\n",
" <td>100</td>\n",
" <td>157.1</td>\n",
" <td>16.9</td>\n",
" <td>1.1</td>\n",
" <td>NaN</td>\n",
" <td>40.622968</td>\n",
" <td>-74.078742</td>\n",
" <td>1.0</td>\n",
" <td>27.0</td>\n",
" <td>6.510853</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3286</td>\n",
" <td>2704325</td>\n",
" <td>109896.0</td>\n",
" <td>1927</td>\n",
" <td>1</td>\n",
" <td>100</td>\n",
" <td>62.3</td>\n",
" <td>3.5</td>\n",
" <td>0.0</td>\n",
" <td>28.65</td>\n",
" <td>40.782421</td>\n",
" <td>-73.972622</td>\n",
" <td>7.0</td>\n",
" <td>165.0</td>\n",
" <td>6.123589</td>\n",
" <td>3.355153</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 64 columns</p>\n",
"</div>"
],
"text/plain": [
" Order Property Id DOF Gross Floor Area Year Built \\\n",
"0 13276 5849784 90300.0 1950 \n",
"1 7377 4398442 52000.0 1926 \n",
"2 9479 4665374 104700.0 1954 \n",
"3 14774 3393340 129333.0 1992 \n",
"4 3286 2704325 109896.0 1927 \n",
"\n",
" Number of Buildings - Self-reported Occupancy Site EUI (kBtu/ft²) \\\n",
"0 1 100 126.0 \n",
"1 1 100 95.4 \n",
"2 1 100 40.4 \n",
"3 1 100 157.1 \n",
"4 1 100 62.3 \n",
"\n",
" Weather Normalized Site Electricity Intensity (kWh/ft²) \\\n",
"0 5.2 \n",
"1 4.7 \n",
"2 3.8 \n",
"3 16.9 \n",
"4 3.5 \n",
"\n",
" Weather Normalized Site Natural Gas Intensity (therms/ft²) \\\n",
"0 1.2 \n",
"1 0.9 \n",
"2 0.3 \n",
"3 1.1 \n",
"4 0.0 \n",
"\n",
" Water Intensity (All Water Sources) (gal/ft²) Latitude Longitude \\\n",
"0 99.41 NaN NaN \n",
"1 NaN 40.835496 -73.887745 \n",
"2 NaN 40.663206 -73.949469 \n",
"3 NaN 40.622968 -74.078742 \n",
"4 28.65 40.782421 -73.972622 \n",
"\n",
" Community Board Census Tract log_Direct GHG Emissions (Metric Tons CO2e) \\\n",
"0 NaN NaN 6.088818 \n",
"1 3.0 161.0 5.384036 \n",
"2 9.0 329.0 5.017280 \n",
"3 1.0 27.0 6.510853 \n",
"4 7.0 165.0 6.123589 \n",
"\n",
" log_Water Intensity (All Water Sources) (gal/ft²) Borough_Staten Island \\\n",
"0 4.599253 0 \n",
"1 NaN 0 \n",
"2 NaN 0 \n",
"3 NaN 1 \n",
"4 3.355153 0 \n",
"\n",
" Largest Property Use Type_Adult Education \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Automobile Dealership \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Bank Branch \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_College/University \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Convenience Store without Gas Station \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Courthouse \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Distribution Center \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Enclosed Mall \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Financial Office \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Hospital (General Medical & Surgical) \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Hotel Largest Property Use Type_K-12 School \\\n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
" Largest Property Use Type_Library ... \\\n",
"0 0 ... \n",
"1 0 ... \n",
"2 0 ... \n",
"3 0 ... \n",
"4 0 ... \n",
"\n",
" Largest Property Use Type_Multifamily Housing \\\n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 0 \n",
"4 1 \n",
"\n",
" Largest Property Use Type_Museum \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Non-Refrigerated Warehouse \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Education \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Entertainment/Public Assembly \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Lodging/Residential \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Mall \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Public Services \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Recreation \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Services \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Other - Specialty Hospital \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Outpatient Rehabilitation/Physical Therapy \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Parking \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Performing Arts \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Pre-school/Daycare \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Refrigerated Warehouse \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Repair Services (Vehicle, Shoe, Locksmith, etc.) \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Residence Hall/Dormitory \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Residential Care Facility \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Restaurant \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Retail Store \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Self-Storage Facility \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Senior Care Community \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Social/Meeting Hall \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Strip Mall \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Supermarket/Grocery Store \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Urgent Care/Clinic/Other Outpatient \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Wholesale Club/Supercenter \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" Largest Property Use Type_Worship Facility \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 64 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_features.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 缺失值填充\n",
"\n",
"利用sklearn的 Imputer object来进行缺失值填充测试集则使用数据集中的结果进行填充尽可能的不要利用测试集的数据对测试集加工因为一开始我们也是不知道的可参考[Data Leagage](https://www.kaggle.com/dansbecker/data-leakage)。"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Create an imputer object with a median filling strategy\n",
"imputer = Imputer(strategy = 'median')\n",
"\n",
"# Train on the training features\n",
"imputer.fit(train_features)\n",
"\n",
"# Transform both training data and testing data\n",
"X = imputer.transform(train_features)\n",
"X_test = imputer.transform(test_features)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Missing values in training features: 0\n",
"Missing values in testing features: 0\n"
]
}
],
"source": [
"print('Missing values in training features:', np.sum(np.isnan(X)))\n",
"print('Missing values in testing features:',np.sum(np.isnan(X_test)))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(array([], dtype=int64), array([], dtype=int64))\n",
"(array([], dtype=int64), array([], dtype=int64))\n"
]
}
],
"source": [
"# Make sure all values are finite\n",
"print(np.where(~np.isfinite(X)))\n",
"print(np.where(~np.isfinite(X_test)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## 特征"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading…
Cancel
Save