From 9cd0cc6063305a8220f28ee38012672fdd1ba647 Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Mon, 21 Dec 2020 21:33:57 +0800
Subject: [PATCH] =?UTF-8?q?Add=20=E5=A4=84=E7=90=86=E7=BC=BA=E5=A4=B1?=
=?UTF-8?q?=E5=80=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
...筑能源利用率预测-checkpoint.ipynb | 1118 ++++++++++++++++-
1 file changed, 1102 insertions(+), 16 deletions(-)
diff --git a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb
index ad03c1d..4194fc4 100644
--- a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb
@@ -36,18 +36,9 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 1,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "D:\\Anaconda3\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
- " return f(*args, **kwds)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
@@ -78,7 +69,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 2,
"metadata": {
"scrolled": true
},
@@ -719,7 +710,7 @@
"4 Washington Heights South ... "
]
},
- "execution_count": 4,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -749,7 +740,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 3,
"metadata": {
"scrolled": true
},
@@ -839,12 +830,1107 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Replace all occurrences of Not Available with numpy not a number\n",
- "data = data.replace({'Not Available':np.nan})"
+ "data = data.replace({'Not Available':np.nan})\n",
+ "\n",
+ "# Iterate through the columns\n",
+ "for col in list(data.columns):\n",
+ " if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in\n",
+ " col or 'therms' in col or 'gal' in col or 'Score' in col):\n",
+ " # Convert the data type to float\n",
+ " data[col] = data[col].astype(float)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Order | \n",
+ " Property Id | \n",
+ " DOF Gross Floor Area | \n",
+ " Largest Property Use Type - Gross Floor Area (ft²) | \n",
+ " 2nd Largest Property Use - Gross Floor Area (ft²) | \n",
+ " 3rd Largest Property Use Type - Gross Floor Area (ft²) | \n",
+ " Year Built | \n",
+ " Number of Buildings - Self-reported | \n",
+ " Occupancy | \n",
+ " ENERGY STAR Score | \n",
+ " Site EUI (kBtu/ft²) | \n",
+ " Weather Normalized Site EUI (kBtu/ft²) | \n",
+ " Weather Normalized Site Electricity Intensity (kWh/ft²) | \n",
+ " Weather Normalized Site Natural Gas Intensity (therms/ft²) | \n",
+ " Weather Normalized Source EUI (kBtu/ft²) | \n",
+ " Fuel Oil #1 Use (kBtu) | \n",
+ " Fuel Oil #2 Use (kBtu) | \n",
+ " Fuel Oil #4 Use (kBtu) | \n",
+ " Fuel Oil #5 & 6 Use (kBtu) | \n",
+ " Diesel #2 Use (kBtu) | \n",
+ " District Steam Use (kBtu) | \n",
+ " Natural Gas Use (kBtu) | \n",
+ " Weather Normalized Site Natural Gas Use (therms) | \n",
+ " Electricity Use - Grid Purchase (kBtu) | \n",
+ " Weather Normalized Site Electricity (kWh) | \n",
+ " Total GHG Emissions (Metric Tons CO2e) | \n",
+ " Direct GHG Emissions (Metric Tons CO2e) | \n",
+ " Indirect GHG Emissions (Metric Tons CO2e) | \n",
+ " Property GFA - Self-Reported (ft²) | \n",
+ " Water Use (All Water Sources) (kgal) | \n",
+ " Water Intensity (All Water Sources) (gal/ft²) | \n",
+ " Source EUI (kBtu/ft²) | \n",
+ " Latitude | \n",
+ " Longitude | \n",
+ " Community Board | \n",
+ " Council District | \n",
+ " Census Tract | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 11746.000000 | \n",
+ " 1.174600e+04 | \n",
+ " 1.162800e+04 | \n",
+ " 1.174400e+04 | \n",
+ " 3741.000000 | \n",
+ " 1484.000000 | \n",
+ " 11746.000000 | \n",
+ " 11746.000000 | \n",
+ " 11746.000000 | \n",
+ " 9642.000000 | \n",
+ " 11583.000000 | \n",
+ " 10281.000000 | \n",
+ " 10959.000000 | \n",
+ " 9783.000000 | \n",
+ " 10281.000000 | \n",
+ " 9.000000e+00 | \n",
+ " 2.581000e+03 | \n",
+ " 1.321000e+03 | \n",
+ " 5.940000e+02 | \n",
+ " 1.600000e+01 | \n",
+ " 9.360000e+02 | \n",
+ " 1.030400e+04 | \n",
+ " 9.784000e+03 | \n",
+ " 1.150200e+04 | \n",
+ " 1.096000e+04 | \n",
+ " 1.167200e+04 | \n",
+ " 1.166300e+04 | \n",
+ " 1.168100e+04 | \n",
+ " 1.174600e+04 | \n",
+ " 7.762000e+03 | \n",
+ " 7762.000000 | \n",
+ " 11583.000000 | \n",
+ " 9483.000000 | \n",
+ " 9483.000000 | \n",
+ " 9483.000000 | \n",
+ " 9483.000000 | \n",
+ " 9483.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 7185.759578 | \n",
+ " 3.642958e+06 | \n",
+ " 1.732695e+05 | \n",
+ " 1.605524e+05 | \n",
+ " 22778.682010 | \n",
+ " 12016.825270 | \n",
+ " 1948.738379 | \n",
+ " 1.289971 | \n",
+ " 98.762557 | \n",
+ " 59.854594 | \n",
+ " 280.071484 | \n",
+ " 309.747466 | \n",
+ " 11.072643 | \n",
+ " 1.901441 | \n",
+ " 417.915709 | \n",
+ " 3.395398e+06 | \n",
+ " 3.186882e+06 | \n",
+ " 5.294367e+06 | \n",
+ " 2.429105e+06 | \n",
+ " 1.193594e+06 | \n",
+ " 2.868907e+08 | \n",
+ " 5.048545e+07 | \n",
+ " 5.364578e+05 | \n",
+ " 5.965472e+06 | \n",
+ " 1.768752e+06 | \n",
+ " 4.553657e+03 | \n",
+ " 2.477937e+03 | \n",
+ " 2.076339e+03 | \n",
+ " 1.673739e+05 | \n",
+ " 1.591798e+04 | \n",
+ " 136.172432 | \n",
+ " 385.908029 | \n",
+ " 40.754379 | \n",
+ " -73.957057 | \n",
+ " 7.140673 | \n",
+ " 15.771275 | \n",
+ " 4977.596647 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 4323.859984 | \n",
+ " 1.049070e+06 | \n",
+ " 3.367055e+05 | \n",
+ " 3.095746e+05 | \n",
+ " 55094.441422 | \n",
+ " 27959.755486 | \n",
+ " 30.576386 | \n",
+ " 4.017484 | \n",
+ " 7.501603 | \n",
+ " 29.993586 | \n",
+ " 8607.178877 | \n",
+ " 9784.731207 | \n",
+ " 127.733868 | \n",
+ " 97.204587 | \n",
+ " 10530.524339 | \n",
+ " 2.213237e+06 | \n",
+ " 5.497154e+06 | \n",
+ " 5.881863e+06 | \n",
+ " 4.442946e+06 | \n",
+ " 3.558178e+06 | \n",
+ " 3.124603e+09 | \n",
+ " 3.914719e+09 | \n",
+ " 4.022606e+07 | \n",
+ " 3.154430e+07 | \n",
+ " 9.389154e+06 | \n",
+ " 2.041639e+05 | \n",
+ " 1.954498e+05 | \n",
+ " 5.931295e+04 | \n",
+ " 3.189238e+05 | \n",
+ " 1.529524e+05 | \n",
+ " 1730.726938 | \n",
+ " 9312.736225 | \n",
+ " 0.080120 | \n",
+ " 0.046337 | \n",
+ " 3.954129 | \n",
+ " 15.674375 | \n",
+ " 13520.422990 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " 7.365000e+03 | \n",
+ " 5.002800e+04 | \n",
+ " 5.400000e+01 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1600.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 2.085973e+05 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " -4.690797e+08 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " -2.313430e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 40.516065 | \n",
+ " -74.243582 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 3428.250000 | \n",
+ " 2.747222e+06 | \n",
+ " 6.524000e+04 | \n",
+ " 6.520100e+04 | \n",
+ " 4000.000000 | \n",
+ " 1720.750000 | \n",
+ " 1927.000000 | \n",
+ " 1.000000 | \n",
+ " 100.000000 | \n",
+ " 37.000000 | \n",
+ " 61.800000 | \n",
+ " 65.100000 | \n",
+ " 3.800000 | \n",
+ " 0.100000 | \n",
+ " 103.500000 | \n",
+ " 1.663594e+06 | \n",
+ " 2.550378e+05 | \n",
+ " 2.128213e+06 | \n",
+ " 0.000000e+00 | \n",
+ " 5.698020e+04 | \n",
+ " 4.320254e+06 | \n",
+ " 1.098251e+06 | \n",
+ " 1.176952e+04 | \n",
+ " 1.043673e+06 | \n",
+ " 3.019974e+05 | \n",
+ " 3.287000e+02 | \n",
+ " 1.474500e+02 | \n",
+ " 9.480000e+01 | \n",
+ " 6.699400e+04 | \n",
+ " 2.595400e+03 | \n",
+ " 27.150000 | \n",
+ " 99.400000 | \n",
+ " 40.707226 | \n",
+ " -73.984662 | \n",
+ " 4.000000 | \n",
+ " 4.000000 | \n",
+ " 100.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 6986.500000 | \n",
+ " 3.236404e+06 | \n",
+ " 9.313850e+04 | \n",
+ " 9.132400e+04 | \n",
+ " 8654.000000 | \n",
+ " 5000.000000 | \n",
+ " 1941.000000 | \n",
+ " 1.000000 | \n",
+ " 100.000000 | \n",
+ " 65.000000 | \n",
+ " 78.500000 | \n",
+ " 82.500000 | \n",
+ " 5.300000 | \n",
+ " 0.500000 | \n",
+ " 129.400000 | \n",
+ " 4.328815e+06 | \n",
+ " 1.380138e+06 | \n",
+ " 4.312984e+06 | \n",
+ " 0.000000e+00 | \n",
+ " 2.070020e+05 | \n",
+ " 9.931240e+06 | \n",
+ " 4.103962e+06 | \n",
+ " 4.445525e+04 | \n",
+ " 1.855196e+06 | \n",
+ " 5.416312e+05 | \n",
+ " 5.002500e+02 | \n",
+ " 2.726000e+02 | \n",
+ " 1.718000e+02 | \n",
+ " 9.408000e+04 | \n",
+ " 4.692500e+03 | \n",
+ " 45.095000 | \n",
+ " 124.900000 | \n",
+ " 40.759130 | \n",
+ " -73.962810 | \n",
+ " 7.000000 | \n",
+ " 9.000000 | \n",
+ " 201.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 11054.500000 | \n",
+ " 4.409092e+06 | \n",
+ " 1.596140e+05 | \n",
+ " 1.532550e+05 | \n",
+ " 20000.000000 | \n",
+ " 12000.000000 | \n",
+ " 1966.000000 | \n",
+ " 1.000000 | \n",
+ " 100.000000 | \n",
+ " 85.000000 | \n",
+ " 97.600000 | \n",
+ " 102.500000 | \n",
+ " 9.200000 | \n",
+ " 0.700000 | \n",
+ " 167.200000 | \n",
+ " 4.938947e+06 | \n",
+ " 4.445808e+06 | \n",
+ " 6.514520e+06 | \n",
+ " 4.293825e+06 | \n",
+ " 2.918332e+05 | \n",
+ " 2.064497e+07 | \n",
+ " 6.855070e+06 | \n",
+ " 7.348107e+04 | \n",
+ " 4.370302e+06 | \n",
+ " 1.284677e+06 | \n",
+ " 9.084250e+02 | \n",
+ " 4.475000e+02 | \n",
+ " 4.249000e+02 | \n",
+ " 1.584140e+05 | \n",
+ " 8.031875e+03 | \n",
+ " 70.805000 | \n",
+ " 162.750000 | \n",
+ " 40.817623 | \n",
+ " -73.932443 | \n",
+ " 9.000000 | \n",
+ " 33.000000 | \n",
+ " 531.500000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 14993.000000 | \n",
+ " 5.991312e+06 | \n",
+ " 1.354011e+07 | \n",
+ " 1.421712e+07 | \n",
+ " 962428.000000 | \n",
+ " 591640.000000 | \n",
+ " 2019.000000 | \n",
+ " 161.000000 | \n",
+ " 100.000000 | \n",
+ " 100.000000 | \n",
+ " 869265.000000 | \n",
+ " 939329.000000 | \n",
+ " 6259.400000 | \n",
+ " 9393.000000 | \n",
+ " 986366.000000 | \n",
+ " 6.275850e+06 | \n",
+ " 1.046849e+08 | \n",
+ " 7.907464e+07 | \n",
+ " 4.410378e+07 | \n",
+ " 1.435178e+07 | \n",
+ " 7.163518e+10 | \n",
+ " 3.942852e+11 | \n",
+ " 3.942852e+09 | \n",
+ " 1.691763e+09 | \n",
+ " 4.958273e+08 | \n",
+ " 2.094340e+07 | \n",
+ " 2.094340e+07 | \n",
+ " 4.764375e+06 | \n",
+ " 1.421712e+07 | \n",
+ " 6.594604e+06 | \n",
+ " 96305.690000 | \n",
+ " 912801.100000 | \n",
+ " 40.912869 | \n",
+ " -73.715543 | \n",
+ " 56.000000 | \n",
+ " 51.000000 | \n",
+ " 155101.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Order Property Id DOF Gross Floor Area \\\n",
+ "count 11746.000000 1.174600e+04 1.162800e+04 \n",
+ "mean 7185.759578 3.642958e+06 1.732695e+05 \n",
+ "std 4323.859984 1.049070e+06 3.367055e+05 \n",
+ "min 1.000000 7.365000e+03 5.002800e+04 \n",
+ "25% 3428.250000 2.747222e+06 6.524000e+04 \n",
+ "50% 6986.500000 3.236404e+06 9.313850e+04 \n",
+ "75% 11054.500000 4.409092e+06 1.596140e+05 \n",
+ "max 14993.000000 5.991312e+06 1.354011e+07 \n",
+ "\n",
+ " Largest Property Use Type - Gross Floor Area (ft²) \\\n",
+ "count 1.174400e+04 \n",
+ "mean 1.605524e+05 \n",
+ "std 3.095746e+05 \n",
+ "min 5.400000e+01 \n",
+ "25% 6.520100e+04 \n",
+ "50% 9.132400e+04 \n",
+ "75% 1.532550e+05 \n",
+ "max 1.421712e+07 \n",
+ "\n",
+ " 2nd Largest Property Use - Gross Floor Area (ft²) \\\n",
+ "count 3741.000000 \n",
+ "mean 22778.682010 \n",
+ "std 55094.441422 \n",
+ "min 0.000000 \n",
+ "25% 4000.000000 \n",
+ "50% 8654.000000 \n",
+ "75% 20000.000000 \n",
+ "max 962428.000000 \n",
+ "\n",
+ " 3rd Largest Property Use Type - Gross Floor Area (ft²) Year Built \\\n",
+ "count 1484.000000 11746.000000 \n",
+ "mean 12016.825270 1948.738379 \n",
+ "std 27959.755486 30.576386 \n",
+ "min 0.000000 1600.000000 \n",
+ "25% 1720.750000 1927.000000 \n",
+ "50% 5000.000000 1941.000000 \n",
+ "75% 12000.000000 1966.000000 \n",
+ "max 591640.000000 2019.000000 \n",
+ "\n",
+ " Number of Buildings - Self-reported Occupancy ENERGY STAR Score \\\n",
+ "count 11746.000000 11746.000000 9642.000000 \n",
+ "mean 1.289971 98.762557 59.854594 \n",
+ "std 4.017484 7.501603 29.993586 \n",
+ "min 0.000000 0.000000 1.000000 \n",
+ "25% 1.000000 100.000000 37.000000 \n",
+ "50% 1.000000 100.000000 65.000000 \n",
+ "75% 1.000000 100.000000 85.000000 \n",
+ "max 161.000000 100.000000 100.000000 \n",
+ "\n",
+ " Site EUI (kBtu/ft²) Weather Normalized Site EUI (kBtu/ft²) \\\n",
+ "count 11583.000000 10281.000000 \n",
+ "mean 280.071484 309.747466 \n",
+ "std 8607.178877 9784.731207 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 61.800000 65.100000 \n",
+ "50% 78.500000 82.500000 \n",
+ "75% 97.600000 102.500000 \n",
+ "max 869265.000000 939329.000000 \n",
+ "\n",
+ " Weather Normalized Site Electricity Intensity (kWh/ft²) \\\n",
+ "count 10959.000000 \n",
+ "mean 11.072643 \n",
+ "std 127.733868 \n",
+ "min 0.000000 \n",
+ "25% 3.800000 \n",
+ "50% 5.300000 \n",
+ "75% 9.200000 \n",
+ "max 6259.400000 \n",
+ "\n",
+ " Weather Normalized Site Natural Gas Intensity (therms/ft²) \\\n",
+ "count 9783.000000 \n",
+ "mean 1.901441 \n",
+ "std 97.204587 \n",
+ "min 0.000000 \n",
+ "25% 0.100000 \n",
+ "50% 0.500000 \n",
+ "75% 0.700000 \n",
+ "max 9393.000000 \n",
+ "\n",
+ " Weather Normalized Source EUI (kBtu/ft²) Fuel Oil #1 Use (kBtu) \\\n",
+ "count 10281.000000 9.000000e+00 \n",
+ "mean 417.915709 3.395398e+06 \n",
+ "std 10530.524339 2.213237e+06 \n",
+ "min 0.000000 2.085973e+05 \n",
+ "25% 103.500000 1.663594e+06 \n",
+ "50% 129.400000 4.328815e+06 \n",
+ "75% 167.200000 4.938947e+06 \n",
+ "max 986366.000000 6.275850e+06 \n",
+ "\n",
+ " Fuel Oil #2 Use (kBtu) Fuel Oil #4 Use (kBtu) \\\n",
+ "count 2.581000e+03 1.321000e+03 \n",
+ "mean 3.186882e+06 5.294367e+06 \n",
+ "std 5.497154e+06 5.881863e+06 \n",
+ "min 0.000000e+00 0.000000e+00 \n",
+ "25% 2.550378e+05 2.128213e+06 \n",
+ "50% 1.380138e+06 4.312984e+06 \n",
+ "75% 4.445808e+06 6.514520e+06 \n",
+ "max 1.046849e+08 7.907464e+07 \n",
+ "\n",
+ " Fuel Oil #5 & 6 Use (kBtu) Diesel #2 Use (kBtu) \\\n",
+ "count 5.940000e+02 1.600000e+01 \n",
+ "mean 2.429105e+06 1.193594e+06 \n",
+ "std 4.442946e+06 3.558178e+06 \n",
+ "min 0.000000e+00 0.000000e+00 \n",
+ "25% 0.000000e+00 5.698020e+04 \n",
+ "50% 0.000000e+00 2.070020e+05 \n",
+ "75% 4.293825e+06 2.918332e+05 \n",
+ "max 4.410378e+07 1.435178e+07 \n",
+ "\n",
+ " District Steam Use (kBtu) Natural Gas Use (kBtu) \\\n",
+ "count 9.360000e+02 1.030400e+04 \n",
+ "mean 2.868907e+08 5.048545e+07 \n",
+ "std 3.124603e+09 3.914719e+09 \n",
+ "min -4.690797e+08 0.000000e+00 \n",
+ "25% 4.320254e+06 1.098251e+06 \n",
+ "50% 9.931240e+06 4.103962e+06 \n",
+ "75% 2.064497e+07 6.855070e+06 \n",
+ "max 7.163518e+10 3.942852e+11 \n",
+ "\n",
+ " Weather Normalized Site Natural Gas Use (therms) \\\n",
+ "count 9.784000e+03 \n",
+ "mean 5.364578e+05 \n",
+ "std 4.022606e+07 \n",
+ "min 0.000000e+00 \n",
+ "25% 1.176952e+04 \n",
+ "50% 4.445525e+04 \n",
+ "75% 7.348107e+04 \n",
+ "max 3.942852e+09 \n",
+ "\n",
+ " Electricity Use - Grid Purchase (kBtu) \\\n",
+ "count 1.150200e+04 \n",
+ "mean 5.965472e+06 \n",
+ "std 3.154430e+07 \n",
+ "min 0.000000e+00 \n",
+ "25% 1.043673e+06 \n",
+ "50% 1.855196e+06 \n",
+ "75% 4.370302e+06 \n",
+ "max 1.691763e+09 \n",
+ "\n",
+ " Weather Normalized Site Electricity (kWh) \\\n",
+ "count 1.096000e+04 \n",
+ "mean 1.768752e+06 \n",
+ "std 9.389154e+06 \n",
+ "min 0.000000e+00 \n",
+ "25% 3.019974e+05 \n",
+ "50% 5.416312e+05 \n",
+ "75% 1.284677e+06 \n",
+ "max 4.958273e+08 \n",
+ "\n",
+ " Total GHG Emissions (Metric Tons CO2e) \\\n",
+ "count 1.167200e+04 \n",
+ "mean 4.553657e+03 \n",
+ "std 2.041639e+05 \n",
+ "min 0.000000e+00 \n",
+ "25% 3.287000e+02 \n",
+ "50% 5.002500e+02 \n",
+ "75% 9.084250e+02 \n",
+ "max 2.094340e+07 \n",
+ "\n",
+ " Direct GHG Emissions (Metric Tons CO2e) \\\n",
+ "count 1.166300e+04 \n",
+ "mean 2.477937e+03 \n",
+ "std 1.954498e+05 \n",
+ "min 0.000000e+00 \n",
+ "25% 1.474500e+02 \n",
+ "50% 2.726000e+02 \n",
+ "75% 4.475000e+02 \n",
+ "max 2.094340e+07 \n",
+ "\n",
+ " Indirect GHG Emissions (Metric Tons CO2e) \\\n",
+ "count 1.168100e+04 \n",
+ "mean 2.076339e+03 \n",
+ "std 5.931295e+04 \n",
+ "min -2.313430e+04 \n",
+ "25% 9.480000e+01 \n",
+ "50% 1.718000e+02 \n",
+ "75% 4.249000e+02 \n",
+ "max 4.764375e+06 \n",
+ "\n",
+ " Property GFA - Self-Reported (ft²) \\\n",
+ "count 1.174600e+04 \n",
+ "mean 1.673739e+05 \n",
+ "std 3.189238e+05 \n",
+ "min 0.000000e+00 \n",
+ "25% 6.699400e+04 \n",
+ "50% 9.408000e+04 \n",
+ "75% 1.584140e+05 \n",
+ "max 1.421712e+07 \n",
+ "\n",
+ " Water Use (All Water Sources) (kgal) \\\n",
+ "count 7.762000e+03 \n",
+ "mean 1.591798e+04 \n",
+ "std 1.529524e+05 \n",
+ "min 0.000000e+00 \n",
+ "25% 2.595400e+03 \n",
+ "50% 4.692500e+03 \n",
+ "75% 8.031875e+03 \n",
+ "max 6.594604e+06 \n",
+ "\n",
+ " Water Intensity (All Water Sources) (gal/ft²) Source EUI (kBtu/ft²) \\\n",
+ "count 7762.000000 11583.000000 \n",
+ "mean 136.172432 385.908029 \n",
+ "std 1730.726938 9312.736225 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 27.150000 99.400000 \n",
+ "50% 45.095000 124.900000 \n",
+ "75% 70.805000 162.750000 \n",
+ "max 96305.690000 912801.100000 \n",
+ "\n",
+ " Latitude Longitude Community Board Council District \\\n",
+ "count 9483.000000 9483.000000 9483.000000 9483.000000 \n",
+ "mean 40.754379 -73.957057 7.140673 15.771275 \n",
+ "std 0.080120 0.046337 3.954129 15.674375 \n",
+ "min 40.516065 -74.243582 1.000000 1.000000 \n",
+ "25% 40.707226 -73.984662 4.000000 4.000000 \n",
+ "50% 40.759130 -73.962810 7.000000 9.000000 \n",
+ "75% 40.817623 -73.932443 9.000000 33.000000 \n",
+ "max 40.912869 -73.715543 56.000000 51.000000 \n",
+ "\n",
+ " Census Tract \n",
+ "count 9483.000000 \n",
+ "mean 4977.596647 \n",
+ "std 13520.422990 \n",
+ "min 1.000000 \n",
+ "25% 100.000000 \n",
+ "50% 201.000000 \n",
+ "75% 531.500000 \n",
+ "max 155101.000000 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Statistics for each column\n",
+ "data.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 缺失值处理\n",
+ "每个列缺失的比例,这里提供一个函数。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Function to calculate missing values by column\n",
+ "def missing_values_table(df):\n",
+ " # Total missing values\n",
+ " mis_val = df.isnull().sum()\n",
+ " \n",
+ " # Percentage of missing values\n",
+ " mis_val_percent = 100 * df.isnull().sum() / len(df)\n",
+ " \n",
+ " # Make a table with the results\n",
+ " mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n",
+ " \n",
+ " # Rename the columns\n",
+ " mis_val_table_ren_columns = mis_val_table.rename(\n",
+ " columns = {0 : 'Missing Values', 1 : '% of Total Values'})\n",
+ " \n",
+ " # Sort the table by percentage of missing descending\n",
+ " mis_val_table_ren_columns = mis_val_table_ren_columns[\n",
+ " mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(\n",
+ " '% of Total Values', ascending=False).round(1)\n",
+ " \n",
+ " # Print some summary information\n",
+ " print (\"Your selected dataframe has \" + str(df.shape[1]) + \" columns.\\n\" \n",
+ " \"There are \" + str(mis_val_table_ren_columns.shape[0]) +\n",
+ " \" columns that have missing values.\")\n",
+ " \n",
+ " \n",
+ " # Return the dataframe with missing information\n",
+ " return mis_val_table_ren_columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Your selected dataframe has 60 columns.\n",
+ "There are 46 columns that have missing values.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Missing Values | \n",
+ " % of Total Values | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Fuel Oil #1 Use (kBtu) | \n",
+ " 11737 | \n",
+ " 99.9 | \n",
+ "
\n",
+ " \n",
+ " Diesel #2 Use (kBtu) | \n",
+ " 11730 | \n",
+ " 99.9 | \n",
+ "
\n",
+ " \n",
+ " Address 2 | \n",
+ " 11539 | \n",
+ " 98.2 | \n",
+ "
\n",
+ " \n",
+ " Fuel Oil #5 & 6 Use (kBtu) | \n",
+ " 11152 | \n",
+ " 94.9 | \n",
+ "
\n",
+ " \n",
+ " District Steam Use (kBtu) | \n",
+ " 10810 | \n",
+ " 92.0 | \n",
+ "
\n",
+ " \n",
+ " Fuel Oil #4 Use (kBtu) | \n",
+ " 10425 | \n",
+ " 88.8 | \n",
+ "
\n",
+ " \n",
+ " 3rd Largest Property Use Type - Gross Floor Area (ft²) | \n",
+ " 10262 | \n",
+ " 87.4 | \n",
+ "
\n",
+ " \n",
+ " 3rd Largest Property Use Type | \n",
+ " 10262 | \n",
+ " 87.4 | \n",
+ "
\n",
+ " \n",
+ " Fuel Oil #2 Use (kBtu) | \n",
+ " 9165 | \n",
+ " 78.0 | \n",
+ "
\n",
+ " \n",
+ " 2nd Largest Property Use Type | \n",
+ " 8005 | \n",
+ " 68.2 | \n",
+ "
\n",
+ " \n",
+ " 2nd Largest Property Use - Gross Floor Area (ft²) | \n",
+ " 8005 | \n",
+ " 68.2 | \n",
+ "
\n",
+ " \n",
+ " Metered Areas (Water) | \n",
+ " 4609 | \n",
+ " 39.2 | \n",
+ "
\n",
+ " \n",
+ " Water Intensity (All Water Sources) (gal/ft²) | \n",
+ " 3984 | \n",
+ " 33.9 | \n",
+ "
\n",
+ " \n",
+ " Water Use (All Water Sources) (kgal) | \n",
+ " 3984 | \n",
+ " 33.9 | \n",
+ "
\n",
+ " \n",
+ " Latitude | \n",
+ " 2263 | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " Longitude | \n",
+ " 2263 | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " Community Board | \n",
+ " 2263 | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " Council District | \n",
+ " 2263 | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " Census Tract | \n",
+ " 2263 | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " NTA | \n",
+ " 2263 | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " ENERGY STAR Score | \n",
+ " 2104 | \n",
+ " 17.9 | \n",
+ "
\n",
+ " \n",
+ " Weather Normalized Site Natural Gas Intensity (therms/ft²) | \n",
+ " 1963 | \n",
+ " 16.7 | \n",
+ "
\n",
+ " \n",
+ " Weather Normalized Site Natural Gas Use (therms) | \n",
+ " 1962 | \n",
+ " 16.7 | \n",
+ "
\n",
+ " \n",
+ " Weather Normalized Source EUI (kBtu/ft²) | \n",
+ " 1465 | \n",
+ " 12.5 | \n",
+ "
\n",
+ " \n",
+ " Weather Normalized Site EUI (kBtu/ft²) | \n",
+ " 1465 | \n",
+ " 12.5 | \n",
+ "
\n",
+ " \n",
+ " Natural Gas Use (kBtu) | \n",
+ " 1442 | \n",
+ " 12.3 | \n",
+ "
\n",
+ " \n",
+ " Weather Normalized Site Electricity Intensity (kWh/ft²) | \n",
+ " 787 | \n",
+ " 6.7 | \n",
+ "
\n",
+ " \n",
+ " Weather Normalized Site Electricity (kWh) | \n",
+ " 786 | \n",
+ " 6.7 | \n",
+ "
\n",
+ " \n",
+ " Electricity Use - Grid Purchase (kBtu) | \n",
+ " 244 | \n",
+ " 2.1 | \n",
+ "
\n",
+ " \n",
+ " Site EUI (kBtu/ft²) | \n",
+ " 163 | \n",
+ " 1.4 | \n",
+ "
\n",
+ " \n",
+ " Source EUI (kBtu/ft²) | \n",
+ " 163 | \n",
+ " 1.4 | \n",
+ "
\n",
+ " \n",
+ " NYC Building Identification Number (BIN) | \n",
+ " 162 | \n",
+ " 1.4 | \n",
+ "
\n",
+ " \n",
+ " Street Number | \n",
+ " 124 | \n",
+ " 1.1 | \n",
+ "
\n",
+ " \n",
+ " Street Name | \n",
+ " 122 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " Borough | \n",
+ " 118 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " DOF Gross Floor Area | \n",
+ " 118 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " Water Required? | \n",
+ " 118 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " Direct GHG Emissions (Metric Tons CO2e) | \n",
+ " 83 | \n",
+ " 0.7 | \n",
+ "
\n",
+ " \n",
+ " Total GHG Emissions (Metric Tons CO2e) | \n",
+ " 74 | \n",
+ " 0.6 | \n",
+ "
\n",
+ " \n",
+ " Indirect GHG Emissions (Metric Tons CO2e) | \n",
+ " 65 | \n",
+ " 0.6 | \n",
+ "
\n",
+ " \n",
+ " Metered Areas (Energy) | \n",
+ " 57 | \n",
+ " 0.5 | \n",
+ "
\n",
+ " \n",
+ " DOF Benchmarking Submission Status | \n",
+ " 30 | \n",
+ " 0.3 | \n",
+ "
\n",
+ " \n",
+ " NYC Borough, Block and Lot (BBL) self-reported | \n",
+ " 11 | \n",
+ " 0.1 | \n",
+ "
\n",
+ " \n",
+ " BBL - 10 digits | \n",
+ " 11 | \n",
+ " 0.1 | \n",
+ "
\n",
+ " \n",
+ " Largest Property Use Type | \n",
+ " 2 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Largest Property Use Type - Gross Floor Area (ft²) | \n",
+ " 2 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Missing Values \\\n",
+ "Fuel Oil #1 Use (kBtu) 11737 \n",
+ "Diesel #2 Use (kBtu) 11730 \n",
+ "Address 2 11539 \n",
+ "Fuel Oil #5 & 6 Use (kBtu) 11152 \n",
+ "District Steam Use (kBtu) 10810 \n",
+ "Fuel Oil #4 Use (kBtu) 10425 \n",
+ "3rd Largest Property Use Type - Gross Floor Are... 10262 \n",
+ "3rd Largest Property Use Type 10262 \n",
+ "Fuel Oil #2 Use (kBtu) 9165 \n",
+ "2nd Largest Property Use Type 8005 \n",
+ "2nd Largest Property Use - Gross Floor Area (ft²) 8005 \n",
+ "Metered Areas (Water) 4609 \n",
+ "Water Intensity (All Water Sources) (gal/ft²) 3984 \n",
+ "Water Use (All Water Sources) (kgal) 3984 \n",
+ "Latitude 2263 \n",
+ "Longitude 2263 \n",
+ "Community Board 2263 \n",
+ "Council District 2263 \n",
+ "Census Tract 2263 \n",
+ "NTA 2263 \n",
+ "ENERGY STAR Score 2104 \n",
+ "Weather Normalized Site Natural Gas Intensity (... 1963 \n",
+ "Weather Normalized Site Natural Gas Use (therms) 1962 \n",
+ "Weather Normalized Source EUI (kBtu/ft²) 1465 \n",
+ "Weather Normalized Site EUI (kBtu/ft²) 1465 \n",
+ "Natural Gas Use (kBtu) 1442 \n",
+ "Weather Normalized Site Electricity Intensity (... 787 \n",
+ "Weather Normalized Site Electricity (kWh) 786 \n",
+ "Electricity Use - Grid Purchase (kBtu) 244 \n",
+ "Site EUI (kBtu/ft²) 163 \n",
+ "Source EUI (kBtu/ft²) 163 \n",
+ "NYC Building Identification Number (BIN) 162 \n",
+ "Street Number 124 \n",
+ "Street Name 122 \n",
+ "Borough 118 \n",
+ "DOF Gross Floor Area 118 \n",
+ "Water Required? 118 \n",
+ "Direct GHG Emissions (Metric Tons CO2e) 83 \n",
+ "Total GHG Emissions (Metric Tons CO2e) 74 \n",
+ "Indirect GHG Emissions (Metric Tons CO2e) 65 \n",
+ "Metered Areas (Energy) 57 \n",
+ "DOF Benchmarking Submission Status 30 \n",
+ "NYC Borough, Block and Lot (BBL) self-reported 11 \n",
+ "BBL - 10 digits 11 \n",
+ "Largest Property Use Type 2 \n",
+ "Largest Property Use Type - Gross Floor Area (ft²) 2 \n",
+ "\n",
+ " % of Total Values \n",
+ "Fuel Oil #1 Use (kBtu) 99.9 \n",
+ "Diesel #2 Use (kBtu) 99.9 \n",
+ "Address 2 98.2 \n",
+ "Fuel Oil #5 & 6 Use (kBtu) 94.9 \n",
+ "District Steam Use (kBtu) 92.0 \n",
+ "Fuel Oil #4 Use (kBtu) 88.8 \n",
+ "3rd Largest Property Use Type - Gross Floor Are... 87.4 \n",
+ "3rd Largest Property Use Type 87.4 \n",
+ "Fuel Oil #2 Use (kBtu) 78.0 \n",
+ "2nd Largest Property Use Type 68.2 \n",
+ "2nd Largest Property Use - Gross Floor Area (ft²) 68.2 \n",
+ "Metered Areas (Water) 39.2 \n",
+ "Water Intensity (All Water Sources) (gal/ft²) 33.9 \n",
+ "Water Use (All Water Sources) (kgal) 33.9 \n",
+ "Latitude 19.3 \n",
+ "Longitude 19.3 \n",
+ "Community Board 19.3 \n",
+ "Council District 19.3 \n",
+ "Census Tract 19.3 \n",
+ "NTA 19.3 \n",
+ "ENERGY STAR Score 17.9 \n",
+ "Weather Normalized Site Natural Gas Intensity (... 16.7 \n",
+ "Weather Normalized Site Natural Gas Use (therms) 16.7 \n",
+ "Weather Normalized Source EUI (kBtu/ft²) 12.5 \n",
+ "Weather Normalized Site EUI (kBtu/ft²) 12.5 \n",
+ "Natural Gas Use (kBtu) 12.3 \n",
+ "Weather Normalized Site Electricity Intensity (... 6.7 \n",
+ "Weather Normalized Site Electricity (kWh) 6.7 \n",
+ "Electricity Use - Grid Purchase (kBtu) 2.1 \n",
+ "Site EUI (kBtu/ft²) 1.4 \n",
+ "Source EUI (kBtu/ft²) 1.4 \n",
+ "NYC Building Identification Number (BIN) 1.4 \n",
+ "Street Number 1.1 \n",
+ "Street Name 1.0 \n",
+ "Borough 1.0 \n",
+ "DOF Gross Floor Area 1.0 \n",
+ "Water Required? 1.0 \n",
+ "Direct GHG Emissions (Metric Tons CO2e) 0.7 \n",
+ "Total GHG Emissions (Metric Tons CO2e) 0.6 \n",
+ "Indirect GHG Emissions (Metric Tons CO2e) 0.6 \n",
+ "Metered Areas (Energy) 0.5 \n",
+ "DOF Benchmarking Submission Status 0.3 \n",
+ "NYC Borough, Block and Lot (BBL) self-reported 0.1 \n",
+ "BBL - 10 digits 0.1 \n",
+ "Largest Property Use Type 0.0 \n",
+ "Largest Property Use Type - Gross Floor Area (ft²) 0.0 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "missing_values_table(data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Your selected dataframe has 60 columns.\n",
+ "There are 46 columns that have missing values.\n",
+ "We will remove 11 columns.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get the columns with > 50% missing\n",
+ "missing_df = missing_values_table(data);\n",
+ "missing_columns = list(missing_df[missing_df['% of Total Values']> 50].index)\n",
+ "print('We will remove %d columns.'% len(missing_columns))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop the columns\n",
+ "data = data.drop(columns = list(missing_columns))"
]
},
{