From 9cd0cc6063305a8220f28ee38012672fdd1ba647 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Mon, 21 Dec 2020 21:33:57 +0800 Subject: [PATCH] =?UTF-8?q?Add=20=E5=A4=84=E7=90=86=E7=BC=BA=E5=A4=B1?= =?UTF-8?q?=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...筑能源利用率预测-checkpoint.ipynb | 1118 ++++++++++++++++- 1 file changed, 1102 insertions(+), 16 deletions(-) diff --git a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb index ad03c1d..4194fc4 100644 --- a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/.ipynb_checkpoints/建筑能源利用率预测-checkpoint.ipynb @@ -36,18 +36,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "D:\\Anaconda3\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", - " return f(*args, **kwds)\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -78,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": { "scrolled": true }, @@ -719,7 +710,7 @@ "4 Washington Heights South ... " ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -749,7 +740,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": { "scrolled": true }, @@ -839,12 +830,1107 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Replace all occurrences of Not Available with numpy not a number\n", - "data = data.replace({'Not Available':np.nan})" + "data = data.replace({'Not Available':np.nan})\n", + "\n", + "# Iterate through the columns\n", + "for col in list(data.columns):\n", + " if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in\n", + " col or 'therms' in col or 'gal' in col or 'Score' in col):\n", + " # Convert the data type to float\n", + " data[col] = data[col].astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OrderProperty IdDOF Gross Floor AreaLargest Property Use Type - Gross Floor Area (ft²)2nd Largest Property Use - Gross Floor Area (ft²)3rd Largest Property Use Type - Gross Floor Area (ft²)Year BuiltNumber of Buildings - Self-reportedOccupancyENERGY STAR ScoreSite EUI (kBtu/ft²)Weather Normalized Site EUI (kBtu/ft²)Weather Normalized Site Electricity Intensity (kWh/ft²)Weather Normalized Site Natural Gas Intensity (therms/ft²)Weather Normalized Source EUI (kBtu/ft²)Fuel Oil #1 Use (kBtu)Fuel Oil #2 Use (kBtu)Fuel Oil #4 Use (kBtu)Fuel Oil #5 & 6 Use (kBtu)Diesel #2 Use (kBtu)District Steam Use (kBtu)Natural Gas Use (kBtu)Weather Normalized Site Natural Gas Use (therms)Electricity Use - Grid Purchase (kBtu)Weather Normalized Site Electricity (kWh)Total GHG Emissions (Metric Tons CO2e)Direct GHG Emissions (Metric Tons CO2e)Indirect GHG Emissions (Metric Tons CO2e)Property GFA - Self-Reported (ft²)Water Use (All Water Sources) (kgal)Water Intensity (All Water Sources) (gal/ft²)Source EUI (kBtu/ft²)LatitudeLongitudeCommunity BoardCouncil DistrictCensus Tract
count11746.0000001.174600e+041.162800e+041.174400e+043741.0000001484.00000011746.00000011746.00000011746.0000009642.00000011583.00000010281.00000010959.0000009783.00000010281.0000009.000000e+002.581000e+031.321000e+035.940000e+021.600000e+019.360000e+021.030400e+049.784000e+031.150200e+041.096000e+041.167200e+041.166300e+041.168100e+041.174600e+047.762000e+037762.00000011583.0000009483.0000009483.0000009483.0000009483.0000009483.000000
mean7185.7595783.642958e+061.732695e+051.605524e+0522778.68201012016.8252701948.7383791.28997198.76255759.854594280.071484309.74746611.0726431.901441417.9157093.395398e+063.186882e+065.294367e+062.429105e+061.193594e+062.868907e+085.048545e+075.364578e+055.965472e+061.768752e+064.553657e+032.477937e+032.076339e+031.673739e+051.591798e+04136.172432385.90802940.754379-73.9570577.14067315.7712754977.596647
std4323.8599841.049070e+063.367055e+053.095746e+0555094.44142227959.75548630.5763864.0174847.50160329.9935868607.1788779784.731207127.73386897.20458710530.5243392.213237e+065.497154e+065.881863e+064.442946e+063.558178e+063.124603e+093.914719e+094.022606e+073.154430e+079.389154e+062.041639e+051.954498e+055.931295e+043.189238e+051.529524e+051730.7269389312.7362250.0801200.0463373.95412915.67437513520.422990
min1.0000007.365000e+035.002800e+045.400000e+010.0000000.0000001600.0000000.0000000.0000001.0000000.0000000.0000000.0000000.0000000.0000002.085973e+050.000000e+000.000000e+000.000000e+000.000000e+00-4.690797e+080.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00-2.313430e+040.000000e+000.000000e+000.0000000.00000040.516065-74.2435821.0000001.0000001.000000
25%3428.2500002.747222e+066.524000e+046.520100e+044000.0000001720.7500001927.0000001.000000100.00000037.00000061.80000065.1000003.8000000.100000103.5000001.663594e+062.550378e+052.128213e+060.000000e+005.698020e+044.320254e+061.098251e+061.176952e+041.043673e+063.019974e+053.287000e+021.474500e+029.480000e+016.699400e+042.595400e+0327.15000099.40000040.707226-73.9846624.0000004.000000100.000000
50%6986.5000003.236404e+069.313850e+049.132400e+048654.0000005000.0000001941.0000001.000000100.00000065.00000078.50000082.5000005.3000000.500000129.4000004.328815e+061.380138e+064.312984e+060.000000e+002.070020e+059.931240e+064.103962e+064.445525e+041.855196e+065.416312e+055.002500e+022.726000e+021.718000e+029.408000e+044.692500e+0345.095000124.90000040.759130-73.9628107.0000009.000000201.000000
75%11054.5000004.409092e+061.596140e+051.532550e+0520000.00000012000.0000001966.0000001.000000100.00000085.00000097.600000102.5000009.2000000.700000167.2000004.938947e+064.445808e+066.514520e+064.293825e+062.918332e+052.064497e+076.855070e+067.348107e+044.370302e+061.284677e+069.084250e+024.475000e+024.249000e+021.584140e+058.031875e+0370.805000162.75000040.817623-73.9324439.00000033.000000531.500000
max14993.0000005.991312e+061.354011e+071.421712e+07962428.000000591640.0000002019.000000161.000000100.000000100.000000869265.000000939329.0000006259.4000009393.000000986366.0000006.275850e+061.046849e+087.907464e+074.410378e+071.435178e+077.163518e+103.942852e+113.942852e+091.691763e+094.958273e+082.094340e+072.094340e+074.764375e+061.421712e+076.594604e+0696305.690000912801.10000040.912869-73.71554356.00000051.000000155101.000000
\n", + "
" + ], + "text/plain": [ + " Order Property Id DOF Gross Floor Area \\\n", + "count 11746.000000 1.174600e+04 1.162800e+04 \n", + "mean 7185.759578 3.642958e+06 1.732695e+05 \n", + "std 4323.859984 1.049070e+06 3.367055e+05 \n", + "min 1.000000 7.365000e+03 5.002800e+04 \n", + "25% 3428.250000 2.747222e+06 6.524000e+04 \n", + "50% 6986.500000 3.236404e+06 9.313850e+04 \n", + "75% 11054.500000 4.409092e+06 1.596140e+05 \n", + "max 14993.000000 5.991312e+06 1.354011e+07 \n", + "\n", + " Largest Property Use Type - Gross Floor Area (ft²) \\\n", + "count 1.174400e+04 \n", + "mean 1.605524e+05 \n", + "std 3.095746e+05 \n", + "min 5.400000e+01 \n", + "25% 6.520100e+04 \n", + "50% 9.132400e+04 \n", + "75% 1.532550e+05 \n", + "max 1.421712e+07 \n", + "\n", + " 2nd Largest Property Use - Gross Floor Area (ft²) \\\n", + "count 3741.000000 \n", + "mean 22778.682010 \n", + "std 55094.441422 \n", + "min 0.000000 \n", + "25% 4000.000000 \n", + "50% 8654.000000 \n", + "75% 20000.000000 \n", + "max 962428.000000 \n", + "\n", + " 3rd Largest Property Use Type - Gross Floor Area (ft²) Year Built \\\n", + "count 1484.000000 11746.000000 \n", + "mean 12016.825270 1948.738379 \n", + "std 27959.755486 30.576386 \n", + "min 0.000000 1600.000000 \n", + "25% 1720.750000 1927.000000 \n", + "50% 5000.000000 1941.000000 \n", + "75% 12000.000000 1966.000000 \n", + "max 591640.000000 2019.000000 \n", + "\n", + " Number of Buildings - Self-reported Occupancy ENERGY STAR Score \\\n", + "count 11746.000000 11746.000000 9642.000000 \n", + "mean 1.289971 98.762557 59.854594 \n", + "std 4.017484 7.501603 29.993586 \n", + "min 0.000000 0.000000 1.000000 \n", + "25% 1.000000 100.000000 37.000000 \n", + "50% 1.000000 100.000000 65.000000 \n", + "75% 1.000000 100.000000 85.000000 \n", + "max 161.000000 100.000000 100.000000 \n", + "\n", + " Site EUI (kBtu/ft²) Weather Normalized Site EUI (kBtu/ft²) \\\n", + "count 11583.000000 10281.000000 \n", + "mean 280.071484 309.747466 \n", + "std 8607.178877 9784.731207 \n", + "min 0.000000 0.000000 \n", + "25% 61.800000 65.100000 \n", + "50% 78.500000 82.500000 \n", + "75% 97.600000 102.500000 \n", + "max 869265.000000 939329.000000 \n", + "\n", + " Weather Normalized Site Electricity Intensity (kWh/ft²) \\\n", + "count 10959.000000 \n", + "mean 11.072643 \n", + "std 127.733868 \n", + "min 0.000000 \n", + "25% 3.800000 \n", + "50% 5.300000 \n", + "75% 9.200000 \n", + "max 6259.400000 \n", + "\n", + " Weather Normalized Site Natural Gas Intensity (therms/ft²) \\\n", + "count 9783.000000 \n", + "mean 1.901441 \n", + "std 97.204587 \n", + "min 0.000000 \n", + "25% 0.100000 \n", + "50% 0.500000 \n", + "75% 0.700000 \n", + "max 9393.000000 \n", + "\n", + " Weather Normalized Source EUI (kBtu/ft²) Fuel Oil #1 Use (kBtu) \\\n", + "count 10281.000000 9.000000e+00 \n", + "mean 417.915709 3.395398e+06 \n", + "std 10530.524339 2.213237e+06 \n", + "min 0.000000 2.085973e+05 \n", + "25% 103.500000 1.663594e+06 \n", + "50% 129.400000 4.328815e+06 \n", + "75% 167.200000 4.938947e+06 \n", + "max 986366.000000 6.275850e+06 \n", + "\n", + " Fuel Oil #2 Use (kBtu) Fuel Oil #4 Use (kBtu) \\\n", + "count 2.581000e+03 1.321000e+03 \n", + "mean 3.186882e+06 5.294367e+06 \n", + "std 5.497154e+06 5.881863e+06 \n", + "min 0.000000e+00 0.000000e+00 \n", + "25% 2.550378e+05 2.128213e+06 \n", + "50% 1.380138e+06 4.312984e+06 \n", + "75% 4.445808e+06 6.514520e+06 \n", + "max 1.046849e+08 7.907464e+07 \n", + "\n", + " Fuel Oil #5 & 6 Use (kBtu) Diesel #2 Use (kBtu) \\\n", + "count 5.940000e+02 1.600000e+01 \n", + "mean 2.429105e+06 1.193594e+06 \n", + "std 4.442946e+06 3.558178e+06 \n", + "min 0.000000e+00 0.000000e+00 \n", + "25% 0.000000e+00 5.698020e+04 \n", + "50% 0.000000e+00 2.070020e+05 \n", + "75% 4.293825e+06 2.918332e+05 \n", + "max 4.410378e+07 1.435178e+07 \n", + "\n", + " District Steam Use (kBtu) Natural Gas Use (kBtu) \\\n", + "count 9.360000e+02 1.030400e+04 \n", + "mean 2.868907e+08 5.048545e+07 \n", + "std 3.124603e+09 3.914719e+09 \n", + "min -4.690797e+08 0.000000e+00 \n", + "25% 4.320254e+06 1.098251e+06 \n", + "50% 9.931240e+06 4.103962e+06 \n", + "75% 2.064497e+07 6.855070e+06 \n", + "max 7.163518e+10 3.942852e+11 \n", + "\n", + " Weather Normalized Site Natural Gas Use (therms) \\\n", + "count 9.784000e+03 \n", + "mean 5.364578e+05 \n", + "std 4.022606e+07 \n", + "min 0.000000e+00 \n", + "25% 1.176952e+04 \n", + "50% 4.445525e+04 \n", + "75% 7.348107e+04 \n", + "max 3.942852e+09 \n", + "\n", + " Electricity Use - Grid Purchase (kBtu) \\\n", + "count 1.150200e+04 \n", + "mean 5.965472e+06 \n", + "std 3.154430e+07 \n", + "min 0.000000e+00 \n", + "25% 1.043673e+06 \n", + "50% 1.855196e+06 \n", + "75% 4.370302e+06 \n", + "max 1.691763e+09 \n", + "\n", + " Weather Normalized Site Electricity (kWh) \\\n", + "count 1.096000e+04 \n", + "mean 1.768752e+06 \n", + "std 9.389154e+06 \n", + "min 0.000000e+00 \n", + "25% 3.019974e+05 \n", + "50% 5.416312e+05 \n", + "75% 1.284677e+06 \n", + "max 4.958273e+08 \n", + "\n", + " Total GHG Emissions (Metric Tons CO2e) \\\n", + "count 1.167200e+04 \n", + "mean 4.553657e+03 \n", + "std 2.041639e+05 \n", + "min 0.000000e+00 \n", + "25% 3.287000e+02 \n", + "50% 5.002500e+02 \n", + "75% 9.084250e+02 \n", + "max 2.094340e+07 \n", + "\n", + " Direct GHG Emissions (Metric Tons CO2e) \\\n", + "count 1.166300e+04 \n", + "mean 2.477937e+03 \n", + "std 1.954498e+05 \n", + "min 0.000000e+00 \n", + "25% 1.474500e+02 \n", + "50% 2.726000e+02 \n", + "75% 4.475000e+02 \n", + "max 2.094340e+07 \n", + "\n", + " Indirect GHG Emissions (Metric Tons CO2e) \\\n", + "count 1.168100e+04 \n", + "mean 2.076339e+03 \n", + "std 5.931295e+04 \n", + "min -2.313430e+04 \n", + "25% 9.480000e+01 \n", + "50% 1.718000e+02 \n", + "75% 4.249000e+02 \n", + "max 4.764375e+06 \n", + "\n", + " Property GFA - Self-Reported (ft²) \\\n", + "count 1.174600e+04 \n", + "mean 1.673739e+05 \n", + "std 3.189238e+05 \n", + "min 0.000000e+00 \n", + "25% 6.699400e+04 \n", + "50% 9.408000e+04 \n", + "75% 1.584140e+05 \n", + "max 1.421712e+07 \n", + "\n", + " Water Use (All Water Sources) (kgal) \\\n", + "count 7.762000e+03 \n", + "mean 1.591798e+04 \n", + "std 1.529524e+05 \n", + "min 0.000000e+00 \n", + "25% 2.595400e+03 \n", + "50% 4.692500e+03 \n", + "75% 8.031875e+03 \n", + "max 6.594604e+06 \n", + "\n", + " Water Intensity (All Water Sources) (gal/ft²) Source EUI (kBtu/ft²) \\\n", + "count 7762.000000 11583.000000 \n", + "mean 136.172432 385.908029 \n", + "std 1730.726938 9312.736225 \n", + "min 0.000000 0.000000 \n", + "25% 27.150000 99.400000 \n", + "50% 45.095000 124.900000 \n", + "75% 70.805000 162.750000 \n", + "max 96305.690000 912801.100000 \n", + "\n", + " Latitude Longitude Community Board Council District \\\n", + "count 9483.000000 9483.000000 9483.000000 9483.000000 \n", + "mean 40.754379 -73.957057 7.140673 15.771275 \n", + "std 0.080120 0.046337 3.954129 15.674375 \n", + "min 40.516065 -74.243582 1.000000 1.000000 \n", + "25% 40.707226 -73.984662 4.000000 4.000000 \n", + "50% 40.759130 -73.962810 7.000000 9.000000 \n", + "75% 40.817623 -73.932443 9.000000 33.000000 \n", + "max 40.912869 -73.715543 56.000000 51.000000 \n", + "\n", + " Census Tract \n", + "count 9483.000000 \n", + "mean 4977.596647 \n", + "std 13520.422990 \n", + "min 1.000000 \n", + "25% 100.000000 \n", + "50% 201.000000 \n", + "75% 531.500000 \n", + "max 155101.000000 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Statistics for each column\n", + "data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 缺失值处理\n", + "每个列缺失的比例,这里提供一个函数。" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to calculate missing values by column\n", + "def missing_values_table(df):\n", + " # Total missing values\n", + " mis_val = df.isnull().sum()\n", + " \n", + " # Percentage of missing values\n", + " mis_val_percent = 100 * df.isnull().sum() / len(df)\n", + " \n", + " # Make a table with the results\n", + " mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n", + " \n", + " # Rename the columns\n", + " mis_val_table_ren_columns = mis_val_table.rename(\n", + " columns = {0 : 'Missing Values', 1 : '% of Total Values'})\n", + " \n", + " # Sort the table by percentage of missing descending\n", + " mis_val_table_ren_columns = mis_val_table_ren_columns[\n", + " mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(\n", + " '% of Total Values', ascending=False).round(1)\n", + " \n", + " # Print some summary information\n", + " print (\"Your selected dataframe has \" + str(df.shape[1]) + \" columns.\\n\" \n", + " \"There are \" + str(mis_val_table_ren_columns.shape[0]) +\n", + " \" columns that have missing values.\")\n", + " \n", + " \n", + " # Return the dataframe with missing information\n", + " return mis_val_table_ren_columns" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your selected dataframe has 60 columns.\n", + "There are 46 columns that have missing values.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Missing Values% of Total Values
Fuel Oil #1 Use (kBtu)1173799.9
Diesel #2 Use (kBtu)1173099.9
Address 21153998.2
Fuel Oil #5 & 6 Use (kBtu)1115294.9
District Steam Use (kBtu)1081092.0
Fuel Oil #4 Use (kBtu)1042588.8
3rd Largest Property Use Type - Gross Floor Area (ft²)1026287.4
3rd Largest Property Use Type1026287.4
Fuel Oil #2 Use (kBtu)916578.0
2nd Largest Property Use Type800568.2
2nd Largest Property Use - Gross Floor Area (ft²)800568.2
Metered Areas (Water)460939.2
Water Intensity (All Water Sources) (gal/ft²)398433.9
Water Use (All Water Sources) (kgal)398433.9
Latitude226319.3
Longitude226319.3
Community Board226319.3
Council District226319.3
Census Tract226319.3
NTA226319.3
ENERGY STAR Score210417.9
Weather Normalized Site Natural Gas Intensity (therms/ft²)196316.7
Weather Normalized Site Natural Gas Use (therms)196216.7
Weather Normalized Source EUI (kBtu/ft²)146512.5
Weather Normalized Site EUI (kBtu/ft²)146512.5
Natural Gas Use (kBtu)144212.3
Weather Normalized Site Electricity Intensity (kWh/ft²)7876.7
Weather Normalized Site Electricity (kWh)7866.7
Electricity Use - Grid Purchase (kBtu)2442.1
Site EUI (kBtu/ft²)1631.4
Source EUI (kBtu/ft²)1631.4
NYC Building Identification Number (BIN)1621.4
Street Number1241.1
Street Name1221.0
Borough1181.0
DOF Gross Floor Area1181.0
Water Required?1181.0
Direct GHG Emissions (Metric Tons CO2e)830.7
Total GHG Emissions (Metric Tons CO2e)740.6
Indirect GHG Emissions (Metric Tons CO2e)650.6
Metered Areas (Energy)570.5
DOF Benchmarking Submission Status300.3
NYC Borough, Block and Lot (BBL) self-reported110.1
BBL - 10 digits110.1
Largest Property Use Type20.0
Largest Property Use Type - Gross Floor Area (ft²)20.0
\n", + "
" + ], + "text/plain": [ + " Missing Values \\\n", + "Fuel Oil #1 Use (kBtu) 11737 \n", + "Diesel #2 Use (kBtu) 11730 \n", + "Address 2 11539 \n", + "Fuel Oil #5 & 6 Use (kBtu) 11152 \n", + "District Steam Use (kBtu) 10810 \n", + "Fuel Oil #4 Use (kBtu) 10425 \n", + "3rd Largest Property Use Type - Gross Floor Are... 10262 \n", + "3rd Largest Property Use Type 10262 \n", + "Fuel Oil #2 Use (kBtu) 9165 \n", + "2nd Largest Property Use Type 8005 \n", + "2nd Largest Property Use - Gross Floor Area (ft²) 8005 \n", + "Metered Areas (Water) 4609 \n", + "Water Intensity (All Water Sources) (gal/ft²) 3984 \n", + "Water Use (All Water Sources) (kgal) 3984 \n", + "Latitude 2263 \n", + "Longitude 2263 \n", + "Community Board 2263 \n", + "Council District 2263 \n", + "Census Tract 2263 \n", + "NTA 2263 \n", + "ENERGY STAR Score 2104 \n", + "Weather Normalized Site Natural Gas Intensity (... 1963 \n", + "Weather Normalized Site Natural Gas Use (therms) 1962 \n", + "Weather Normalized Source EUI (kBtu/ft²) 1465 \n", + "Weather Normalized Site EUI (kBtu/ft²) 1465 \n", + "Natural Gas Use (kBtu) 1442 \n", + "Weather Normalized Site Electricity Intensity (... 787 \n", + "Weather Normalized Site Electricity (kWh) 786 \n", + "Electricity Use - Grid Purchase (kBtu) 244 \n", + "Site EUI (kBtu/ft²) 163 \n", + "Source EUI (kBtu/ft²) 163 \n", + "NYC Building Identification Number (BIN) 162 \n", + "Street Number 124 \n", + "Street Name 122 \n", + "Borough 118 \n", + "DOF Gross Floor Area 118 \n", + "Water Required? 118 \n", + "Direct GHG Emissions (Metric Tons CO2e) 83 \n", + "Total GHG Emissions (Metric Tons CO2e) 74 \n", + "Indirect GHG Emissions (Metric Tons CO2e) 65 \n", + "Metered Areas (Energy) 57 \n", + "DOF Benchmarking Submission Status 30 \n", + "NYC Borough, Block and Lot (BBL) self-reported 11 \n", + "BBL - 10 digits 11 \n", + "Largest Property Use Type 2 \n", + "Largest Property Use Type - Gross Floor Area (ft²) 2 \n", + "\n", + " % of Total Values \n", + "Fuel Oil #1 Use (kBtu) 99.9 \n", + "Diesel #2 Use (kBtu) 99.9 \n", + "Address 2 98.2 \n", + "Fuel Oil #5 & 6 Use (kBtu) 94.9 \n", + "District Steam Use (kBtu) 92.0 \n", + "Fuel Oil #4 Use (kBtu) 88.8 \n", + "3rd Largest Property Use Type - Gross Floor Are... 87.4 \n", + "3rd Largest Property Use Type 87.4 \n", + "Fuel Oil #2 Use (kBtu) 78.0 \n", + "2nd Largest Property Use Type 68.2 \n", + "2nd Largest Property Use - Gross Floor Area (ft²) 68.2 \n", + "Metered Areas (Water) 39.2 \n", + "Water Intensity (All Water Sources) (gal/ft²) 33.9 \n", + "Water Use (All Water Sources) (kgal) 33.9 \n", + "Latitude 19.3 \n", + "Longitude 19.3 \n", + "Community Board 19.3 \n", + "Council District 19.3 \n", + "Census Tract 19.3 \n", + "NTA 19.3 \n", + "ENERGY STAR Score 17.9 \n", + "Weather Normalized Site Natural Gas Intensity (... 16.7 \n", + "Weather Normalized Site Natural Gas Use (therms) 16.7 \n", + "Weather Normalized Source EUI (kBtu/ft²) 12.5 \n", + "Weather Normalized Site EUI (kBtu/ft²) 12.5 \n", + "Natural Gas Use (kBtu) 12.3 \n", + "Weather Normalized Site Electricity Intensity (... 6.7 \n", + "Weather Normalized Site Electricity (kWh) 6.7 \n", + "Electricity Use - Grid Purchase (kBtu) 2.1 \n", + "Site EUI (kBtu/ft²) 1.4 \n", + "Source EUI (kBtu/ft²) 1.4 \n", + "NYC Building Identification Number (BIN) 1.4 \n", + "Street Number 1.1 \n", + "Street Name 1.0 \n", + "Borough 1.0 \n", + "DOF Gross Floor Area 1.0 \n", + "Water Required? 1.0 \n", + "Direct GHG Emissions (Metric Tons CO2e) 0.7 \n", + "Total GHG Emissions (Metric Tons CO2e) 0.6 \n", + "Indirect GHG Emissions (Metric Tons CO2e) 0.6 \n", + "Metered Areas (Energy) 0.5 \n", + "DOF Benchmarking Submission Status 0.3 \n", + "NYC Borough, Block and Lot (BBL) self-reported 0.1 \n", + "BBL - 10 digits 0.1 \n", + "Largest Property Use Type 0.0 \n", + "Largest Property Use Type - Gross Floor Area (ft²) 0.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "missing_values_table(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your selected dataframe has 60 columns.\n", + "There are 46 columns that have missing values.\n", + "We will remove 11 columns.\n" + ] + } + ], + "source": [ + "# Get the columns with > 50% missing\n", + "missing_df = missing_values_table(data);\n", + "missing_columns = list(missing_df[missing_df['% of Total Values']> 50].index)\n", + "print('We will remove %d columns.'% len(missing_columns))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the columns\n", + "data = data.drop(columns = list(missing_columns))" ] }, {