From a60aa03c487119f1ba64ab97bfc560ec1d87fe93 Mon Sep 17 00:00:00 2001
From: Nirmalya Misra <39618712+nirmalya8@users.noreply.github.com>
Date: Wed, 6 Oct 2021 00:06:56 +0530
Subject: [PATCH] Added Encoding

---
 .../08-data-preparation/notebook.ipynb        | 1710 ++++++++++++++---
 1 file changed, 1492 insertions(+), 218 deletions(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index b5a6bac..71b076e 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -79,7 +79,7 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "4641a412-8abb-4e2f-d1ec-ff9b5004e361"
+        "outputId": "70e0d7dd-fb30-45c4-a5af-7dc85cd89342"
       },
       "source": [
         "iris_df.shape"
@@ -126,7 +126,7 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "0f9c41ea-d480-4245-d7e2-56d514ac7724"
+        "outputId": "85e6ab39-174f-4dc7-fee6-a18f3ba14a7d"
       },
       "source": [
         "iris_df.columns"
@@ -174,7 +174,7 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "94d5e48a-746c-4e58-b08f-c63b377a61b1"
+        "outputId": "2a2bb81a-257c-4410-f826-99402b75ce14"
       },
       "source": [
         "iris_df.info()"
@@ -230,7 +230,7 @@
           "base_uri": "https://localhost:8080/",
           "height": 297
         },
-        "outputId": "b01322a1-4296-4ad0-f990-6e0dcba668f6"
+        "outputId": "e5015299-163f-42c7-aaa1-9bc3a67788bf"
       },
       "source": [
         "iris_df.describe()"
@@ -373,7 +373,7 @@
           "base_uri": "https://localhost:8080/",
           "height": 204
         },
-        "outputId": "14b1e3cd-54ac-47dc-f7b2-231d51d93741"
+        "outputId": "5ff975df-45f0-4efd-f884-2580909c6e67"
       },
       "source": [
         "iris_df.head()"
@@ -492,7 +492,7 @@
       "source": [
         "# Hint: Consult the documentation by using iris_df.head?"
       ],
-      "execution_count": 7,
+      "execution_count": null,
       "outputs": []
     },
     {
@@ -514,12 +514,12 @@
           "base_uri": "https://localhost:8080/",
           "height": 204
         },
-        "outputId": "d4e22b38-ba5d-4dd1-bbd2-b9cd9ad7b150"
+        "outputId": "1726a2e0-82d7-4491-8dbc-637f28a11d26"
       },
       "source": [
         "iris_df.tail()"
       ],
-      "execution_count": 8,
+      "execution_count": 7,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -599,7 +599,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 8
+          "execution_count": 7
         }
       ]
     },
@@ -657,7 +657,7 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "e2ea93a4-b967-4319-904b-85479c36b169"
+        "outputId": "20e2d43a-2053-4037-c736-8ec2c28b67e5"
       },
       "source": [
         "import numpy as np\n",
@@ -665,7 +665,7 @@
         "example1 = np.array([2, None, 6, 8])\n",
         "example1"
       ],
-      "execution_count": 9,
+      "execution_count": 8,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -675,7 +675,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 9
+          "execution_count": 8
         }
       ]
     },
@@ -699,12 +699,12 @@
           "base_uri": "https://localhost:8080/",
           "height": 292
         },
-        "outputId": "ff2a899b-5419-4a5c-b054-bc1e6ab906c5"
+        "outputId": "ab3b1799-504f-480d-851b-85b19f62d8b7"
       },
       "source": [
         "example1.sum()"
       ],
-      "execution_count": 10,
+      "execution_count": 9,
       "outputs": [
         {
           "output_type": "error",
@@ -713,7 +713,7 @@
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-10-ce9901ad18bd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mexample1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;32m<ipython-input-9-ce9901ad18bd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mexample1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
             "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py\u001b[0m in \u001b[0;36m_sum\u001b[0;34m(a, axis, dtype, out, keepdims, initial, where)\u001b[0m\n\u001b[1;32m     45\u001b[0m def _sum(a, axis=None, dtype=None, out=None, keepdims=False,\n\u001b[1;32m     46\u001b[0m          initial=_NoValue, where=True):\n\u001b[0;32m---> 47\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mumr_sum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwhere\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m def _prod(a, axis=None, dtype=None, out=None, keepdims=False,\n",
             "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'int' and 'NoneType'"
           ]
@@ -748,12 +748,12 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "a452b675-2131-47a7-ff38-2b4d6e923d50"
+        "outputId": "3744a812-6daf-472e-e933-388c722ab2b4"
       },
       "source": [
         "np.nan + 1"
       ],
-      "execution_count": 11,
+      "execution_count": 10,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -763,7 +763,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 11
+          "execution_count": 10
         }
       ]
     },
@@ -775,12 +775,12 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "6956b57f-8ae7-4880-cc1d-0cf54edfe6ee"
+        "outputId": "4a304a47-c5a0-4814-92b0-c4b5ab193358"
       },
       "source": [
         "np.nan * 0"
       ],
-      "execution_count": 12,
+      "execution_count": 11,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -790,7 +790,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 12
+          "execution_count": 11
         }
       ]
     },
@@ -811,13 +811,13 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "57ad3201-3958-48c6-924b-d46b61d4aeba"
+        "outputId": "a41b57bf-1c2a-4219-9ee5-0a1a1499e74d"
       },
       "source": [
         "example2 = np.array([2, np.nan, 6, 8]) \n",
         "example2.sum(), example2.min(), example2.max()"
       ],
-      "execution_count": 13,
+      "execution_count": 12,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -827,7 +827,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 13
+          "execution_count": 12
         }
       ]
     },
@@ -850,7 +850,7 @@
       "source": [
         "# What happens if you add np.nan and None together?\n"
       ],
-      "execution_count": 14,
+      "execution_count": 13,
       "outputs": []
     },
     {
@@ -881,13 +881,13 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "8dbdf129-cd8b-40b5-96ba-21a7f3fa0044"
+        "outputId": "5f3389e0-4b54-4d6b-a305-a269df869235"
       },
       "source": [
         "int_series = pd.Series([1, 2, 3], dtype=int)\n",
         "int_series"
       ],
-      "execution_count": 15,
+      "execution_count": 14,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -900,7 +900,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 15
+          "execution_count": 14
         }
       ]
     },
@@ -925,7 +925,7 @@
         "# How does that element show up in the Series?\n",
         "# What is the dtype of the Series?\n"
       ],
-      "execution_count": 16,
+      "execution_count": 15,
       "outputs": []
     },
     {
@@ -966,7 +966,7 @@
       "source": [
         "example3 = pd.Series([0, np.nan, '', None])"
       ],
-      "execution_count": 17,
+      "execution_count": 16,
       "outputs": []
     },
     {
@@ -977,12 +977,12 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "1fd6c6af-19e0-4568-e837-985d571604f4"
+        "outputId": "88a14e60-392a-42ad-d767-a4055580f523"
       },
       "source": [
         "example3.isnull()"
       ],
-      "execution_count": 18,
+      "execution_count": 17,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -996,7 +996,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 18
+          "execution_count": 17
         }
       ]
     },
@@ -1020,12 +1020,12 @@
           "base_uri": "https://localhost:8080/"
         },
         "id": "JCcQVoPkHDUv",
-        "outputId": "c0002689-f529-4e3e-c73b-41ac513c59d3"
+        "outputId": "042418f0-981b-4c5e-cdf8-c42912f7e4fe"
       },
       "source": [
         "example3.isnull().sum()"
       ],
-      "execution_count": 19,
+      "execution_count": 18,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1035,7 +1035,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 19
+          "execution_count": 18
         }
       ]
     },
@@ -1059,7 +1059,7 @@
         "# Try running example3[example3.notnull()].\n",
         "# Before you do so, what do you expect to see?\n"
       ],
-      "execution_count": 20,
+      "execution_count": 19,
       "outputs": []
     },
     {
@@ -1118,13 +1118,13 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "3d2d43e7-99ca-45ca-adc4-cef2c737e5bf"
+        "outputId": "782b0526-a1bb-4757-ac1f-a16267d9eb4f"
       },
       "source": [
         "example3 = example3.dropna()\n",
         "example3"
       ],
-      "execution_count": 21,
+      "execution_count": 20,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1136,7 +1136,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 21
+          "execution_count": 20
         }
       ]
     },
@@ -1160,7 +1160,7 @@
           "base_uri": "https://localhost:8080/",
           "height": 142
         },
-        "outputId": "961427aa-9bce-445b-d230-61d02bc16c92"
+        "outputId": "3d19e787-896d-4ba4-8662-811d2e191d3b"
       },
       "source": [
         "example4 = pd.DataFrame([[1,      np.nan, 7], \n",
@@ -1168,7 +1168,7 @@
         "                         [np.nan, 6,      9]])\n",
         "example4"
       ],
-      "execution_count": 22,
+      "execution_count": 21,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1228,7 +1228,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 22
+          "execution_count": 21
         }
       ]
     },
@@ -1252,12 +1252,12 @@
           "base_uri": "https://localhost:8080/",
           "height": 80
         },
-        "outputId": "aaeac6bc-ca6f-4eda-de0c-119e0c50ba83"
+        "outputId": "6bdb7658-8a64-401f-d2b2-bd0f8bc17325"
       },
       "source": [
         "example4.dropna()"
       ],
-      "execution_count": 23,
+      "execution_count": 22,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1303,7 +1303,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 23
+          "execution_count": 22
         }
       ]
     },
@@ -1325,12 +1325,12 @@
           "base_uri": "https://localhost:8080/",
           "height": 142
         },
-        "outputId": "89fee273-d71b-4400-9484-b4bf93b69ee5"
+        "outputId": "0071a8bb-9fe5-4ed5-a3af-d0209485515a"
       },
       "source": [
         "example4.dropna(axis='columns')"
       ],
-      "execution_count": 24,
+      "execution_count": 23,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1382,7 +1382,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 24
+          "execution_count": 23
         }
       ]
     },
@@ -1406,13 +1406,13 @@
           "base_uri": "https://localhost:8080/",
           "height": 142
         },
-        "outputId": "07e8f4eb-18c8-4e5d-9317-6a9a3db38b73"
+        "outputId": "a26b5362-0d17-49c2-d902-10832f9bf9a0"
       },
       "source": [
         "example4[3] = np.nan\n",
         "example4"
       ],
-      "execution_count": 25,
+      "execution_count": 24,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1476,7 +1476,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 25
+          "execution_count": 24
         }
       ]
     },
@@ -1513,7 +1513,7 @@
         "# How might you go about dropping just column 3?\n",
         "# Hint: remember that you will need to supply both the axis parameter and the how parameter.\n"
       ],
-      "execution_count": 26,
+      "execution_count": 25,
       "outputs": []
     },
     {
@@ -1534,12 +1534,12 @@
           "base_uri": "https://localhost:8080/",
           "height": 80
         },
-        "outputId": "b2c00415-95a6-4a5c-e3f9-781ff5cc8625"
+        "outputId": "ee2d3a60-a694-4a11-ef37-28d00a8d956c"
       },
       "source": [
         "example4.dropna(axis='rows', thresh=3)"
       ],
-      "execution_count": 27,
+      "execution_count": 26,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1587,7 +1587,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 27
+          "execution_count": 26
         }
       ]
     },
@@ -1620,6 +1620,7 @@
         "id": "CE8S7louLezV"
       },
       "source": [
+        "### Categorical Data(Non-numeric)\n",
         "First let us consider non-numeric data. In datasets, we have columns with categorical data. Eg. Gender, True or False etc.\n",
         "\n",
         "In most of these cases, we replace missing values with the `mode` of the column. Say, we have 100 data points and 90 have said True, 8 have said False and 2 have not filled. Then, we can will the 2 with True, considering the full column. \n",
@@ -1635,7 +1636,7 @@
           "height": 204
         },
         "id": "MY5faq4yLdpQ",
-        "outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc"
+        "outputId": "49350e22-4ee9-43c1-9d6c-e5f837b24ae8"
       },
       "source": [
         "fill_with_mode = pd.DataFrame([[1,2,\"True\"],\n",
@@ -1646,7 +1647,7 @@
         "\n",
         "fill_with_mode"
       ],
-      "execution_count": 28,
+      "execution_count": 27,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1720,7 +1721,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 28
+          "execution_count": 27
         }
       ]
     },
@@ -1740,12 +1741,12 @@
           "base_uri": "https://localhost:8080/"
         },
         "id": "WKy-9Y2tN5jv",
-        "outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f"
+        "outputId": "d0c045f2-218c-45aa-951c-f3feed98510a"
       },
       "source": [
         "fill_with_mode[2].value_counts()"
       ],
-      "execution_count": 29,
+      "execution_count": 28,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1757,7 +1758,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 29
+          "execution_count": 28
         }
       ]
     },
@@ -1778,7 +1779,7 @@
       "source": [
         "fill_with_mode[2].fillna('True',inplace=True)"
       ],
-      "execution_count": 30,
+      "execution_count": 29,
       "outputs": []
     },
     {
@@ -1789,12 +1790,12 @@
           "height": 204
         },
         "id": "tvas7c9_OPWE",
-        "outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164"
+        "outputId": "c45890f5-8c76-4a3c-87f0-b831c2199750"
       },
       "source": [
         "fill_with_mode"
       ],
-      "execution_count": 31,
+      "execution_count": 30,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1868,7 +1869,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 31
+          "execution_count": 30
         }
       ]
     },
@@ -1887,6 +1888,7 @@
         "id": "heYe1I0dOmQ_"
       },
       "source": [
+        "### Numeric Data\n",
         "Now, coming to numeric data. Here, we have a two common ways of replacing missing values:\n",
         "\n",
         "1. Replace with Median of the row\n",
@@ -1907,7 +1909,7 @@
           "height": 204
         },
         "id": "09HM_2feOj5Y",
-        "outputId": "ade42fec-dc40-45d0-e22c-974849ea8664"
+        "outputId": "44330273-5709-4af9-99c7-7a3a8e28c7b0"
       },
       "source": [
         "fill_with_mean = pd.DataFrame([[-2,0,1],\n",
@@ -1918,7 +1920,7 @@
         "\n",
         "fill_with_mean"
       ],
-      "execution_count": 33,
+      "execution_count": 31,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -1992,7 +1994,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 33
+          "execution_count": 31
         }
       ]
     },
@@ -2009,15 +2011,15 @@
       "cell_type": "code",
       "metadata": {
         "id": "XYtYEf5BSxFL",
-        "outputId": "1e79aeea-6baf-4572-dcd1-23e5ec742036",
         "colab": {
           "base_uri": "https://localhost:8080/"
-        }
+        },
+        "outputId": "7240075c-c3a7-4ac3-e08d-be6d60573d38"
       },
       "source": [
         "np.mean(fill_with_mean[0])"
       ],
-      "execution_count": 34,
+      "execution_count": 32,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -2027,7 +2029,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 34
+          "execution_count": 32
         }
       ]
     },
@@ -2044,17 +2046,17 @@
       "cell_type": "code",
       "metadata": {
         "id": "FzncQLmuS5jh",
-        "outputId": "75f33b25-e6b3-41bb-8049-1ed2e085efe2",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 204
-        }
+        },
+        "outputId": "733bfa87-b099-4c11-db2e-1dea88b977ac"
       },
       "source": [
         "fill_with_mean[0].fillna(np.mean(fill_with_mean[0]),inplace=True)\n",
         "fill_with_mean"
       ],
-      "execution_count": 35,
+      "execution_count": 33,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -2128,7 +2130,7 @@
             ]
           },
           "metadata": {},
-          "execution_count": 35
+          "execution_count": 33
         }
       ]
     },
@@ -2141,201 +2143,1261 @@
         "As we can see, the missing value has been replaced with its mean."
       ]
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "trusted": false,
-        "id": "0ybtWLDdgRsG"
-      },
-      "source": [
-        "example5 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))\n",
-        "example5"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "yrsigxRggRsH"
+        "id": "jIvF13a1i00Z"
       },
       "source": [
-        "You can fill all of the null entries with a single value, such as `0`:"
+        "Now let us try another dataframe, and this time we will replace the None values with the median of the column."
       ]
     },
     {
       "cell_type": "code",
       "metadata": {
-        "trusted": false,
-        "id": "KXMIPsQdgRsH"
+        "id": "DA59Bqo3jBYZ",
+        "outputId": "4338adf5-081c-46ce-aca1-85bcaebf9838",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 204
+        }
       },
       "source": [
-        "example5.fillna(0)"
+        "fill_with_median = pd.DataFrame([[-2,0,1],\n",
+        "                               [-1,2,3],\n",
+        "                               [0,np.nan,5],\n",
+        "                               [1,6,7],\n",
+        "                               [2,8,9]])\n",
+        "\n",
+        "fill_with_median"
       ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FI9MmqFJgRsH"
-      },
-      "source": [
-        "### Exercise:"
+      "execution_count": 39,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>1</th>\n",
+              "      <th>2</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>-2</td>\n",
+              "      <td>0.0</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>-1</td>\n",
+              "      <td>2.0</td>\n",
+              "      <td>3</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>0</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>5</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>1</td>\n",
+              "      <td>6.0</td>\n",
+              "      <td>7</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>2</td>\n",
+              "      <td>8.0</td>\n",
+              "      <td>9</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "   0    1  2\n",
+              "0 -2  0.0  1\n",
+              "1 -1  2.0  3\n",
+              "2  0  NaN  5\n",
+              "3  1  6.0  7\n",
+              "4  2  8.0  9"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 39
+        }
       ]
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "collapsed": true,
-        "trusted": false,
-        "id": "af-ezpXdgRsH"
-      },
-      "source": [
-        "# What happens if you try to fill null values with a string, like ''?\n"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "kq3hw1kLgRsI"
+        "id": "mM1GpXYmjHnc"
       },
       "source": [
-        "You can **forward-fill** null values, which is to use the last valid value to fill a null:"
+        "The median of the second column is"
       ]
     },
     {
       "cell_type": "code",
       "metadata": {
-        "trusted": false,
-        "id": "vO3BuNrggRsI"
+        "id": "uiDy5v3xjHHX",
+        "outputId": "2028aa4b-8bec-4b76-ea2f-fcaa7b362e9d",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
       },
       "source": [
-        "example5.fillna(method='ffill')"
+        "fill_with_median[1].median()"
       ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nDXeYuHzgRsI"
-      },
-      "source": [
-        "You can also **back-fill** to propagate the next valid value backward to fill a null:"
+      "execution_count": 40,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "4.0"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 40
+        }
       ]
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "trusted": false,
-        "id": "4M5onHcEgRsI"
-      },
-      "source": [
-        "example5.fillna(method='bfill')"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
     {
       "cell_type": "markdown",
       "metadata": {
-        "collapsed": true,
-        "id": "MbBzTom5gRsI"
+        "id": "z9PLF75Jj_1s"
       },
       "source": [
-        "As you might guess, this works the same with `DataFrame`s, but you can also specify an `axis` along which to fill null values:"
+        "Filling with median"
       ]
     },
     {
       "cell_type": "code",
       "metadata": {
-        "trusted": false,
-        "id": "aRpIvo4ZgRsI"
-      },
-      "source": [
-        "example4"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "trusted": false,
-        "id": "VM1qtACAgRsI"
+        "id": "lFKbOxCMkBbg",
+        "outputId": "61bf2b0e-c68d-4b54-9724-f496c8c2ea94",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 204
+        }
       },
       "source": [
-        "example4.fillna(method='ffill', axis=1)"
+        "fill_with_median[1].fillna(fill_with_median[1].median(),inplace=True)\n",
+        "fill_with_median"
       ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZeMc-I1EgRsI"
+      "execution_count": 41,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>1</th>\n",
+              "      <th>2</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>-2</td>\n",
+              "      <td>0.0</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>-1</td>\n",
+              "      <td>2.0</td>\n",
+              "      <td>3</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>0</td>\n",
+              "      <td>4.0</td>\n",
+              "      <td>5</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>1</td>\n",
+              "      <td>6.0</td>\n",
+              "      <td>7</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>2</td>\n",
+              "      <td>8.0</td>\n",
+              "      <td>9</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "   0    1  2\n",
+              "0 -2  0.0  1\n",
+              "1 -1  2.0  3\n",
+              "2  0  4.0  5\n",
+              "3  1  6.0  7\n",
+              "4  2  8.0  9"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 41
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8JtQ53GSkKWC"
+      },
+      "source": [
+        "As we can see, the NaN value has been replaced by the median of the column"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "trusted": false,
+        "id": "0ybtWLDdgRsG",
+        "outputId": "ee2e547a-bf98-40a5-ddc4-b11357efb898",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "example5 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))\n",
+        "example5"
+      ],
+      "execution_count": 42,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "a    1.0\n",
+              "b    NaN\n",
+              "c    2.0\n",
+              "d    NaN\n",
+              "e    3.0\n",
+              "dtype: float64"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 42
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yrsigxRggRsH"
+      },
+      "source": [
+        "You can fill all of the null entries with a single value, such as `0`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "trusted": false,
+        "id": "KXMIPsQdgRsH",
+        "outputId": "f88a0095-9742-4f1e-fdf4-43fc14cbc4c0",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "example5.fillna(0)"
+      ],
+      "execution_count": 43,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "a    1.0\n",
+              "b    0.0\n",
+              "c    2.0\n",
+              "d    0.0\n",
+              "e    3.0\n",
+              "dtype: float64"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 43
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RRlI5f_hkfKe"
+      },
+      "source": [
+        "> Key takeaways:\n",
+        "1. Filling in missing values should be done when either there is less data or there is a strategy to fill in the missing data.\n",
+        "2. Domain knowledge can be used to fill in missing values by approximating them.\n",
+        "3. For Categorical data, mostly, missing values are substituted with the mode of the column. \n",
+        "4. For numeric data, missing values are usually filled in with the mean(for normalized datasets) or the median of the columns. "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FI9MmqFJgRsH"
+      },
+      "source": [
+        "### Exercise:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "collapsed": true,
+        "trusted": false,
+        "id": "af-ezpXdgRsH"
+      },
+      "source": [
+        "# What happens if you try to fill null values with a string, like ''?\n"
+      ],
+      "execution_count": 44,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kq3hw1kLgRsI"
+      },
+      "source": [
+        "You can **forward-fill** null values, which is to use the last valid value to fill a null:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "trusted": false,
+        "id": "vO3BuNrggRsI",
+        "outputId": "aff7d7de-20b9-42bf-fe06-932677314b37",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "example5.fillna(method='ffill')"
+      ],
+      "execution_count": 45,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "a    1.0\n",
+              "b    1.0\n",
+              "c    2.0\n",
+              "d    2.0\n",
+              "e    3.0\n",
+              "dtype: float64"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 45
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nDXeYuHzgRsI"
+      },
+      "source": [
+        "You can also **back-fill** to propagate the next valid value backward to fill a null:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "trusted": false,
+        "id": "4M5onHcEgRsI",
+        "outputId": "c20c283d-76d7-4f75-c443-5c55fbdb3541",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "example5.fillna(method='bfill')"
+      ],
+      "execution_count": 46,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "a    1.0\n",
+              "b    2.0\n",
+              "c    2.0\n",
+              "d    3.0\n",
+              "e    3.0\n",
+              "dtype: float64"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 46
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "collapsed": true,
+        "id": "MbBzTom5gRsI"
+      },
+      "source": [
+        "As you might guess, this works the same with DataFrames, but you can also specify an `axis` along which to fill null values:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "trusted": false,
+        "id": "aRpIvo4ZgRsI",
+        "outputId": "ea9c5e3d-a23d-4314-cff4-e5a0e46043d1",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 142
+        }
+      },
+      "source": [
+        "example4"
+      ],
+      "execution_count": 47,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>1</th>\n",
+              "      <th>2</th>\n",
+              "      <th>3</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>1.0</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>7</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>2.0</td>\n",
+              "      <td>5.0</td>\n",
+              "      <td>8</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>NaN</td>\n",
+              "      <td>6.0</td>\n",
+              "      <td>9</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "     0    1  2   3\n",
+              "0  1.0  NaN  7 NaN\n",
+              "1  2.0  5.0  8 NaN\n",
+              "2  NaN  6.0  9 NaN"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 47
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "trusted": false,
+        "id": "VM1qtACAgRsI",
+        "outputId": "2cd3360a-ac87-41fb-d362-9d8c981f573f",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 142
+        }
+      },
+      "source": [
+        "example4.fillna(method='ffill', axis=1)"
+      ],
+      "execution_count": 48,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>1</th>\n",
+              "      <th>2</th>\n",
+              "      <th>3</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>1.0</td>\n",
+              "      <td>1.0</td>\n",
+              "      <td>7.0</td>\n",
+              "      <td>7.0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>2.0</td>\n",
+              "      <td>5.0</td>\n",
+              "      <td>8.0</td>\n",
+              "      <td>8.0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>NaN</td>\n",
+              "      <td>6.0</td>\n",
+              "      <td>9.0</td>\n",
+              "      <td>9.0</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "     0    1    2    3\n",
+              "0  1.0  1.0  7.0  7.0\n",
+              "1  2.0  5.0  8.0  8.0\n",
+              "2  NaN  6.0  9.0  9.0"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 48
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZeMc-I1EgRsI"
+      },
+      "source": [
+        "Notice that when a previous value is not available for forward-filling, the null value remains."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eeAoOU0RgRsJ"
+      },
+      "source": [
+        "### Exercise:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "collapsed": true,
+        "trusted": false,
+        "id": "e8S-CjW8gRsJ"
+      },
+      "source": [
+        "# What output does example4.fillna(method='bfill', axis=1) produce?\n",
+        "# What about example4.fillna(method='ffill') or example4.fillna(method='bfill')?\n",
+        "# Can you think of a longer code snippet to write that can fill all of the null values in example4?\n"
+      ],
+      "execution_count": 49,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YHgy0lIrgRsJ"
+      },
+      "source": [
+        "You can be creative about how you use `fillna`. For example, let's look at `example4` again, but this time let's fill the missing values with the average of all of the values in the `DataFrame`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "trusted": false,
+        "id": "OtYVErEygRsJ",
+        "outputId": "ad5f4520-cf88-4e3e-fa16-54bda5efa417",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 142
+        }
+      },
+      "source": [
+        "example4.fillna(example4.mean())"
+      ],
+      "execution_count": 50,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>1</th>\n",
+              "      <th>2</th>\n",
+              "      <th>3</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>1.0</td>\n",
+              "      <td>5.5</td>\n",
+              "      <td>7</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>2.0</td>\n",
+              "      <td>5.0</td>\n",
+              "      <td>8</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>1.5</td>\n",
+              "      <td>6.0</td>\n",
+              "      <td>9</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "     0    1  2   3\n",
+              "0  1.0  5.5  7 NaN\n",
+              "1  2.0  5.0  8 NaN\n",
+              "2  1.5  6.0  9 NaN"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 50
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zpMvCkLSgRsJ"
+      },
+      "source": [
+        "Notice that column 3 is still valueless: the default direction is to fill values row-wise.\n",
+        "\n",
+        "> **Takeaway:** There are multiple ways to deal with missing values in your datasets. The specific strategy you use (removing them, replacing them, or even how you replace them) should be dictated by the particulars of that data. You will develop a better sense of how to deal with missing values the more you handle and interact with datasets."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bauDnESIl9FH"
+      },
+      "source": [
+        "### Encoding Categorical Data\n",
+        "\n",
+        "Machine learning models only deal with numbers and any form of numeric data. It won't be able to tell the difference between a Yes and a No, but it would be able to distinguish between 0 and 1. So, after filling in the missing values, we need to do encode the categorical data to some numeric form for the model to understand.\n",
+        "\n",
+        "Encoding can be done in two ways. We will be discussing them next.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uDq9SxB7mu5i"
+      },
+      "source": [
+        "**LABEL ENCODING**\n",
+        "\n",
+        "\n",
+        "Label encoding is basically converting each category to a number. For example, say we have a dataset of airline passengers and there is a column containing their class among the following ['business class', 'economy class','first class']. If Label encoding is done on this, this would be transformed to [0,1,2]. Let us see an example via code. As we would be learning `scikit-learn` in the upcoming notebooks, we won't use it here."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "1vGz7uZyoWHL",
+        "outputId": "5003c8cd-ff07-4399-a5b2-621b45184511",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 235
+        }
+      },
+      "source": [
+        "label = pd.DataFrame([\n",
+        "                      [10,'business class'],\n",
+        "                      [20,'first class'],\n",
+        "                      [30, 'economy class'],\n",
+        "                      [40, 'economy class'],\n",
+        "                      [50, 'economy class'],\n",
+        "                      [60, 'business class']\n",
+        "],columns=['ID','class'])\n",
+        "label"
+      ],
+      "execution_count": 70,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>ID</th>\n",
+              "      <th>class</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>10</td>\n",
+              "      <td>business class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>20</td>\n",
+              "      <td>first class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>30</td>\n",
+              "      <td>economy class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>40</td>\n",
+              "      <td>economy class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>50</td>\n",
+              "      <td>economy class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>5</th>\n",
+              "      <td>60</td>\n",
+              "      <td>business class</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "   ID           class\n",
+              "0  10  business class\n",
+              "1  20     first class\n",
+              "2  30   economy class\n",
+              "3  40   economy class\n",
+              "4  50   economy class\n",
+              "5  60  business class"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 70
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IDHnkwTYov-h"
       },
       "source": [
-        "Notice that when a previous value is not available for forward-filling, the null value remains."
+        "To perform label encoding on the 1st column, we have to first describe a mapping from each class to a number, before replacing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZC5URJG3o1ES",
+        "outputId": "c75465b2-169e-417c-8769-680aaf1cd268",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 235
+        }
+      },
+      "source": [
+        "class_labels = {'business class':0,'economy class':1,'first class':2}\n",
+        "label['class'] = label['class'].replace(class_labels)\n",
+        "label"
+      ],
+      "execution_count": 71,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>ID</th>\n",
+              "      <th>class</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>10</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>20</td>\n",
+              "      <td>2</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>30</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>40</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>50</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>5</th>\n",
+              "      <td>60</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "   ID  class\n",
+              "0  10      0\n",
+              "1  20      2\n",
+              "2  30      1\n",
+              "3  40      1\n",
+              "4  50      1\n",
+              "5  60      0"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 71
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ftnF-TyapOPt"
+      },
+      "source": [
+        "As we can see, the output matches what we thought would happen. So, when do we use label encoding? Label encoding is used in either or both of the following cases :\n",
+        "1. When the number of categories is large\n",
+        "2. When the categories are in order. "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eQPAPVwsqWT7"
+      },
+      "source": [
+        "**ONE HOT ENCODING**\n",
+        "\n",
+        "Another type of encoding is One Hot Encoding. In this type of encoding, each category of the column gets added as a separate column and each datapoint will get a 0 or a 1 based on whether it contains that category. So, if there are n different categories, n columns will be appended to the dataframe.\n",
+        "\n",
+        "For example, let us take the same aeroplane class example. The categories were: ['business class', 'economy class','first class'] . So, if we perform one hot encoding, the following three columns will be added to the dataset: ['class_business class','class_economy class','class_first class']."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZM0eVh0ArKUL",
+        "outputId": "cba4258f-a6c3-45e0-dd69-32b73b2cd735",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 235
+        }
+      },
+      "source": [
+        "one_hot = pd.DataFrame([\n",
+        "                      [10,'business class'],\n",
+        "                      [20,'first class'],\n",
+        "                      [30, 'economy class'],\n",
+        "                      [40, 'economy class'],\n",
+        "                      [50, 'economy class'],\n",
+        "                      [60, 'business class']\n",
+        "],columns=['ID','class'])\n",
+        "one_hot"
+      ],
+      "execution_count": 67,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>ID</th>\n",
+              "      <th>class</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>10</td>\n",
+              "      <td>business class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>20</td>\n",
+              "      <td>first class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>30</td>\n",
+              "      <td>economy class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>40</td>\n",
+              "      <td>economy class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>50</td>\n",
+              "      <td>economy class</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>5</th>\n",
+              "      <td>60</td>\n",
+              "      <td>business class</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "   ID           class\n",
+              "0  10  business class\n",
+              "1  20     first class\n",
+              "2  30   economy class\n",
+              "3  40   economy class\n",
+              "4  50   economy class\n",
+              "5  60  business class"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 67
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aVnZ7paDrWmb"
+      },
+      "source": [
+        "Let us perform one hot encoding on the 1st column"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RUPxf7egrYKr"
+      },
+      "source": [
+        "one_hot_data = pd.get_dummies(one_hot,columns=['class'])"
+      ],
+      "execution_count": 68,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "TM37pHsFr4ge",
+        "outputId": "4f9cdbec-5ea6-4613-b14f-5b8b66b85894",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 235
+        }
+      },
+      "source": [
+        "one_hot_data"
+      ],
+      "execution_count": 69,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>ID</th>\n",
+              "      <th>class_business class</th>\n",
+              "      <th>class_economy class</th>\n",
+              "      <th>class_first class</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>10</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>20</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>30</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>40</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>50</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>5</th>\n",
+              "      <td>60</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "   ID  class_business class  class_economy class  class_first class\n",
+              "0  10                     1                    0                  0\n",
+              "1  20                     0                    0                  1\n",
+              "2  30                     0                    1                  0\n",
+              "3  40                     0                    1                  0\n",
+              "4  50                     0                    1                  0\n",
+              "5  60                     1                    0                  0"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 69
+        }
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "eeAoOU0RgRsJ"
+        "id": "_zXRLOjXujdA"
       },
       "source": [
-        "### Exercise:"
+        "Each one hot encoded column contains 0 or 1, which specifies whether that category exists for that datapoint."
       ]
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "collapsed": true,
-        "trusted": false,
-        "id": "e8S-CjW8gRsJ"
-      },
-      "source": [
-        "# What output does example4.fillna(method='bfill', axis=1) produce?\n",
-        "# What about example4.fillna(method='ffill') or example4.fillna(method='bfill')?\n",
-        "# Can you think of a longer code snippet to write that can fill all of the null values in example4?\n"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "YHgy0lIrgRsJ"
+        "id": "bDnC4NQOu0qr"
       },
       "source": [
-        "You can be creative about how you use `fillna`. For example, let's look at `example4` again, but this time let's fill the missing values with the average of all of the values in the `DataFrame`:"
+        "When do we use one hot encoding? One hot encoding is used in either or both of the following cases :\n",
+        "\n",
+        "1. When the number of categories and the size of the dataset is smaller.\n",
+        "2. When the categories follow no particular order."
       ]
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "trusted": false,
-        "id": "OtYVErEygRsJ"
-      },
-      "source": [
-        "example4.fillna(example4.mean())"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "zpMvCkLSgRsJ"
+        "id": "XnUmci_4uvyu"
       },
       "source": [
-        "Notice that column 3 is still valueless: the default direction is to fill values row-wise.\n",
-        "\n",
-        "> **Takeaway:** There are multiple ways to deal with missing values in your datasets. The specific strategy you use (removing them, replacing them, or even how you replace them) should be dictated by the particulars of that data. You will develop a better sense of how to deal with missing values the more you handle and interact with datasets."
+        "> Key Takeaways:\n",
+        "1. Encoding is done to convert non-numeric data to numeric data.\n",
+        "2. There are two types of encoding: Label encoding and One Hot encoding, both of which can be performed based on the demands of the dataset. "
       ]
     },
     {
@@ -2366,27 +3428,121 @@
       "cell_type": "code",
       "metadata": {
         "trusted": false,
-        "id": "ZLu6FEnZgRsJ"
+        "id": "ZLu6FEnZgRsJ",
+        "outputId": "d62ede23-a8ba-412b-f666-6fc1a43af424",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 204
+        }
       },
       "source": [
         "example6 = pd.DataFrame({'letters': ['A','B'] * 2 + ['B'],\n",
         "                         'numbers': [1, 2, 1, 3, 3]})\n",
         "example6"
       ],
-      "execution_count": null,
-      "outputs": []
+      "execution_count": 72,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>letters</th>\n",
+              "      <th>numbers</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>A</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>B</td>\n",
+              "      <td>2</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>A</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>B</td>\n",
+              "      <td>3</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>B</td>\n",
+              "      <td>3</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "  letters  numbers\n",
+              "0       A        1\n",
+              "1       B        2\n",
+              "2       A        1\n",
+              "3       B        3\n",
+              "4       B        3"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 72
+        }
+      ]
     },
     {
       "cell_type": "code",
       "metadata": {
         "trusted": false,
-        "id": "cIduB5oBgRsK"
+        "id": "cIduB5oBgRsK",
+        "outputId": "061ff212-4cba-4f49-ae20-a7bde21b54a3",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
       },
       "source": [
         "example6.duplicated()"
       ],
-      "execution_count": null,
-      "outputs": []
+      "execution_count": 73,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "0    False\n",
+              "1    False\n",
+              "2     True\n",
+              "3    False\n",
+              "4     True\n",
+              "dtype: bool"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 73
+        }
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2402,13 +3558,75 @@
       "cell_type": "code",
       "metadata": {
         "trusted": false,
-        "id": "w_YPpqIqgRsK"
+        "id": "w_YPpqIqgRsK",
+        "outputId": "5081cf87-9e65-493f-c867-c73f3833b775",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 142
+        }
       },
       "source": [
         "example6.drop_duplicates()"
       ],
-      "execution_count": null,
-      "outputs": []
+      "execution_count": 74,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>letters</th>\n",
+              "      <th>numbers</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>A</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>B</td>\n",
+              "      <td>2</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>B</td>\n",
+              "      <td>3</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "  letters  numbers\n",
+              "0       A        1\n",
+              "1       B        2\n",
+              "3       B        3"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 74
+        }
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2423,13 +3641,69 @@
       "cell_type": "code",
       "metadata": {
         "trusted": false,
-        "id": "BILjDs67gRsK"
+        "id": "BILjDs67gRsK",
+        "outputId": "1087142d-5a36-4667-8b70-45824de07d64",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 111
+        }
       },
       "source": [
         "example6.drop_duplicates(['letters'])"
       ],
-      "execution_count": null,
-      "outputs": []
+      "execution_count": 75,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>letters</th>\n",
+              "      <th>numbers</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>A</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>B</td>\n",
+              "      <td>2</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "  letters  numbers\n",
+              "0       A        1\n",
+              "1       B        2"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 75
+        }
+      ]
     },
     {
       "cell_type": "markdown",