From a60aa03c487119f1ba64ab97bfc560ec1d87fe93 Mon Sep 17 00:00:00 2001 From: Nirmalya Misra <39618712+nirmalya8@users.noreply.github.com> Date: Wed, 6 Oct 2021 00:06:56 +0530 Subject: [PATCH] Added Encoding --- .../08-data-preparation/notebook.ipynb | 1710 ++++++++++++++--- 1 file changed, 1492 insertions(+), 218 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index b5a6bac..71b076e 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -79,7 +79,7 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "4641a412-8abb-4e2f-d1ec-ff9b5004e361" + "outputId": "70e0d7dd-fb30-45c4-a5af-7dc85cd89342" }, "source": [ "iris_df.shape" @@ -126,7 +126,7 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "0f9c41ea-d480-4245-d7e2-56d514ac7724" + "outputId": "85e6ab39-174f-4dc7-fee6-a18f3ba14a7d" }, "source": [ "iris_df.columns" @@ -174,7 +174,7 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "94d5e48a-746c-4e58-b08f-c63b377a61b1" + "outputId": "2a2bb81a-257c-4410-f826-99402b75ce14" }, "source": [ "iris_df.info()" @@ -230,7 +230,7 @@ "base_uri": "https://localhost:8080/", "height": 297 }, - "outputId": "b01322a1-4296-4ad0-f990-6e0dcba668f6" + "outputId": "e5015299-163f-42c7-aaa1-9bc3a67788bf" }, "source": [ "iris_df.describe()" @@ -373,7 +373,7 @@ "base_uri": "https://localhost:8080/", "height": 204 }, - "outputId": "14b1e3cd-54ac-47dc-f7b2-231d51d93741" + "outputId": "5ff975df-45f0-4efd-f884-2580909c6e67" }, "source": [ "iris_df.head()" @@ -492,7 +492,7 @@ "source": [ "# Hint: Consult the documentation by using iris_df.head?" ], - "execution_count": 7, + "execution_count": null, "outputs": [] }, { @@ -514,12 +514,12 @@ "base_uri": "https://localhost:8080/", "height": 204 }, - "outputId": "d4e22b38-ba5d-4dd1-bbd2-b9cd9ad7b150" + "outputId": "1726a2e0-82d7-4491-8dbc-637f28a11d26" }, "source": [ "iris_df.tail()" ], - "execution_count": 8, + "execution_count": 7, "outputs": [ { "output_type": "execute_result", @@ -599,7 +599,7 @@ ] }, "metadata": {}, - "execution_count": 8 + "execution_count": 7 } ] }, @@ -657,7 +657,7 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "e2ea93a4-b967-4319-904b-85479c36b169" + "outputId": "20e2d43a-2053-4037-c736-8ec2c28b67e5" }, "source": [ "import numpy as np\n", @@ -665,7 +665,7 @@ "example1 = np.array([2, None, 6, 8])\n", "example1" ], - "execution_count": 9, + "execution_count": 8, "outputs": [ { "output_type": "execute_result", @@ -675,7 +675,7 @@ ] }, "metadata": {}, - "execution_count": 9 + "execution_count": 8 } ] }, @@ -699,12 +699,12 @@ "base_uri": "https://localhost:8080/", "height": 292 }, - "outputId": "ff2a899b-5419-4a5c-b054-bc1e6ab906c5" + "outputId": "ab3b1799-504f-480d-851b-85b19f62d8b7" }, "source": [ "example1.sum()" ], - "execution_count": 10, + "execution_count": 9, "outputs": [ { "output_type": "error", @@ -713,7 +713,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mexample1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mexample1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py\u001b[0m in \u001b[0;36m_sum\u001b[0;34m(a, axis, dtype, out, keepdims, initial, where)\u001b[0m\n\u001b[1;32m 45\u001b[0m def _sum(a, axis=None, dtype=None, out=None, keepdims=False,\n\u001b[1;32m 46\u001b[0m initial=_NoValue, where=True):\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mumr_sum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwhere\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m def _prod(a, axis=None, dtype=None, out=None, keepdims=False,\n", "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'int' and 'NoneType'" ] @@ -748,12 +748,12 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "a452b675-2131-47a7-ff38-2b4d6e923d50" + "outputId": "3744a812-6daf-472e-e933-388c722ab2b4" }, "source": [ "np.nan + 1" ], - "execution_count": 11, + "execution_count": 10, "outputs": [ { "output_type": "execute_result", @@ -763,7 +763,7 @@ ] }, "metadata": {}, - "execution_count": 11 + "execution_count": 10 } ] }, @@ -775,12 +775,12 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "6956b57f-8ae7-4880-cc1d-0cf54edfe6ee" + "outputId": "4a304a47-c5a0-4814-92b0-c4b5ab193358" }, "source": [ "np.nan * 0" ], - "execution_count": 12, + "execution_count": 11, "outputs": [ { "output_type": "execute_result", @@ -790,7 +790,7 @@ ] }, "metadata": {}, - "execution_count": 12 + "execution_count": 11 } ] }, @@ -811,13 +811,13 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "57ad3201-3958-48c6-924b-d46b61d4aeba" + "outputId": "a41b57bf-1c2a-4219-9ee5-0a1a1499e74d" }, "source": [ "example2 = np.array([2, np.nan, 6, 8]) \n", "example2.sum(), example2.min(), example2.max()" ], - "execution_count": 13, + "execution_count": 12, "outputs": [ { "output_type": "execute_result", @@ -827,7 +827,7 @@ ] }, "metadata": {}, - "execution_count": 13 + "execution_count": 12 } ] }, @@ -850,7 +850,7 @@ "source": [ "# What happens if you add np.nan and None together?\n" ], - "execution_count": 14, + "execution_count": 13, "outputs": [] }, { @@ -881,13 +881,13 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "8dbdf129-cd8b-40b5-96ba-21a7f3fa0044" + "outputId": "5f3389e0-4b54-4d6b-a305-a269df869235" }, "source": [ "int_series = pd.Series([1, 2, 3], dtype=int)\n", "int_series" ], - "execution_count": 15, + "execution_count": 14, "outputs": [ { "output_type": "execute_result", @@ -900,7 +900,7 @@ ] }, "metadata": {}, - "execution_count": 15 + "execution_count": 14 } ] }, @@ -925,7 +925,7 @@ "# How does that element show up in the Series?\n", "# What is the dtype of the Series?\n" ], - "execution_count": 16, + "execution_count": 15, "outputs": [] }, { @@ -966,7 +966,7 @@ "source": [ "example3 = pd.Series([0, np.nan, '', None])" ], - "execution_count": 17, + "execution_count": 16, "outputs": [] }, { @@ -977,12 +977,12 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "1fd6c6af-19e0-4568-e837-985d571604f4" + "outputId": "88a14e60-392a-42ad-d767-a4055580f523" }, "source": [ "example3.isnull()" ], - "execution_count": 18, + "execution_count": 17, "outputs": [ { "output_type": "execute_result", @@ -996,7 +996,7 @@ ] }, "metadata": {}, - "execution_count": 18 + "execution_count": 17 } ] }, @@ -1020,12 +1020,12 @@ "base_uri": "https://localhost:8080/" }, "id": "JCcQVoPkHDUv", - "outputId": "c0002689-f529-4e3e-c73b-41ac513c59d3" + "outputId": "042418f0-981b-4c5e-cdf8-c42912f7e4fe" }, "source": [ "example3.isnull().sum()" ], - "execution_count": 19, + "execution_count": 18, "outputs": [ { "output_type": "execute_result", @@ -1035,7 +1035,7 @@ ] }, "metadata": {}, - "execution_count": 19 + "execution_count": 18 } ] }, @@ -1059,7 +1059,7 @@ "# Try running example3[example3.notnull()].\n", "# Before you do so, what do you expect to see?\n" ], - "execution_count": 20, + "execution_count": 19, "outputs": [] }, { @@ -1118,13 +1118,13 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "3d2d43e7-99ca-45ca-adc4-cef2c737e5bf" + "outputId": "782b0526-a1bb-4757-ac1f-a16267d9eb4f" }, "source": [ "example3 = example3.dropna()\n", "example3" ], - "execution_count": 21, + "execution_count": 20, "outputs": [ { "output_type": "execute_result", @@ -1136,7 +1136,7 @@ ] }, "metadata": {}, - "execution_count": 21 + "execution_count": 20 } ] }, @@ -1160,7 +1160,7 @@ "base_uri": "https://localhost:8080/", "height": 142 }, - "outputId": "961427aa-9bce-445b-d230-61d02bc16c92" + "outputId": "3d19e787-896d-4ba4-8662-811d2e191d3b" }, "source": [ "example4 = pd.DataFrame([[1, np.nan, 7], \n", @@ -1168,7 +1168,7 @@ " [np.nan, 6, 9]])\n", "example4" ], - "execution_count": 22, + "execution_count": 21, "outputs": [ { "output_type": "execute_result", @@ -1228,7 +1228,7 @@ ] }, "metadata": {}, - "execution_count": 22 + "execution_count": 21 } ] }, @@ -1252,12 +1252,12 @@ "base_uri": "https://localhost:8080/", "height": 80 }, - "outputId": "aaeac6bc-ca6f-4eda-de0c-119e0c50ba83" + "outputId": "6bdb7658-8a64-401f-d2b2-bd0f8bc17325" }, "source": [ "example4.dropna()" ], - "execution_count": 23, + "execution_count": 22, "outputs": [ { "output_type": "execute_result", @@ -1303,7 +1303,7 @@ ] }, "metadata": {}, - "execution_count": 23 + "execution_count": 22 } ] }, @@ -1325,12 +1325,12 @@ "base_uri": "https://localhost:8080/", "height": 142 }, - "outputId": "89fee273-d71b-4400-9484-b4bf93b69ee5" + "outputId": "0071a8bb-9fe5-4ed5-a3af-d0209485515a" }, "source": [ "example4.dropna(axis='columns')" ], - "execution_count": 24, + "execution_count": 23, "outputs": [ { "output_type": "execute_result", @@ -1382,7 +1382,7 @@ ] }, "metadata": {}, - "execution_count": 24 + "execution_count": 23 } ] }, @@ -1406,13 +1406,13 @@ "base_uri": "https://localhost:8080/", "height": 142 }, - "outputId": "07e8f4eb-18c8-4e5d-9317-6a9a3db38b73" + "outputId": "a26b5362-0d17-49c2-d902-10832f9bf9a0" }, "source": [ "example4[3] = np.nan\n", "example4" ], - "execution_count": 25, + "execution_count": 24, "outputs": [ { "output_type": "execute_result", @@ -1476,7 +1476,7 @@ ] }, "metadata": {}, - "execution_count": 25 + "execution_count": 24 } ] }, @@ -1513,7 +1513,7 @@ "# How might you go about dropping just column 3?\n", "# Hint: remember that you will need to supply both the axis parameter and the how parameter.\n" ], - "execution_count": 26, + "execution_count": 25, "outputs": [] }, { @@ -1534,12 +1534,12 @@ "base_uri": "https://localhost:8080/", "height": 80 }, - "outputId": "b2c00415-95a6-4a5c-e3f9-781ff5cc8625" + "outputId": "ee2d3a60-a694-4a11-ef37-28d00a8d956c" }, "source": [ "example4.dropna(axis='rows', thresh=3)" ], - "execution_count": 27, + "execution_count": 26, "outputs": [ { "output_type": "execute_result", @@ -1587,7 +1587,7 @@ ] }, "metadata": {}, - "execution_count": 27 + "execution_count": 26 } ] }, @@ -1620,6 +1620,7 @@ "id": "CE8S7louLezV" }, "source": [ + "### Categorical Data(Non-numeric)\n", "First let us consider non-numeric data. In datasets, we have columns with categorical data. Eg. Gender, True or False etc.\n", "\n", "In most of these cases, we replace missing values with the `mode` of the column. Say, we have 100 data points and 90 have said True, 8 have said False and 2 have not filled. Then, we can will the 2 with True, considering the full column. \n", @@ -1635,7 +1636,7 @@ "height": 204 }, "id": "MY5faq4yLdpQ", - "outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc" + "outputId": "49350e22-4ee9-43c1-9d6c-e5f837b24ae8" }, "source": [ "fill_with_mode = pd.DataFrame([[1,2,\"True\"],\n", @@ -1646,7 +1647,7 @@ "\n", "fill_with_mode" ], - "execution_count": 28, + "execution_count": 27, "outputs": [ { "output_type": "execute_result", @@ -1720,7 +1721,7 @@ ] }, "metadata": {}, - "execution_count": 28 + "execution_count": 27 } ] }, @@ -1740,12 +1741,12 @@ "base_uri": "https://localhost:8080/" }, "id": "WKy-9Y2tN5jv", - "outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f" + "outputId": "d0c045f2-218c-45aa-951c-f3feed98510a" }, "source": [ "fill_with_mode[2].value_counts()" ], - "execution_count": 29, + "execution_count": 28, "outputs": [ { "output_type": "execute_result", @@ -1757,7 +1758,7 @@ ] }, "metadata": {}, - "execution_count": 29 + "execution_count": 28 } ] }, @@ -1778,7 +1779,7 @@ "source": [ "fill_with_mode[2].fillna('True',inplace=True)" ], - "execution_count": 30, + "execution_count": 29, "outputs": [] }, { @@ -1789,12 +1790,12 @@ "height": 204 }, "id": "tvas7c9_OPWE", - "outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164" + "outputId": "c45890f5-8c76-4a3c-87f0-b831c2199750" }, "source": [ "fill_with_mode" ], - "execution_count": 31, + "execution_count": 30, "outputs": [ { "output_type": "execute_result", @@ -1868,7 +1869,7 @@ ] }, "metadata": {}, - "execution_count": 31 + "execution_count": 30 } ] }, @@ -1887,6 +1888,7 @@ "id": "heYe1I0dOmQ_" }, "source": [ + "### Numeric Data\n", "Now, coming to numeric data. Here, we have a two common ways of replacing missing values:\n", "\n", "1. Replace with Median of the row\n", @@ -1907,7 +1909,7 @@ "height": 204 }, "id": "09HM_2feOj5Y", - "outputId": "ade42fec-dc40-45d0-e22c-974849ea8664" + "outputId": "44330273-5709-4af9-99c7-7a3a8e28c7b0" }, "source": [ "fill_with_mean = pd.DataFrame([[-2,0,1],\n", @@ -1918,7 +1920,7 @@ "\n", "fill_with_mean" ], - "execution_count": 33, + "execution_count": 31, "outputs": [ { "output_type": "execute_result", @@ -1992,7 +1994,7 @@ ] }, "metadata": {}, - "execution_count": 33 + "execution_count": 31 } ] }, @@ -2009,15 +2011,15 @@ "cell_type": "code", "metadata": { "id": "XYtYEf5BSxFL", - "outputId": "1e79aeea-6baf-4572-dcd1-23e5ec742036", "colab": { "base_uri": "https://localhost:8080/" - } + }, + "outputId": "7240075c-c3a7-4ac3-e08d-be6d60573d38" }, "source": [ "np.mean(fill_with_mean[0])" ], - "execution_count": 34, + "execution_count": 32, "outputs": [ { "output_type": "execute_result", @@ -2027,7 +2029,7 @@ ] }, "metadata": {}, - "execution_count": 34 + "execution_count": 32 } ] }, @@ -2044,17 +2046,17 @@ "cell_type": "code", "metadata": { "id": "FzncQLmuS5jh", - "outputId": "75f33b25-e6b3-41bb-8049-1ed2e085efe2", "colab": { "base_uri": "https://localhost:8080/", "height": 204 - } + }, + "outputId": "733bfa87-b099-4c11-db2e-1dea88b977ac" }, "source": [ "fill_with_mean[0].fillna(np.mean(fill_with_mean[0]),inplace=True)\n", "fill_with_mean" ], - "execution_count": 35, + "execution_count": 33, "outputs": [ { "output_type": "execute_result", @@ -2128,7 +2130,7 @@ ] }, "metadata": {}, - "execution_count": 35 + "execution_count": 33 } ] }, @@ -2141,201 +2143,1261 @@ "As we can see, the missing value has been replaced with its mean." ] }, - { - "cell_type": "code", - "metadata": { - "trusted": false, - "id": "0ybtWLDdgRsG" - }, - "source": [ - "example5 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))\n", - "example5" - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": { - "id": "yrsigxRggRsH" + "id": "jIvF13a1i00Z" }, "source": [ - "You can fill all of the null entries with a single value, such as `0`:" + "Now let us try another dataframe, and this time we will replace the None values with the median of the column." ] }, { "cell_type": "code", "metadata": { - "trusted": false, - "id": "KXMIPsQdgRsH" + "id": "DA59Bqo3jBYZ", + "outputId": "4338adf5-081c-46ce-aca1-85bcaebf9838", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } }, "source": [ - "example5.fillna(0)" + "fill_with_median = pd.DataFrame([[-2,0,1],\n", + " [-1,2,3],\n", + " [0,np.nan,5],\n", + " [1,6,7],\n", + " [2,8,9]])\n", + "\n", + "fill_with_median" ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FI9MmqFJgRsH" - }, - "source": [ - "### Exercise:" + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
0-20.01
1-12.03
20NaN5
316.07
428.09
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "0 -2 0.0 1\n", + "1 -1 2.0 3\n", + "2 0 NaN 5\n", + "3 1 6.0 7\n", + "4 2 8.0 9" + ] + }, + "metadata": {}, + "execution_count": 39 + } ] }, - { - "cell_type": "code", - "metadata": { - "collapsed": true, - "trusted": false, - "id": "af-ezpXdgRsH" - }, - "source": [ - "# What happens if you try to fill null values with a string, like ''?\n" - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": { - "id": "kq3hw1kLgRsI" + "id": "mM1GpXYmjHnc" }, "source": [ - "You can **forward-fill** null values, which is to use the last valid value to fill a null:" + "The median of the second column is" ] }, { "cell_type": "code", "metadata": { - "trusted": false, - "id": "vO3BuNrggRsI" + "id": "uiDy5v3xjHHX", + "outputId": "2028aa4b-8bec-4b76-ea2f-fcaa7b362e9d", + "colab": { + "base_uri": "https://localhost:8080/" + } }, "source": [ - "example5.fillna(method='ffill')" + "fill_with_median[1].median()" ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nDXeYuHzgRsI" - }, - "source": [ - "You can also **back-fill** to propagate the next valid value backward to fill a null:" + "execution_count": 40, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "4.0" + ] + }, + "metadata": {}, + "execution_count": 40 + } ] }, - { - "cell_type": "code", - "metadata": { - "trusted": false, - "id": "4M5onHcEgRsI" - }, - "source": [ - "example5.fillna(method='bfill')" - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": { - "collapsed": true, - "id": "MbBzTom5gRsI" + "id": "z9PLF75Jj_1s" }, "source": [ - "As you might guess, this works the same with `DataFrame`s, but you can also specify an `axis` along which to fill null values:" + "Filling with median" ] }, { "cell_type": "code", "metadata": { - "trusted": false, - "id": "aRpIvo4ZgRsI" - }, - "source": [ - "example4" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "trusted": false, - "id": "VM1qtACAgRsI" + "id": "lFKbOxCMkBbg", + "outputId": "61bf2b0e-c68d-4b54-9724-f496c8c2ea94", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } }, "source": [ - "example4.fillna(method='ffill', axis=1)" + "fill_with_median[1].fillna(fill_with_median[1].median(),inplace=True)\n", + "fill_with_median" ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZeMc-I1EgRsI" + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
0-20.01
1-12.03
204.05
316.07
428.09
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "0 -2 0.0 1\n", + "1 -1 2.0 3\n", + "2 0 4.0 5\n", + "3 1 6.0 7\n", + "4 2 8.0 9" + ] + }, + "metadata": {}, + "execution_count": 41 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8JtQ53GSkKWC" + }, + "source": [ + "As we can see, the NaN value has been replaced by the median of the column" + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": false, + "id": "0ybtWLDdgRsG", + "outputId": "ee2e547a-bf98-40a5-ddc4-b11357efb898", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "example5 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))\n", + "example5" + ], + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b NaN\n", + "c 2.0\n", + "d NaN\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yrsigxRggRsH" + }, + "source": [ + "You can fill all of the null entries with a single value, such as `0`:" + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": false, + "id": "KXMIPsQdgRsH", + "outputId": "f88a0095-9742-4f1e-fdf4-43fc14cbc4c0", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "example5.fillna(0)" + ], + "execution_count": 43, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b 0.0\n", + "c 2.0\n", + "d 0.0\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 43 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RRlI5f_hkfKe" + }, + "source": [ + "> Key takeaways:\n", + "1. Filling in missing values should be done when either there is less data or there is a strategy to fill in the missing data.\n", + "2. Domain knowledge can be used to fill in missing values by approximating them.\n", + "3. For Categorical data, mostly, missing values are substituted with the mode of the column. \n", + "4. For numeric data, missing values are usually filled in with the mean(for normalized datasets) or the median of the columns. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FI9MmqFJgRsH" + }, + "source": [ + "### Exercise:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "trusted": false, + "id": "af-ezpXdgRsH" + }, + "source": [ + "# What happens if you try to fill null values with a string, like ''?\n" + ], + "execution_count": 44, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kq3hw1kLgRsI" + }, + "source": [ + "You can **forward-fill** null values, which is to use the last valid value to fill a null:" + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": false, + "id": "vO3BuNrggRsI", + "outputId": "aff7d7de-20b9-42bf-fe06-932677314b37", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "example5.fillna(method='ffill')" + ], + "execution_count": 45, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b 1.0\n", + "c 2.0\n", + "d 2.0\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nDXeYuHzgRsI" + }, + "source": [ + "You can also **back-fill** to propagate the next valid value backward to fill a null:" + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": false, + "id": "4M5onHcEgRsI", + "outputId": "c20c283d-76d7-4f75-c443-5c55fbdb3541", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "example5.fillna(method='bfill')" + ], + "execution_count": 46, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b 2.0\n", + "c 2.0\n", + "d 3.0\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "id": "MbBzTom5gRsI" + }, + "source": [ + "As you might guess, this works the same with DataFrames, but you can also specify an `axis` along which to fill null values:" + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": false, + "id": "aRpIvo4ZgRsI", + "outputId": "ea9c5e3d-a23d-4314-cff4-e5a0e46043d1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 + } + }, + "source": [ + "example4" + ], + "execution_count": 47, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
01.0NaN7NaN
12.05.08NaN
2NaN6.09NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 1.0 NaN 7 NaN\n", + "1 2.0 5.0 8 NaN\n", + "2 NaN 6.0 9 NaN" + ] + }, + "metadata": {}, + "execution_count": 47 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": false, + "id": "VM1qtACAgRsI", + "outputId": "2cd3360a-ac87-41fb-d362-9d8c981f573f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 + } + }, + "source": [ + "example4.fillna(method='ffill', axis=1)" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
01.01.07.07.0
12.05.08.08.0
2NaN6.09.09.0
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 1.0 1.0 7.0 7.0\n", + "1 2.0 5.0 8.0 8.0\n", + "2 NaN 6.0 9.0 9.0" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZeMc-I1EgRsI" + }, + "source": [ + "Notice that when a previous value is not available for forward-filling, the null value remains." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eeAoOU0RgRsJ" + }, + "source": [ + "### Exercise:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "trusted": false, + "id": "e8S-CjW8gRsJ" + }, + "source": [ + "# What output does example4.fillna(method='bfill', axis=1) produce?\n", + "# What about example4.fillna(method='ffill') or example4.fillna(method='bfill')?\n", + "# Can you think of a longer code snippet to write that can fill all of the null values in example4?\n" + ], + "execution_count": 49, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YHgy0lIrgRsJ" + }, + "source": [ + "You can be creative about how you use `fillna`. For example, let's look at `example4` again, but this time let's fill the missing values with the average of all of the values in the `DataFrame`:" + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": false, + "id": "OtYVErEygRsJ", + "outputId": "ad5f4520-cf88-4e3e-fa16-54bda5efa417", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 + } + }, + "source": [ + "example4.fillna(example4.mean())" + ], + "execution_count": 50, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
01.05.57NaN
12.05.08NaN
21.56.09NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 1.0 5.5 7 NaN\n", + "1 2.0 5.0 8 NaN\n", + "2 1.5 6.0 9 NaN" + ] + }, + "metadata": {}, + "execution_count": 50 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zpMvCkLSgRsJ" + }, + "source": [ + "Notice that column 3 is still valueless: the default direction is to fill values row-wise.\n", + "\n", + "> **Takeaway:** There are multiple ways to deal with missing values in your datasets. The specific strategy you use (removing them, replacing them, or even how you replace them) should be dictated by the particulars of that data. You will develop a better sense of how to deal with missing values the more you handle and interact with datasets." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bauDnESIl9FH" + }, + "source": [ + "### Encoding Categorical Data\n", + "\n", + "Machine learning models only deal with numbers and any form of numeric data. It won't be able to tell the difference between a Yes and a No, but it would be able to distinguish between 0 and 1. So, after filling in the missing values, we need to do encode the categorical data to some numeric form for the model to understand.\n", + "\n", + "Encoding can be done in two ways. We will be discussing them next.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uDq9SxB7mu5i" + }, + "source": [ + "**LABEL ENCODING**\n", + "\n", + "\n", + "Label encoding is basically converting each category to a number. For example, say we have a dataset of airline passengers and there is a column containing their class among the following ['business class', 'economy class','first class']. If Label encoding is done on this, this would be transformed to [0,1,2]. Let us see an example via code. As we would be learning `scikit-learn` in the upcoming notebooks, we won't use it here." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1vGz7uZyoWHL", + "outputId": "5003c8cd-ff07-4399-a5b2-621b45184511", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + } + }, + "source": [ + "label = pd.DataFrame([\n", + " [10,'business class'],\n", + " [20,'first class'],\n", + " [30, 'economy class'],\n", + " [40, 'economy class'],\n", + " [50, 'economy class'],\n", + " [60, 'business class']\n", + "],columns=['ID','class'])\n", + "label" + ], + "execution_count": 70, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDclass
010business class
120first class
230economy class
340economy class
450economy class
560business class
\n", + "
" + ], + "text/plain": [ + " ID class\n", + "0 10 business class\n", + "1 20 first class\n", + "2 30 economy class\n", + "3 40 economy class\n", + "4 50 economy class\n", + "5 60 business class" + ] + }, + "metadata": {}, + "execution_count": 70 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IDHnkwTYov-h" }, "source": [ - "Notice that when a previous value is not available for forward-filling, the null value remains." + "To perform label encoding on the 1st column, we have to first describe a mapping from each class to a number, before replacing" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZC5URJG3o1ES", + "outputId": "c75465b2-169e-417c-8769-680aaf1cd268", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + } + }, + "source": [ + "class_labels = {'business class':0,'economy class':1,'first class':2}\n", + "label['class'] = label['class'].replace(class_labels)\n", + "label" + ], + "execution_count": 71, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDclass
0100
1202
2301
3401
4501
5600
\n", + "
" + ], + "text/plain": [ + " ID class\n", + "0 10 0\n", + "1 20 2\n", + "2 30 1\n", + "3 40 1\n", + "4 50 1\n", + "5 60 0" + ] + }, + "metadata": {}, + "execution_count": 71 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ftnF-TyapOPt" + }, + "source": [ + "As we can see, the output matches what we thought would happen. So, when do we use label encoding? Label encoding is used in either or both of the following cases :\n", + "1. When the number of categories is large\n", + "2. When the categories are in order. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eQPAPVwsqWT7" + }, + "source": [ + "**ONE HOT ENCODING**\n", + "\n", + "Another type of encoding is One Hot Encoding. In this type of encoding, each category of the column gets added as a separate column and each datapoint will get a 0 or a 1 based on whether it contains that category. So, if there are n different categories, n columns will be appended to the dataframe.\n", + "\n", + "For example, let us take the same aeroplane class example. The categories were: ['business class', 'economy class','first class'] . So, if we perform one hot encoding, the following three columns will be added to the dataset: ['class_business class','class_economy class','class_first class']." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZM0eVh0ArKUL", + "outputId": "cba4258f-a6c3-45e0-dd69-32b73b2cd735", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + } + }, + "source": [ + "one_hot = pd.DataFrame([\n", + " [10,'business class'],\n", + " [20,'first class'],\n", + " [30, 'economy class'],\n", + " [40, 'economy class'],\n", + " [50, 'economy class'],\n", + " [60, 'business class']\n", + "],columns=['ID','class'])\n", + "one_hot" + ], + "execution_count": 67, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDclass
010business class
120first class
230economy class
340economy class
450economy class
560business class
\n", + "
" + ], + "text/plain": [ + " ID class\n", + "0 10 business class\n", + "1 20 first class\n", + "2 30 economy class\n", + "3 40 economy class\n", + "4 50 economy class\n", + "5 60 business class" + ] + }, + "metadata": {}, + "execution_count": 67 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aVnZ7paDrWmb" + }, + "source": [ + "Let us perform one hot encoding on the 1st column" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RUPxf7egrYKr" + }, + "source": [ + "one_hot_data = pd.get_dummies(one_hot,columns=['class'])" + ], + "execution_count": 68, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "TM37pHsFr4ge", + "outputId": "4f9cdbec-5ea6-4613-b14f-5b8b66b85894", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + } + }, + "source": [ + "one_hot_data" + ], + "execution_count": 69, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDclass_business classclass_economy classclass_first class
010100
120001
230010
340010
450010
560100
\n", + "
" + ], + "text/plain": [ + " ID class_business class class_economy class class_first class\n", + "0 10 1 0 0\n", + "1 20 0 0 1\n", + "2 30 0 1 0\n", + "3 40 0 1 0\n", + "4 50 0 1 0\n", + "5 60 1 0 0" + ] + }, + "metadata": {}, + "execution_count": 69 + } ] }, { "cell_type": "markdown", "metadata": { - "id": "eeAoOU0RgRsJ" + "id": "_zXRLOjXujdA" }, "source": [ - "### Exercise:" + "Each one hot encoded column contains 0 or 1, which specifies whether that category exists for that datapoint." ] }, - { - "cell_type": "code", - "metadata": { - "collapsed": true, - "trusted": false, - "id": "e8S-CjW8gRsJ" - }, - "source": [ - "# What output does example4.fillna(method='bfill', axis=1) produce?\n", - "# What about example4.fillna(method='ffill') or example4.fillna(method='bfill')?\n", - "# Can you think of a longer code snippet to write that can fill all of the null values in example4?\n" - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": { - "id": "YHgy0lIrgRsJ" + "id": "bDnC4NQOu0qr" }, "source": [ - "You can be creative about how you use `fillna`. For example, let's look at `example4` again, but this time let's fill the missing values with the average of all of the values in the `DataFrame`:" + "When do we use one hot encoding? One hot encoding is used in either or both of the following cases :\n", + "\n", + "1. When the number of categories and the size of the dataset is smaller.\n", + "2. When the categories follow no particular order." ] }, - { - "cell_type": "code", - "metadata": { - "trusted": false, - "id": "OtYVErEygRsJ" - }, - "source": [ - "example4.fillna(example4.mean())" - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": { - "id": "zpMvCkLSgRsJ" + "id": "XnUmci_4uvyu" }, "source": [ - "Notice that column 3 is still valueless: the default direction is to fill values row-wise.\n", - "\n", - "> **Takeaway:** There are multiple ways to deal with missing values in your datasets. The specific strategy you use (removing them, replacing them, or even how you replace them) should be dictated by the particulars of that data. You will develop a better sense of how to deal with missing values the more you handle and interact with datasets." + "> Key Takeaways:\n", + "1. Encoding is done to convert non-numeric data to numeric data.\n", + "2. There are two types of encoding: Label encoding and One Hot encoding, both of which can be performed based on the demands of the dataset. " ] }, { @@ -2366,27 +3428,121 @@ "cell_type": "code", "metadata": { "trusted": false, - "id": "ZLu6FEnZgRsJ" + "id": "ZLu6FEnZgRsJ", + "outputId": "d62ede23-a8ba-412b-f666-6fc1a43af424", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } }, "source": [ "example6 = pd.DataFrame({'letters': ['A','B'] * 2 + ['B'],\n", " 'numbers': [1, 2, 1, 3, 3]})\n", "example6" ], - "execution_count": null, - "outputs": [] + "execution_count": 72, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lettersnumbers
0A1
1B2
2A1
3B3
4B3
\n", + "
" + ], + "text/plain": [ + " letters numbers\n", + "0 A 1\n", + "1 B 2\n", + "2 A 1\n", + "3 B 3\n", + "4 B 3" + ] + }, + "metadata": {}, + "execution_count": 72 + } + ] }, { "cell_type": "code", "metadata": { "trusted": false, - "id": "cIduB5oBgRsK" + "id": "cIduB5oBgRsK", + "outputId": "061ff212-4cba-4f49-ae20-a7bde21b54a3", + "colab": { + "base_uri": "https://localhost:8080/" + } }, "source": [ "example6.duplicated()" ], - "execution_count": null, - "outputs": [] + "execution_count": 73, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 True\n", + "3 False\n", + "4 True\n", + "dtype: bool" + ] + }, + "metadata": {}, + "execution_count": 73 + } + ] }, { "cell_type": "markdown", @@ -2402,13 +3558,75 @@ "cell_type": "code", "metadata": { "trusted": false, - "id": "w_YPpqIqgRsK" + "id": "w_YPpqIqgRsK", + "outputId": "5081cf87-9e65-493f-c867-c73f3833b775", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 + } }, "source": [ "example6.drop_duplicates()" ], - "execution_count": null, - "outputs": [] + "execution_count": 74, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lettersnumbers
0A1
1B2
3B3
\n", + "
" + ], + "text/plain": [ + " letters numbers\n", + "0 A 1\n", + "1 B 2\n", + "3 B 3" + ] + }, + "metadata": {}, + "execution_count": 74 + } + ] }, { "cell_type": "markdown", @@ -2423,13 +3641,69 @@ "cell_type": "code", "metadata": { "trusted": false, - "id": "BILjDs67gRsK" + "id": "BILjDs67gRsK", + "outputId": "1087142d-5a36-4667-8b70-45824de07d64", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 111 + } }, "source": [ "example6.drop_duplicates(['letters'])" ], - "execution_count": null, - "outputs": [] + "execution_count": 75, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lettersnumbers
0A1
1B2
\n", + "
" + ], + "text/plain": [ + " letters numbers\n", + "0 A 1\n", + "1 B 2" + ] + }, + "metadata": {}, + "execution_count": 75 + } + ] }, { "cell_type": "markdown",