diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index e45a5cb..93c37c3 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3,28 +3,28 @@ { "cell_type": "markdown", "source": [ - "# Data Preparation\r\n", - "\r\n", - "[Original Notebook source from *Data Science: Introduction to Machine Learning for Data Science Python and Machine Learning Studio by Lee Stott*](https://github.com/leestott/intro-Datascience/blob/master/Course%20Materials/4-Cleaning_and_Manipulating-Reference.ipynb)\r\n", - "\r\n", - "## Exploring `DataFrame` information\r\n", - "\r\n", - "> **Learning goal:** By the end of this subsection, you should be comfortable finding general information about the data stored in pandas DataFrames.\r\n", - "\r\n", - "Once you have loaded your data into pandas, it will more likely than not be in a `DataFrame`. However, if the data set in your `DataFrame` has 60,000 rows and 400 columns, how do you even begin to get a sense of what you're working with? Fortunately, pandas provides some convenient tools to quickly look at overall information about a `DataFrame` in addition to the first few and last few rows.\r\n", - "\r\n", - "In order to explore this functionality, we will import the Python scikit-learn library and use an iconic dataset that every data scientist has seen hundreds of times: British biologist Ronald Fisher's *Iris* data set used in his 1936 paper \"The use of multiple measurements in taxonomic problems\":" + "# Data Preparation\n", + "\n", + "[Original Notebook source from *Data Science: Introduction to Machine Learning for Data Science Python and Machine Learning Studio by Lee Stott*](https://github.com/leestott/intro-Datascience/blob/master/Course%20Materials/4-Cleaning_and_Manipulating-Reference.ipynb)\n", + "\n", + "## Exploring `DataFrame` information\n", + "\n", + "> **Learning goal:** By the end of this subsection, you should be comfortable finding general information about the data stored in pandas DataFrames.\n", + "\n", + "Once you have loaded your data into pandas, it will more likely than not be in a `DataFrame`. \n", + "\n", + "In order to explore our `DataFramme`, we will import the Python `scikit-learn` library and use an iconic dataset that every data scientist has seen hundreds of times: British biologist Ronald Fisher's **Iris data set** used in his 1936 paper \"*The use of multiple measurements in taxonomic problems*\":" ], "metadata": {} }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "source": [ - "import pandas as pd\r\n", - "from sklearn.datasets import load_iris\r\n", - "\r\n", - "iris = load_iris()\r\n", + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "\n", + "iris = load_iris()\n", "iris_df = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])" ], "outputs": [], @@ -43,11 +43,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "source": [ "iris_df.info()" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 150 entries, 0 to 149\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sepal length (cm) 150 non-null float64\n", + " 1 sepal width (cm) 150 non-null float64\n", + " 2 petal length (cm) 150 non-null float64\n", + " 3 petal width (cm) 150 non-null float64\n", + "dtypes: float64(4)\n", + "memory usage: 4.8 KB\n" + ] + } + ], "metadata": { "trusted": false } @@ -69,11 +87,92 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "source": [ "iris_df.head()" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2
\n", + "
" + ], + "text/plain": [ + " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)\n", + "0 5.1 3.5 1.4 0.2\n", + "1 4.9 3.0 1.4 0.2\n", + "2 4.7 3.2 1.3 0.2\n", + "3 4.6 3.1 1.5 0.2\n", + "4 5.0 3.6 1.4 0.2" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], "metadata": { "trusted": false } @@ -89,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "source": [ "# Hint: Consult the documentation by using iris_df.head?" ], @@ -109,11 +208,92 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "source": [ "iris_df.tail()" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
1456.73.05.22.3
1466.32.55.01.9
1476.53.05.22.0
1486.23.45.42.3
1495.93.05.11.8
\n", + "
" + ], + "text/plain": [ + " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)\n", + "145 6.7 3.0 5.2 2.3\n", + "146 6.3 2.5 5.0 1.9\n", + "147 6.5 3.0 5.2 2.0\n", + "148 6.2 3.4 5.4 2.3\n", + "149 5.9 3.0 5.1 1.8" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], "metadata": { "trusted": false } @@ -154,14 +334,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "source": [ - "import numpy as np\r\n", - "\r\n", - "example1 = np.array([2, None, 6, 8])\r\n", + "import numpy as np\n", + "\n", + "example1 = np.array([2, None, 6, 8])\n", "example1" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([2, None, 6, 8], dtype=object)" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], "metadata": { "trusted": false } @@ -177,11 +368,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "source": [ "example1.sum()" ], - "outputs": [], + "outputs": [ + { + "output_type": "error", + "ename": "TypeError", + "evalue": "unsupported operand type(s) for +: 'int' and 'NoneType'", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mexample1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py\u001b[0m in \u001b[0;36m_sum\u001b[0;34m(a, axis, dtype, out, keepdims, initial, where)\u001b[0m\n\u001b[1;32m 45\u001b[0m def _sum(a, axis=None, dtype=None, out=None, keepdims=False,\n\u001b[1;32m 46\u001b[0m initial=_NoValue, where=True):\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mumr_sum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwhere\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m def _prod(a, axis=None, dtype=None, out=None, keepdims=False,\n", + "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'int' and 'NoneType'" + ] + } + ], "metadata": { "trusted": false } @@ -204,22 +408,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "source": [ "np.nan + 1" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "nan" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], "metadata": { "trusted": false } }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "source": [ "np.nan * 0" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "nan" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], "metadata": { "trusted": false } @@ -233,12 +459,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "source": [ - "example2 = np.array([2, np.nan, 6, 8]) \r\n", + "example2 = np.array([2, np.nan, 6, 8]) \n", "example2.sum(), example2.min(), example2.max()" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(nan, nan, nan)" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], "metadata": { "trusted": false } @@ -252,9 +489,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "source": [ - "# What happens if you add np.nan and None together?\r\n" + "# What happens if you add np.nan and None together?\n" ], "outputs": [], "metadata": { @@ -280,12 +517,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "source": [ - "int_series = pd.Series([1, 2, 3], dtype=int)\r\n", + "int_series = pd.Series([1, 2, 3], dtype=int)\n", "int_series" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], "metadata": { "trusted": false } @@ -299,11 +550,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "source": [ - "# Now set an element of int_series equal to None.\r\n", - "# How does that element show up in the Series?\r\n", - "# What is the dtype of the Series?\r\n" + "# Now set an element of int_series equal to None.\n", + "# How does that element show up in the Series?\n", + "# What is the dtype of the Series?\n" ], "outputs": [], "metadata": { @@ -335,7 +586,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "source": [ "example3 = pd.Series([0, np.nan, '', None])" ], @@ -347,11 +598,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "source": [ "example3.isnull()" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 False\n", + "1 True\n", + "2 False\n", + "3 True\n", + "dtype: bool" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ], "metadata": { "trusted": false } @@ -374,10 +640,10 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "source": [ - "# Try running example3[example3.notnull()].\r\n", - "# Before you do so, what do you expect to see?\r\n" + "# Try running example3[example3.notnull()].\n", + "# Before you do so, what do you expect to see?\n" ], "outputs": [], "metadata": { @@ -403,12 +669,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "source": [ - "example3 = example3.dropna()\r\n", + "example3 = example3.dropna()\n", "example3" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 0\n", + "2 \n", + "dtype: object" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], "metadata": { "trusted": false } @@ -424,14 +703,75 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "source": [ - "example4 = pd.DataFrame([[1, np.nan, 7], \r\n", - " [2, 5, 8], \r\n", - " [np.nan, 6, 9]])\r\n", + "example4 = pd.DataFrame([[1, np.nan, 7], \n", + " [2, 5, 8], \n", + " [np.nan, 6, 9]])\n", "example4" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
01.0NaN7
12.05.08
2NaN6.09
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "0 1.0 NaN 7\n", + "1 2.0 5.0 8\n", + "2 NaN 6.0 9" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ], "metadata": { "trusted": false } @@ -447,11 +787,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "source": [ "example4.dropna()" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
12.05.08
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "1 2.0 5.0 8" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ], "metadata": { "trusted": false } @@ -465,11 +852,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "source": [ "example4.dropna(axis='columns')" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
2
07
18
29
\n", + "
" + ], + "text/plain": [ + " 2\n", + "0 7\n", + "1 8\n", + "2 9" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ], "metadata": { "trusted": false } @@ -485,12 +925,77 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "source": [ - "example4[3] = np.nan\r\n", + "example4[3] = np.nan\n", "example4" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
01.0NaN7NaN
12.05.08NaN
2NaN6.09NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 1.0 NaN 7 NaN\n", + "1 2.0 5.0 8 NaN\n", + "2 NaN 6.0 9 NaN" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ], "metadata": { "trusted": false } @@ -504,10 +1009,10 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "source": [ - "# How might you go about dropping just column 3?\r\n", - "# Hint: remember that you will need to supply both the axis parameter and the how parameter.\r\n" + "# How might you go about dropping just column 3?\n", + "# Hint: remember that you will need to supply both the axis parameter and the how parameter.\n" ], "outputs": [], "metadata": { @@ -524,11 +1029,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "source": [ "example4.dropna(axis='rows', thresh=3)" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
12.05.08NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "1 2.0 5.0 8 NaN" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ], "metadata": { "trusted": false } @@ -551,12 +1105,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "source": [ - "example5 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))\r\n", + "example5 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))\n", "example5" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b NaN\n", + "c 2.0\n", + "d NaN\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ], "metadata": { "trusted": false } @@ -570,11 +1140,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "source": [ "example5.fillna(0)" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b 0.0\n", + "c 2.0\n", + "d 0.0\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ], "metadata": { "trusted": false } @@ -588,9 +1174,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "source": [ - "# What happens if you try to fill null values with a string, like ''?\r\n" + "# What happens if you try to fill null values with a string, like ''?\n" ], "outputs": [], "metadata": { @@ -607,11 +1193,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "source": [ "example5.fillna(method='ffill')" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b 1.0\n", + "c 2.0\n", + "d 2.0\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ], "metadata": { "trusted": false } @@ -625,11 +1227,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "source": [ "example5.fillna(method='bfill')" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "a 1.0\n", + "b 2.0\n", + "c 2.0\n", + "d 3.0\n", + "e 3.0\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 28 + } + ], "metadata": { "trusted": false } @@ -645,22 +1263,152 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "source": [ "example4" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
01.0NaN7NaN
12.05.08NaN
2NaN6.09NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 1.0 NaN 7 NaN\n", + "1 2.0 5.0 8 NaN\n", + "2 NaN 6.0 9 NaN" + ] + }, + "metadata": {}, + "execution_count": 29 + } + ], "metadata": { "trusted": false } }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "source": [ "example4.fillna(method='ffill', axis=1)" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
01.01.07.07.0
12.05.08.08.0
2NaN6.09.09.0
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 1.0 1.0 7.0 7.0\n", + "1 2.0 5.0 8.0 8.0\n", + "2 NaN 6.0 9.0 9.0" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ], "metadata": { "trusted": false } @@ -681,11 +1429,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "source": [ - "# What output does example4.fillna(method='bfill', axis=1) produce?\r\n", - "# What about example4.fillna(method='ffill') or example4.fillna(method='bfill')?\r\n", - "# Can you think of a longer code snippet to write that can fill all of the null values in example4?\r\n" + "# What output does example4.fillna(method='bfill', axis=1) produce?\n", + "# What about example4.fillna(method='ffill') or example4.fillna(method='bfill')?\n", + "# Can you think of a longer code snippet to write that can fill all of the null values in example4?\n" ], "outputs": [], "metadata": { @@ -702,11 +1450,76 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "source": [ "example4.fillna(example4.mean())" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
01.05.57NaN
12.05.08NaN
21.56.09NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 1.0 5.5 7 NaN\n", + "1 2.0 5.0 8 NaN\n", + "2 1.5 6.0 9 NaN" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ], "metadata": { "trusted": false } @@ -742,24 +1555,109 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "source": [ - "example6 = pd.DataFrame({'letters': ['A','B'] * 2 + ['B'],\r\n", - " 'numbers': [1, 2, 1, 3, 3]})\r\n", + "example6 = pd.DataFrame({'letters': ['A','B'] * 2 + ['B'],\n", + " 'numbers': [1, 2, 1, 3, 3]})\n", "example6" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lettersnumbers
0A1
1B2
2A1
3B3
4B3
\n", + "
" + ], + "text/plain": [ + " letters numbers\n", + "0 A 1\n", + "1 B 2\n", + "2 A 1\n", + "3 B 3\n", + "4 B 3" + ] + }, + "metadata": {}, + "execution_count": 33 + } + ], "metadata": { "trusted": false } }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "source": [ "example6.duplicated()" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 True\n", + "3 False\n", + "4 True\n", + "dtype: bool" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ], "metadata": { "trusted": false } @@ -774,11 +1672,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "source": [ "example6.drop_duplicates()" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lettersnumbers
0A1
1B2
3B3
\n", + "
" + ], + "text/plain": [ + " letters numbers\n", + "0 A 1\n", + "1 B 2\n", + "3 B 3" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ], "metadata": { "trusted": false } @@ -792,11 +1747,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "source": [ "example6.drop_duplicates(['letters'])" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lettersnumbers
0A1
1B2
\n", + "
" + ], + "text/plain": [ + " letters numbers\n", + "0 A 1\n", + "1 B 2" + ] + }, + "metadata": {}, + "execution_count": 36 + } + ], "metadata": { "trusted": false } @@ -813,20 +1819,22 @@ "anaconda-cloud": {}, "kernelspec": { "name": "python3", - "display_name": "Python 3", - "language": "python" + "display_name": "Python 3.8.8 64-bit ('base': conda)" }, "language_info": { "mimetype": "text/x-python", "nbconvert_exporter": "python", "name": "python", "file_extension": ".py", - "version": "3.5.4", + "version": "3.8.8", "pygments_lexer": "ipython3", "codemirror_mode": { "version": 3, "name": "ipython" } + }, + "interpreter": { + "hash": "ac36fb7022a775f2750f61e1a6104d2d5a9eb3fb9bd004b80f1c771537b93945" } }, "nbformat": 4,