{ "cells": [ { "cell_type": "markdown", "source": [ "# Analyzing Data\r\n", "Examples of the Pandas functions mentioned in the [lesson](README.md)." ], "metadata": {} }, { "cell_type": "code", "execution_count": 1, "source": [ "import pandas as pd\r\n", "import glob\r\n", "\r\n", "#Loading the dataset\r\n", "path = '../../data/emails.csv'\r\n", "email_df = pd.read_csv(path)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 2, "source": [ "# Using Describe on the email dataset\r\n", "print(email_df.describe())" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " the to ect and for of \\\n", "count 406.000000 406.000000 406.000000 406.000000 406.000000 406.000000 \n", "mean 7.022167 6.519704 4.948276 3.059113 3.502463 2.662562 \n", "std 10.945522 9.801907 9.293820 6.267806 4.901372 5.443939 \n", "min 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 1.000000 1.000000 0.000000 1.000000 0.000000 \n", "50% 3.000000 3.000000 2.000000 1.000000 2.000000 1.000000 \n", "75% 9.000000 7.750000 4.000000 3.000000 4.750000 3.000000 \n", "max 99.000000 88.000000 79.000000 69.000000 39.000000 57.000000 \n", "\n", " a you in on is this \\\n", "count 406.000000 406.000000 406.000000 406.000000 406.000000 406.000000 \n", "mean 57.017241 2.394089 10.817734 11.591133 5.901478 1.485222 \n", "std 78.868243 4.067015 19.050972 16.407175 8.793103 2.912473 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 15.000000 0.000000 1.250000 3.000000 1.000000 0.000000 \n", "50% 29.000000 1.000000 5.000000 6.000000 3.000000 0.000000 \n", "75% 61.000000 3.000000 12.000000 13.000000 7.000000 2.000000 \n", "max 843.000000 31.000000 223.000000 125.000000 61.000000 24.000000 \n", "\n", " i be that will \n", "count 406.000000 406.000000 406.000000 406.000000 \n", "mean 47.155172 2.950739 1.034483 0.955665 \n", "std 71.043009 4.297865 1.904846 2.042271 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 11.000000 1.000000 0.000000 0.000000 \n", "50% 24.000000 1.000000 0.000000 0.000000 \n", "75% 50.750000 3.000000 1.000000 1.000000 \n", "max 754.000000 40.000000 14.000000 24.000000 \n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 5, "source": [ "# Sampling 10 emails\r\n", "print(email_df.sample(10))" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Email No. the to ect and for of a you in on is this i \\\n", "150 Email 151 0 1 2 0 3 0 15 0 0 5 0 0 7 \n", "380 Email 5147 0 3 2 0 0 0 7 0 1 1 0 0 3 \n", "19 Email 20 3 4 11 0 4 2 32 1 1 3 9 5 25 \n", "300 Email 301 2 1 1 0 1 1 15 2 2 3 2 0 8 \n", "307 Email 308 0 0 1 0 0 0 1 0 1 0 0 0 2 \n", "167 Email 168 2 2 2 1 5 1 24 2 5 6 4 0 30 \n", "320 Email 321 10 12 4 6 8 6 187 5 26 28 23 2 171 \n", "61 Email 62 0 1 1 0 4 1 15 4 4 3 3 0 19 \n", "26 Email 27 5 4 1 1 4 4 51 0 8 6 6 2 44 \n", "73 Email 74 0 0 1 0 0 0 7 0 4 3 0 0 6 \n", "\n", " be that will \n", "150 1 0 0 \n", "380 0 0 0 \n", "19 3 0 1 \n", "300 0 0 0 \n", "307 0 0 0 \n", "167 2 0 0 \n", "320 5 1 1 \n", "61 2 0 0 \n", "26 6 0 0 \n", "73 0 0 0 \n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 14, "source": [ "# Returns rows where there are more occurrences of \"to\" than \"the\"\r\n", "print(email_df.query('the < to'))" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Email No. the to ect and for of a you in on is this i \\\n", "1 Email 2 8 13 24 6 6 2 102 1 18 21 13 0 61 \n", "3 Email 4 0 5 22 0 5 1 51 2 1 5 9 2 16 \n", "5 Email 6 4 5 1 4 2 3 45 1 16 12 8 1 52 \n", "7 Email 8 0 2 2 3 1 2 21 6 2 6 2 0 28 \n", "13 Email 14 4 5 7 1 5 1 37 1 8 8 6 1 43 \n", ".. ... ... .. ... ... ... .. ... ... .. .. .. ... .. \n", "390 Email 5157 4 13 1 0 3 1 48 2 8 26 9 1 45 \n", "393 Email 5160 2 13 1 0 2 1 38 2 7 24 6 1 34 \n", "396 Email 5163 2 3 1 2 1 2 32 0 7 3 2 0 26 \n", "404 Email 5171 2 7 1 0 2 1 28 2 8 11 7 1 39 \n", "405 Email 5172 22 24 5 1 6 5 148 8 23 13 5 4 99 \n", "\n", " be that will \n", "1 4 2 0 \n", "3 2 0 0 \n", "5 2 0 0 \n", "7 1 0 1 \n", "13 1 0 1 \n", ".. .. ... ... \n", "390 1 0 0 \n", "393 1 0 0 \n", "396 3 0 0 \n", "404 1 0 0 \n", "405 6 4 1 \n", "\n", "[169 rows x 17 columns]\n" ] } ], "metadata": {} } ], "metadata": { "orig_nbformat": 4, "language_info": { "name": "python", "version": "3.9.7", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernelspec": { "name": "python3", "display_name": "Python 3.9.7 64-bit ('venv': venv)" }, "interpreter": { "hash": "6b9b57232c4b57163d057191678da2030059e733b8becc68f245de5a75abe84e" } }, "nbformat": 4, "nbformat_minor": 2 }