{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Analyzing Data\r\n",
    "Examples of the Pandas functions mentioned in the [lesson](README.md)."
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import pandas as pd\r\n",
    "import glob\r\n",
    "\r\n",
    "#Loading the dataset\r\n",
    "path = '../../data/emails.csv'\r\n",
    "email_df = pd.read_csv(path)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "# Using Describe on the email dataset\r\n",
    "print(email_df.describe())"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "              the          to         ect         and         for          of  \\\n",
      "count  406.000000  406.000000  406.000000  406.000000  406.000000  406.000000   \n",
      "mean     7.022167    6.519704    4.948276    3.059113    3.502463    2.662562   \n",
      "std     10.945522    9.801907    9.293820    6.267806    4.901372    5.443939   \n",
      "min      0.000000    0.000000    1.000000    0.000000    0.000000    0.000000   \n",
      "25%      1.000000    1.000000    1.000000    0.000000    1.000000    0.000000   \n",
      "50%      3.000000    3.000000    2.000000    1.000000    2.000000    1.000000   \n",
      "75%      9.000000    7.750000    4.000000    3.000000    4.750000    3.000000   \n",
      "max     99.000000   88.000000   79.000000   69.000000   39.000000   57.000000   \n",
      "\n",
      "                a         you          in          on          is        this  \\\n",
      "count  406.000000  406.000000  406.000000  406.000000  406.000000  406.000000   \n",
      "mean    57.017241    2.394089   10.817734   11.591133    5.901478    1.485222   \n",
      "std     78.868243    4.067015   19.050972   16.407175    8.793103    2.912473   \n",
      "min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   \n",
      "25%     15.000000    0.000000    1.250000    3.000000    1.000000    0.000000   \n",
      "50%     29.000000    1.000000    5.000000    6.000000    3.000000    0.000000   \n",
      "75%     61.000000    3.000000   12.000000   13.000000    7.000000    2.000000   \n",
      "max    843.000000   31.000000  223.000000  125.000000   61.000000   24.000000   \n",
      "\n",
      "                i          be        that        will  \n",
      "count  406.000000  406.000000  406.000000  406.000000  \n",
      "mean    47.155172    2.950739    1.034483    0.955665  \n",
      "std     71.043009    4.297865    1.904846    2.042271  \n",
      "min      0.000000    0.000000    0.000000    0.000000  \n",
      "25%     11.000000    1.000000    0.000000    0.000000  \n",
      "50%     24.000000    1.000000    0.000000    0.000000  \n",
      "75%     50.750000    3.000000    1.000000    1.000000  \n",
      "max    754.000000   40.000000   14.000000   24.000000  \n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "# Sampling 10 emails\r\n",
    "print(email_df.sample(10))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "      Email No.  the  to  ect  and  for  of    a  you  in  on  is  this    i  \\\n",
      "150   Email 151    0   1    2    0    3   0   15    0   0   5   0     0    7   \n",
      "380  Email 5147    0   3    2    0    0   0    7    0   1   1   0     0    3   \n",
      "19     Email 20    3   4   11    0    4   2   32    1   1   3   9     5   25   \n",
      "300   Email 301    2   1    1    0    1   1   15    2   2   3   2     0    8   \n",
      "307   Email 308    0   0    1    0    0   0    1    0   1   0   0     0    2   \n",
      "167   Email 168    2   2    2    1    5   1   24    2   5   6   4     0   30   \n",
      "320   Email 321   10  12    4    6    8   6  187    5  26  28  23     2  171   \n",
      "61     Email 62    0   1    1    0    4   1   15    4   4   3   3     0   19   \n",
      "26     Email 27    5   4    1    1    4   4   51    0   8   6   6     2   44   \n",
      "73     Email 74    0   0    1    0    0   0    7    0   4   3   0     0    6   \n",
      "\n",
      "     be  that  will  \n",
      "150   1     0     0  \n",
      "380   0     0     0  \n",
      "19    3     0     1  \n",
      "300   0     0     0  \n",
      "307   0     0     0  \n",
      "167   2     0     0  \n",
      "320   5     1     1  \n",
      "61    2     0     0  \n",
      "26    6     0     0  \n",
      "73    0     0     0  \n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "source": [
    "# Returns rows where there are more occurrences of \"to\" than \"the\"\r\n",
    "print(email_df.query('the < to'))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "      Email No.  the  to  ect  and  for  of    a  you  in  on  is  this   i  \\\n",
      "1       Email 2    8  13   24    6    6   2  102    1  18  21  13     0  61   \n",
      "3       Email 4    0   5   22    0    5   1   51    2   1   5   9     2  16   \n",
      "5       Email 6    4   5    1    4    2   3   45    1  16  12   8     1  52   \n",
      "7       Email 8    0   2    2    3    1   2   21    6   2   6   2     0  28   \n",
      "13     Email 14    4   5    7    1    5   1   37    1   8   8   6     1  43   \n",
      "..          ...  ...  ..  ...  ...  ...  ..  ...  ...  ..  ..  ..   ...  ..   \n",
      "390  Email 5157    4  13    1    0    3   1   48    2   8  26   9     1  45   \n",
      "393  Email 5160    2  13    1    0    2   1   38    2   7  24   6     1  34   \n",
      "396  Email 5163    2   3    1    2    1   2   32    0   7   3   2     0  26   \n",
      "404  Email 5171    2   7    1    0    2   1   28    2   8  11   7     1  39   \n",
      "405  Email 5172   22  24    5    1    6   5  148    8  23  13   5     4  99   \n",
      "\n",
      "     be  that  will  \n",
      "1     4     2     0  \n",
      "3     2     0     0  \n",
      "5     2     0     0  \n",
      "7     1     0     1  \n",
      "13    1     0     1  \n",
      "..   ..   ...   ...  \n",
      "390   1     0     0  \n",
      "393   1     0     0  \n",
      "396   3     0     0  \n",
      "404   1     0     0  \n",
      "405   6     4     1  \n",
      "\n",
      "[169 rows x 17 columns]\n"
     ]
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.9.7",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.9.7 64-bit ('venv': venv)"
  },
  "interpreter": {
   "hash": "6b9b57232c4b57163d057191678da2030059e733b8becc68f245de5a75abe84e"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}