You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
180 lines
7.0 KiB
180 lines
7.0 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"source": [
|
|
"# Analyzing Data\r\n",
|
|
"Examples of the Pandas functions mentioned in the [lesson](README.md)."
|
|
],
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"source": [
|
|
"import pandas as pd\r\n",
|
|
"import glob\r\n",
|
|
"\r\n",
|
|
"#Loading the dataset\r\n",
|
|
"path = '../../data/emails.csv'\r\n",
|
|
"email_df = pd.read_csv(path)"
|
|
],
|
|
"outputs": [],
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"source": [
|
|
"# Using Describe on the email dataset\r\n",
|
|
"print(email_df.describe())"
|
|
],
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
" the to ect and for of \\\n",
|
|
"count 406.000000 406.000000 406.000000 406.000000 406.000000 406.000000 \n",
|
|
"mean 7.022167 6.519704 4.948276 3.059113 3.502463 2.662562 \n",
|
|
"std 10.945522 9.801907 9.293820 6.267806 4.901372 5.443939 \n",
|
|
"min 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 \n",
|
|
"25% 1.000000 1.000000 1.000000 0.000000 1.000000 0.000000 \n",
|
|
"50% 3.000000 3.000000 2.000000 1.000000 2.000000 1.000000 \n",
|
|
"75% 9.000000 7.750000 4.000000 3.000000 4.750000 3.000000 \n",
|
|
"max 99.000000 88.000000 79.000000 69.000000 39.000000 57.000000 \n",
|
|
"\n",
|
|
" a you in on is this \\\n",
|
|
"count 406.000000 406.000000 406.000000 406.000000 406.000000 406.000000 \n",
|
|
"mean 57.017241 2.394089 10.817734 11.591133 5.901478 1.485222 \n",
|
|
"std 78.868243 4.067015 19.050972 16.407175 8.793103 2.912473 \n",
|
|
"min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
|
"25% 15.000000 0.000000 1.250000 3.000000 1.000000 0.000000 \n",
|
|
"50% 29.000000 1.000000 5.000000 6.000000 3.000000 0.000000 \n",
|
|
"75% 61.000000 3.000000 12.000000 13.000000 7.000000 2.000000 \n",
|
|
"max 843.000000 31.000000 223.000000 125.000000 61.000000 24.000000 \n",
|
|
"\n",
|
|
" i be that will \n",
|
|
"count 406.000000 406.000000 406.000000 406.000000 \n",
|
|
"mean 47.155172 2.950739 1.034483 0.955665 \n",
|
|
"std 71.043009 4.297865 1.904846 2.042271 \n",
|
|
"min 0.000000 0.000000 0.000000 0.000000 \n",
|
|
"25% 11.000000 1.000000 0.000000 0.000000 \n",
|
|
"50% 24.000000 1.000000 0.000000 0.000000 \n",
|
|
"75% 50.750000 3.000000 1.000000 1.000000 \n",
|
|
"max 754.000000 40.000000 14.000000 24.000000 \n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"source": [
|
|
"# Sampling 10 emails\r\n",
|
|
"print(email_df.sample(10))"
|
|
],
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
" Email No. the to ect and for of a you in on is this i \\\n",
|
|
"150 Email 151 0 1 2 0 3 0 15 0 0 5 0 0 7 \n",
|
|
"380 Email 5147 0 3 2 0 0 0 7 0 1 1 0 0 3 \n",
|
|
"19 Email 20 3 4 11 0 4 2 32 1 1 3 9 5 25 \n",
|
|
"300 Email 301 2 1 1 0 1 1 15 2 2 3 2 0 8 \n",
|
|
"307 Email 308 0 0 1 0 0 0 1 0 1 0 0 0 2 \n",
|
|
"167 Email 168 2 2 2 1 5 1 24 2 5 6 4 0 30 \n",
|
|
"320 Email 321 10 12 4 6 8 6 187 5 26 28 23 2 171 \n",
|
|
"61 Email 62 0 1 1 0 4 1 15 4 4 3 3 0 19 \n",
|
|
"26 Email 27 5 4 1 1 4 4 51 0 8 6 6 2 44 \n",
|
|
"73 Email 74 0 0 1 0 0 0 7 0 4 3 0 0 6 \n",
|
|
"\n",
|
|
" be that will \n",
|
|
"150 1 0 0 \n",
|
|
"380 0 0 0 \n",
|
|
"19 3 0 1 \n",
|
|
"300 0 0 0 \n",
|
|
"307 0 0 0 \n",
|
|
"167 2 0 0 \n",
|
|
"320 5 1 1 \n",
|
|
"61 2 0 0 \n",
|
|
"26 6 0 0 \n",
|
|
"73 0 0 0 \n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"source": [
|
|
"# Returns rows where there are more occurrences of \"to\" than \"the\"\r\n",
|
|
"print(email_df.query('the < to'))"
|
|
],
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
" Email No. the to ect and for of a you in on is this i \\\n",
|
|
"1 Email 2 8 13 24 6 6 2 102 1 18 21 13 0 61 \n",
|
|
"3 Email 4 0 5 22 0 5 1 51 2 1 5 9 2 16 \n",
|
|
"5 Email 6 4 5 1 4 2 3 45 1 16 12 8 1 52 \n",
|
|
"7 Email 8 0 2 2 3 1 2 21 6 2 6 2 0 28 \n",
|
|
"13 Email 14 4 5 7 1 5 1 37 1 8 8 6 1 43 \n",
|
|
".. ... ... .. ... ... ... .. ... ... .. .. .. ... .. \n",
|
|
"390 Email 5157 4 13 1 0 3 1 48 2 8 26 9 1 45 \n",
|
|
"393 Email 5160 2 13 1 0 2 1 38 2 7 24 6 1 34 \n",
|
|
"396 Email 5163 2 3 1 2 1 2 32 0 7 3 2 0 26 \n",
|
|
"404 Email 5171 2 7 1 0 2 1 28 2 8 11 7 1 39 \n",
|
|
"405 Email 5172 22 24 5 1 6 5 148 8 23 13 5 4 99 \n",
|
|
"\n",
|
|
" be that will \n",
|
|
"1 4 2 0 \n",
|
|
"3 2 0 0 \n",
|
|
"5 2 0 0 \n",
|
|
"7 1 0 1 \n",
|
|
"13 1 0 1 \n",
|
|
".. .. ... ... \n",
|
|
"390 1 0 0 \n",
|
|
"393 1 0 0 \n",
|
|
"396 3 0 0 \n",
|
|
"404 1 0 0 \n",
|
|
"405 6 4 1 \n",
|
|
"\n",
|
|
"[169 rows x 17 columns]\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {}
|
|
}
|
|
],
|
|
"metadata": {
|
|
"orig_nbformat": 4,
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.9.7",
|
|
"mimetype": "text/x-python",
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"pygments_lexer": "ipython3",
|
|
"nbconvert_exporter": "python",
|
|
"file_extension": ".py"
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3.9.7 64-bit ('venv': venv)"
|
|
},
|
|
"interpreter": {
|
|
"hash": "6b9b57232c4b57163d057191678da2030059e733b8becc68f245de5a75abe84e"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
} |