{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# 데이터 분석\n",
    "[수업](README.md)에서 언급된 Pandas 함수의 예제.\n"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import pandas as pd\r\n",
    "import glob\r\n",
    "\r\n",
    "#Loading the dataset\r\n",
    "path = '../../data/emails.csv'\r\n",
    "email_df = pd.read_csv(path)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "# Using Describe on the email dataset\r\n",
    "print(email_df.describe())"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "              the          to         ect         and         for          of  \\\n",
      "count  406.000000  406.000000  406.000000  406.000000  406.000000  406.000000   \n",
      "mean     7.022167    6.519704    4.948276    3.059113    3.502463    2.662562   \n",
      "std     10.945522    9.801907    9.293820    6.267806    4.901372    5.443939   \n",
      "min      0.000000    0.000000    1.000000    0.000000    0.000000    0.000000   \n",
      "25%      1.000000    1.000000    1.000000    0.000000    1.000000    0.000000   \n",
      "50%      3.000000    3.000000    2.000000    1.000000    2.000000    1.000000   \n",
      "75%      9.000000    7.750000    4.000000    3.000000    4.750000    3.000000   \n",
      "max     99.000000   88.000000   79.000000   69.000000   39.000000   57.000000   \n",
      "\n",
      "                a         you          in          on          is        this  \\\n",
      "count  406.000000  406.000000  406.000000  406.000000  406.000000  406.000000   \n",
      "mean    57.017241    2.394089   10.817734   11.591133    5.901478    1.485222   \n",
      "std     78.868243    4.067015   19.050972   16.407175    8.793103    2.912473   \n",
      "min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   \n",
      "25%     15.000000    0.000000    1.250000    3.000000    1.000000    0.000000   \n",
      "50%     29.000000    1.000000    5.000000    6.000000    3.000000    0.000000   \n",
      "75%     61.000000    3.000000   12.000000   13.000000    7.000000    2.000000   \n",
      "max    843.000000   31.000000  223.000000  125.000000   61.000000   24.000000   \n",
      "\n",
      "                i          be        that        will  \n",
      "count  406.000000  406.000000  406.000000  406.000000  \n",
      "mean    47.155172    2.950739    1.034483    0.955665  \n",
      "std     71.043009    4.297865    1.904846    2.042271  \n",
      "min      0.000000    0.000000    0.000000    0.000000  \n",
      "25%     11.000000    1.000000    0.000000    0.000000  \n",
      "50%     24.000000    1.000000    0.000000    0.000000  \n",
      "75%     50.750000    3.000000    1.000000    1.000000  \n",
      "max    754.000000   40.000000   14.000000   24.000000  \n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "# Sampling 10 emails\r\n",
    "print(email_df.sample(10))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "      Email No.  the  to  ect  and  for  of    a  you  in  on  is  this    i  \\\n",
      "150   Email 151    0   1    2    0    3   0   15    0   0   5   0     0    7   \n",
      "380  Email 5147    0   3    2    0    0   0    7    0   1   1   0     0    3   \n",
      "19     Email 20    3   4   11    0    4   2   32    1   1   3   9     5   25   \n",
      "300   Email 301    2   1    1    0    1   1   15    2   2   3   2     0    8   \n",
      "307   Email 308    0   0    1    0    0   0    1    0   1   0   0     0    2   \n",
      "167   Email 168    2   2    2    1    5   1   24    2   5   6   4     0   30   \n",
      "320   Email 321   10  12    4    6    8   6  187    5  26  28  23     2  171   \n",
      "61     Email 62    0   1    1    0    4   1   15    4   4   3   3     0   19   \n",
      "26     Email 27    5   4    1    1    4   4   51    0   8   6   6     2   44   \n",
      "73     Email 74    0   0    1    0    0   0    7    0   4   3   0     0    6   \n",
      "\n",
      "     be  that  will  \n",
      "150   1     0     0  \n",
      "380   0     0     0  \n",
      "19    3     0     1  \n",
      "300   0     0     0  \n",
      "307   0     0     0  \n",
      "167   2     0     0  \n",
      "320   5     1     1  \n",
      "61    2     0     0  \n",
      "26    6     0     0  \n",
      "73    0     0     0  \n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "source": [
    "# Returns rows where there are more occurrences of \"to\" than \"the\"\r\n",
    "print(email_df.query('the < to'))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "      Email No.  the  to  ect  and  for  of    a  you  in  on  is  this   i  \\\n",
      "1       Email 2    8  13   24    6    6   2  102    1  18  21  13     0  61   \n",
      "3       Email 4    0   5   22    0    5   1   51    2   1   5   9     2  16   \n",
      "5       Email 6    4   5    1    4    2   3   45    1  16  12   8     1  52   \n",
      "7       Email 8    0   2    2    3    1   2   21    6   2   6   2     0  28   \n",
      "13     Email 14    4   5    7    1    5   1   37    1   8   8   6     1  43   \n",
      "..          ...  ...  ..  ...  ...  ...  ..  ...  ...  ..  ..  ..   ...  ..   \n",
      "390  Email 5157    4  13    1    0    3   1   48    2   8  26   9     1  45   \n",
      "393  Email 5160    2  13    1    0    2   1   38    2   7  24   6     1  34   \n",
      "396  Email 5163    2   3    1    2    1   2   32    0   7   3   2     0  26   \n",
      "404  Email 5171    2   7    1    0    2   1   28    2   8  11   7     1  39   \n",
      "405  Email 5172   22  24    5    1    6   5  148    8  23  13   5     4  99   \n",
      "\n",
      "     be  that  will  \n",
      "1     4     2     0  \n",
      "3     2     0     0  \n",
      "5     2     0     0  \n",
      "7     1     0     1  \n",
      "13    1     0     1  \n",
      "..   ..   ...   ...  \n",
      "390   1     0     0  \n",
      "393   1     0     0  \n",
      "396   3     0     0  \n",
      "404   1     0     0  \n",
      "405   6     4     1  \n",
      "\n",
      "[169 rows x 17 columns]\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n---\n\n**면책 조항**:  \n이 문서는 AI 번역 서비스 [Co-op Translator](https://github.com/Azure/co-op-translator)를 사용하여 번역되었습니다. 정확성을 위해 최선을 다하고 있지만, 자동 번역에는 오류나 부정확성이 포함될 수 있습니다. 원본 문서의 원어 버전을 권위 있는 출처로 간주해야 합니다. 중요한 정보의 경우, 전문적인 인간 번역을 권장합니다. 이 번역 사용으로 인해 발생하는 오해나 잘못된 해석에 대해 책임을 지지 않습니다.\n"
   ]
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.9.7",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.9.7 64-bit ('venv': venv)"
  },
  "interpreter": {
   "hash": "6b9b57232c4b57163d057191678da2030059e733b8becc68f245de5a75abe84e"
  },
  "coopTranslator": {
   "original_hash": "9d102c8c3cdbc8ea4e92fc32593462c6",
   "translation_date": "2025-09-01T22:20:40+00:00",
   "source_file": "4-Data-Science-Lifecycle/15-analyzing/notebook.ipynb",
   "language_code": "ko"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}