Data-Science-For-Beginners/translations/ja/4-Data-Science-Lifecycle/15-analyzing/notebook.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# データの分析\n",
    "[レッスン](README.md)で紹介されたPandas関数の例。\n"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import pandas as pd\r\n",
    "import glob\r\n",
    "\r\n",
    "#Loading the dataset\r\n",
    "path = '../../data/emails.csv'\r\n",
    "email_df = pd.read_csv(path)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "# Using Describe on the email dataset\r\n",
    "print(email_df.describe())"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "              the          to         ect         and         for          of  \\\n",
      "count  406.000000  406.000000  406.000000  406.000000  406.000000  406.000000   \n",
      "mean     7.022167    6.519704    4.948276    3.059113    3.502463    2.662562   \n",
      "std     10.945522    9.801907    9.293820    6.267806    4.901372    5.443939   \n",
      "min      0.000000    0.000000    1.000000    0.000000    0.000000    0.000000   \n",
      "25%      1.000000    1.000000    1.000000    0.000000    1.000000    0.000000   \n",
      "50%      3.000000    3.000000    2.000000    1.000000    2.000000    1.000000   \n",
      "75%      9.000000    7.750000    4.000000    3.000000    4.750000    3.000000   \n",
      "max     99.000000   88.000000   79.000000   69.000000   39.000000   57.000000   \n",
      "\n",
      "                a         you          in          on          is        this  \\\n",
      "count  406.000000  406.000000  406.000000  406.000000  406.000000  406.000000   \n",
      "mean    57.017241    2.394089   10.817734   11.591133    5.901478    1.485222   \n",
      "std     78.868243    4.067015   19.050972   16.407175    8.793103    2.912473   \n",
      "min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   \n",
      "25%     15.000000    0.000000    1.250000    3.000000    1.000000    0.000000   \n",
      "50%     29.000000    1.000000    5.000000    6.000000    3.000000    0.000000   \n",
      "75%     61.000000    3.000000   12.000000   13.000000    7.000000    2.000000   \n",
      "max    843.000000   31.000000  223.000000  125.000000   61.000000   24.000000   \n",
      "\n",
      "                i          be        that        will  \n",
      "count  406.000000  406.000000  406.000000  406.000000  \n",
      "mean    47.155172    2.950739    1.034483    0.955665  \n",
      "std     71.043009    4.297865    1.904846    2.042271  \n",
      "min      0.000000    0.000000    0.000000    0.000000  \n",
      "25%     11.000000    1.000000    0.000000    0.000000  \n",
      "50%     24.000000    1.000000    0.000000    0.000000  \n",
      "75%     50.750000    3.000000    1.000000    1.000000  \n",
      "max    754.000000   40.000000   14.000000   24.000000  \n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "# Sampling 10 emails\r\n",
    "print(email_df.sample(10))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "      Email No.  the  to  ect  and  for  of    a  you  in  on  is  this    i  \\\n",
      "150   Email 151    0   1    2    0    3   0   15    0   0   5   0     0    7   \n",
      "380  Email 5147    0   3    2    0    0   0    7    0   1   1   0     0    3   \n",
      "19     Email 20    3   4   11    0    4   2   32    1   1   3   9     5   25   \n",
      "300   Email 301    2   1    1    0    1   1   15    2   2   3   2     0    8   \n",
      "307   Email 308    0   0    1    0    0   0    1    0   1   0   0     0    2   \n",
      "167   Email 168    2   2    2    1    5   1   24    2   5   6   4     0   30   \n",
      "320   Email 321   10  12    4    6    8   6  187    5  26  28  23     2  171   \n",
      "61     Email 62    0   1    1    0    4   1   15    4   4   3   3     0   19   \n",
      "26     Email 27    5   4    1    1    4   4   51    0   8   6   6     2   44   \n",
      "73     Email 74    0   0    1    0    0   0    7    0   4   3   0     0    6   \n",
      "\n",
      "     be  that  will  \n",
      "150   1     0     0  \n",
      "380   0     0     0  \n",
      "19    3     0     1  \n",
      "300   0     0     0  \n",
      "307   0     0     0  \n",
      "167   2     0     0  \n",
      "320   5     1     1  \n",
      "61    2     0     0  \n",
      "26    6     0     0  \n",
      "73    0     0     0  \n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "source": [
    "# Returns rows where there are more occurrences of \"to\" than \"the\"\r\n",
    "print(email_df.query('the < to'))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "      Email No.  the  to  ect  and  for  of    a  you  in  on  is  this   i  \\\n",
      "1       Email 2    8  13   24    6    6   2  102    1  18  21  13     0  61   \n",
      "3       Email 4    0   5   22    0    5   1   51    2   1   5   9     2  16   \n",
      "5       Email 6    4   5    1    4    2   3   45    1  16  12   8     1  52   \n",
      "7       Email 8    0   2    2    3    1   2   21    6   2   6   2     0  28   \n",
      "13     Email 14    4   5    7    1    5   1   37    1   8   8   6     1  43   \n",
      "..          ...  ...  ..  ...  ...  ...  ..  ...  ...  ..  ..  ..   ...  ..   \n",
      "390  Email 5157    4  13    1    0    3   1   48    2   8  26   9     1  45   \n",
      "393  Email 5160    2  13    1    0    2   1   38    2   7  24   6     1  34   \n",
      "396  Email 5163    2   3    1    2    1   2   32    0   7   3   2     0  26   \n",
      "404  Email 5171    2   7    1    0    2   1   28    2   8  11   7     1  39   \n",
      "405  Email 5172   22  24    5    1    6   5  148    8  23  13   5     4  99   \n",
      "\n",
      "     be  that  will  \n",
      "1     4     2     0  \n",
      "3     2     0     0  \n",
      "5     2     0     0  \n",
      "7     1     0     1  \n",
      "13    1     0     1  \n",
      "..   ..   ...   ...  \n",
      "390   1     0     0  \n",
      "393   1     0     0  \n",
      "396   3     0     0  \n",
      "404   1     0     0  \n",
      "405   6     4     1  \n",
      "\n",
      "[169 rows x 17 columns]\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n---\n\n**免責事項**:  \nこの文書は、AI翻訳サービス [Co-op Translator](https://github.com/Azure/co-op-translator) を使用して翻訳されています。正確性を追求しておりますが、自動翻訳には誤りや不正確な部分が含まれる可能性があることをご承知ください。元の言語で記載された文書が正式な情報源とみなされるべきです。重要な情報については、専門の人間による翻訳を推奨します。この翻訳の使用に起因する誤解や誤解釈について、当方は一切の責任を負いません。\n"
   ]
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.9.7",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.9.7 64-bit ('venv': venv)"
  },
  "interpreter": {
   "hash": "6b9b57232c4b57163d057191678da2030059e733b8becc68f245de5a75abe84e"
  },
  "coopTranslator": {
   "original_hash": "9d102c8c3cdbc8ea4e92fc32593462c6",
   "translation_date": "2025-09-01T22:20:37+00:00",
   "source_file": "4-Data-Science-Lifecycle/15-analyzing/notebook.ipynb",
   "language_code": "ja"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}