AiLearning-Theory-Applying/notebook_必备数学基础/假设检验章节/python卡方检验实例.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 白人和黑人在求职路上会有种族的歧视吗？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>ad</th>\n",
       "      <th>education</th>\n",
       "      <th>ofjobs</th>\n",
       "      <th>yearsexp</th>\n",
       "      <th>honors</th>\n",
       "      <th>volunteer</th>\n",
       "      <th>military</th>\n",
       "      <th>empholes</th>\n",
       "      <th>occupspecific</th>\n",
       "      <th>...</th>\n",
       "      <th>compreq</th>\n",
       "      <th>orgreq</th>\n",
       "      <th>manuf</th>\n",
       "      <th>transcom</th>\n",
       "      <th>bankreal</th>\n",
       "      <th>trade</th>\n",
       "      <th>busservice</th>\n",
       "      <th>othservice</th>\n",
       "      <th>missind</th>\n",
       "      <th>ownership</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>316</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>19</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>313</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>22</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>313</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Nonprofit</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 65 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  id ad  education  ofjobs  yearsexp  honors  volunteer  military  empholes  \\\n",
       "0  b  1          4       2         6       0          0         0         1   \n",
       "1  b  1          3       3         6       0          1         1         0   \n",
       "2  b  1          4       1         6       0          0         0         0   \n",
       "3  b  1          3       4         6       0          1         0         1   \n",
       "4  b  1          3       3        22       0          0         0         0   \n",
       "\n",
       "   occupspecific  ...  compreq  orgreq  manuf  transcom  bankreal trade  \\\n",
       "0             17  ...      1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "1            316  ...      1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "2             19  ...      1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "3            313  ...      1.0     0.0    1.0       0.0       0.0   0.0   \n",
       "4            313  ...      1.0     1.0    0.0       0.0       0.0   0.0   \n",
       "\n",
       "  busservice othservice  missind  ownership  \n",
       "0        0.0        0.0      0.0             \n",
       "1        0.0        0.0      0.0             \n",
       "2        0.0        0.0      0.0             \n",
       "3        0.0        0.0      0.0             \n",
       "4        0.0        1.0      0.0  Nonprofit  \n",
       "\n",
       "[5 rows x 65 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data  = pd.io.stata.read_stata('us_job_market_discrimination.dta')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "blacks = data[data.race=='b']  # 黑人\n",
    "whites = data[data.race=='w']  # 白人"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    2435.000000\n",
       "mean        0.064476\n",
       "std         0.245649\n",
       "min         0.000000\n",
       "25%         0.000000\n",
       "50%         0.000000\n",
       "75%         0.000000\n",
       "max         1.000000\n",
       "Name: call, dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "blacks.call.describe()  # call为1则是被录取，为0则是不录取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    2435.000000\n",
       "mean        0.096509\n",
       "std         0.295346\n",
       "min         0.000000\n",
       "25%         0.000000\n",
       "50%         0.000000\n",
       "75%         0.000000\n",
       "max         1.000000\n",
       "Name: call, dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "whites.call.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "白人的录取均值更高，我们再查看其它的指标，确认是到底是不是因为肤色"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 卡方检验\n",
    "<ul>\n",
    "    <li>白人获得职位\n",
    "    <li>白人被拒绝\n",
    "    <li>黑人获得职位\n",
    "    <li>黑人被拒绝\n",
    "</ul>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 假设检验\n",
    "<ul>\n",
    "    <li>H0：种族对求职结果没有显著影响\n",
    "    <li>H1：种族对求职结果有显著影响"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "blacks_called = len(blacks[blacks['call'] == True])\n",
    "blacks_not_called = len(blacks[blacks['call'] == False])\n",
    "whites_called = len(whites[whites['call'] == True])\n",
    "whites_not_called = len(whites[whites['call'] == False])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "observed = pd.DataFrame({\n",
    "    'blacks':{'called':blacks_called, 'not_called':blacks_not_called},\n",
    "    'whites':{'called':whites_called, 'not_called':whites_not_called}\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>blacks</th>\n",
       "      <th>whites</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>called</th>\n",
       "      <td>157</td>\n",
       "      <td>235</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>not_called</th>\n",
       "      <td>2278</td>\n",
       "      <td>2200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            blacks  whites\n",
       "called         157     235\n",
       "not_called    2278    2200"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "observed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "392\n",
      "4478\n"
     ]
    }
   ],
   "source": [
    "num_called_back = blacks_called +whites_called\n",
    "num_not_called = blacks_not_called + whites_not_called\n",
    "\n",
    "print(num_called_back)\n",
    "print(num_not_called)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 得到期望值\n",
    "rate_of_callbacks = num_called_back / (num_not_called+num_called_back)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.08049281314168377"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rate_of_callbacks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "391.99999999999994\n",
      "4478.0\n"
     ]
    }
   ],
   "source": [
    "expected_called = len(data) * rate_of_callbacks\n",
    "expected_not_called = len(data) * (1-rate_of_callbacks)\n",
    "\n",
    "print(expected_called)  # 获得的期望人数\n",
    "print(expected_not_called)  # 没获得的期望人数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Power_divergenceResult(statistic=16.87905041427022, pvalue=0.0007483959441097264)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import scipy.stats as stats\n",
    "observed_frequencies = [blacks_not_called, whites_not_called,\n",
    "                        whites_called,blacks_called]  # 观测值\n",
    "expected_frequencies = [expected_not_called/2,expected_not_called/2,\n",
    "                        expected_called/2,expected_called/2]  # 期望值\n",
    "\n",
    "stats.chisquare(f_obs = observed_frequencies,\n",
    "               f_exp = expected_frequencies)  #算出卡方检验值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "p值非常非常小, 小于0.05，我们认为在这份数据中，种族歧视是存在的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}