Add python卡方检验实例

pull/2/head
benjas 5 years ago
parent b01af8a92f
commit 4dc4e8627b

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,548 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 白人和黑人在求职路上会有种族的歧视吗?"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy import stats"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>ad</th>\n",
" <th>education</th>\n",
" <th>ofjobs</th>\n",
" <th>yearsexp</th>\n",
" <th>honors</th>\n",
" <th>volunteer</th>\n",
" <th>military</th>\n",
" <th>empholes</th>\n",
" <th>occupspecific</th>\n",
" <th>...</th>\n",
" <th>compreq</th>\n",
" <th>orgreq</th>\n",
" <th>manuf</th>\n",
" <th>transcom</th>\n",
" <th>bankreal</th>\n",
" <th>trade</th>\n",
" <th>busservice</th>\n",
" <th>othservice</th>\n",
" <th>missind</th>\n",
" <th>ownership</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>316</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>19</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>313</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>22</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>313</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>Nonprofit</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 65 columns</p>\n",
"</div>"
],
"text/plain": [
" id ad education ofjobs yearsexp honors volunteer military empholes \\\n",
"0 b 1 4 2 6 0 0 0 1 \n",
"1 b 1 3 3 6 0 1 1 0 \n",
"2 b 1 4 1 6 0 0 0 0 \n",
"3 b 1 3 4 6 0 1 0 1 \n",
"4 b 1 3 3 22 0 0 0 0 \n",
"\n",
" occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n",
"0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
"1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
"2 19 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
"3 313 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
"4 313 ... 1.0 1.0 0.0 0.0 0.0 0.0 \n",
"\n",
" busservice othservice missind ownership \n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 1.0 0.0 Nonprofit \n",
"\n",
"[5 rows x 65 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.io.stata.read_stata('us_job_market_discrimination.dta')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"blacks = data[data.race=='b'] # 黑人\n",
"whites = data[data.race=='w'] # 白人"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 2435.000000\n",
"mean 0.064476\n",
"std 0.245649\n",
"min 0.000000\n",
"25% 0.000000\n",
"50% 0.000000\n",
"75% 0.000000\n",
"max 1.000000\n",
"Name: call, dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"blacks.call.describe() # call为1则是被录取为0则是不录取"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 2435.000000\n",
"mean 0.096509\n",
"std 0.295346\n",
"min 0.000000\n",
"25% 0.000000\n",
"50% 0.000000\n",
"75% 0.000000\n",
"max 1.000000\n",
"Name: call, dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"whites.call.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"白人的录取均值更高,我们再查看其它的指标,确认是到底是不是因为肤色"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 卡方检验\n",
"<ul>\n",
" <li>白人获得职位\n",
" <li>白人被拒绝\n",
" <li>黑人获得职位\n",
" <li>黑人被拒绝\n",
"</ul>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 假设检验\n",
"<ul>\n",
" <li>H0种族对求职结果没有显著影响\n",
" <li>H1种族对求职结果有显著影响"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"blacks_called = len(blacks[blacks['call'] == True])\n",
"blacks_not_called = len(blacks[blacks['call'] == False])\n",
"whites_called = len(whites[whites['call'] == True])\n",
"whites_not_called = len(whites[whites['call'] == False])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"observed = pd.DataFrame({\n",
" 'blacks':{'called':blacks_called, 'not_called':blacks_not_called},\n",
" 'whites':{'called':whites_called, 'not_called':whites_not_called}\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>blacks</th>\n",
" <th>whites</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>called</th>\n",
" <td>157</td>\n",
" <td>235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>not_called</th>\n",
" <td>2278</td>\n",
" <td>2200</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" blacks whites\n",
"called 157 235\n",
"not_called 2278 2200"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"observed"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"392\n",
"4478\n"
]
}
],
"source": [
"num_called_back = blacks_called +whites_called\n",
"num_not_called = blacks_not_called + whites_not_called\n",
"\n",
"print(num_called_back)\n",
"print(num_not_called)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# 得到期望值\n",
"rate_of_callbacks = num_called_back / (num_not_called+num_called_back)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.08049281314168377"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rate_of_callbacks"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"391.99999999999994\n",
"4478.0\n"
]
}
],
"source": [
"expected_called = len(data) * rate_of_callbacks\n",
"expected_not_called = len(data) * (1-rate_of_callbacks)\n",
"\n",
"print(expected_called) # 获得的期望人数\n",
"print(expected_not_called) # 没获得的期望人数"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Power_divergenceResult(statistic=16.87905041427022, pvalue=0.0007483959441097264)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import scipy.stats as stats\n",
"observed_frequencies = [blacks_not_called, whites_not_called,\n",
" whites_called,blacks_called] # 观测值\n",
"expected_frequencies = [expected_not_called/2,expected_not_called/2,\n",
" expected_called/2,expected_called/2] # 期望值\n",
"\n",
"stats.chisquare(f_obs = observed_frequencies,\n",
" f_exp = expected_frequencies) #算出卡方检验值"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"p值非常非常小, 小于0.05,我们认为在这份数据中,种族歧视是存在的"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading…
Cancel
Save