### 白人和黑人在求职路上会有种族的歧视吗？

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
data  = pd.io.stata.read_stata('us_job_market_discrimination.dta')
data.head()

Unnamed: 0,id,ad,education,ofjobs,yearsexp,honors,volunteer,military,empholes,occupspecific,...,compreq,orgreq,manuf,transcom,bankreal,trade,busservice,othservice,missind,ownership
0,b,1,4,2,6,0,0,0,1,17,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,b,1,3,3,6,0,1,1,0,316,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,b,1,4,1,6,0,0,0,0,19,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,b,1,3,4,6,0,1,0,1,313,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,b,1,3,3,22,0,0,0,0,313,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Nonprofit


In [4]:
blacks = data[data.race=='b']  # 黑人
whites = data[data.race=='w']  # 白人

In [5]:
blacks.call.describe()  # call为1则是被录取，为0则是不录取

count    2435.000000
mean        0.064476
std         0.245649
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: call, dtype: float64

In [7]:
whites.call.describe()

count    2435.000000
mean        0.096509
std         0.295346
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: call, dtype: float64

白人的录取均值更高，我们再查看其它的指标，确认是到底是不是因为肤色

## 卡方检验
<ul>
    <li>白人获得职位
    <li>白人被拒绝
    <li>黑人获得职位
    <li>黑人被拒绝
</ul>

## 假设检验
<ul>
    <li>H0：种族对求职结果没有显著影响
    <li>H1：种族对求职结果有显著影响

In [8]:
blacks_called = len(blacks[blacks['call'] == True])
blacks_not_called = len(blacks[blacks['call'] == False])
whites_called = len(whites[whites['call'] == True])
whites_not_called = len(whites[whites['call'] == False])

In [14]:
observed = pd.DataFrame({
    'blacks':{'called':blacks_called, 'not_called':blacks_not_called},
    'whites':{'called':whites_called, 'not_called':whites_not_called}
})

In [15]:
observed

Unnamed: 0,blacks,whites
called,157,235
not_called,2278,2200


In [16]:
num_called_back = blacks_called +whites_called
num_not_called = blacks_not_called + whites_not_called

print(num_called_back)
print(num_not_called)

392
4478


In [22]:
# 得到期望值
rate_of_callbacks = num_called_back / (num_not_called+num_called_back)

In [23]:
rate_of_callbacks

0.08049281314168377

In [24]:
expected_called = len(data) * rate_of_callbacks
expected_not_called = len(data) * (1-rate_of_callbacks)

print(expected_called)  # 获得的期望人数
print(expected_not_called)  # 没获得的期望人数

391.99999999999994
4478.0


In [25]:
import scipy.stats as stats
observed_frequencies = [blacks_not_called, whites_not_called,
                        whites_called,blacks_called]  # 观测值
expected_frequencies = [expected_not_called/2,expected_not_called/2,
                        expected_called/2,expected_called/2]  # 期望值

stats.chisquare(f_obs = observed_frequencies,
               f_exp = expected_frequencies)  #算出卡方检验值

Power_divergenceResult(statistic=16.87905041427022, pvalue=0.0007483959441097264)

p值非常非常小, 小于0.05，我们认为在这份数据中，种族歧视是存在的