diff --git a/notebook_必备数学基础/假设检验章节/.ipynb_checkpoints/python卡方检验实例-checkpoint.ipynb b/notebook_必备数学基础/假设检验章节/.ipynb_checkpoints/python卡方检验实例-checkpoint.ipynb
new file mode 100644
index 0000000..2fd6442
--- /dev/null
+++ b/notebook_必备数学基础/假设检验章节/.ipynb_checkpoints/python卡方检验实例-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook_必备数学基础/假设检验章节/python卡方检验实例.ipynb b/notebook_必备数学基础/假设检验章节/python卡方检验实例.ipynb
new file mode 100644
index 0000000..5f9b7c1
--- /dev/null
+++ b/notebook_必备数学基础/假设检验章节/python卡方检验实例.ipynb
@@ -0,0 +1,548 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 白人和黑人在求职路上会有种族的歧视吗?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from scipy import stats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " ad | \n",
+ " education | \n",
+ " ofjobs | \n",
+ " yearsexp | \n",
+ " honors | \n",
+ " volunteer | \n",
+ " military | \n",
+ " empholes | \n",
+ " occupspecific | \n",
+ " ... | \n",
+ " compreq | \n",
+ " orgreq | \n",
+ " manuf | \n",
+ " transcom | \n",
+ " bankreal | \n",
+ " trade | \n",
+ " busservice | \n",
+ " othservice | \n",
+ " missind | \n",
+ " ownership | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " b | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 17 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " b | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 316 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " b | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 19 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " b | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 313 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " b | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 22 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 313 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " Nonprofit | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 65 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id ad education ofjobs yearsexp honors volunteer military empholes \\\n",
+ "0 b 1 4 2 6 0 0 0 1 \n",
+ "1 b 1 3 3 6 0 1 1 0 \n",
+ "2 b 1 4 1 6 0 0 0 0 \n",
+ "3 b 1 3 4 6 0 1 0 1 \n",
+ "4 b 1 3 3 22 0 0 0 0 \n",
+ "\n",
+ " occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n",
+ "0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
+ "1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
+ "2 19 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
+ "3 313 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
+ "4 313 ... 1.0 1.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " busservice othservice missind ownership \n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 1.0 0.0 Nonprofit \n",
+ "\n",
+ "[5 rows x 65 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = pd.io.stata.read_stata('us_job_market_discrimination.dta')\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "blacks = data[data.race=='b'] # 黑人\n",
+ "whites = data[data.race=='w'] # 白人"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 2435.000000\n",
+ "mean 0.064476\n",
+ "std 0.245649\n",
+ "min 0.000000\n",
+ "25% 0.000000\n",
+ "50% 0.000000\n",
+ "75% 0.000000\n",
+ "max 1.000000\n",
+ "Name: call, dtype: float64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "blacks.call.describe() # call为1则是被录取,为0则是不录取"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 2435.000000\n",
+ "mean 0.096509\n",
+ "std 0.295346\n",
+ "min 0.000000\n",
+ "25% 0.000000\n",
+ "50% 0.000000\n",
+ "75% 0.000000\n",
+ "max 1.000000\n",
+ "Name: call, dtype: float64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "whites.call.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "白人的录取均值更高,我们再查看其它的指标,确认是到底是不是因为肤色"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 卡方检验\n",
+ "\n",
+ " - 白人获得职位\n",
+ "
- 白人被拒绝\n",
+ "
- 黑人获得职位\n",
+ "
- 黑人被拒绝\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 假设检验\n",
+ "\n",
+ " - H0:种族对求职结果没有显著影响\n",
+ "
- H1:种族对求职结果有显著影响"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "blacks_called = len(blacks[blacks['call'] == True])\n",
+ "blacks_not_called = len(blacks[blacks['call'] == False])\n",
+ "whites_called = len(whites[whites['call'] == True])\n",
+ "whites_not_called = len(whites[whites['call'] == False])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "observed = pd.DataFrame({\n",
+ " 'blacks':{'called':blacks_called, 'not_called':blacks_not_called},\n",
+ " 'whites':{'called':whites_called, 'not_called':whites_not_called}\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " blacks | \n",
+ " whites | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " called | \n",
+ " 157 | \n",
+ " 235 | \n",
+ "
\n",
+ " \n",
+ " not_called | \n",
+ " 2278 | \n",
+ " 2200 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " blacks whites\n",
+ "called 157 235\n",
+ "not_called 2278 2200"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "observed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "392\n",
+ "4478\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_called_back = blacks_called +whites_called\n",
+ "num_not_called = blacks_not_called + whites_not_called\n",
+ "\n",
+ "print(num_called_back)\n",
+ "print(num_not_called)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 得到期望值\n",
+ "rate_of_callbacks = num_called_back / (num_not_called+num_called_back)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.08049281314168377"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rate_of_callbacks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "391.99999999999994\n",
+ "4478.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "expected_called = len(data) * rate_of_callbacks\n",
+ "expected_not_called = len(data) * (1-rate_of_callbacks)\n",
+ "\n",
+ "print(expected_called) # 获得的期望人数\n",
+ "print(expected_not_called) # 没获得的期望人数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Power_divergenceResult(statistic=16.87905041427022, pvalue=0.0007483959441097264)"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import scipy.stats as stats\n",
+ "observed_frequencies = [blacks_not_called, whites_not_called,\n",
+ " whites_called,blacks_called] # 观测值\n",
+ "expected_frequencies = [expected_not_called/2,expected_not_called/2,\n",
+ " expected_called/2,expected_called/2] # 期望值\n",
+ "\n",
+ "stats.chisquare(f_obs = observed_frequencies,\n",
+ " f_exp = expected_frequencies) #算出卡方检验值"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "p值非常非常小, 小于0.05,我们认为在这份数据中,种族歧视是存在的"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook_必备数学基础/假设检验章节/us_job_market_discrimination.dta b/notebook_必备数学基础/假设检验章节/us_job_market_discrimination.dta
new file mode 100644
index 0000000..08c8bc0
Binary files /dev/null and b/notebook_必备数学基础/假设检验章节/us_job_market_discrimination.dta differ