diff --git a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb
index b2653f6..ba5ed51 100644
--- a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb
@@ -47,30 +47,22 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 3,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "D:\\Anaconda3\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
- " return f(*args, **kwds)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# 导入工具包\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
- "%matplotlib inline # 把图轻松的镶嵌到这个notebook中"
+ "# 把图轻松的镶嵌到这个notebook中\n",
+ "%matplotlib inline"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -268,7 +260,7 @@
"[5 rows x 31 columns]"
]
},
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -289,7 +281,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -341,6 +333,332 @@
"* 第二种方式则会减少真实数据,使得模型可学的数据变少,能力也会减弱。"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 数据标准化处理\n",
+ "\n",
+ "上面Amount列的值还是原值,相比其它列的值过大,会导致模型结果出现偏差,认为Amount列是非常重要的,具体可参考前面讲过的回归分析章节,需要对其标准化,大的值在区间内依然是大的,小的值在区间内依然是小的,可以理解为一种缩放。\n",
+ "\n",
+ "对逻辑回归来说,所有的训练数据都需要进行标准化。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " V1 | \n",
+ " V2 | \n",
+ " V3 | \n",
+ " V4 | \n",
+ " V5 | \n",
+ " V6 | \n",
+ " V7 | \n",
+ " V8 | \n",
+ " V9 | \n",
+ " V10 | \n",
+ " ... | \n",
+ " V21 | \n",
+ " V22 | \n",
+ " V23 | \n",
+ " V24 | \n",
+ " V25 | \n",
+ " V26 | \n",
+ " V27 | \n",
+ " V28 | \n",
+ " Class | \n",
+ " normAmount | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " -1.359807 | \n",
+ " -0.072781 | \n",
+ " 2.536347 | \n",
+ " 1.378155 | \n",
+ " -0.338321 | \n",
+ " 0.462388 | \n",
+ " 0.239599 | \n",
+ " 0.098698 | \n",
+ " 0.363787 | \n",
+ " 0.090794 | \n",
+ " ... | \n",
+ " -0.018307 | \n",
+ " 0.277838 | \n",
+ " -0.110474 | \n",
+ " 0.066928 | \n",
+ " 0.128539 | \n",
+ " -0.189115 | \n",
+ " 0.133558 | \n",
+ " -0.021053 | \n",
+ " 0 | \n",
+ " 0.244964 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1.191857 | \n",
+ " 0.266151 | \n",
+ " 0.166480 | \n",
+ " 0.448154 | \n",
+ " 0.060018 | \n",
+ " -0.082361 | \n",
+ " -0.078803 | \n",
+ " 0.085102 | \n",
+ " -0.255425 | \n",
+ " -0.166974 | \n",
+ " ... | \n",
+ " -0.225775 | \n",
+ " -0.638672 | \n",
+ " 0.101288 | \n",
+ " -0.339846 | \n",
+ " 0.167170 | \n",
+ " 0.125895 | \n",
+ " -0.008983 | \n",
+ " 0.014724 | \n",
+ " 0 | \n",
+ " -0.342475 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " -1.358354 | \n",
+ " -1.340163 | \n",
+ " 1.773209 | \n",
+ " 0.379780 | \n",
+ " -0.503198 | \n",
+ " 1.800499 | \n",
+ " 0.791461 | \n",
+ " 0.247676 | \n",
+ " -1.514654 | \n",
+ " 0.207643 | \n",
+ " ... | \n",
+ " 0.247998 | \n",
+ " 0.771679 | \n",
+ " 0.909412 | \n",
+ " -0.689281 | \n",
+ " -0.327642 | \n",
+ " -0.139097 | \n",
+ " -0.055353 | \n",
+ " -0.059752 | \n",
+ " 0 | \n",
+ " 1.160686 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " -0.966272 | \n",
+ " -0.185226 | \n",
+ " 1.792993 | \n",
+ " -0.863291 | \n",
+ " -0.010309 | \n",
+ " 1.247203 | \n",
+ " 0.237609 | \n",
+ " 0.377436 | \n",
+ " -1.387024 | \n",
+ " -0.054952 | \n",
+ " ... | \n",
+ " -0.108300 | \n",
+ " 0.005274 | \n",
+ " -0.190321 | \n",
+ " -1.175575 | \n",
+ " 0.647376 | \n",
+ " -0.221929 | \n",
+ " 0.062723 | \n",
+ " 0.061458 | \n",
+ " 0 | \n",
+ " 0.140534 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " -1.158233 | \n",
+ " 0.877737 | \n",
+ " 1.548718 | \n",
+ " 0.403034 | \n",
+ " -0.407193 | \n",
+ " 0.095921 | \n",
+ " 0.592941 | \n",
+ " -0.270533 | \n",
+ " 0.817739 | \n",
+ " 0.753074 | \n",
+ " ... | \n",
+ " -0.009431 | \n",
+ " 0.798278 | \n",
+ " -0.137458 | \n",
+ " 0.141267 | \n",
+ " -0.206010 | \n",
+ " 0.502292 | \n",
+ " 0.219422 | \n",
+ " 0.215153 | \n",
+ " 0 | \n",
+ " -0.073403 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 30 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " V1 V2 V3 V4 V5 V6 V7 \\\n",
+ "0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
+ "1 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
+ "2 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
+ "3 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
+ "4 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
+ "\n",
+ " V8 V9 V10 ... V21 V22 V23 V24 \\\n",
+ "0 0.098698 0.363787 0.090794 ... -0.018307 0.277838 -0.110474 0.066928 \n",
+ "1 0.085102 -0.255425 -0.166974 ... -0.225775 -0.638672 0.101288 -0.339846 \n",
+ "2 0.247676 -1.514654 0.207643 ... 0.247998 0.771679 0.909412 -0.689281 \n",
+ "3 0.377436 -1.387024 -0.054952 ... -0.108300 0.005274 -0.190321 -1.175575 \n",
+ "4 -0.270533 0.817739 0.753074 ... -0.009431 0.798278 -0.137458 0.141267 \n",
+ "\n",
+ " V25 V26 V27 V28 Class normAmount \n",
+ "0 0.128539 -0.189115 0.133558 -0.021053 0 0.244964 \n",
+ "1 0.167170 0.125895 -0.008983 0.014724 0 -0.342475 \n",
+ "2 -0.327642 -0.139097 -0.055353 -0.059752 0 1.160686 \n",
+ "3 0.647376 -0.221929 0.062723 0.061458 0 0.140534 \n",
+ "4 -0.206010 0.502292 0.219422 0.215153 0 -0.073403 \n",
+ "\n",
+ "[5 rows x 30 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ "# X = (x-μ)/σ,使得新的X数据集方差为1,均值为0\n",
+ "# fit_transform(data['Amount']) 意思是找出data['Amount']的μ和σ,并应用在data['Amount']上。\n",
+ "data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))\n",
+ "data = data.drop(['Time', 'Amount'], axis=1) # Time这里用不上也去掉\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 下采样方案"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "正常样本所占整体比例: 0.5\n",
+ "异常样本所占整体比例: 0.5\n",
+ "下采样策略总体样本量: 984\n"
+ ]
+ }
+ ],
+ "source": [
+ "X = data.loc[:,data.columns != \"Class\"] # 特征\n",
+ "y = data.loc[:,data.columns == \"Class\"] # 标签\n",
+ " \n",
+ "# 得到正样本(异常样本)的索引\n",
+ "number_records_fraud=len(data[data.Class==1])\n",
+ "fraud_indices=np.array(data[data.Class==1].index)\n",
+ "\n",
+ "# 得到负样本(正常样本)的索引\n",
+ "normal_indices=data[data.Class==0].index\n",
+ " \n",
+ "# 从正常样本中随机采样指定个数的样本,并取索引\n",
+ "random_normal_indices=np.random.choice(normal_indices, number_records_fraud,replace=False) \n",
+ "random_normal_indices=np.array(random_normal_indices)\n",
+ "\n",
+ "# 有了正常样本和异常样本的所以\n",
+ "under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])\n",
+ "\n",
+ "# 根据索引得到下采样的所有样本点\n",
+ "under_sample_data=data.iloc[under_sample_indices,:]\n",
+ "\n",
+ "X_under_sample=under_sample_data.iloc[:,under_sample_data.columns != \"Class\"]\n",
+ "Y_under_sample=under_sample_data.iloc[:,under_sample_data.columns == \"Class\"]\n",
+ "\n",
+ "print(\"正常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))\n",
+ "print(\"异常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))\n",
+ "print(\"下采样策略总体样本量:\", len(under_sample_data))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 交叉验证"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "原始训练集包含样本量: 199364\n",
+ "原始测试集包含样本量: 85443\n",
+ "原始样本总数: 284807\n",
+ "\n",
+ "\n",
+ "下采样训练集包含样本数量: 688\n",
+ "下采样测试集包含样本数量: 296\n",
+ "下采样样本总数: 984\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import train_test_split # 切分数据集\n",
+ "\n",
+ "# 将数据切割成训练集0.7 和测试集 0.3\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)\n",
+ "\n",
+ "print(\"原始训练集包含样本量:\", len(X_train))\n",
+ "print(\"原始测试集包含样本量:\", len(X_test))\n",
+ "print(\"原始样本总数:\", len(X_train)+len(X_test))\n",
+ "\n",
+ "# 下采样数据集进行划分\n",
+ "X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample\n",
+ " ,Y_under_sample\n",
+ " ,test_size = 0.3\n",
+ " ,random_state = 0)\n",
+ " \n",
+ "print(\"\\n\")\n",
+ "print(\"下采样训练集包含样本数量: \", len(X_train_undersample))\n",
+ "print(\"下采样测试集包含样本数量: \", len(X_test_undersample))\n",
+ "print(\"下采样样本总数: \", len(X_train_undersample)+len(X_test_undersample))"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
diff --git a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb
index 76ecfb3..ba5ed51 100644
--- a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb
@@ -610,6 +610,55 @@
"print(\"下采样策略总体样本量:\", len(under_sample_data))"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 交叉验证"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "原始训练集包含样本量: 199364\n",
+ "原始测试集包含样本量: 85443\n",
+ "原始样本总数: 284807\n",
+ "\n",
+ "\n",
+ "下采样训练集包含样本数量: 688\n",
+ "下采样测试集包含样本数量: 296\n",
+ "下采样样本总数: 984\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import train_test_split # 切分数据集\n",
+ "\n",
+ "# 将数据切割成训练集0.7 和测试集 0.3\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)\n",
+ "\n",
+ "print(\"原始训练集包含样本量:\", len(X_train))\n",
+ "print(\"原始测试集包含样本量:\", len(X_test))\n",
+ "print(\"原始样本总数:\", len(X_train)+len(X_test))\n",
+ "\n",
+ "# 下采样数据集进行划分\n",
+ "X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample\n",
+ " ,Y_under_sample\n",
+ " ,test_size = 0.3\n",
+ " ,random_state = 0)\n",
+ " \n",
+ "print(\"\\n\")\n",
+ "print(\"下采样训练集包含样本数量: \", len(X_train_undersample))\n",
+ "print(\"下采样测试集包含样本数量: \", len(X_test_undersample))\n",
+ "print(\"下采样样本总数: \", len(X_train_undersample)+len(X_test_undersample))"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,