Add. Unsubsample

5 years ago · 69c4007bcf
parent 70973c2a7e
commit 69c4007bcf
1 changed files with 47 additions and 1 deletions
--- a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb
@ -339,7 +339,9 @@
   "source": [
    "### 数据标准化处理\n",
    "\n",
-    "上面Amount列的值还是原值，相比其它列的值过大，会导致模型结果出现偏差，认为Amount列是非常重要的，具体可参考前面讲过的回归分析章节，需要对其标准化，大的值在区间内依然是大的，小的值在区间内依然是小的，可以理解为一种缩放。"
+    "上面Amount列的值还是原值，相比其它列的值过大，会导致模型结果出现偏差，认为Amount列是非常重要的，具体可参考前面讲过的回归分析章节，需要对其标准化，大的值在区间内依然是大的，小的值在区间内依然是小的，可以理解为一种缩放。\n",
    "\n",
    "对逻辑回归来说，所有的训练数据都需要进行标准化。"
   ]
  },
  {
@ -564,6 +566,50 @@
    "### 下采样方案"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正常样本所占整体比例： 0.5\n",
      "异常样本所占整体比例： 0.5\n",
      "下采样策略总体样本量： 984\n"
     ]
    }
   ],
   "source": [
    "X = data.loc[:,data.columns != \"Class\"]  # 特征\n",
    "y = data.loc[:,data.columns == \"Class\"]  # 标签\n",
    " \n",
    "# 得到正样本（异常样本）的索引\n",
    "number_records_fraud=len(data[data.Class==1])\n",
    "fraud_indices=np.array(data[data.Class==1].index)\n",
    "\n",
    "# 得到负样本（正常样本）的索引\n",
    "normal_indices=data[data.Class==0].index\n",
    " \n",
    "# 从正常样本中随机采样指定个数的样本，并取索引\n",
    "random_normal_indices=np.random.choice(normal_indices, number_records_fraud,replace=False) \n",
    "random_normal_indices=np.array(random_normal_indices)\n",
    "\n",
    "# 有了正常样本和异常样本的所以\n",
    "under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])\n",
    "\n",
    "# 根据索引得到下采样的所有样本点\n",
    "under_sample_data=data.iloc[under_sample_indices,:]\n",
    "\n",
    "X_under_sample=under_sample_data.iloc[:,under_sample_data.columns != \"Class\"]\n",
    "Y_under_sample=under_sample_data.iloc[:,under_sample_data.columns == \"Class\"]\n",
    "\n",
    "print(\"正常样本所占整体比例：\", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))\n",
    "print(\"异常样本所占整体比例：\", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))\n",
    "print(\"下采样策略总体样本量：\", len(under_sample_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,