diff --git a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb index 95aff30..76ecfb3 100644 --- a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb +++ b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb @@ -339,7 +339,9 @@ "source": [ "### 数据标准化处理\n", "\n", - "上面Amount列的值还是原值,相比其它列的值过大,会导致模型结果出现偏差,认为Amount列是非常重要的,具体可参考前面讲过的回归分析章节,需要对其标准化,大的值在区间内依然是大的,小的值在区间内依然是小的,可以理解为一种缩放。" + "上面Amount列的值还是原值,相比其它列的值过大,会导致模型结果出现偏差,认为Amount列是非常重要的,具体可参考前面讲过的回归分析章节,需要对其标准化,大的值在区间内依然是大的,小的值在区间内依然是小的,可以理解为一种缩放。\n", + "\n", + "对逻辑回归来说,所有的训练数据都需要进行标准化。" ] }, { @@ -564,6 +566,50 @@ "### 下采样方案" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "正常样本所占整体比例: 0.5\n", + "异常样本所占整体比例: 0.5\n", + "下采样策略总体样本量: 984\n" + ] + } + ], + "source": [ + "X = data.loc[:,data.columns != \"Class\"] # 特征\n", + "y = data.loc[:,data.columns == \"Class\"] # 标签\n", + " \n", + "# 得到正样本(异常样本)的索引\n", + "number_records_fraud=len(data[data.Class==1])\n", + "fraud_indices=np.array(data[data.Class==1].index)\n", + "\n", + "# 得到负样本(正常样本)的索引\n", + "normal_indices=data[data.Class==0].index\n", + " \n", + "# 从正常样本中随机采样指定个数的样本,并取索引\n", + "random_normal_indices=np.random.choice(normal_indices, number_records_fraud,replace=False) \n", + "random_normal_indices=np.array(random_normal_indices)\n", + "\n", + "# 有了正常样本和异常样本的所以\n", + "under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])\n", + "\n", + "# 根据索引得到下采样的所有样本点\n", + "under_sample_data=data.iloc[under_sample_indices,:]\n", + "\n", + "X_under_sample=under_sample_data.iloc[:,under_sample_data.columns != \"Class\"]\n", + "Y_under_sample=under_sample_data.iloc[:,under_sample_data.columns == \"Class\"]\n", + "\n", + "print(\"正常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))\n", + "print(\"异常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))\n", + "print(\"下采样策略总体样本量:\", len(under_sample_data))" + ] + }, { "cell_type": "code", "execution_count": null,