|
|
@ -339,7 +339,9 @@
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"### 数据标准化处理\n",
|
|
|
|
"### 数据标准化处理\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"上面Amount列的值还是原值,相比其它列的值过大,会导致模型结果出现偏差,认为Amount列是非常重要的,具体可参考前面讲过的回归分析章节,需要对其标准化,大的值在区间内依然是大的,小的值在区间内依然是小的,可以理解为一种缩放。"
|
|
|
|
"上面Amount列的值还是原值,相比其它列的值过大,会导致模型结果出现偏差,认为Amount列是非常重要的,具体可参考前面讲过的回归分析章节,需要对其标准化,大的值在区间内依然是大的,小的值在区间内依然是小的,可以理解为一种缩放。\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"对逻辑回归来说,所有的训练数据都需要进行标准化。"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
@ -564,6 +566,50 @@
|
|
|
|
"### 下采样方案"
|
|
|
|
"### 下采样方案"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"正常样本所占整体比例: 0.5\n",
|
|
|
|
|
|
|
|
"异常样本所占整体比例: 0.5\n",
|
|
|
|
|
|
|
|
"下采样策略总体样本量: 984\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"X = data.loc[:,data.columns != \"Class\"] # 特征\n",
|
|
|
|
|
|
|
|
"y = data.loc[:,data.columns == \"Class\"] # 标签\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
"# 得到正样本(异常样本)的索引\n",
|
|
|
|
|
|
|
|
"number_records_fraud=len(data[data.Class==1])\n",
|
|
|
|
|
|
|
|
"fraud_indices=np.array(data[data.Class==1].index)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 得到负样本(正常样本)的索引\n",
|
|
|
|
|
|
|
|
"normal_indices=data[data.Class==0].index\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
"# 从正常样本中随机采样指定个数的样本,并取索引\n",
|
|
|
|
|
|
|
|
"random_normal_indices=np.random.choice(normal_indices, number_records_fraud,replace=False) \n",
|
|
|
|
|
|
|
|
"random_normal_indices=np.array(random_normal_indices)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 有了正常样本和异常样本的所以\n",
|
|
|
|
|
|
|
|
"under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 根据索引得到下采样的所有样本点\n",
|
|
|
|
|
|
|
|
"under_sample_data=data.iloc[under_sample_indices,:]\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"X_under_sample=under_sample_data.iloc[:,under_sample_data.columns != \"Class\"]\n",
|
|
|
|
|
|
|
|
"Y_under_sample=under_sample_data.iloc[:,under_sample_data.columns == \"Class\"]\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"print(\"正常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))\n",
|
|
|
|
|
|
|
|
"print(\"异常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))\n",
|
|
|
|
|
|
|
|
"print(\"下采样策略总体样本量:\", len(under_sample_data))"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": null,
|
|
|
|