|
|
|
@ -659,6 +659,76 @@
|
|
|
|
|
"print(\"下采样样本总数: \", len(X_train_undersample)+len(X_test_undersample))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 模型的评估方法——召回率\n",
|
|
|
|
|
"由于目前正负样本极度不平衡,如果用准确率,那么和上面说的一样,模型把全部评定为正常样本,准确率就达到99.99%。这里用召回率,即异常样本找到多少个。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Recall = TP/(TP+FN)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"* TP,即 True Positive =正确地判断成正例\n",
|
|
|
|
|
"* TN,即 True negative=正确地判断成负例\n",
|
|
|
|
|
"* FP,即False Positive =错误地判断成正例\n",
|
|
|
|
|
"* FN ,即False negative =错误地判断成负例"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 14,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 在sklearn库的线性模块中,调取“逻辑回归”\n",
|
|
|
|
|
"from sklearn.linear_model import LogisticRegression \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"#交叉验证模块中,选用“K折交叉验证”\n",
|
|
|
|
|
"#cross_val_score函数返回的是一个使用交叉验证以后的评分标准。\n",
|
|
|
|
|
"from sklearn.model_selection import KFold, cross_val_score \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# 混淆矩阵、召回率\n",
|
|
|
|
|
"# sklearn中的classification_report函数用于显示主要分类指标的文本报告,在报告中显示每个类的精确度,召回率,F1值等信息。\n",
|
|
|
|
|
"from sklearn.metrics import confusion_matrix,recall_score,classification_report \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# cross_val_predict 和 cross_val_score的使用方法是一样的,但是它返回的是一个使用交叉验证以后的输出值,而不是评分标准。\n",
|
|
|
|
|
"from sklearn.model_selection import cross_val_predict"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 正则化惩罚——提高模型泛化能力\n",
|
|
|
|
|
"模型过拟合通常出现在,数据量少的同时特征又多,也就是当下的情况,异常样本非常少,特征维度有28个。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 16,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 编写Kflod函数——printing_Kfold_scores,实际中我们可以直接调用\n",
|
|
|
|
|
"def printing_Kfold_scores(x_train_data,y_train_data):\n",
|
|
|
|
|
" fold = KFold(len(y_train_data),5,shuffle=False) #shuffle=False是指数据集不用洗牌\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # 定义不同力度的正则化惩罚力度,越大惩罚力度越大\n",
|
|
|
|
|
" c_param_range = [0.01,0.1,1,10,100] \n",
|
|
|
|
|
" # 展示结果用的表格\n",
|
|
|
|
|
" results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])\n",
|
|
|
|
|
" results_table['C_parameter'] = c_param_range\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # k-fold 表示K折的交叉验证,这里会得到两个索引集合: 训练集 = indices[0], 验证集 = indices[1]\n",
|
|
|
|
|
" j = 0\n",
|
|
|
|
|
" # 循环遍历不同的参数(这里的c_param_rang是5个——5折交叉验证)\n",
|
|
|
|
|
" for c_param in c_param_range:\n",
|
|
|
|
|
" print('-------------------------------------------')\n",
|
|
|
|
|
" print('正则化惩罚力度: ', c_param)\n",
|
|
|
|
|
" print('-------------------------------------------')\n",
|
|
|
|
|
" print('\\n')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|