@ -47,30 +47,22 @@
},
{
"cell_type": "code",
"execution_count": 1 ,
"execution_count": 3 ,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Anaconda3\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
" return f(*args, **kwds)\n"
]
}
],
"outputs": [],
"source": [
"# 导入工具包\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"%matplotlib inline # 把图轻松的镶嵌到这个notebook中"
"# 把图轻松的镶嵌到这个notebook中\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3 ,
"execution_count": 4 ,
"metadata": {},
"outputs": [
{
@ -268,7 +260,7 @@
"[5 rows x 31 columns]"
]
},
"execution_count": 3 ,
"execution_count": 4 ,
"metadata": {},
"output_type": "execute_result"
}
@ -289,7 +281,7 @@
},
{
"cell_type": "code",
"execution_count": 6 ,
"execution_count": 5 ,
"metadata": {},
"outputs": [
{
@ -341,6 +333,332 @@
"* 第二种方式则会减少真实数据,使得模型可学的数据变少,能力也会减弱。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据标准化处理\n",
"\n",
"上面Amount列的值还是原值, 相比其它列的值过大, 会导致模型结果出现偏差, 认为Amount列是非常重要的, 具体可参考前面讲过的回归分析章节, 需要对其标准化, 大的值在区间内依然是大的, 小的值在区间内依然是小的, 可以理解为一种缩放。\n",
"\n",
"对逻辑回归来说,所有的训练数据都需要进行标准化。"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>V1</th>\n",
" <th>V2</th>\n",
" <th>V3</th>\n",
" <th>V4</th>\n",
" <th>V5</th>\n",
" <th>V6</th>\n",
" <th>V7</th>\n",
" <th>V8</th>\n",
" <th>V9</th>\n",
" <th>V10</th>\n",
" <th>...</th>\n",
" <th>V21</th>\n",
" <th>V22</th>\n",
" <th>V23</th>\n",
" <th>V24</th>\n",
" <th>V25</th>\n",
" <th>V26</th>\n",
" <th>V27</th>\n",
" <th>V28</th>\n",
" <th>Class</th>\n",
" <th>normAmount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1.359807</td>\n",
" <td>-0.072781</td>\n",
" <td>2.536347</td>\n",
" <td>1.378155</td>\n",
" <td>-0.338321</td>\n",
" <td>0.462388</td>\n",
" <td>0.239599</td>\n",
" <td>0.098698</td>\n",
" <td>0.363787</td>\n",
" <td>0.090794</td>\n",
" <td>...</td>\n",
" <td>-0.018307</td>\n",
" <td>0.277838</td>\n",
" <td>-0.110474</td>\n",
" <td>0.066928</td>\n",
" <td>0.128539</td>\n",
" <td>-0.189115</td>\n",
" <td>0.133558</td>\n",
" <td>-0.021053</td>\n",
" <td>0</td>\n",
" <td>0.244964</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.191857</td>\n",
" <td>0.266151</td>\n",
" <td>0.166480</td>\n",
" <td>0.448154</td>\n",
" <td>0.060018</td>\n",
" <td>-0.082361</td>\n",
" <td>-0.078803</td>\n",
" <td>0.085102</td>\n",
" <td>-0.255425</td>\n",
" <td>-0.166974</td>\n",
" <td>...</td>\n",
" <td>-0.225775</td>\n",
" <td>-0.638672</td>\n",
" <td>0.101288</td>\n",
" <td>-0.339846</td>\n",
" <td>0.167170</td>\n",
" <td>0.125895</td>\n",
" <td>-0.008983</td>\n",
" <td>0.014724</td>\n",
" <td>0</td>\n",
" <td>-0.342475</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-1.358354</td>\n",
" <td>-1.340163</td>\n",
" <td>1.773209</td>\n",
" <td>0.379780</td>\n",
" <td>-0.503198</td>\n",
" <td>1.800499</td>\n",
" <td>0.791461</td>\n",
" <td>0.247676</td>\n",
" <td>-1.514654</td>\n",
" <td>0.207643</td>\n",
" <td>...</td>\n",
" <td>0.247998</td>\n",
" <td>0.771679</td>\n",
" <td>0.909412</td>\n",
" <td>-0.689281</td>\n",
" <td>-0.327642</td>\n",
" <td>-0.139097</td>\n",
" <td>-0.055353</td>\n",
" <td>-0.059752</td>\n",
" <td>0</td>\n",
" <td>1.160686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.966272</td>\n",
" <td>-0.185226</td>\n",
" <td>1.792993</td>\n",
" <td>-0.863291</td>\n",
" <td>-0.010309</td>\n",
" <td>1.247203</td>\n",
" <td>0.237609</td>\n",
" <td>0.377436</td>\n",
" <td>-1.387024</td>\n",
" <td>-0.054952</td>\n",
" <td>...</td>\n",
" <td>-0.108300</td>\n",
" <td>0.005274</td>\n",
" <td>-0.190321</td>\n",
" <td>-1.175575</td>\n",
" <td>0.647376</td>\n",
" <td>-0.221929</td>\n",
" <td>0.062723</td>\n",
" <td>0.061458</td>\n",
" <td>0</td>\n",
" <td>0.140534</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-1.158233</td>\n",
" <td>0.877737</td>\n",
" <td>1.548718</td>\n",
" <td>0.403034</td>\n",
" <td>-0.407193</td>\n",
" <td>0.095921</td>\n",
" <td>0.592941</td>\n",
" <td>-0.270533</td>\n",
" <td>0.817739</td>\n",
" <td>0.753074</td>\n",
" <td>...</td>\n",
" <td>-0.009431</td>\n",
" <td>0.798278</td>\n",
" <td>-0.137458</td>\n",
" <td>0.141267</td>\n",
" <td>-0.206010</td>\n",
" <td>0.502292</td>\n",
" <td>0.219422</td>\n",
" <td>0.215153</td>\n",
" <td>0</td>\n",
" <td>-0.073403</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 30 columns</p>\n",
"</div>"
],
"text/plain": [
" V1 V2 V3 V4 V5 V6 V7 \\\n",
"0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
"1 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
"2 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
"3 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
"4 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
"\n",
" V8 V9 V10 ... V21 V22 V23 V24 \\\n",
"0 0.098698 0.363787 0.090794 ... -0.018307 0.277838 -0.110474 0.066928 \n",
"1 0.085102 -0.255425 -0.166974 ... -0.225775 -0.638672 0.101288 -0.339846 \n",
"2 0.247676 -1.514654 0.207643 ... 0.247998 0.771679 0.909412 -0.689281 \n",
"3 0.377436 -1.387024 -0.054952 ... -0.108300 0.005274 -0.190321 -1.175575 \n",
"4 -0.270533 0.817739 0.753074 ... -0.009431 0.798278 -0.137458 0.141267 \n",
"\n",
" V25 V26 V27 V28 Class normAmount \n",
"0 0.128539 -0.189115 0.133558 -0.021053 0 0.244964 \n",
"1 0.167170 0.125895 -0.008983 0.014724 0 -0.342475 \n",
"2 -0.327642 -0.139097 -0.055353 -0.059752 0 1.160686 \n",
"3 0.647376 -0.221929 0.062723 0.061458 0 0.140534 \n",
"4 -0.206010 0.502292 0.219422 0.215153 0 -0.073403 \n",
"\n",
"[5 rows x 30 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"# X = (x-μ)/σ ,使得新的X数据集方差为1, 均值为0\n",
"# fit_transform(data['Amount']) 意思是找出data['Amount']的μ和σ , 并应用在data['Amount']上。\n",
"data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))\n",
"data = data.drop(['Time', 'Amount'], axis=1) # Time这里用不上也去掉\n",
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 下采样方案"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"正常样本所占整体比例: 0.5\n",
"异常样本所占整体比例: 0.5\n",
"下采样策略总体样本量: 984\n"
]
}
],
"source": [
"X = data.loc[:,data.columns != \"Class\"] # 特征\n",
"y = data.loc[:,data.columns == \"Class\"] # 标签\n",
" \n",
"# 得到正样本(异常样本)的索引\n",
"number_records_fraud=len(data[data.Class==1])\n",
"fraud_indices=np.array(data[data.Class==1].index)\n",
"\n",
"# 得到负样本(正常样本)的索引\n",
"normal_indices=data[data.Class==0].index\n",
" \n",
"# 从正常样本中随机采样指定个数的样本,并取索引\n",
"random_normal_indices=np.random.choice(normal_indices, number_records_fraud,replace=False) \n",
"random_normal_indices=np.array(random_normal_indices)\n",
"\n",
"# 有了正常样本和异常样本的所以\n",
"under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])\n",
"\n",
"# 根据索引得到下采样的所有样本点\n",
"under_sample_data=data.iloc[under_sample_indices,:]\n",
"\n",
"X_under_sample=under_sample_data.iloc[:,under_sample_data.columns != \"Class\"]\n",
"Y_under_sample=under_sample_data.iloc[:,under_sample_data.columns == \"Class\"]\n",
"\n",
"print(\"正常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))\n",
"print(\"异常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))\n",
"print(\"下采样策略总体样本量:\", len(under_sample_data))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 交叉验证"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"原始训练集包含样本量: 199364\n",
"原始测试集包含样本量: 85443\n",
"原始样本总数: 284807\n",
"\n",
"\n",
"下采样训练集包含样本数量: 688\n",
"下采样测试集包含样本数量: 296\n",
"下采样样本总数: 984\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split # 切分数据集\n",
"\n",
"# 将数据切割成训练集0.7 和测试集 0.3\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)\n",
"\n",
"print(\"原始训练集包含样本量:\", len(X_train))\n",
"print(\"原始测试集包含样本量:\", len(X_test))\n",
"print(\"原始样本总数:\", len(X_train)+len(X_test))\n",
"\n",
"# 下采样数据集进行划分\n",
"X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample\n",
" ,Y_under_sample\n",
" ,test_size = 0.3\n",
" ,random_state = 0)\n",
" \n",
"print(\"\\n\")\n",
"print(\"下采样训练集包含样本数量: \", len(X_train_undersample))\n",
"print(\"下采样测试集包含样本数量: \", len(X_test_undersample))\n",
"print(\"下采样样本总数: \", len(X_train_undersample)+len(X_test_undersample))"
]
},
{
"cell_type": "code",
"execution_count": null,