diff --git a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb index b2653f6..ba5ed51 100644 --- a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/.ipynb_checkpoints/逻辑回归-信用卡欺诈检测-checkpoint.ipynb @@ -47,30 +47,22 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "D:\\Anaconda3\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", - " return f(*args, **kwds)\n" - ] - } - ], + "outputs": [], "source": [ "# 导入工具包\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", - "%matplotlib inline # 把图轻松的镶嵌到这个notebook中" + "# 把图轻松的镶嵌到这个notebook中\n", + "%matplotlib inline" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -268,7 +260,7 @@ "[5 rows x 31 columns]" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -289,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -341,6 +333,332 @@ "* 第二种方式则会减少真实数据,使得模型可学的数据变少,能力也会减弱。" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据标准化处理\n", + "\n", + "上面Amount列的值还是原值,相比其它列的值过大,会导致模型结果出现偏差,认为Amount列是非常重要的,具体可参考前面讲过的回归分析章节,需要对其标准化,大的值在区间内依然是大的,小的值在区间内依然是小的,可以理解为一种缩放。\n", + "\n", + "对逻辑回归来说,所有的训练数据都需要进行标准化。" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V1V2V3V4V5V6V7V8V9V10...V21V22V23V24V25V26V27V28ClassnormAmount
0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.02105300.244964
11.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.166974...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147240-0.342475
2-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.207643...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.05975201.160686
3-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.06145800.140534
4-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.2151530-0.073403
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " V1 V2 V3 V4 V5 V6 V7 \\\n", + "0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", + "1 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", + "2 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", + "3 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", + "4 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", + "\n", + " V8 V9 V10 ... V21 V22 V23 V24 \\\n", + "0 0.098698 0.363787 0.090794 ... -0.018307 0.277838 -0.110474 0.066928 \n", + "1 0.085102 -0.255425 -0.166974 ... -0.225775 -0.638672 0.101288 -0.339846 \n", + "2 0.247676 -1.514654 0.207643 ... 0.247998 0.771679 0.909412 -0.689281 \n", + "3 0.377436 -1.387024 -0.054952 ... -0.108300 0.005274 -0.190321 -1.175575 \n", + "4 -0.270533 0.817739 0.753074 ... -0.009431 0.798278 -0.137458 0.141267 \n", + "\n", + " V25 V26 V27 V28 Class normAmount \n", + "0 0.128539 -0.189115 0.133558 -0.021053 0 0.244964 \n", + "1 0.167170 0.125895 -0.008983 0.014724 0 -0.342475 \n", + "2 -0.327642 -0.139097 -0.055353 -0.059752 0 1.160686 \n", + "3 0.647376 -0.221929 0.062723 0.061458 0 0.140534 \n", + "4 -0.206010 0.502292 0.219422 0.215153 0 -0.073403 \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# X = (x-μ)/σ,使得新的X数据集方差为1,均值为0\n", + "# fit_transform(data['Amount']) 意思是找出data['Amount']的μ和σ,并应用在data['Amount']上。\n", + "data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))\n", + "data = data.drop(['Time', 'Amount'], axis=1) # Time这里用不上也去掉\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 下采样方案" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "正常样本所占整体比例: 0.5\n", + "异常样本所占整体比例: 0.5\n", + "下采样策略总体样本量: 984\n" + ] + } + ], + "source": [ + "X = data.loc[:,data.columns != \"Class\"] # 特征\n", + "y = data.loc[:,data.columns == \"Class\"] # 标签\n", + " \n", + "# 得到正样本(异常样本)的索引\n", + "number_records_fraud=len(data[data.Class==1])\n", + "fraud_indices=np.array(data[data.Class==1].index)\n", + "\n", + "# 得到负样本(正常样本)的索引\n", + "normal_indices=data[data.Class==0].index\n", + " \n", + "# 从正常样本中随机采样指定个数的样本,并取索引\n", + "random_normal_indices=np.random.choice(normal_indices, number_records_fraud,replace=False) \n", + "random_normal_indices=np.array(random_normal_indices)\n", + "\n", + "# 有了正常样本和异常样本的所以\n", + "under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])\n", + "\n", + "# 根据索引得到下采样的所有样本点\n", + "under_sample_data=data.iloc[under_sample_indices,:]\n", + "\n", + "X_under_sample=under_sample_data.iloc[:,under_sample_data.columns != \"Class\"]\n", + "Y_under_sample=under_sample_data.iloc[:,under_sample_data.columns == \"Class\"]\n", + "\n", + "print(\"正常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))\n", + "print(\"异常样本所占整体比例:\", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))\n", + "print(\"下采样策略总体样本量:\", len(under_sample_data))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 交叉验证" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "原始训练集包含样本量: 199364\n", + "原始测试集包含样本量: 85443\n", + "原始样本总数: 284807\n", + "\n", + "\n", + "下采样训练集包含样本数量: 688\n", + "下采样测试集包含样本数量: 296\n", + "下采样样本总数: 984\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split # 切分数据集\n", + "\n", + "# 将数据切割成训练集0.7 和测试集 0.3\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)\n", + "\n", + "print(\"原始训练集包含样本量:\", len(X_train))\n", + "print(\"原始测试集包含样本量:\", len(X_test))\n", + "print(\"原始样本总数:\", len(X_train)+len(X_test))\n", + "\n", + "# 下采样数据集进行划分\n", + "X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample\n", + " ,Y_under_sample\n", + " ,test_size = 0.3\n", + " ,random_state = 0)\n", + " \n", + "print(\"\\n\")\n", + "print(\"下采样训练集包含样本数量: \", len(X_train_undersample))\n", + "print(\"下采样测试集包含样本数量: \", len(X_test_undersample))\n", + "print(\"下采样样本总数: \", len(X_train_undersample)+len(X_test_undersample))" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb index 76ecfb3..ba5ed51 100644 --- a/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb +++ b/机器学习竞赛实战_优胜解决方案/信用卡欺诈检测/逻辑回归-信用卡欺诈检测.ipynb @@ -610,6 +610,55 @@ "print(\"下采样策略总体样本量:\", len(under_sample_data))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 交叉验证" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "原始训练集包含样本量: 199364\n", + "原始测试集包含样本量: 85443\n", + "原始样本总数: 284807\n", + "\n", + "\n", + "下采样训练集包含样本数量: 688\n", + "下采样测试集包含样本数量: 296\n", + "下采样样本总数: 984\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split # 切分数据集\n", + "\n", + "# 将数据切割成训练集0.7 和测试集 0.3\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)\n", + "\n", + "print(\"原始训练集包含样本量:\", len(X_train))\n", + "print(\"原始测试集包含样本量:\", len(X_test))\n", + "print(\"原始样本总数:\", len(X_train)+len(X_test))\n", + "\n", + "# 下采样数据集进行划分\n", + "X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample\n", + " ,Y_under_sample\n", + " ,test_size = 0.3\n", + " ,random_state = 0)\n", + " \n", + "print(\"\\n\")\n", + "print(\"下采样训练集包含样本数量: \", len(X_train_undersample))\n", + "print(\"下采样测试集包含样本数量: \", len(X_test_undersample))\n", + "print(\"下采样样本总数: \", len(X_train_undersample)+len(X_test_undersample))" + ] + }, { "cell_type": "code", "execution_count": null,