Add. reduce_mem_usage

4 years ago · 89af7090c4
parent c14a41f8c1
commit 89af7090c4
2 changed files with 672 additions and 0 deletions
--- a/竞赛优胜技巧/.ipynb_checkpoints/Feature
+++ b/竞赛优胜技巧/.ipynb_checkpoints/Feature
@ -241,6 +241,342 @@
    "    if df[col].dtype=='int64': df[col] = df[col].astype('int32')"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "35181a29",
+   "metadata": {},
+   "source": [
+    "### 测试修改数据大小后，结果会不会发生变化"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4cbbf955",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import time\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.preprocessing import OrdinalEncoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "6aaae6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "七分类任务，处理前： [1 2 3 4 5 6 7]\n",
+      "[5 5 2 ... 3 3 3]\n",
+      "七分类任务，处理后： [0. 1. 2. 3. 4. 5. 6.]\n",
+      "[4. 4. 1. ... 2. 2. 2.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import fetch_covtype\n",
+    "data = fetch_covtype()  # 森林植被类型\n",
+    "# 预处理\n",
+    "X, y = data['data'], data['target']\n",
+    "# 由于模型标签需要从0开始，所以数字需要全部减1\n",
+    "print('七分类任务，处理前：',np.unique(y))\n",
+    "print(y)\n",
+    "ord = OrdinalEncoder()\n",
+    "y = ord.fit_transform(y.reshape(-1, 1))\n",
+    "y = y.reshape(-1, )\n",
+    "print('七分类任务，处理后：',np.unique(y))\n",
+    "print(y)\n",
+    "\n",
+    "X = pd.DataFrame(X,columns=data.feature_names)\n",
+    "X = X.iloc[:,:20]  # 数据集过大，这里仅用前20列做演示\n",
+    "\n",
+    "y = pd.DataFrame(y, columns=data.target_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "816e36fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 581012 entries, 0 to 581011\n",
+      "Data columns (total 20 columns):\n",
+      " #   Column                              Non-Null Count   Dtype  \n",
+      "---  ------                              --------------   -----  \n",
+      " 0   Elevation                           581012 non-null  float64\n",
+      " 1   Aspect                              581012 non-null  float64\n",
+      " 2   Slope                               581012 non-null  float64\n",
+      " 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64\n",
+      " 4   Vertical_Distance_To_Hydrology      581012 non-null  float64\n",
+      " 5   Horizontal_Distance_To_Roadways     581012 non-null  float64\n",
+      " 6   Hillshade_9am                       581012 non-null  float64\n",
+      " 7   Hillshade_Noon                      581012 non-null  float64\n",
+      " 8   Hillshade_3pm                       581012 non-null  float64\n",
+      " 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float64\n",
+      " 10  Wilderness_Area_0                   581012 non-null  float64\n",
+      " 11  Wilderness_Area_1                   581012 non-null  float64\n",
+      " 12  Wilderness_Area_2                   581012 non-null  float64\n",
+      " 13  Wilderness_Area_3                   581012 non-null  float64\n",
+      " 14  Soil_Type_0                         581012 non-null  float64\n",
+      " 15  Soil_Type_1                         581012 non-null  float64\n",
+      " 16  Soil_Type_2                         581012 non-null  float64\n",
+      " 17  Soil_Type_3                         581012 non-null  float64\n",
+      " 18  Soil_Type_4                         581012 non-null  float64\n",
+      " 19  Soil_Type_5                         581012 non-null  float64\n",
+      "dtypes: float64(20)\n",
+      "memory usage: 88.7 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "X.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "022527cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "f1de7808",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "第 1 次训练...\n",
+      "0.951731022434877\n",
+      "第 2 次训练...\n",
+      "0.952789514900648\n",
+      "第 3 次训练...\n",
+      "0.9518510869003975\n",
+      "第 4 次训练...\n",
+      "0.9518855097158396\n",
+      "第 5 次训练...\n",
+      "0.952023200977608\n",
+      "5折泛化，验证集AC：0.952\n",
+      "Wall time: 1h 48min 7s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "val_acc_num=0\n",
+    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n",
+    "    print(\"第 {} 次训练...\".format(fold_+1))\n",
+    "    train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n",
+    "    vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n",
+    "\n",
+    "    random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n",
+    "    random_forest.fit(train_x, trai_y.values.ravel())  # .values.ravel()是把DF格式变成1-D数组，上面出现的警告用这个解决\n",
+    "\n",
+    "    # ===============验证集AUC操作===================\n",
+    "    pred_y = random_forest.predict(vali_x)\n",
+    "    print(accuracy_score(pred_y,vali_y.values.ravel()))\n",
+    "    val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n",
+    "    \n",
+    "print(\"5折泛化，验证集ACC：{0:.7f}\".format(val_acc_num/5))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5577a53b",
+   "metadata": {},
+   "source": [
+    "上面输出的是3位尾数，5折加起来的均值为：0.952056"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "9f4453ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 581012 entries, 0 to 581011\n",
+      "Data columns (total 20 columns):\n",
+      " #   Column                              Non-Null Count   Dtype  \n",
+      "---  ------                              --------------   -----  \n",
+      " 0   Elevation                           581012 non-null  float32\n",
+      " 1   Aspect                              581012 non-null  float32\n",
+      " 2   Slope                               581012 non-null  float32\n",
+      " 3   Horizontal_Distance_To_Hydrology    581012 non-null  float32\n",
+      " 4   Vertical_Distance_To_Hydrology      581012 non-null  float32\n",
+      " 5   Horizontal_Distance_To_Roadways     581012 non-null  float32\n",
+      " 6   Hillshade_9am                       581012 non-null  float32\n",
+      " 7   Hillshade_Noon                      581012 non-null  float32\n",
+      " 8   Hillshade_3pm                       581012 non-null  float32\n",
+      " 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float32\n",
+      " 10  Wilderness_Area_0                   581012 non-null  float32\n",
+      " 11  Wilderness_Area_1                   581012 non-null  float32\n",
+      " 12  Wilderness_Area_2                   581012 non-null  float32\n",
+      " 13  Wilderness_Area_3                   581012 non-null  float32\n",
+      " 14  Soil_Type_0                         581012 non-null  float32\n",
+      " 15  Soil_Type_1                         581012 non-null  float32\n",
+      " 16  Soil_Type_2                         581012 non-null  float32\n",
+      " 17  Soil_Type_3                         581012 non-null  float32\n",
+      " 18  Soil_Type_4                         581012 non-null  float32\n",
+      " 19  Soil_Type_5                         581012 non-null  float32\n",
+      "dtypes: float32(20)\n",
+      "memory usage: 44.3 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 优化X内存使用率\n",
+    "for col in X.columns:\n",
+    "    if X[col].dtype=='float64': X[col] = X[col].astype('float32')\n",
+    "    if X[col].dtype=='int64': X[col] = [col].astype('int32')\n",
+    "X.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "56e985d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "第 1 次训练...\n",
+      "0.9512577127957109\n",
+      "第 2 次训练...\n",
+      "0.9529013880880872\n",
+      "第 3 次训练...\n",
+      "0.9521092580162132\n",
+      "第 4 次训练...\n",
+      "0.9516961842309083\n",
+      "第 5 次训练...\n",
+      "0.9520145952737474\n",
+      "5折泛化，验证集AC：0.952\n",
+      "Wall time: 1h 57min 11s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "val_acc_num=0\n",
+    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n",
+    "    print(\"第 {} 次训练...\".format(fold_+1))\n",
+    "    train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n",
+    "    vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n",
+    "\n",
+    "    random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n",
+    "    random_forest.fit(train_x, trai_y.values.ravel())  # .values.ravel()是把DF格式变成1-D数组，上面出现的警告用这个解决\n",
+    "\n",
+    "    # ===============验证集AUC操作===================\n",
+    "    pred_y = random_forest.predict(vali_x)\n",
+    "    print(accuracy_score(pred_y,vali_y.values.ravel()))\n",
+    "    val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n",
+    "    \n",
+    "print(\"5折泛化，验证集ACC：{0:.7f}\".format(val_acc_num/5))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6dc51d51",
+   "metadata": {},
+   "source": [
+    "5折加起来的均值为：0.9519958，和上方的0.952056，仅差0.0000602，我测试了三轮，结果是接近一致的波动，由此证明减少内存是不影响结果的。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "17fd117b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 封装好的代码，原文链接：https://blog.csdn.net/wushaowu2014/article/details/86561141\n",
+    "def reduce_mem_usage(df):\n",
+    "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
+    "        to reduce memory usage.        \n",
+    "    \"\"\"\n",
+    "    start_mem = df.memory_usage().sum() / 1024**2\n",
+    "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
+    "    \n",
+    "    for col in df.columns:\n",
+    "        col_type = df[col].dtype\n",
+    "        \n",
+    "        if col_type != object:\n",
+    "            c_min = df[col].min()\n",
+    "            c_max = df[col].max()\n",
+    "            if str(col_type)[:3] == 'int':\n",
+    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
+    "                    df[col] = df[col].astype(np.int8)\n",
+    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
+    "                    df[col] = df[col].astype(np.int16)\n",
+    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
+    "                    df[col] = df[col].astype(np.int32)\n",
+    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
+    "                    df[col] = df[col].astype(np.int64)  \n",
+    "            else:\n",
+    "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
+    "                    df[col] = df[col].astype(np.float16)\n",
+    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
+    "                    df[col] = df[col].astype(np.float32)\n",
+    "                else:\n",
+    "                    df[col] = df[col].astype(np.float64)\n",
+    "        else:\n",
+    "            df[col] = df[col].astype('category')\n",
+    " \n",
+    "    end_mem = df.memory_usage().sum() / 1024**2\n",
+    "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
+    "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
+    "    \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "690184eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Memory usage of dataframe is 88.66 MB\n",
+      "Memory usage after optimization is: 22.16 MB\n",
+      "Decreased by 75.0%\n"
+     ]
+    }
+   ],
+   "source": [
+    "X = reduce_mem_usage(X)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "f72a6efa",
--- a/竞赛优胜技巧/Feature
+++ b/竞赛优胜技巧/Feature
@ -241,6 +241,342 @@
    "    if df[col].dtype=='int64': df[col] = df[col].astype('int32')"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "35181a29",
+   "metadata": {},
+   "source": [
+    "### 测试修改数据大小后，结果会不会发生变化"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4cbbf955",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import time\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.preprocessing import OrdinalEncoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "6aaae6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "七分类任务，处理前： [1 2 3 4 5 6 7]\n",
+      "[5 5 2 ... 3 3 3]\n",
+      "七分类任务，处理后： [0. 1. 2. 3. 4. 5. 6.]\n",
+      "[4. 4. 1. ... 2. 2. 2.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import fetch_covtype\n",
+    "data = fetch_covtype()  # 森林植被类型\n",
+    "# 预处理\n",
+    "X, y = data['data'], data['target']\n",
+    "# 由于模型标签需要从0开始，所以数字需要全部减1\n",
+    "print('七分类任务，处理前：',np.unique(y))\n",
+    "print(y)\n",
+    "ord = OrdinalEncoder()\n",
+    "y = ord.fit_transform(y.reshape(-1, 1))\n",
+    "y = y.reshape(-1, )\n",
+    "print('七分类任务，处理后：',np.unique(y))\n",
+    "print(y)\n",
+    "\n",
+    "X = pd.DataFrame(X,columns=data.feature_names)\n",
+    "X = X.iloc[:,:20]  # 数据集过大，这里仅用前20列做演示\n",
+    "\n",
+    "y = pd.DataFrame(y, columns=data.target_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "816e36fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 581012 entries, 0 to 581011\n",
+      "Data columns (total 20 columns):\n",
+      " #   Column                              Non-Null Count   Dtype  \n",
+      "---  ------                              --------------   -----  \n",
+      " 0   Elevation                           581012 non-null  float64\n",
+      " 1   Aspect                              581012 non-null  float64\n",
+      " 2   Slope                               581012 non-null  float64\n",
+      " 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64\n",
+      " 4   Vertical_Distance_To_Hydrology      581012 non-null  float64\n",
+      " 5   Horizontal_Distance_To_Roadways     581012 non-null  float64\n",
+      " 6   Hillshade_9am                       581012 non-null  float64\n",
+      " 7   Hillshade_Noon                      581012 non-null  float64\n",
+      " 8   Hillshade_3pm                       581012 non-null  float64\n",
+      " 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float64\n",
+      " 10  Wilderness_Area_0                   581012 non-null  float64\n",
+      " 11  Wilderness_Area_1                   581012 non-null  float64\n",
+      " 12  Wilderness_Area_2                   581012 non-null  float64\n",
+      " 13  Wilderness_Area_3                   581012 non-null  float64\n",
+      " 14  Soil_Type_0                         581012 non-null  float64\n",
+      " 15  Soil_Type_1                         581012 non-null  float64\n",
+      " 16  Soil_Type_2                         581012 non-null  float64\n",
+      " 17  Soil_Type_3                         581012 non-null  float64\n",
+      " 18  Soil_Type_4                         581012 non-null  float64\n",
+      " 19  Soil_Type_5                         581012 non-null  float64\n",
+      "dtypes: float64(20)\n",
+      "memory usage: 88.7 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "X.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "022527cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "f1de7808",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "第 1 次训练...\n",
+      "0.951731022434877\n",
+      "第 2 次训练...\n",
+      "0.952789514900648\n",
+      "第 3 次训练...\n",
+      "0.9518510869003975\n",
+      "第 4 次训练...\n",
+      "0.9518855097158396\n",
+      "第 5 次训练...\n",
+      "0.952023200977608\n",
+      "5折泛化，验证集AC：0.952\n",
+      "Wall time: 1h 48min 7s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "val_acc_num=0\n",
+    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n",
+    "    print(\"第 {} 次训练...\".format(fold_+1))\n",
+    "    train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n",
+    "    vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n",
+    "\n",
+    "    random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n",
+    "    random_forest.fit(train_x, trai_y.values.ravel())  # .values.ravel()是把DF格式变成1-D数组，上面出现的警告用这个解决\n",
+    "\n",
+    "    # ===============验证集AUC操作===================\n",
+    "    pred_y = random_forest.predict(vali_x)\n",
+    "    print(accuracy_score(pred_y,vali_y.values.ravel()))\n",
+    "    val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n",
+    "    \n",
+    "print(\"5折泛化，验证集ACC：{0:.7f}\".format(val_acc_num/5))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5577a53b",
+   "metadata": {},
+   "source": [
+    "上面输出的是3位尾数，5折加起来的均值为：0.952056"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "9f4453ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 581012 entries, 0 to 581011\n",
+      "Data columns (total 20 columns):\n",
+      " #   Column                              Non-Null Count   Dtype  \n",
+      "---  ------                              --------------   -----  \n",
+      " 0   Elevation                           581012 non-null  float32\n",
+      " 1   Aspect                              581012 non-null  float32\n",
+      " 2   Slope                               581012 non-null  float32\n",
+      " 3   Horizontal_Distance_To_Hydrology    581012 non-null  float32\n",
+      " 4   Vertical_Distance_To_Hydrology      581012 non-null  float32\n",
+      " 5   Horizontal_Distance_To_Roadways     581012 non-null  float32\n",
+      " 6   Hillshade_9am                       581012 non-null  float32\n",
+      " 7   Hillshade_Noon                      581012 non-null  float32\n",
+      " 8   Hillshade_3pm                       581012 non-null  float32\n",
+      " 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float32\n",
+      " 10  Wilderness_Area_0                   581012 non-null  float32\n",
+      " 11  Wilderness_Area_1                   581012 non-null  float32\n",
+      " 12  Wilderness_Area_2                   581012 non-null  float32\n",
+      " 13  Wilderness_Area_3                   581012 non-null  float32\n",
+      " 14  Soil_Type_0                         581012 non-null  float32\n",
+      " 15  Soil_Type_1                         581012 non-null  float32\n",
+      " 16  Soil_Type_2                         581012 non-null  float32\n",
+      " 17  Soil_Type_3                         581012 non-null  float32\n",
+      " 18  Soil_Type_4                         581012 non-null  float32\n",
+      " 19  Soil_Type_5                         581012 non-null  float32\n",
+      "dtypes: float32(20)\n",
+      "memory usage: 44.3 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 优化X内存使用率\n",
+    "for col in X.columns:\n",
+    "    if X[col].dtype=='float64': X[col] = X[col].astype('float32')\n",
+    "    if X[col].dtype=='int64': X[col] = [col].astype('int32')\n",
+    "X.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "56e985d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "第 1 次训练...\n",
+      "0.9512577127957109\n",
+      "第 2 次训练...\n",
+      "0.9529013880880872\n",
+      "第 3 次训练...\n",
+      "0.9521092580162132\n",
+      "第 4 次训练...\n",
+      "0.9516961842309083\n",
+      "第 5 次训练...\n",
+      "0.9520145952737474\n",
+      "5折泛化，验证集AC：0.952\n",
+      "Wall time: 1h 57min 11s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "val_acc_num=0\n",
+    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n",
+    "    print(\"第 {} 次训练...\".format(fold_+1))\n",
+    "    train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n",
+    "    vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n",
+    "\n",
+    "    random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n",
+    "    random_forest.fit(train_x, trai_y.values.ravel())  # .values.ravel()是把DF格式变成1-D数组，上面出现的警告用这个解决\n",
+    "\n",
+    "    # ===============验证集AUC操作===================\n",
+    "    pred_y = random_forest.predict(vali_x)\n",
+    "    print(accuracy_score(pred_y,vali_y.values.ravel()))\n",
+    "    val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n",
+    "    \n",
+    "print(\"5折泛化，验证集ACC：{0:.7f}\".format(val_acc_num/5))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6dc51d51",
+   "metadata": {},
+   "source": [
+    "5折加起来的均值为：0.9519958，和上方的0.952056，仅差0.0000602，我测试了三轮，结果是接近一致的波动，由此证明减少内存是不影响结果的。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "17fd117b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 封装好的代码，原文链接：https://blog.csdn.net/wushaowu2014/article/details/86561141\n",
+    "def reduce_mem_usage(df):\n",
+    "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
+    "        to reduce memory usage.        \n",
+    "    \"\"\"\n",
+    "    start_mem = df.memory_usage().sum() / 1024**2\n",
+    "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
+    "    \n",
+    "    for col in df.columns:\n",
+    "        col_type = df[col].dtype\n",
+    "        \n",
+    "        if col_type != object:\n",
+    "            c_min = df[col].min()\n",
+    "            c_max = df[col].max()\n",
+    "            if str(col_type)[:3] == 'int':\n",
+    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
+    "                    df[col] = df[col].astype(np.int8)\n",
+    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
+    "                    df[col] = df[col].astype(np.int16)\n",
+    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
+    "                    df[col] = df[col].astype(np.int32)\n",
+    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
+    "                    df[col] = df[col].astype(np.int64)  \n",
+    "            else:\n",
+    "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
+    "                    df[col] = df[col].astype(np.float16)\n",
+    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
+    "                    df[col] = df[col].astype(np.float32)\n",
+    "                else:\n",
+    "                    df[col] = df[col].astype(np.float64)\n",
+    "        else:\n",
+    "            df[col] = df[col].astype('category')\n",
+    " \n",
+    "    end_mem = df.memory_usage().sum() / 1024**2\n",
+    "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
+    "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
+    "    \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "690184eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Memory usage of dataframe is 88.66 MB\n",
+      "Memory usage after optimization is: 22.16 MB\n",
+      "Decreased by 75.0%\n"
+     ]
+    }
+   ],
+   "source": [
+    "X = reduce_mem_usage(X)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "f72a6efa",