From 89af7090c4b2c804f41ee35cf0f4105145755936 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Thu, 2 Sep 2021 21:04:13 +0800 Subject: [PATCH] Add. reduce_mem_usage --- ...re Engineering Techniques-checkpoint.ipynb | 336 ++++++++++++++++++ .../Feature Engineering Techniques.ipynb | 336 ++++++++++++++++++ 2 files changed, 672 insertions(+) diff --git a/竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb b/竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb index fbcb05b..0d17fcb 100644 --- a/竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb +++ b/竞赛优胜技巧/.ipynb_checkpoints/Feature Engineering Techniques-checkpoint.ipynb @@ -241,6 +241,342 @@ " if df[col].dtype=='int64': df[col] = df[col].astype('int32')" ] }, + { + "cell_type": "markdown", + "id": "35181a29", + "metadata": {}, + "source": [ + "### 测试修改数据大小后,结果会不会发生变化" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4cbbf955", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import time\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from sklearn.model_selection import StratifiedKFold, train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.preprocessing import OrdinalEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6aaae6e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "七分类任务,处理前: [1 2 3 4 5 6 7]\n", + "[5 5 2 ... 3 3 3]\n", + "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n", + "[4. 4. 1. ... 2. 2. 2.]\n" + ] + } + ], + "source": [ + "from sklearn.datasets import fetch_covtype\n", + "data = fetch_covtype() # 森林植被类型\n", + "# 预处理\n", + "X, y = data['data'], data['target']\n", + "# 由于模型标签需要从0开始,所以数字需要全部减1\n", + "print('七分类任务,处理前:',np.unique(y))\n", + "print(y)\n", + "ord = OrdinalEncoder()\n", + "y = ord.fit_transform(y.reshape(-1, 1))\n", + "y = y.reshape(-1, )\n", + "print('七分类任务,处理后:',np.unique(y))\n", + "print(y)\n", + "\n", + "X = pd.DataFrame(X,columns=data.feature_names)\n", + "X = X.iloc[:,:20] # 数据集过大,这里仅用前20列做演示\n", + "\n", + "y = pd.DataFrame(y, columns=data.target_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "816e36fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 581012 entries, 0 to 581011\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Elevation 581012 non-null float64\n", + " 1 Aspect 581012 non-null float64\n", + " 2 Slope 581012 non-null float64\n", + " 3 Horizontal_Distance_To_Hydrology 581012 non-null float64\n", + " 4 Vertical_Distance_To_Hydrology 581012 non-null float64\n", + " 5 Horizontal_Distance_To_Roadways 581012 non-null float64\n", + " 6 Hillshade_9am 581012 non-null float64\n", + " 7 Hillshade_Noon 581012 non-null float64\n", + " 8 Hillshade_3pm 581012 non-null float64\n", + " 9 Horizontal_Distance_To_Fire_Points 581012 non-null float64\n", + " 10 Wilderness_Area_0 581012 non-null float64\n", + " 11 Wilderness_Area_1 581012 non-null float64\n", + " 12 Wilderness_Area_2 581012 non-null float64\n", + " 13 Wilderness_Area_3 581012 non-null float64\n", + " 14 Soil_Type_0 581012 non-null float64\n", + " 15 Soil_Type_1 581012 non-null float64\n", + " 16 Soil_Type_2 581012 non-null float64\n", + " 17 Soil_Type_3 581012 non-null float64\n", + " 18 Soil_Type_4 581012 non-null float64\n", + " 19 Soil_Type_5 581012 non-null float64\n", + "dtypes: float64(20)\n", + "memory usage: 88.7 MB\n" + ] + } + ], + "source": [ + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "022527cd", + "metadata": {}, + "outputs": [], + "source": [ + "folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f1de7808", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "第 1 次训练...\n", + "0.951731022434877\n", + "第 2 次训练...\n", + "0.952789514900648\n", + "第 3 次训练...\n", + "0.9518510869003975\n", + "第 4 次训练...\n", + "0.9518855097158396\n", + "第 5 次训练...\n", + "0.952023200977608\n", + "5折泛化,验证集AC:0.952\n", + "Wall time: 1h 48min 7s\n" + ] + } + ], + "source": [ + "%%time\n", + "val_acc_num=0\n", + "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n", + " print(\"第 {} 次训练...\".format(fold_+1))\n", + " train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n", + " vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n", + "\n", + " random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n", + " random_forest.fit(train_x, trai_y.values.ravel()) # .values.ravel()是把DF格式变成1-D数组,上面出现的警告用这个解决\n", + "\n", + " # ===============验证集AUC操作===================\n", + " pred_y = random_forest.predict(vali_x)\n", + " print(accuracy_score(pred_y,vali_y.values.ravel()))\n", + " val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n", + " \n", + "print(\"5折泛化,验证集ACC:{0:.7f}\".format(val_acc_num/5))" + ] + }, + { + "cell_type": "markdown", + "id": "5577a53b", + "metadata": {}, + "source": [ + "上面输出的是3位尾数,5折加起来的均值为:0.952056" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9f4453ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 581012 entries, 0 to 581011\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Elevation 581012 non-null float32\n", + " 1 Aspect 581012 non-null float32\n", + " 2 Slope 581012 non-null float32\n", + " 3 Horizontal_Distance_To_Hydrology 581012 non-null float32\n", + " 4 Vertical_Distance_To_Hydrology 581012 non-null float32\n", + " 5 Horizontal_Distance_To_Roadways 581012 non-null float32\n", + " 6 Hillshade_9am 581012 non-null float32\n", + " 7 Hillshade_Noon 581012 non-null float32\n", + " 8 Hillshade_3pm 581012 non-null float32\n", + " 9 Horizontal_Distance_To_Fire_Points 581012 non-null float32\n", + " 10 Wilderness_Area_0 581012 non-null float32\n", + " 11 Wilderness_Area_1 581012 non-null float32\n", + " 12 Wilderness_Area_2 581012 non-null float32\n", + " 13 Wilderness_Area_3 581012 non-null float32\n", + " 14 Soil_Type_0 581012 non-null float32\n", + " 15 Soil_Type_1 581012 non-null float32\n", + " 16 Soil_Type_2 581012 non-null float32\n", + " 17 Soil_Type_3 581012 non-null float32\n", + " 18 Soil_Type_4 581012 non-null float32\n", + " 19 Soil_Type_5 581012 non-null float32\n", + "dtypes: float32(20)\n", + "memory usage: 44.3 MB\n" + ] + } + ], + "source": [ + "# 优化X内存使用率\n", + "for col in X.columns:\n", + " if X[col].dtype=='float64': X[col] = X[col].astype('float32')\n", + " if X[col].dtype=='int64': X[col] = [col].astype('int32')\n", + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "56e985d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "第 1 次训练...\n", + "0.9512577127957109\n", + "第 2 次训练...\n", + "0.9529013880880872\n", + "第 3 次训练...\n", + "0.9521092580162132\n", + "第 4 次训练...\n", + "0.9516961842309083\n", + "第 5 次训练...\n", + "0.9520145952737474\n", + "5折泛化,验证集AC:0.952\n", + "Wall time: 1h 57min 11s\n" + ] + } + ], + "source": [ + "%%time\n", + "val_acc_num=0\n", + "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n", + " print(\"第 {} 次训练...\".format(fold_+1))\n", + " train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n", + " vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n", + "\n", + " random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n", + " random_forest.fit(train_x, trai_y.values.ravel()) # .values.ravel()是把DF格式变成1-D数组,上面出现的警告用这个解决\n", + "\n", + " # ===============验证集AUC操作===================\n", + " pred_y = random_forest.predict(vali_x)\n", + " print(accuracy_score(pred_y,vali_y.values.ravel()))\n", + " val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n", + " \n", + "print(\"5折泛化,验证集ACC:{0:.7f}\".format(val_acc_num/5))" + ] + }, + { + "cell_type": "markdown", + "id": "6dc51d51", + "metadata": {}, + "source": [ + "5折加起来的均值为:0.9519958,和上方的0.952056,仅差0.0000602,我测试了三轮,结果是接近一致的波动,由此证明减少内存是不影响结果的。" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "17fd117b", + "metadata": {}, + "outputs": [], + "source": [ + "# 封装好的代码,原文链接:https://blog.csdn.net/wushaowu2014/article/details/86561141\n", + "def reduce_mem_usage(df):\n", + " \"\"\" iterate through all the columns of a dataframe and modify the data type\n", + " to reduce memory usage. \n", + " \"\"\"\n", + " start_mem = df.memory_usage().sum() / 1024**2\n", + " print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n", + " \n", + " for col in df.columns:\n", + " col_type = df[col].dtype\n", + " \n", + " if col_type != object:\n", + " c_min = df[col].min()\n", + " c_max = df[col].max()\n", + " if str(col_type)[:3] == 'int':\n", + " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", + " df[col] = df[col].astype(np.int8)\n", + " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", + " df[col] = df[col].astype(np.int16)\n", + " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", + " df[col] = df[col].astype(np.int32)\n", + " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", + " df[col] = df[col].astype(np.int64) \n", + " else:\n", + " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", + " df[col] = df[col].astype(np.float16)\n", + " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", + " df[col] = df[col].astype(np.float32)\n", + " else:\n", + " df[col] = df[col].astype(np.float64)\n", + " else:\n", + " df[col] = df[col].astype('category')\n", + " \n", + " end_mem = df.memory_usage().sum() / 1024**2\n", + " print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n", + " print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "690184eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Memory usage of dataframe is 88.66 MB\n", + "Memory usage after optimization is: 22.16 MB\n", + "Decreased by 75.0%\n" + ] + } + ], + "source": [ + "X = reduce_mem_usage(X)" + ] + }, { "cell_type": "markdown", "id": "f72a6efa", diff --git a/竞赛优胜技巧/Feature Engineering Techniques.ipynb b/竞赛优胜技巧/Feature Engineering Techniques.ipynb index fbcb05b..0d17fcb 100644 --- a/竞赛优胜技巧/Feature Engineering Techniques.ipynb +++ b/竞赛优胜技巧/Feature Engineering Techniques.ipynb @@ -241,6 +241,342 @@ " if df[col].dtype=='int64': df[col] = df[col].astype('int32')" ] }, + { + "cell_type": "markdown", + "id": "35181a29", + "metadata": {}, + "source": [ + "### 测试修改数据大小后,结果会不会发生变化" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4cbbf955", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import time\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from sklearn.model_selection import StratifiedKFold, train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.preprocessing import OrdinalEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6aaae6e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "七分类任务,处理前: [1 2 3 4 5 6 7]\n", + "[5 5 2 ... 3 3 3]\n", + "七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n", + "[4. 4. 1. ... 2. 2. 2.]\n" + ] + } + ], + "source": [ + "from sklearn.datasets import fetch_covtype\n", + "data = fetch_covtype() # 森林植被类型\n", + "# 预处理\n", + "X, y = data['data'], data['target']\n", + "# 由于模型标签需要从0开始,所以数字需要全部减1\n", + "print('七分类任务,处理前:',np.unique(y))\n", + "print(y)\n", + "ord = OrdinalEncoder()\n", + "y = ord.fit_transform(y.reshape(-1, 1))\n", + "y = y.reshape(-1, )\n", + "print('七分类任务,处理后:',np.unique(y))\n", + "print(y)\n", + "\n", + "X = pd.DataFrame(X,columns=data.feature_names)\n", + "X = X.iloc[:,:20] # 数据集过大,这里仅用前20列做演示\n", + "\n", + "y = pd.DataFrame(y, columns=data.target_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "816e36fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 581012 entries, 0 to 581011\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Elevation 581012 non-null float64\n", + " 1 Aspect 581012 non-null float64\n", + " 2 Slope 581012 non-null float64\n", + " 3 Horizontal_Distance_To_Hydrology 581012 non-null float64\n", + " 4 Vertical_Distance_To_Hydrology 581012 non-null float64\n", + " 5 Horizontal_Distance_To_Roadways 581012 non-null float64\n", + " 6 Hillshade_9am 581012 non-null float64\n", + " 7 Hillshade_Noon 581012 non-null float64\n", + " 8 Hillshade_3pm 581012 non-null float64\n", + " 9 Horizontal_Distance_To_Fire_Points 581012 non-null float64\n", + " 10 Wilderness_Area_0 581012 non-null float64\n", + " 11 Wilderness_Area_1 581012 non-null float64\n", + " 12 Wilderness_Area_2 581012 non-null float64\n", + " 13 Wilderness_Area_3 581012 non-null float64\n", + " 14 Soil_Type_0 581012 non-null float64\n", + " 15 Soil_Type_1 581012 non-null float64\n", + " 16 Soil_Type_2 581012 non-null float64\n", + " 17 Soil_Type_3 581012 non-null float64\n", + " 18 Soil_Type_4 581012 non-null float64\n", + " 19 Soil_Type_5 581012 non-null float64\n", + "dtypes: float64(20)\n", + "memory usage: 88.7 MB\n" + ] + } + ], + "source": [ + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "022527cd", + "metadata": {}, + "outputs": [], + "source": [ + "folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f1de7808", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "第 1 次训练...\n", + "0.951731022434877\n", + "第 2 次训练...\n", + "0.952789514900648\n", + "第 3 次训练...\n", + "0.9518510869003975\n", + "第 4 次训练...\n", + "0.9518855097158396\n", + "第 5 次训练...\n", + "0.952023200977608\n", + "5折泛化,验证集AC:0.952\n", + "Wall time: 1h 48min 7s\n" + ] + } + ], + "source": [ + "%%time\n", + "val_acc_num=0\n", + "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n", + " print(\"第 {} 次训练...\".format(fold_+1))\n", + " train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n", + " vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n", + "\n", + " random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n", + " random_forest.fit(train_x, trai_y.values.ravel()) # .values.ravel()是把DF格式变成1-D数组,上面出现的警告用这个解决\n", + "\n", + " # ===============验证集AUC操作===================\n", + " pred_y = random_forest.predict(vali_x)\n", + " print(accuracy_score(pred_y,vali_y.values.ravel()))\n", + " val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n", + " \n", + "print(\"5折泛化,验证集ACC:{0:.7f}\".format(val_acc_num/5))" + ] + }, + { + "cell_type": "markdown", + "id": "5577a53b", + "metadata": {}, + "source": [ + "上面输出的是3位尾数,5折加起来的均值为:0.952056" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9f4453ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 581012 entries, 0 to 581011\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Elevation 581012 non-null float32\n", + " 1 Aspect 581012 non-null float32\n", + " 2 Slope 581012 non-null float32\n", + " 3 Horizontal_Distance_To_Hydrology 581012 non-null float32\n", + " 4 Vertical_Distance_To_Hydrology 581012 non-null float32\n", + " 5 Horizontal_Distance_To_Roadways 581012 non-null float32\n", + " 6 Hillshade_9am 581012 non-null float32\n", + " 7 Hillshade_Noon 581012 non-null float32\n", + " 8 Hillshade_3pm 581012 non-null float32\n", + " 9 Horizontal_Distance_To_Fire_Points 581012 non-null float32\n", + " 10 Wilderness_Area_0 581012 non-null float32\n", + " 11 Wilderness_Area_1 581012 non-null float32\n", + " 12 Wilderness_Area_2 581012 non-null float32\n", + " 13 Wilderness_Area_3 581012 non-null float32\n", + " 14 Soil_Type_0 581012 non-null float32\n", + " 15 Soil_Type_1 581012 non-null float32\n", + " 16 Soil_Type_2 581012 non-null float32\n", + " 17 Soil_Type_3 581012 non-null float32\n", + " 18 Soil_Type_4 581012 non-null float32\n", + " 19 Soil_Type_5 581012 non-null float32\n", + "dtypes: float32(20)\n", + "memory usage: 44.3 MB\n" + ] + } + ], + "source": [ + "# 优化X内存使用率\n", + "for col in X.columns:\n", + " if X[col].dtype=='float64': X[col] = X[col].astype('float32')\n", + " if X[col].dtype=='int64': X[col] = [col].astype('int32')\n", + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "56e985d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "第 1 次训练...\n", + "0.9512577127957109\n", + "第 2 次训练...\n", + "0.9529013880880872\n", + "第 3 次训练...\n", + "0.9521092580162132\n", + "第 4 次训练...\n", + "0.9516961842309083\n", + "第 5 次训练...\n", + "0.9520145952737474\n", + "5折泛化,验证集AC:0.952\n", + "Wall time: 1h 57min 11s\n" + ] + } + ], + "source": [ + "%%time\n", + "val_acc_num=0\n", + "for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n", + " print(\"第 {} 次训练...\".format(fold_+1))\n", + " train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n", + " vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n", + "\n", + " random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n", + " random_forest.fit(train_x, trai_y.values.ravel()) # .values.ravel()是把DF格式变成1-D数组,上面出现的警告用这个解决\n", + "\n", + " # ===============验证集AUC操作===================\n", + " pred_y = random_forest.predict(vali_x)\n", + " print(accuracy_score(pred_y,vali_y.values.ravel()))\n", + " val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n", + " \n", + "print(\"5折泛化,验证集ACC:{0:.7f}\".format(val_acc_num/5))" + ] + }, + { + "cell_type": "markdown", + "id": "6dc51d51", + "metadata": {}, + "source": [ + "5折加起来的均值为:0.9519958,和上方的0.952056,仅差0.0000602,我测试了三轮,结果是接近一致的波动,由此证明减少内存是不影响结果的。" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "17fd117b", + "metadata": {}, + "outputs": [], + "source": [ + "# 封装好的代码,原文链接:https://blog.csdn.net/wushaowu2014/article/details/86561141\n", + "def reduce_mem_usage(df):\n", + " \"\"\" iterate through all the columns of a dataframe and modify the data type\n", + " to reduce memory usage. \n", + " \"\"\"\n", + " start_mem = df.memory_usage().sum() / 1024**2\n", + " print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n", + " \n", + " for col in df.columns:\n", + " col_type = df[col].dtype\n", + " \n", + " if col_type != object:\n", + " c_min = df[col].min()\n", + " c_max = df[col].max()\n", + " if str(col_type)[:3] == 'int':\n", + " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", + " df[col] = df[col].astype(np.int8)\n", + " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", + " df[col] = df[col].astype(np.int16)\n", + " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", + " df[col] = df[col].astype(np.int32)\n", + " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", + " df[col] = df[col].astype(np.int64) \n", + " else:\n", + " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", + " df[col] = df[col].astype(np.float16)\n", + " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", + " df[col] = df[col].astype(np.float32)\n", + " else:\n", + " df[col] = df[col].astype(np.float64)\n", + " else:\n", + " df[col] = df[col].astype('category')\n", + " \n", + " end_mem = df.memory_usage().sum() / 1024**2\n", + " print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n", + " print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "690184eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Memory usage of dataframe is 88.66 MB\n", + "Memory usage after optimization is: 22.16 MB\n", + "Decreased by 75.0%\n" + ] + } + ], + "source": [ + "X = reduce_mem_usage(X)" + ] + }, { "cell_type": "markdown", "id": "f72a6efa",