|
|
|
@ -241,6 +241,342 @@
|
|
|
|
|
" if df[col].dtype=='int64': df[col] = df[col].astype('int32')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "35181a29",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 测试修改数据大小后,结果会不会发生变化"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"id": "4cbbf955",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"import time\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from sklearn.model_selection import StratifiedKFold, train_test_split\n",
|
|
|
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
|
|
|
"from sklearn.preprocessing import OrdinalEncoder"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 24,
|
|
|
|
|
"id": "6aaae6e5",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"七分类任务,处理前: [1 2 3 4 5 6 7]\n",
|
|
|
|
|
"[5 5 2 ... 3 3 3]\n",
|
|
|
|
|
"七分类任务,处理后: [0. 1. 2. 3. 4. 5. 6.]\n",
|
|
|
|
|
"[4. 4. 1. ... 2. 2. 2.]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.datasets import fetch_covtype\n",
|
|
|
|
|
"data = fetch_covtype() # 森林植被类型\n",
|
|
|
|
|
"# 预处理\n",
|
|
|
|
|
"X, y = data['data'], data['target']\n",
|
|
|
|
|
"# 由于模型标签需要从0开始,所以数字需要全部减1\n",
|
|
|
|
|
"print('七分类任务,处理前:',np.unique(y))\n",
|
|
|
|
|
"print(y)\n",
|
|
|
|
|
"ord = OrdinalEncoder()\n",
|
|
|
|
|
"y = ord.fit_transform(y.reshape(-1, 1))\n",
|
|
|
|
|
"y = y.reshape(-1, )\n",
|
|
|
|
|
"print('七分类任务,处理后:',np.unique(y))\n",
|
|
|
|
|
"print(y)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"X = pd.DataFrame(X,columns=data.feature_names)\n",
|
|
|
|
|
"X = X.iloc[:,:20] # 数据集过大,这里仅用前20列做演示\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"y = pd.DataFrame(y, columns=data.target_names)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 25,
|
|
|
|
|
"id": "816e36fd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
|
|
|
"RangeIndex: 581012 entries, 0 to 581011\n",
|
|
|
|
|
"Data columns (total 20 columns):\n",
|
|
|
|
|
" # Column Non-Null Count Dtype \n",
|
|
|
|
|
"--- ------ -------------- ----- \n",
|
|
|
|
|
" 0 Elevation 581012 non-null float64\n",
|
|
|
|
|
" 1 Aspect 581012 non-null float64\n",
|
|
|
|
|
" 2 Slope 581012 non-null float64\n",
|
|
|
|
|
" 3 Horizontal_Distance_To_Hydrology 581012 non-null float64\n",
|
|
|
|
|
" 4 Vertical_Distance_To_Hydrology 581012 non-null float64\n",
|
|
|
|
|
" 5 Horizontal_Distance_To_Roadways 581012 non-null float64\n",
|
|
|
|
|
" 6 Hillshade_9am 581012 non-null float64\n",
|
|
|
|
|
" 7 Hillshade_Noon 581012 non-null float64\n",
|
|
|
|
|
" 8 Hillshade_3pm 581012 non-null float64\n",
|
|
|
|
|
" 9 Horizontal_Distance_To_Fire_Points 581012 non-null float64\n",
|
|
|
|
|
" 10 Wilderness_Area_0 581012 non-null float64\n",
|
|
|
|
|
" 11 Wilderness_Area_1 581012 non-null float64\n",
|
|
|
|
|
" 12 Wilderness_Area_2 581012 non-null float64\n",
|
|
|
|
|
" 13 Wilderness_Area_3 581012 non-null float64\n",
|
|
|
|
|
" 14 Soil_Type_0 581012 non-null float64\n",
|
|
|
|
|
" 15 Soil_Type_1 581012 non-null float64\n",
|
|
|
|
|
" 16 Soil_Type_2 581012 non-null float64\n",
|
|
|
|
|
" 17 Soil_Type_3 581012 non-null float64\n",
|
|
|
|
|
" 18 Soil_Type_4 581012 non-null float64\n",
|
|
|
|
|
" 19 Soil_Type_5 581012 non-null float64\n",
|
|
|
|
|
"dtypes: float64(20)\n",
|
|
|
|
|
"memory usage: 88.7 MB\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"X.info()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 20,
|
|
|
|
|
"id": "022527cd",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 21,
|
|
|
|
|
"id": "f1de7808",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"第 1 次训练...\n",
|
|
|
|
|
"0.951731022434877\n",
|
|
|
|
|
"第 2 次训练...\n",
|
|
|
|
|
"0.952789514900648\n",
|
|
|
|
|
"第 3 次训练...\n",
|
|
|
|
|
"0.9518510869003975\n",
|
|
|
|
|
"第 4 次训练...\n",
|
|
|
|
|
"0.9518855097158396\n",
|
|
|
|
|
"第 5 次训练...\n",
|
|
|
|
|
"0.952023200977608\n",
|
|
|
|
|
"5折泛化,验证集AC:0.952\n",
|
|
|
|
|
"Wall time: 1h 48min 7s\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"%%time\n",
|
|
|
|
|
"val_acc_num=0\n",
|
|
|
|
|
"for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n",
|
|
|
|
|
" print(\"第 {} 次训练...\".format(fold_+1))\n",
|
|
|
|
|
" train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n",
|
|
|
|
|
" vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n",
|
|
|
|
|
" random_forest.fit(train_x, trai_y.values.ravel()) # .values.ravel()是把DF格式变成1-D数组,上面出现的警告用这个解决\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # ===============验证集AUC操作===================\n",
|
|
|
|
|
" pred_y = random_forest.predict(vali_x)\n",
|
|
|
|
|
" print(accuracy_score(pred_y,vali_y.values.ravel()))\n",
|
|
|
|
|
" val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"print(\"5折泛化,验证集ACC:{0:.7f}\".format(val_acc_num/5))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "5577a53b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"上面输出的是3位尾数,5折加起来的均值为:0.952056"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 22,
|
|
|
|
|
"id": "9f4453ec",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
|
|
|
"RangeIndex: 581012 entries, 0 to 581011\n",
|
|
|
|
|
"Data columns (total 20 columns):\n",
|
|
|
|
|
" # Column Non-Null Count Dtype \n",
|
|
|
|
|
"--- ------ -------------- ----- \n",
|
|
|
|
|
" 0 Elevation 581012 non-null float32\n",
|
|
|
|
|
" 1 Aspect 581012 non-null float32\n",
|
|
|
|
|
" 2 Slope 581012 non-null float32\n",
|
|
|
|
|
" 3 Horizontal_Distance_To_Hydrology 581012 non-null float32\n",
|
|
|
|
|
" 4 Vertical_Distance_To_Hydrology 581012 non-null float32\n",
|
|
|
|
|
" 5 Horizontal_Distance_To_Roadways 581012 non-null float32\n",
|
|
|
|
|
" 6 Hillshade_9am 581012 non-null float32\n",
|
|
|
|
|
" 7 Hillshade_Noon 581012 non-null float32\n",
|
|
|
|
|
" 8 Hillshade_3pm 581012 non-null float32\n",
|
|
|
|
|
" 9 Horizontal_Distance_To_Fire_Points 581012 non-null float32\n",
|
|
|
|
|
" 10 Wilderness_Area_0 581012 non-null float32\n",
|
|
|
|
|
" 11 Wilderness_Area_1 581012 non-null float32\n",
|
|
|
|
|
" 12 Wilderness_Area_2 581012 non-null float32\n",
|
|
|
|
|
" 13 Wilderness_Area_3 581012 non-null float32\n",
|
|
|
|
|
" 14 Soil_Type_0 581012 non-null float32\n",
|
|
|
|
|
" 15 Soil_Type_1 581012 non-null float32\n",
|
|
|
|
|
" 16 Soil_Type_2 581012 non-null float32\n",
|
|
|
|
|
" 17 Soil_Type_3 581012 non-null float32\n",
|
|
|
|
|
" 18 Soil_Type_4 581012 non-null float32\n",
|
|
|
|
|
" 19 Soil_Type_5 581012 non-null float32\n",
|
|
|
|
|
"dtypes: float32(20)\n",
|
|
|
|
|
"memory usage: 44.3 MB\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 优化X内存使用率\n",
|
|
|
|
|
"for col in X.columns:\n",
|
|
|
|
|
" if X[col].dtype=='float64': X[col] = X[col].astype('float32')\n",
|
|
|
|
|
" if X[col].dtype=='int64': X[col] = [col].astype('int32')\n",
|
|
|
|
|
"X.info()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 23,
|
|
|
|
|
"id": "56e985d4",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"第 1 次训练...\n",
|
|
|
|
|
"0.9512577127957109\n",
|
|
|
|
|
"第 2 次训练...\n",
|
|
|
|
|
"0.9529013880880872\n",
|
|
|
|
|
"第 3 次训练...\n",
|
|
|
|
|
"0.9521092580162132\n",
|
|
|
|
|
"第 4 次训练...\n",
|
|
|
|
|
"0.9516961842309083\n",
|
|
|
|
|
"第 5 次训练...\n",
|
|
|
|
|
"0.9520145952737474\n",
|
|
|
|
|
"5折泛化,验证集AC:0.952\n",
|
|
|
|
|
"Wall time: 1h 57min 11s\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"%%time\n",
|
|
|
|
|
"val_acc_num=0\n",
|
|
|
|
|
"for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):\n",
|
|
|
|
|
" print(\"第 {} 次训练...\".format(fold_+1))\n",
|
|
|
|
|
" train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]\n",
|
|
|
|
|
" vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" random_forest = RandomForestClassifier(n_estimators=1000,oob_score=True)\n",
|
|
|
|
|
" random_forest.fit(train_x, trai_y.values.ravel()) # .values.ravel()是把DF格式变成1-D数组,上面出现的警告用这个解决\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # ===============验证集AUC操作===================\n",
|
|
|
|
|
" pred_y = random_forest.predict(vali_x)\n",
|
|
|
|
|
" print(accuracy_score(pred_y,vali_y.values.ravel()))\n",
|
|
|
|
|
" val_acc_num += accuracy_score(pred_y,vali_y.values.ravel())\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"print(\"5折泛化,验证集ACC:{0:.7f}\".format(val_acc_num/5))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "6dc51d51",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"5折加起来的均值为:0.9519958,和上方的0.952056,仅差0.0000602,我测试了三轮,结果是接近一致的波动,由此证明减少内存是不影响结果的。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 26,
|
|
|
|
|
"id": "17fd117b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 封装好的代码,原文链接:https://blog.csdn.net/wushaowu2014/article/details/86561141\n",
|
|
|
|
|
"def reduce_mem_usage(df):\n",
|
|
|
|
|
" \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
|
|
|
|
|
" to reduce memory usage. \n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" start_mem = df.memory_usage().sum() / 1024**2\n",
|
|
|
|
|
" print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" for col in df.columns:\n",
|
|
|
|
|
" col_type = df[col].dtype\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" if col_type != object:\n",
|
|
|
|
|
" c_min = df[col].min()\n",
|
|
|
|
|
" c_max = df[col].max()\n",
|
|
|
|
|
" if str(col_type)[:3] == 'int':\n",
|
|
|
|
|
" if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
|
|
|
|
|
" df[col] = df[col].astype(np.int8)\n",
|
|
|
|
|
" elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
|
|
|
|
|
" df[col] = df[col].astype(np.int16)\n",
|
|
|
|
|
" elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
|
|
|
|
|
" df[col] = df[col].astype(np.int32)\n",
|
|
|
|
|
" elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
|
|
|
|
|
" df[col] = df[col].astype(np.int64) \n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
|
|
|
|
|
" df[col] = df[col].astype(np.float16)\n",
|
|
|
|
|
" elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
|
|
|
|
|
" df[col] = df[col].astype(np.float32)\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" df[col] = df[col].astype(np.float64)\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" df[col] = df[col].astype('category')\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" end_mem = df.memory_usage().sum() / 1024**2\n",
|
|
|
|
|
" print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
|
|
|
|
|
" print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" return df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 27,
|
|
|
|
|
"id": "690184eb",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Memory usage of dataframe is 88.66 MB\n",
|
|
|
|
|
"Memory usage after optimization is: 22.16 MB\n",
|
|
|
|
|
"Decreased by 75.0%\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"X = reduce_mem_usage(X)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "f72a6efa",
|
|
|
|
|