{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 任务目标:利用异烟酸生产过程中的各参数,预测最终异烟酸的收率\n",
"
\n",
" - 数据集包括工程中10各步骤的参数,样本id、A1-A28、B1-B14包括原料、辅料、时间、温度、压强以及收率。\n",
"
- 冠军ATCG解决方案\n",
"
\n",
"\n",
"**预测具体的值:回归任务**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import warnings\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import KFold\n",
"from sklearn.metrics import mean_squared_error as mse\n",
"\n",
"warnings.simplefilter('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"导入数据集\"\"\"\n",
"df_trn = pd.read_csv('jinnan_round1_train_20181227.csv', encoding='GB2312') # encoding进行编码\n",
"df_tst_a = pd.read_csv('jinnan_round1_testA_20181227.csv', encoding='GB2312')\n",
"df_tst_b = pd.read_csv('jinnan_round1_testB_20190121.csv', encoding='GB2312') "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
" A1 | \n",
" A2 | \n",
" A3 | \n",
" A4 | \n",
" A5 | \n",
" A6 | \n",
" A7 | \n",
" A8 | \n",
" A9 | \n",
" ... | \n",
" B6 | \n",
" B7 | \n",
" B8 | \n",
" B9 | \n",
" B10 | \n",
" B11 | \n",
" B12 | \n",
" B13 | \n",
" B14 | \n",
" 收率 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1528 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 13:30:00 | \n",
" 38.0 | \n",
" NaN | \n",
" NaN | \n",
" 15:30:00 | \n",
" ... | \n",
" 65 | \n",
" 11:30:00 | \n",
" 45.0 | \n",
" 11:30-13:00 | \n",
" 14:00-15:30 | \n",
" NaN | \n",
" 800.0 | \n",
" 0.15 | \n",
" 400 | \n",
" 0.879 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1698 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 14:00:00 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 16:00:00 | \n",
" ... | \n",
" 80 | \n",
" 6:00:00 | \n",
" 45.0 | \n",
" 6:00-7:30 | \n",
" 7:30-9:00 | \n",
" 9:00-10:00 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 400 | \n",
" 0.902 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_639 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 14:00:00 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 16:00:00 | \n",
" ... | \n",
" 80 | \n",
" 1:00:00 | \n",
" 45.0 | \n",
" 1:00-2:30 | \n",
" 2:30-4:00 | \n",
" 4:00-5:00 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 400 | \n",
" 0.936 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_483 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 1:30:00 | \n",
" 38.0 | \n",
" NaN | \n",
" NaN | \n",
" 3:00:00 | \n",
" ... | \n",
" 65 | \n",
" 18:00:00 | \n",
" 45.0 | \n",
" 19:00-20:30 | \n",
" 21:30-23:00 | \n",
" NaN | \n",
" 800.0 | \n",
" 0.15 | \n",
" 400 | \n",
" 0.902 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_617 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 22:00:00 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 0:00:00 | \n",
" ... | \n",
" 80 | \n",
" 9:00:00 | \n",
" 45.0 | \n",
" 9:00-10:30 | \n",
" 10:30-12:00 | \n",
" 12:00-13:00 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 420 | \n",
" 0.983 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 44 columns
\n",
"
"
],
"text/plain": [
" 样本id A1 A2 A3 A4 A5 A6 A7 A8 A9 ... \\\n",
"0 sample_1528 300 NaN 405.0 700 13:30:00 38.0 NaN NaN 15:30:00 ... \n",
"1 sample_1698 300 NaN 405.0 700 14:00:00 29.0 NaN NaN 16:00:00 ... \n",
"2 sample_639 300 NaN 405.0 700 14:00:00 29.0 NaN NaN 16:00:00 ... \n",
"3 sample_483 300 NaN 405.0 700 1:30:00 38.0 NaN NaN 3:00:00 ... \n",
"4 sample_617 300 NaN 405.0 700 22:00:00 29.0 NaN NaN 0:00:00 ... \n",
"\n",
" B6 B7 B8 B9 B10 B11 B12 B13 \\\n",
"0 65 11:30:00 45.0 11:30-13:00 14:00-15:30 NaN 800.0 0.15 \n",
"1 80 6:00:00 45.0 6:00-7:30 7:30-9:00 9:00-10:00 1200.0 0.15 \n",
"2 80 1:00:00 45.0 1:00-2:30 2:30-4:00 4:00-5:00 1200.0 0.15 \n",
"3 65 18:00:00 45.0 19:00-20:30 21:30-23:00 NaN 800.0 0.15 \n",
"4 80 9:00:00 45.0 9:00-10:30 10:30-12:00 12:00-13:00 1200.0 0.15 \n",
"\n",
" B14 收率 \n",
"0 400 0.879 \n",
"1 400 0.902 \n",
"2 400 0.936 \n",
"3 400 0.902 \n",
"4 420 0.983 \n",
"\n",
"[5 rows x 44 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 观察数据\n",
"df_trn.head() # 可以发现A2、A7等有NaN缺失值"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 1396 entries, 0 to 1395\n",
"Data columns (total 44 columns):\n",
"样本id 1396 non-null object\n",
"A1 1396 non-null int64\n",
"A2 42 non-null float64\n",
"A3 1354 non-null float64\n",
"A4 1396 non-null int64\n",
"A5 1396 non-null object\n",
"A6 1396 non-null float64\n",
"A7 149 non-null object\n",
"A8 149 non-null float64\n",
"A9 1396 non-null object\n",
"A10 1396 non-null int64\n",
"A11 1396 non-null object\n",
"A12 1396 non-null int64\n",
"A13 1396 non-null float64\n",
"A14 1396 non-null object\n",
"A15 1396 non-null float64\n",
"A16 1396 non-null object\n",
"A17 1396 non-null float64\n",
"A18 1396 non-null float64\n",
"A19 1396 non-null int64\n",
"A20 1396 non-null object\n",
"A21 1393 non-null float64\n",
"A22 1396 non-null float64\n",
"A23 1393 non-null float64\n",
"A24 1395 non-null object\n",
"A25 1396 non-null object\n",
"A26 1394 non-null object\n",
"A27 1396 non-null int64\n",
"A28 1396 non-null object\n",
"B1 1386 non-null float64\n",
"B2 1394 non-null float64\n",
"B3 1394 non-null float64\n",
"B4 1396 non-null object\n",
"B5 1395 non-null object\n",
"B6 1396 non-null int64\n",
"B7 1396 non-null object\n",
"B8 1395 non-null float64\n",
"B9 1396 non-null object\n",
"B10 1152 non-null object\n",
"B11 547 non-null object\n",
"B12 1395 non-null float64\n",
"B13 1395 non-null float64\n",
"B14 1396 non-null int64\n",
"收率 1396 non-null float64\n",
"dtypes: float64(18), int64(8), object(18)\n",
"memory usage: 480.0+ KB\n"
]
}
],
"source": [
"df_trn.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 如何确定字段需要处理\n",
"我们需要解决一些异常值,如某值相对其它值过大的离群点"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A1 | \n",
" A2 | \n",
" A3 | \n",
" A4 | \n",
" A6 | \n",
" A8 | \n",
" A10 | \n",
" A12 | \n",
" A13 | \n",
" A15 | \n",
" A17 | \n",
" A18 | \n",
" A19 | \n",
" A21 | \n",
" A22 | \n",
" A23 | \n",
" A27 | \n",
" B1 | \n",
" B2 | \n",
" B3 | \n",
" B6 | \n",
" B8 | \n",
" B12 | \n",
" B13 | \n",
" B14 | \n",
" 收率 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1396.000000 | \n",
" 42.0 | \n",
" 1354.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
" 149.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
" 1393.000000 | \n",
" 1396.000000 | \n",
" 1393.000000 | \n",
" 1396.000000 | \n",
" 1386.000000 | \n",
" 1394.000000 | \n",
" 1394.000000 | \n",
" 1396.000000 | \n",
" 1395.000000 | \n",
" 1395.000000 | \n",
" 1395.000000 | \n",
" 1396.000000 | \n",
" 1396.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 298.853868 | \n",
" 125.0 | \n",
" 403.515510 | \n",
" 705.974212 | \n",
" 28.287751 | \n",
" 78.818792 | \n",
" 100.861032 | \n",
" 102.641834 | \n",
" 0.199907 | \n",
" 103.829370 | \n",
" 104.766905 | \n",
" 0.199928 | \n",
" 231.067335 | \n",
" 48.707825 | \n",
" 9.117120 | \n",
" 5.002872 | \n",
" 74.396848 | \n",
" 334.452742 | \n",
" 3.454412 | \n",
" 3.500072 | \n",
" 72.065186 | \n",
" 43.709677 | \n",
" 1020.215054 | \n",
" 0.149419 | \n",
" 410.403295 | \n",
" 0.923244 | \n",
"
\n",
" \n",
" std | \n",
" 10.130552 | \n",
" 0.0 | \n",
" 13.348093 | \n",
" 53.214754 | \n",
" 6.742765 | \n",
" 2.683920 | \n",
" 0.905198 | \n",
" 0.915387 | \n",
" 0.002524 | \n",
" 0.963639 | \n",
" 1.401446 | \n",
" 0.002676 | \n",
" 50.478071 | \n",
" 4.976531 | \n",
" 0.369152 | \n",
" 0.136638 | \n",
" 3.044490 | \n",
" 105.120753 | \n",
" 0.388585 | \n",
" 0.002678 | \n",
" 9.161986 | \n",
" 4.338396 | \n",
" 205.920155 | \n",
" 0.008213 | \n",
" 26.018410 | \n",
" 0.030880 | \n",
"
\n",
" \n",
" min | \n",
" 200.000000 | \n",
" 125.0 | \n",
" 270.000000 | \n",
" 470.000000 | \n",
" 17.000000 | \n",
" 70.000000 | \n",
" 100.000000 | \n",
" 98.000000 | \n",
" 0.120000 | \n",
" 100.000000 | \n",
" 89.000000 | \n",
" 0.100000 | \n",
" 100.000000 | \n",
" 20.000000 | \n",
" 3.500000 | \n",
" 4.000000 | \n",
" 45.000000 | \n",
" 3.500000 | \n",
" 0.150000 | \n",
" 3.500000 | \n",
" 40.000000 | \n",
" 20.000000 | \n",
" 400.000000 | \n",
" 0.030000 | \n",
" 40.000000 | \n",
" 0.624000 | \n",
"
\n",
" \n",
" 25% | \n",
" 300.000000 | \n",
" 125.0 | \n",
" 405.000000 | \n",
" 700.000000 | \n",
" 24.000000 | \n",
" 80.000000 | \n",
" 100.000000 | \n",
" 102.000000 | \n",
" 0.200000 | \n",
" 103.000000 | \n",
" 104.000000 | \n",
" 0.200000 | \n",
" 200.000000 | \n",
" 50.000000 | \n",
" 9.000000 | \n",
" 5.000000 | \n",
" 73.000000 | \n",
" 320.000000 | \n",
" 3.500000 | \n",
" 3.500000 | \n",
" 65.000000 | \n",
" 45.000000 | \n",
" 800.000000 | \n",
" 0.150000 | \n",
" 400.000000 | \n",
" 0.902000 | \n",
"
\n",
" \n",
" 50% | \n",
" 300.000000 | \n",
" 125.0 | \n",
" 405.000000 | \n",
" 700.000000 | \n",
" 29.000000 | \n",
" 80.000000 | \n",
" 101.000000 | \n",
" 103.000000 | \n",
" 0.200000 | \n",
" 104.000000 | \n",
" 105.000000 | \n",
" 0.200000 | \n",
" 200.000000 | \n",
" 50.000000 | \n",
" 9.000000 | \n",
" 5.000000 | \n",
" 73.000000 | \n",
" 320.000000 | \n",
" 3.500000 | \n",
" 3.500000 | \n",
" 78.000000 | \n",
" 45.000000 | \n",
" 1200.000000 | \n",
" 0.150000 | \n",
" 400.000000 | \n",
" 0.925000 | \n",
"
\n",
" \n",
" 75% | \n",
" 300.000000 | \n",
" 125.0 | \n",
" 405.000000 | \n",
" 700.000000 | \n",
" 30.000000 | \n",
" 80.000000 | \n",
" 102.000000 | \n",
" 103.000000 | \n",
" 0.200000 | \n",
" 104.000000 | \n",
" 105.000000 | \n",
" 0.200000 | \n",
" 300.000000 | \n",
" 50.000000 | \n",
" 9.000000 | \n",
" 5.000000 | \n",
" 77.000000 | \n",
" 330.000000 | \n",
" 3.500000 | \n",
" 3.500000 | \n",
" 80.000000 | \n",
" 45.000000 | \n",
" 1200.000000 | \n",
" 0.150000 | \n",
" 420.000000 | \n",
" 0.943000 | \n",
"
\n",
" \n",
" max | \n",
" 300.000000 | \n",
" 125.0 | \n",
" 405.000000 | \n",
" 980.000000 | \n",
" 97.000000 | \n",
" 82.000000 | \n",
" 103.000000 | \n",
" 107.000000 | \n",
" 0.200000 | \n",
" 109.000000 | \n",
" 108.000000 | \n",
" 0.200000 | \n",
" 350.000000 | \n",
" 90.000000 | \n",
" 10.000000 | \n",
" 10.000000 | \n",
" 80.000000 | \n",
" 1200.000000 | \n",
" 3.600000 | \n",
" 3.600000 | \n",
" 80.000000 | \n",
" 73.000000 | \n",
" 1200.000000 | \n",
" 0.150000 | \n",
" 460.000000 | \n",
" 1.000800 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A1 A2 A3 A4 A6 A8 \\\n",
"count 1396.000000 42.0 1354.000000 1396.000000 1396.000000 149.000000 \n",
"mean 298.853868 125.0 403.515510 705.974212 28.287751 78.818792 \n",
"std 10.130552 0.0 13.348093 53.214754 6.742765 2.683920 \n",
"min 200.000000 125.0 270.000000 470.000000 17.000000 70.000000 \n",
"25% 300.000000 125.0 405.000000 700.000000 24.000000 80.000000 \n",
"50% 300.000000 125.0 405.000000 700.000000 29.000000 80.000000 \n",
"75% 300.000000 125.0 405.000000 700.000000 30.000000 80.000000 \n",
"max 300.000000 125.0 405.000000 980.000000 97.000000 82.000000 \n",
"\n",
" A10 A12 A13 A15 A17 \\\n",
"count 1396.000000 1396.000000 1396.000000 1396.000000 1396.000000 \n",
"mean 100.861032 102.641834 0.199907 103.829370 104.766905 \n",
"std 0.905198 0.915387 0.002524 0.963639 1.401446 \n",
"min 100.000000 98.000000 0.120000 100.000000 89.000000 \n",
"25% 100.000000 102.000000 0.200000 103.000000 104.000000 \n",
"50% 101.000000 103.000000 0.200000 104.000000 105.000000 \n",
"75% 102.000000 103.000000 0.200000 104.000000 105.000000 \n",
"max 103.000000 107.000000 0.200000 109.000000 108.000000 \n",
"\n",
" A18 A19 A21 A22 A23 \\\n",
"count 1396.000000 1396.000000 1393.000000 1396.000000 1393.000000 \n",
"mean 0.199928 231.067335 48.707825 9.117120 5.002872 \n",
"std 0.002676 50.478071 4.976531 0.369152 0.136638 \n",
"min 0.100000 100.000000 20.000000 3.500000 4.000000 \n",
"25% 0.200000 200.000000 50.000000 9.000000 5.000000 \n",
"50% 0.200000 200.000000 50.000000 9.000000 5.000000 \n",
"75% 0.200000 300.000000 50.000000 9.000000 5.000000 \n",
"max 0.200000 350.000000 90.000000 10.000000 10.000000 \n",
"\n",
" A27 B1 B2 B3 B6 \\\n",
"count 1396.000000 1386.000000 1394.000000 1394.000000 1396.000000 \n",
"mean 74.396848 334.452742 3.454412 3.500072 72.065186 \n",
"std 3.044490 105.120753 0.388585 0.002678 9.161986 \n",
"min 45.000000 3.500000 0.150000 3.500000 40.000000 \n",
"25% 73.000000 320.000000 3.500000 3.500000 65.000000 \n",
"50% 73.000000 320.000000 3.500000 3.500000 78.000000 \n",
"75% 77.000000 330.000000 3.500000 3.500000 80.000000 \n",
"max 80.000000 1200.000000 3.600000 3.600000 80.000000 \n",
"\n",
" B8 B12 B13 B14 收率 \n",
"count 1395.000000 1395.000000 1395.000000 1396.000000 1396.000000 \n",
"mean 43.709677 1020.215054 0.149419 410.403295 0.923244 \n",
"std 4.338396 205.920155 0.008213 26.018410 0.030880 \n",
"min 20.000000 400.000000 0.030000 40.000000 0.624000 \n",
"25% 45.000000 800.000000 0.150000 400.000000 0.902000 \n",
"50% 45.000000 1200.000000 0.150000 400.000000 0.925000 \n",
"75% 45.000000 1200.000000 0.150000 420.000000 0.943000 \n",
"max 73.000000 1200.000000 0.150000 460.000000 1.000800 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# pd.set_option('display.max_rows',100)#设置最大可见100行\n",
"pd.set_option('display.max_columns',100) #给最大列设置为100列\n",
"df_trn.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**观测点:**\n",
"\n",
" - A5、A9等字段的describe没有了,而head()是有的,说明这些字段有问题\n",
"
- 理论上,std(方差也可以)越大表明特征间的差异越大,这样模型能学到区分性,但是过大可能是数据有离群值,B1、B12是需要关注的,再看其它值,B1里面最小值是3.5,25%/50%/75%都是320,3.5非常离群,而B12里最小值和中位数和最大值像是递进。"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def train_abnormal_revise(data):\n",
" df_trn = data.copy() # 复制一份数据,不改变原数据\n",
" df_trn.loc[(df_trn['A1'] == 200) & (df_trn['A3'] == 405), 'A1'] = 300\n",
" # A5会发现三个不合法值,比如1900/1/21 0:00可能要表达的是21:00:00,我们替换掉\n",
" df_trn['A5'] = df_trn['A5'].replace('1900/1/21 0:00', '21:00:00')\n",
" df_trn['A5'] = df_trn['A5'].replace('1900/1/29 0:00', '14:00:00')\n",
" df_trn['A9'] = df_trn['A9'].replace('1900/1/9 7:00', '23:00:00')\n",
" # A9有两个不合法值\n",
" df_trn['A9'] = df_trn['A9'].replace('1900/1/9 7:00', '23:00:00')\n",
" df_trn['A9'] = df_trn['A9'].replace('700', '7:00:00')\n",
" # A11有一个不合法值\n",
" df_trn['A11'] = df_trn['A11'].replace('1900/1/1 2:30', '2:30:00')\n",
" df_trn['A11'] = df_trn['A11'].replace(':30:00', '00:30:00')\n",
" df_trn['A16'] = df_trn['A16'].replace('1900/1/12 0:00', '12:00:00')\n",
" df_trn['A20'] = df_trn['A20'].replace('6:00-6:30分', '6:00-6:30')\n",
" df_trn['A20'] = df_trn['A20'].replace('18:30-15:00', '18:30-19:00')\n",
" # A22有个不合法值\n",
" df_trn['A22'] = df_trn['A22'].replace(3.5, np.nan)\n",
" df_trn['A25'] = df_trn['A25'].replace('1900/3/10 0:00', 70).astype(int)\n",
" df_trn['A26'] = df_trn['A26'].replace('1900/3/13 0:00', '13:00:00')\n",
" df_trn['B1'] = df_trn['B1'].replace(3.5, np.nan)\n",
" df_trn['B4'] = df_trn['B4'].replace('15:00-1600', '15:00-16:00')\n",
" df_trn['B4'] = df_trn['B4'].replace('18:00-17:00', '16:00-17:00')\n",
" df_trn['B4'] = df_trn['B4'].replace('19:-20:05', '19:05-20:05')\n",
" df_trn['B9'] = df_trn['B9'].replace('23:00-7:30', '23:00-00:30')\n",
" df_trn['B14'] = df_trn['B14'].replace(40, 400)\n",
" return df_trn\n",
"\n",
"\n",
"def test_a_abnormal_revise(data):\n",
" df_tst = data.copy()\n",
" df_tst['A5'] = df_tst['A5'].replace('1900/1/22 0:00', '22:00:00')\n",
" df_tst['A7'] = df_tst['A7'].replace('0:50:00', '21:50:00')\n",
" df_tst['B14'] = df_tst['B14'].replace(785, 385)\n",
" return df_tst\n",
"\n",
"\n",
"def train_abnormal_adjust(data):\n",
" df_trn = data.copy()\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1894', 'A5'] = '14:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1234', 'A9'] = '0:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1020', 'A9'] = '18:30:00'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1380', 'A11'] = '15:30:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_844', 'A11'] = '10:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1348', 'A11'] = '17:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_25', 'A11'] = '00:30:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1105', 'A11'] = '4:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_313', 'A11'] = '15:30:00'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_291', 'A14'] = '19:30:00'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1398', 'A16'] = '11:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1177', 'A20'] = '19:00-20:00'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_71', 'A20'] = '16:20-16:50'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_14', 'A20'] = '18:00-18:30'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_69', 'A20'] = '6:10-6:50'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1500', 'A20'] = '23:00-23:30'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1524', 'A24'] = '15:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1524', 'A26'] = '15:30:00'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1046', 'A28'] = '18:00-18:30'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1230', 'B5'] = '17:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_97', 'B7'] = '1:00:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_752', 'B9'] = '11:00-14:00'\n",
"\n",
" df_trn.loc[df_trn['样本id'] == 'sample_609', 'B11'] = '11:00-12:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_643', 'B11'] = '12:00-13:00'\n",
" df_trn.loc[df_trn['样本id'] == 'sample_1164', 'B11'] = '5:00-6:00'\n",
" return df_trn\n",
"\n",
"\n",
"def test_a_abnormal_adjust(data):\n",
" df_tst = data.copy()\n",
" df_tst.loc[df_tst['样本id'] == 'sample_919', 'A9'] = '19:50:00'\n",
" return df_tst\n",
"\n",
"\n",
"def test_b_abnormal_adjust(data):\n",
" df_tst = data.copy()\n",
" df_tst.loc[df_tst['样本id'] == 'sample_566', 'A5'] = '18:00:00'\n",
" df_tst.loc[df_tst['样本id'] == 'sample_40', 'A20'] = '5:00-5:30'\n",
" df_tst.loc[df_tst['样本id'] == 'sample_531', 'B5'] = '1:00'\n",
" return df_tst"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df_trn = train_abnormal_revise(df_trn).pipe(train_abnormal_adjust)\n",
"df_tst_a = test_a_abnormal_revise(df_tst_a).pipe(test_a_abnormal_adjust)\n",
"df_tst_b = test_b_abnormal_adjust(df_tst_b)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 标签与数据集整合"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df_trn, df_tst = df_trn.copy(), df_tst_a.copy()\n",
"df_target = df_trn['收率'] # 获取数据标签\n",
"del df_trn['收率'] # 删除掉训练集的标签,即是训练数据\n",
"df_trn_tst = df_trn.append(df_tst, ignore_index=False).reset_index(\n",
" drop=True) # 把test合并一起,同时做操作"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"for _df in [df_trn, df_tst, df_trn_tst]:\n",
" _df['A3'] = _df['A3'].fillna(405) # A3有缺失值,用众数填充"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 时间段特征处理 "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# 所有时间相关列\n",
"cols_timer = ['A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7']\n",
"# 同时对训练和测试集进行相同处理\n",
"for _df in [df_trn_tst, df_trn, df_tst]:\n",
" # 添加列名标记\n",
" _df.rename(columns={_col: _col + '_t' for _col in cols_timer},\n",
" inplace=True)\n",
" # 遍历所有持续时间相关列例如21:00-21:30\n",
" for _col in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']:\n",
" # 取到当前列的索引\n",
" _idx_col = _df.columns.tolist().index(_col)\n",
" # 添加新的一列,表示起始时间,split表示分别取开始和结束时间,用索引来指定\n",
" _df.insert(_idx_col + 1, _col + '_at',\n",
" _df[_col].str.split('-').str[0])\n",
" # 添加新的一列,表示终止时间\n",
" _df.insert(_idx_col + 2, _col + '_bt',\n",
" _df[_col].str.split('-').str[1])\n",
" # 删除持续时间\n",
" del _df[_col]\n",
" cols_timer = cols_timer + [_col + '_at', _col + '_bt']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
" A1 | \n",
" A2 | \n",
" A3 | \n",
" A4 | \n",
" A5_t | \n",
" A6 | \n",
" A7_t | \n",
" A8 | \n",
" A9_t | \n",
" ... | \n",
" B8 | \n",
" B9_at | \n",
" B9_bt | \n",
" B10_at | \n",
" B10_bt | \n",
" B11_at | \n",
" B11_bt | \n",
" B12 | \n",
" B13 | \n",
" B14 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1528 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 13:30:00 | \n",
" 38.0 | \n",
" NaN | \n",
" NaN | \n",
" 15:30:00 | \n",
" ... | \n",
" 45.0 | \n",
" 11:30 | \n",
" 13:00 | \n",
" 14:00 | \n",
" 15:30 | \n",
" NaN | \n",
" NaN | \n",
" 800.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1698 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 14:00:00 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 16:00:00 | \n",
" ... | \n",
" 45.0 | \n",
" 6:00 | \n",
" 7:30 | \n",
" 7:30 | \n",
" 9:00 | \n",
" 9:00 | \n",
" 10:00 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_639 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 14:00:00 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 16:00:00 | \n",
" ... | \n",
" 45.0 | \n",
" 1:00 | \n",
" 2:30 | \n",
" 2:30 | \n",
" 4:00 | \n",
" 4:00 | \n",
" 5:00 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_483 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 1:30:00 | \n",
" 38.0 | \n",
" NaN | \n",
" NaN | \n",
" 3:00:00 | \n",
" ... | \n",
" 45.0 | \n",
" 19:00 | \n",
" 20:30 | \n",
" 21:30 | \n",
" 23:00 | \n",
" NaN | \n",
" NaN | \n",
" 800.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_617 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 22:00:00 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 0:00:00 | \n",
" ... | \n",
" 45.0 | \n",
" 9:00 | \n",
" 10:30 | \n",
" 10:30 | \n",
" 12:00 | \n",
" 12:00 | \n",
" 13:00 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 420 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 49 columns
\n",
"
"
],
"text/plain": [
" 样本id A1 A2 A3 A4 A5_t A6 A7_t A8 A9_t ... \\\n",
"0 sample_1528 300 NaN 405.0 700 13:30:00 38.0 NaN NaN 15:30:00 ... \n",
"1 sample_1698 300 NaN 405.0 700 14:00:00 29.0 NaN NaN 16:00:00 ... \n",
"2 sample_639 300 NaN 405.0 700 14:00:00 29.0 NaN NaN 16:00:00 ... \n",
"3 sample_483 300 NaN 405.0 700 1:30:00 38.0 NaN NaN 3:00:00 ... \n",
"4 sample_617 300 NaN 405.0 700 22:00:00 29.0 NaN NaN 0:00:00 ... \n",
"\n",
" B8 B9_at B9_bt B10_at B10_bt B11_at B11_bt B12 B13 B14 \n",
"0 45.0 11:30 13:00 14:00 15:30 NaN NaN 800.0 0.15 400 \n",
"1 45.0 6:00 7:30 7:30 9:00 9:00 10:00 1200.0 0.15 400 \n",
"2 45.0 1:00 2:30 2:30 4:00 4:00 5:00 1200.0 0.15 400 \n",
"3 45.0 19:00 20:30 21:30 23:00 NaN NaN 800.0 0.15 400 \n",
"4 45.0 9:00 10:30 10:30 12:00 12:00 13:00 1200.0 0.15 420 \n",
"\n",
"[5 rows x 49 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_trn_tst.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"cols_timer = list(filter(lambda x: x.endswith('t'), df_trn_tst.columns))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['A5_t',\n",
" 'A7_t',\n",
" 'A9_t',\n",
" 'A11_t',\n",
" 'A14_t',\n",
" 'A16_t',\n",
" 'A20_at',\n",
" 'A20_bt',\n",
" 'A24_t',\n",
" 'A26_t',\n",
" 'A28_at',\n",
" 'A28_bt',\n",
" 'B4_at',\n",
" 'B4_bt',\n",
" 'B5_t',\n",
" 'B7_t',\n",
" 'B9_at',\n",
" 'B9_bt',\n",
" 'B10_at',\n",
" 'B10_bt',\n",
" 'B11_at',\n",
" 'B11_bt']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols_timer"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def time_to_min(x):\n",
" # 将时间全部转换成分钟形式\n",
" if x is np.nan:\n",
" return np.nan\n",
" else:\n",
" x = x.replace(';', ':').replace(';', ':')\n",
" x = x.replace('::', ':').replace('\"', ':')\n",
" h, m = x.split(':')[:2]\n",
" h = 0 if not h else h\n",
" m = 0 if not m else m\n",
" return int(h)*60 + int(m)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for _df in [df_trn_tst, df_trn, df_tst]:\n",
" for _col in cols_timer:\n",
" _df[_col] = _df[_col].map(time_to_min)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
" A1 | \n",
" A2 | \n",
" A3 | \n",
" A4 | \n",
" A5_t | \n",
" A6 | \n",
" A7_t | \n",
" A8 | \n",
" A9_t | \n",
" ... | \n",
" B8 | \n",
" B9_at | \n",
" B9_bt | \n",
" B10_at | \n",
" B10_bt | \n",
" B11_at | \n",
" B11_bt | \n",
" B12 | \n",
" B13 | \n",
" B14 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1528 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 810 | \n",
" 38.0 | \n",
" NaN | \n",
" NaN | \n",
" 930 | \n",
" ... | \n",
" 45.0 | \n",
" 690 | \n",
" 780 | \n",
" 840.0 | \n",
" 930.0 | \n",
" NaN | \n",
" NaN | \n",
" 800.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1698 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 840 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 960 | \n",
" ... | \n",
" 45.0 | \n",
" 360 | \n",
" 450 | \n",
" 450.0 | \n",
" 540.0 | \n",
" 540.0 | \n",
" 600.0 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_639 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 840 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 960 | \n",
" ... | \n",
" 45.0 | \n",
" 60 | \n",
" 150 | \n",
" 150.0 | \n",
" 240.0 | \n",
" 240.0 | \n",
" 300.0 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_483 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 90 | \n",
" 38.0 | \n",
" NaN | \n",
" NaN | \n",
" 180 | \n",
" ... | \n",
" 45.0 | \n",
" 1140 | \n",
" 1230 | \n",
" 1290.0 | \n",
" 1380.0 | \n",
" NaN | \n",
" NaN | \n",
" 800.0 | \n",
" 0.15 | \n",
" 400 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_617 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 1320 | \n",
" 29.0 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" ... | \n",
" 45.0 | \n",
" 540 | \n",
" 630 | \n",
" 630.0 | \n",
" 720.0 | \n",
" 720.0 | \n",
" 780.0 | \n",
" 1200.0 | \n",
" 0.15 | \n",
" 420 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 49 columns
\n",
"
"
],
"text/plain": [
" 样本id A1 A2 A3 A4 A5_t A6 A7_t A8 A9_t ... B8 \\\n",
"0 sample_1528 300 NaN 405.0 700 810 38.0 NaN NaN 930 ... 45.0 \n",
"1 sample_1698 300 NaN 405.0 700 840 29.0 NaN NaN 960 ... 45.0 \n",
"2 sample_639 300 NaN 405.0 700 840 29.0 NaN NaN 960 ... 45.0 \n",
"3 sample_483 300 NaN 405.0 700 90 38.0 NaN NaN 180 ... 45.0 \n",
"4 sample_617 300 NaN 405.0 700 1320 29.0 NaN NaN 0 ... 45.0 \n",
"\n",
" B9_at B9_bt B10_at B10_bt B11_at B11_bt B12 B13 B14 \n",
"0 690 780 840.0 930.0 NaN NaN 800.0 0.15 400 \n",
"1 360 450 450.0 540.0 540.0 600.0 1200.0 0.15 400 \n",
"2 60 150 150.0 240.0 240.0 300.0 1200.0 0.15 400 \n",
"3 1140 1230 1290.0 1380.0 NaN NaN 800.0 0.15 400 \n",
"4 540 630 630.0 720.0 720.0 780.0 1200.0 0.15 420 \n",
"\n",
"[5 rows x 49 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_trn_tst.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 创建一个df来准备添加很多特征"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1528 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1698 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_639 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_483 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_617 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 样本id\n",
"0 sample_1528\n",
"1 sample_1698\n",
"2 sample_639\n",
"3 sample_483\n",
"4 sample_617"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw = df_trn_tst.copy()\n",
"df = pd.DataFrame(raw['样本id'])\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 温度相关特征"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# 加热过程\n",
"df['P1_S1_A6_0C'] = raw['A6'] # 容器初始温度\n",
"df['P1_S2_A8_1C'] = raw['A8'] # 首次测温温度\n",
"df['P1_S3_A10_2C'] = raw['A10'] # 准备水解温度\n",
"df['P1_C1_C0_D'] = raw['A8'] - raw['A6'] # 测温温差\n",
"df['P1_C2_C0_D'] = raw['A10'] - raw['A6'] # 初次沸腾温差\n",
"\n",
"# 水解过程\n",
"df['P2_S1_A12_3C'] = raw['A12'] # 水解开始温度\n",
"df['P2_S2_A15_4C'] = raw['A15'] # 水解过程测温温度\n",
"df['P2_S3_A17_5C'] = raw['A17'] # 水解结束温度\n",
"df['P2_C3_C0_D'] = raw['A12'] - raw['A6'] # 水解开始与初始温度温差\n",
"df['P2_C3_C2_D'] = raw['A12'] - raw['A10'] # 水解开始前恒温温差\n",
"df['P2_C4_C3_D'] = raw['A15'] - raw['A12'] # 水解过程中途温差\n",
"df['P2_C5_C4_D'] = raw['A17'] - raw['A15'] # 水解结束中途温差\n",
"df['P2_C5_C3_KD'] = raw['A17'] - raw['A12'] # 水解起止温差\n",
"\n",
"# 脱色过程\n",
"df['P3_S2_A25_7C'] = raw['A25'] # 脱色保温开始温度\n",
"df['P3_S3_A27_8C'] = raw['A27'] # 脱色保温结束温度\n",
"df['P3_C7_C5_D'] = raw['A25'] - raw['A17'] # 降温温差\n",
"df['P3_C8_C7_KD'] = raw['A27'] - raw['A25'] # 保温温差\n",
"\n",
"# 结晶过程\n",
"df['P4_S2_B6_11C'] = raw['B6'] # 结晶开始温度\n",
"df['P4_S3_B8_12C'] = raw['B8'] # 结晶结束温度\n",
"df['P4_C11_C8_D'] = raw['B6'] - raw['A27'] # 脱色结束到结晶温差\n",
"df['P4_C12_C11_KD'] = raw['B8'] - raw['B6'] # 结晶温差"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 温度相关统计特征"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"_funcs = ['mean', 'std', 'sum']\n",
"# 遍历每一种统计指标\n",
"for _func in _funcs:\n",
" # 对每一个样本计算各项指标\n",
" df[f'P2_C2-C5_{_func}'] = raw[['A10', 'A12', 'A15', 'A17']].\\\n",
" agg(_func, axis=1) # 沸腾过程温度\n",
" df[f'P2_D3-D5_{_func}'] = \\\n",
" df[[f'P2_C{i}_C{i-1}_D' for i in range(3, 6)]].\\\n",
" abs().agg(_func, axis=1) # 沸腾过程绝对温差\n",
" df[f'P2_C1-C12_KD_ABS_{_func}'] = \\\n",
" df[[_f for _f in df.columns if _f.endswith('KD')]].\\\n",
" abs().agg(_func, axis=1) # 关键过程绝对温差\n",
" df[f'P2_C1-C12_D_{_func}'] = \\\n",
" df[[_f for _f in df.columns if _f.endswith('D')]].\\\n",
" abs().agg(_func, axis=1) # 所有过程绝对温差\n",
" df[f'P2_LARGE_KD_{_func}'] = \\\n",
" df[['P2_C3_C0_D', 'P3_C7_C5_D', 'P4_C12_C11_KD']].\\\n",
" abs().agg(_func, axis=1) # 大温差绝对温差"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
" P1_S1_A6_0C | \n",
" P1_S2_A8_1C | \n",
" P1_S3_A10_2C | \n",
" P1_C1_C0_D | \n",
" P1_C2_C0_D | \n",
" P2_S1_A12_3C | \n",
" P2_S2_A15_4C | \n",
" P2_S3_A17_5C | \n",
" P2_C3_C0_D | \n",
" ... | \n",
" P2_C2-C5_std | \n",
" P2_D3-D5_std | \n",
" P2_C1-C12_KD_ABS_std | \n",
" P2_C1-C12_D_std | \n",
" P2_LARGE_KD_std | \n",
" P2_C2-C5_sum | \n",
" P2_D3-D5_sum | \n",
" P2_C1-C12_KD_ABS_sum | \n",
" P2_C1-C12_D_sum | \n",
" P2_LARGE_KD_sum | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1528 | \n",
" 38.0 | \n",
" NaN | \n",
" 100 | \n",
" NaN | \n",
" 62.0 | \n",
" 102.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 64.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 9.643651 | \n",
" 24.928565 | \n",
" 23.245071 | \n",
" 409.0 | \n",
" 4.0 | \n",
" 27.0 | \n",
" 191.0 | \n",
" 113.0 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1698 | \n",
" 29.0 | \n",
" NaN | \n",
" 101 | \n",
" NaN | \n",
" 72.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 105.0 | \n",
" 74.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 17.785762 | \n",
" 28.887521 | \n",
" 25.890796 | \n",
" 413.0 | \n",
" 4.0 | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_639 | \n",
" 29.0 | \n",
" NaN | \n",
" 102 | \n",
" NaN | \n",
" 73.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 105.0 | \n",
" 74.0 | \n",
" ... | \n",
" 1.290994 | \n",
" 0.00000 | \n",
" 18.009257 | \n",
" 29.231642 | \n",
" 25.514702 | \n",
" 414.0 | \n",
" 3.0 | \n",
" 43.0 | \n",
" 226.0 | \n",
" 135.0 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_483 | \n",
" 38.0 | \n",
" NaN | \n",
" 100 | \n",
" NaN | \n",
" 62.0 | \n",
" 102.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 64.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 9.165151 | \n",
" 24.617293 | \n",
" 22.479620 | \n",
" 409.0 | \n",
" 4.0 | \n",
" 30.0 | \n",
" 207.0 | \n",
" 118.0 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_617 | \n",
" 29.0 | \n",
" NaN | \n",
" 101 | \n",
" NaN | \n",
" 72.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 105.0 | \n",
" 74.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 17.785762 | \n",
" 28.887521 | \n",
" 25.890796 | \n",
" 413.0 | \n",
" 4.0 | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 37 columns
\n",
"
"
],
"text/plain": [
" 样本id P1_S1_A6_0C P1_S2_A8_1C P1_S3_A10_2C P1_C1_C0_D \\\n",
"0 sample_1528 38.0 NaN 100 NaN \n",
"1 sample_1698 29.0 NaN 101 NaN \n",
"2 sample_639 29.0 NaN 102 NaN \n",
"3 sample_483 38.0 NaN 100 NaN \n",
"4 sample_617 29.0 NaN 101 NaN \n",
"\n",
" P1_C2_C0_D P2_S1_A12_3C P2_S2_A15_4C P2_S3_A17_5C P2_C3_C0_D ... \\\n",
"0 62.0 102.0 103.0 104.0 64.0 ... \n",
"1 72.0 103.0 104.0 105.0 74.0 ... \n",
"2 73.0 103.0 104.0 105.0 74.0 ... \n",
"3 62.0 102.0 103.0 104.0 64.0 ... \n",
"4 72.0 103.0 104.0 105.0 74.0 ... \n",
"\n",
" P2_C2-C5_std P2_D3-D5_std P2_C1-C12_KD_ABS_std P2_C1-C12_D_std \\\n",
"0 1.707825 0.57735 9.643651 24.928565 \n",
"1 1.707825 0.57735 17.785762 28.887521 \n",
"2 1.290994 0.00000 18.009257 29.231642 \n",
"3 1.707825 0.57735 9.165151 24.617293 \n",
"4 1.707825 0.57735 17.785762 28.887521 \n",
"\n",
" P2_LARGE_KD_std P2_C2-C5_sum P2_D3-D5_sum P2_C1-C12_KD_ABS_sum \\\n",
"0 23.245071 409.0 4.0 27.0 \n",
"1 25.890796 413.0 4.0 44.0 \n",
"2 25.514702 414.0 3.0 43.0 \n",
"3 22.479620 409.0 4.0 30.0 \n",
"4 25.890796 413.0 4.0 44.0 \n",
"\n",
" P2_C1-C12_D_sum P2_LARGE_KD_sum \n",
"0 191.0 113.0 \n",
"1 226.0 134.0 \n",
"2 226.0 135.0 \n",
"3 207.0 118.0 \n",
"4 226.0 134.0 \n",
"\n",
"[5 rows x 37 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df_temperature = df.set_index('样本id')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" P1_S1_A6_0C | \n",
" P1_S2_A8_1C | \n",
" P1_S3_A10_2C | \n",
" P1_C1_C0_D | \n",
" P1_C2_C0_D | \n",
" P2_S1_A12_3C | \n",
" P2_S2_A15_4C | \n",
" P2_S3_A17_5C | \n",
" P2_C3_C0_D | \n",
" P2_C3_C2_D | \n",
" ... | \n",
" P2_C2-C5_std | \n",
" P2_D3-D5_std | \n",
" P2_C1-C12_KD_ABS_std | \n",
" P2_C1-C12_D_std | \n",
" P2_LARGE_KD_std | \n",
" P2_C2-C5_sum | \n",
" P2_D3-D5_sum | \n",
" P2_C1-C12_KD_ABS_sum | \n",
" P2_C1-C12_D_sum | \n",
" P2_LARGE_KD_sum | \n",
"
\n",
" \n",
" 样本id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" sample_1528 | \n",
" 38.0 | \n",
" NaN | \n",
" 100 | \n",
" NaN | \n",
" 62.0 | \n",
" 102.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 64.0 | \n",
" 2.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 9.643651 | \n",
" 24.928565 | \n",
" 23.245071 | \n",
" 409.0 | \n",
" 4.0 | \n",
" 27.0 | \n",
" 191.0 | \n",
" 113.0 | \n",
"
\n",
" \n",
" sample_1698 | \n",
" 29.0 | \n",
" NaN | \n",
" 101 | \n",
" NaN | \n",
" 72.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 105.0 | \n",
" 74.0 | \n",
" 2.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 17.785762 | \n",
" 28.887521 | \n",
" 25.890796 | \n",
" 413.0 | \n",
" 4.0 | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
"
\n",
" \n",
" sample_639 | \n",
" 29.0 | \n",
" NaN | \n",
" 102 | \n",
" NaN | \n",
" 73.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 105.0 | \n",
" 74.0 | \n",
" 1.0 | \n",
" ... | \n",
" 1.290994 | \n",
" 0.00000 | \n",
" 18.009257 | \n",
" 29.231642 | \n",
" 25.514702 | \n",
" 414.0 | \n",
" 3.0 | \n",
" 43.0 | \n",
" 226.0 | \n",
" 135.0 | \n",
"
\n",
" \n",
" sample_483 | \n",
" 38.0 | \n",
" NaN | \n",
" 100 | \n",
" NaN | \n",
" 62.0 | \n",
" 102.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 64.0 | \n",
" 2.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 9.165151 | \n",
" 24.617293 | \n",
" 22.479620 | \n",
" 409.0 | \n",
" 4.0 | \n",
" 30.0 | \n",
" 207.0 | \n",
" 118.0 | \n",
"
\n",
" \n",
" sample_617 | \n",
" 29.0 | \n",
" NaN | \n",
" 101 | \n",
" NaN | \n",
" 72.0 | \n",
" 103.0 | \n",
" 104.0 | \n",
" 105.0 | \n",
" 74.0 | \n",
" 2.0 | \n",
" ... | \n",
" 1.707825 | \n",
" 0.57735 | \n",
" 17.785762 | \n",
" 28.887521 | \n",
" 25.890796 | \n",
" 413.0 | \n",
" 4.0 | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 36 columns
\n",
"
"
],
"text/plain": [
" P1_S1_A6_0C P1_S2_A8_1C P1_S3_A10_2C P1_C1_C0_D P1_C2_C0_D \\\n",
"样本id \n",
"sample_1528 38.0 NaN 100 NaN 62.0 \n",
"sample_1698 29.0 NaN 101 NaN 72.0 \n",
"sample_639 29.0 NaN 102 NaN 73.0 \n",
"sample_483 38.0 NaN 100 NaN 62.0 \n",
"sample_617 29.0 NaN 101 NaN 72.0 \n",
"\n",
" P2_S1_A12_3C P2_S2_A15_4C P2_S3_A17_5C P2_C3_C0_D P2_C3_C2_D \\\n",
"样本id \n",
"sample_1528 102.0 103.0 104.0 64.0 2.0 \n",
"sample_1698 103.0 104.0 105.0 74.0 2.0 \n",
"sample_639 103.0 104.0 105.0 74.0 1.0 \n",
"sample_483 102.0 103.0 104.0 64.0 2.0 \n",
"sample_617 103.0 104.0 105.0 74.0 2.0 \n",
"\n",
" ... P2_C2-C5_std P2_D3-D5_std P2_C1-C12_KD_ABS_std \\\n",
"样本id ... \n",
"sample_1528 ... 1.707825 0.57735 9.643651 \n",
"sample_1698 ... 1.707825 0.57735 17.785762 \n",
"sample_639 ... 1.290994 0.00000 18.009257 \n",
"sample_483 ... 1.707825 0.57735 9.165151 \n",
"sample_617 ... 1.707825 0.57735 17.785762 \n",
"\n",
" P2_C1-C12_D_std P2_LARGE_KD_std P2_C2-C5_sum P2_D3-D5_sum \\\n",
"样本id \n",
"sample_1528 24.928565 23.245071 409.0 4.0 \n",
"sample_1698 28.887521 25.890796 413.0 4.0 \n",
"sample_639 29.231642 25.514702 414.0 3.0 \n",
"sample_483 24.617293 22.479620 409.0 4.0 \n",
"sample_617 28.887521 25.890796 413.0 4.0 \n",
"\n",
" P2_C1-C12_KD_ABS_sum P2_C1-C12_D_sum P2_LARGE_KD_sum \n",
"样本id \n",
"sample_1528 27.0 191.0 113.0 \n",
"sample_1698 44.0 226.0 134.0 \n",
"sample_639 43.0 226.0 135.0 \n",
"sample_483 30.0 207.0 118.0 \n",
"sample_617 44.0 226.0 134.0 \n",
"\n",
"[5 rows x 36 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_temperature.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 时间相关特征"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# 时间计算方式转换\n",
"def duration_outer(series1, series2):\n",
" # 处理隔了一天如21:30 - 01:30\n",
" duration = series1 - series2\n",
" duration = np.where(duration < 0, duration + 24*60, duration)\n",
" duration = np.where(duration > 12*60, 24*60 - duration, duration)\n",
" duration = np.where(duration > 6*60, 12*60 - duration, duration)\n",
" return duration"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"raw = df_trn_tst.copy()\n",
"df = pd.DataFrame(raw['样本id'])\n",
"# 加热过程\n",
"df['P1_S1_A5_0T'] = raw['A5_t'] # 初始时刻\n",
"df['P1_S2_A9_2T'] = raw['A9_t'] # 初始时刻\n",
"df['P1_T1_T0_D'] = duration_outer(raw['A7_t'], raw['A5_t'])\n",
"# 初次测温时间差\n",
"df['P1_T2_T1_D'] = duration_outer(raw['A9_t'], raw['A7_t'])\n",
"# 二次测温时间差\n",
"df['P1_T2_T0_K_D'] = duration_outer(raw['A9_t'], raw['A5_t'])\n",
"# 开始加热至沸腾时间差\n",
"\n",
"# 水解过程\n",
"df['P2_S1_A11_3T'] = raw['A11_t'] # 水解开始时刻\n",
"df['P2_S1_A16_5T'] = raw['A16_t'] # 水解结束时刻\n",
"\n",
"df['P2_T3_T0_K_D'] = duration_outer(raw['A11_t'], raw['A5_t'])\n",
"# 开始加热至投料时间差\n",
"df['P2_T3_T2_K_D'] = duration_outer(raw['A11_t'], raw['A9_t'])\n",
"# 恒温至投料投料时间差\n",
"# df['P2_T4_T3_D'] = raw['A14_t'] - raw['A11_t'] # 水解初次测温时间差\n",
"# df['P2_T5_T4_D'] = raw['A16_t'] - raw['A14_t'] # 水解结束时间差\n",
"df['P2_T5_T3_K_D'] = duration_outer(raw['A16_t'], raw['A11_t'])\n",
"# 水解时间差\n",
"\n",
"# 脱色过程\n",
"df['P3_S1_A20_6T'] = raw['A20_at'] # 中和开始时刻\n",
"df['P3_S2_A25_7T'] = raw['A24_t'] # 保温时刻\n",
"\n",
"df['P3_T6_T5_K_D'] = duration_outer(raw['A20_at'], raw['A16_t'])\n",
"# 水解结束至中和间歇时间\n",
"df['P3_T6_T6_K_D'] = duration_outer(raw['A20_bt'], raw['A20_at'])\n",
"# 酸碱度中和时间\n",
"df['P3_T7_T6_D'] = duration_outer(raw['A24_t'], raw['A20_bt'])\n",
"# 中和结束至脱色间歇时间\n",
"df['P3_T8_T7_K_D'] = duration_outer(raw['A26_t'], raw['A24_t'])\n",
"# 脱色保温时间\n",
"df['P3_T9_T8_D'] = duration_outer(raw['A28_at'], raw['A26_t'])\n",
"# 脱色至抽滤间歇时间\n",
"df['P3_T9_T9_K_D'] = duration_outer(raw['A28_bt'], raw['A28_at'])\n",
"# 抽滤时间\n",
"df['P3_T9_T5_1D'] = duration_outer(raw['A28_bt'], raw['A16_t'])\n",
"df['P3_T9_T6_2D'] = duration_outer(raw['A28_bt'], raw['A20_at'])\n",
"# 脱色总时间\n",
"\n",
"# 结晶过程\n",
"df['P4_S1_B4_10T'] = raw['B4_at'] # 酸化开始时刻\n",
"df['P4_S2_B5_11T'] = raw['B5_t'] # 结晶开始时刻\n",
"df['P4_S3_B7_12T'] = raw['B7_t'] # 结晶结束时刻\n",
"\n",
"df['P4_T10_T9_D'] = duration_outer(raw['B4_at'], raw['A28_bt'])\n",
"# 抽滤结束至酸化间歇时间\n",
"df['P4_T10_T10_K_D'] = duration_outer(raw['B4_bt'], raw['B4_at'])\n",
"# 酸化时间\n",
"df['P4_T11_T10_K_D'] = duration_outer(raw['B5_t'], raw['B4_bt'])\n",
"# 酸化至结晶间歇时间\n",
"df['P4_T12_T11_K_D'] = duration_outer(raw['B7_t'], raw['B5_t'])\n",
"# 自然结晶时间\n",
"df['P4_T12_T9_1D'] = duration_outer(raw['B7_t'], raw['A28_bt'])\n",
"df['P4_T12_T10_2D'] = duration_outer(raw['B7_t'], raw['B4_at'])\n",
"# 结晶总时间\n",
"\n",
"# 甩滤过程\n",
"df['P5_S1_B9_13T'] = raw['B9_at'] # 甩滤开始时刻\n",
"df['P5_S3_B12_15T'] = np.where(\n",
" raw['B11_bt'].isnull(),\n",
" np.where(raw['B10_bt'].isnull(), raw['B9_bt'], raw['B10_bt']),\n",
" raw['B11_bt']) # 甩滤结束时刻\n",
"df['P5_T13_T12_D'] = duration_outer(raw['B9_at'], raw['B7_t'])\n",
"# 酸化结束至甩滤间歇时间\n",
"df['P5_T13_T13_K_D'] = duration_outer(raw['B9_bt'], raw['B9_at'])\n",
"# 基本甩滤时间\n",
"df['P5_T14_T13_D'] = duration_outer(raw['B10_at'], raw['B9_bt'])\n",
"# 基本甩滤至补充甩滤1间歇时间\n",
"df['P5_T14_T14_K_D'] = duration_outer(raw['B10_bt'], raw['B10_at'])\n",
"# 补充甩滤1时间\n",
"df['P5_T15_T14_D'] = duration_outer(raw['B11_at'], raw['B10_bt'])\n",
"# 补充甩滤1至补充甩滤2间歇时间\n",
"df['P5_T15_T13_K_D'] = duration_outer(raw['B11_bt'], raw['B11_at'])\n",
"# 补充甩滤2时间\n",
"df['P5_T15_T13_1D'] = \\\n",
" df[['P5_T13_T13_K_D', 'P5_T14_T14_K_D', 'P5_T13_T13_K_D']].sum(axis=1)\n",
"df['P5_T15_T12_2D'] = duration_outer(\n",
" df['P5_S3_B12_15T'], df['P4_S3_B7_12T'])\n",
"df['P5_T15_T12_3D'] = duration_outer(\n",
" df['P5_S3_B12_15T'], df['P5_S1_B9_13T'])\n",
"# 总甩滤时间\n",
"\n",
"# 总流程时长\n",
"df['P5_T15_T1_4D'] = \\\n",
" df[['P5_T15_T12_2D', 'P4_T12_T9_1D', 'P3_T9_T5_1D',\n",
" 'P2_T3_T0_K_D', 'P2_T5_T3_K_D']].sum(axis=1)\n",
"_funcs = ['mean', 'std', 'sum']\n",
"for _func in _funcs:\n",
" df[f'P5__D_{_func}'] = \\\n",
" df[[_f for _f in df.columns if _f.endswith('_D')]].\\\n",
" abs().agg(_func, axis=1)\n",
" df[f'P5_K_D_{_func}'] = \\\n",
" df[[_f for _f in df.columns if _f.endswith('_K_D')]]. \\\n",
" abs().agg(_func, axis=1)\n",
" df[f'P5__D_{_func}'] = \\\n",
" df[[_f for _f in df.columns if _f.endswith('D')]]. \\\n",
" abs().agg(_func, axis=1)\n",
"df_duration = df.set_index('样本id')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" P1_S1_A5_0T | \n",
" P1_S2_A9_2T | \n",
" P1_T1_T0_D | \n",
" P1_T2_T1_D | \n",
" P1_T2_T0_K_D | \n",
" P2_S1_A11_3T | \n",
" P2_S1_A16_5T | \n",
" P2_T3_T0_K_D | \n",
" P2_T3_T2_K_D | \n",
" P2_T5_T3_K_D | \n",
" ... | \n",
" P5_T15_T13_1D | \n",
" P5_T15_T12_2D | \n",
" P5_T15_T12_3D | \n",
" P5_T15_T1_4D | \n",
" P5__D_mean | \n",
" P5_K_D_mean | \n",
" P5__D_std | \n",
" P5_K_D_std | \n",
" P5__D_sum | \n",
" P5_K_D_sum | \n",
"
\n",
" \n",
" 样本id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" sample_1528 | \n",
" 810 | \n",
" 930 | \n",
" NaN | \n",
" NaN | \n",
" 120 | \n",
" 990 | \n",
" 1110 | \n",
" 180 | \n",
" 60 | \n",
" 120 | \n",
" ... | \n",
" 270.0 | \n",
" 240.0 | \n",
" 240.0 | \n",
" 840.0 | \n",
" 145.384615 | \n",
" 90.000000 | \n",
" 169.852425 | \n",
" 63.639610 | \n",
" 3780.0 | \n",
" 1170.0 | \n",
"
\n",
" \n",
" sample_1698 | \n",
" 840 | \n",
" 960 | \n",
" NaN | \n",
" NaN | \n",
" 120 | \n",
" 1020 | \n",
" 1140 | \n",
" 180 | \n",
" 60 | \n",
" 120 | \n",
" ... | \n",
" 270.0 | \n",
" 240.0 | \n",
" 240.0 | \n",
" 960.0 | \n",
" 136.071429 | \n",
" 90.000000 | \n",
" 188.588113 | \n",
" 76.258669 | \n",
" 3810.0 | \n",
" 1260.0 | \n",
"
\n",
" \n",
" sample_639 | \n",
" 840 | \n",
" 960 | \n",
" NaN | \n",
" NaN | \n",
" 120 | \n",
" 1020 | \n",
" 1140 | \n",
" 180 | \n",
" 60 | \n",
" 120 | \n",
" ... | \n",
" 270.0 | \n",
" 240.0 | \n",
" 240.0 | \n",
" 900.0 | \n",
" 123.214286 | \n",
" 75.000000 | \n",
" 173.654693 | \n",
" 49.575118 | \n",
" 3450.0 | \n",
" 1050.0 | \n",
"
\n",
" \n",
" sample_483 | \n",
" 90 | \n",
" 180 | \n",
" NaN | \n",
" NaN | \n",
" 90 | \n",
" 240 | \n",
" 360 | \n",
" 150 | \n",
" 60 | \n",
" 120 | \n",
" ... | \n",
" 270.0 | \n",
" 300.0 | \n",
" 240.0 | \n",
" 990.0 | \n",
" 158.076923 | \n",
" 73.846154 | \n",
" 195.448596 | \n",
" 46.822086 | \n",
" 4110.0 | \n",
" 960.0 | \n",
"
\n",
" \n",
" sample_617 | \n",
" 1320 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" 120 | \n",
" 60 | \n",
" 180 | \n",
" 180 | \n",
" 60 | \n",
" 120 | \n",
" ... | \n",
" 270.0 | \n",
" 240.0 | \n",
" 240.0 | \n",
" 900.0 | \n",
" 123.214286 | \n",
" 77.142857 | \n",
" 173.846539 | \n",
" 48.107024 | \n",
" 3450.0 | \n",
" 1080.0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 47 columns
\n",
"
"
],
"text/plain": [
" P1_S1_A5_0T P1_S2_A9_2T P1_T1_T0_D P1_T2_T1_D P1_T2_T0_K_D \\\n",
"样本id \n",
"sample_1528 810 930 NaN NaN 120 \n",
"sample_1698 840 960 NaN NaN 120 \n",
"sample_639 840 960 NaN NaN 120 \n",
"sample_483 90 180 NaN NaN 90 \n",
"sample_617 1320 0 NaN NaN 120 \n",
"\n",
" P2_S1_A11_3T P2_S1_A16_5T P2_T3_T0_K_D P2_T3_T2_K_D \\\n",
"样本id \n",
"sample_1528 990 1110 180 60 \n",
"sample_1698 1020 1140 180 60 \n",
"sample_639 1020 1140 180 60 \n",
"sample_483 240 360 150 60 \n",
"sample_617 60 180 180 60 \n",
"\n",
" P2_T5_T3_K_D ... P5_T15_T13_1D P5_T15_T12_2D P5_T15_T12_3D \\\n",
"样本id ... \n",
"sample_1528 120 ... 270.0 240.0 240.0 \n",
"sample_1698 120 ... 270.0 240.0 240.0 \n",
"sample_639 120 ... 270.0 240.0 240.0 \n",
"sample_483 120 ... 270.0 300.0 240.0 \n",
"sample_617 120 ... 270.0 240.0 240.0 \n",
"\n",
" P5_T15_T1_4D P5__D_mean P5_K_D_mean P5__D_std P5_K_D_std \\\n",
"样本id \n",
"sample_1528 840.0 145.384615 90.000000 169.852425 63.639610 \n",
"sample_1698 960.0 136.071429 90.000000 188.588113 76.258669 \n",
"sample_639 900.0 123.214286 75.000000 173.654693 49.575118 \n",
"sample_483 990.0 158.076923 73.846154 195.448596 46.822086 \n",
"sample_617 900.0 123.214286 77.142857 173.846539 48.107024 \n",
"\n",
" P5__D_sum P5_K_D_sum \n",
"样本id \n",
"sample_1528 3780.0 1170.0 \n",
"sample_1698 3810.0 1260.0 \n",
"sample_639 3450.0 1050.0 \n",
"sample_483 4110.0 960.0 \n",
"sample_617 3450.0 1080.0 \n",
"\n",
"[5 rows x 47 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_duration.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 水耗相关特征"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"na_value=405\n",
"\n",
"df_trn_tst = df_trn_tst.copy()\n",
"df = pd.DataFrame(raw['样本id'])\n",
"# 耗水\n",
"df['P2_W_1M'] = raw['A4']\n",
"df['P2_W_2M'] = raw['A19']\n",
"# 耗盐酸\n",
"df['P3_H_1M'] = raw['A21'].fillna(50)\n",
"df['P4_H_2M'] = raw['B1'].fillna(320)\n",
"# 氢氧化钠\n",
"df['P2_N_1M'] = raw['A3'].fillna(na_value)\n",
"# 4-氰基吡啶\n",
"df['P2_C_1M'] = raw['A1']\n",
"\n",
"df['P5_W_3M'] = raw['B12'].fillna(1200)\n",
"df['P5_W_1M'] = df['P2_W_1M'] + df['P2_W_2M']\n",
"df['P5_W_3M'] = df['P2_W_1M'] + df['P2_W_2M'] + df['P5_W_3M']\n",
"df['P5_H_1M'] = df['P3_H_1M'] + df['P4_H_2M']\n",
"df['P5_M_0M'] = raw['A1'] + df['P2_N_1M'] + df['P5_W_1M'] + df['P4_H_2M']\n",
"df['P5_M_1M'] = df['P5_M_0M'] + df['P5_W_3M']\n",
"df['P5_M_2M'] = df['P5_M_1M'] + df['P3_H_1M']\n",
"# 理论产出\n",
"df['P5_O_1M'] = raw['B14']\n",
"df['P5_O_5M'] = raw['B14'].replace(418, 420).replace(405, 400).\\\n",
" replace(395, 390).replace(392, 390).replace(387, 380).\\\n",
" replace(385, 380).replace(370, 360).replace(350, 360).\\\n",
" replace(350, 360).replace(340, 360).replace(290, 280).\\\n",
" replace(260, 280).replace(256, 280)\n",
"_fs = [_f for _f in df.columns if _f.endswith('M')]\n",
"for _f in _fs[:-2]:\n",
" df[f'{_f}_P5_O_1M_R'] = df['P5_O_1M'] / df[_f]\n",
" df[f'{_f}_P5_O_5M_R'] = df['P5_O_5M'] / df[_f]\n",
"for i in range(len(_fs[:6])):\n",
" _f, _sub_fs = _fs[i], _fs[(i+1):6]\n",
" for _f_div in _sub_fs:\n",
" df[f'{_f}_{_f_div}_R'] = df[_f] / df[_f_div]\n",
"df_materials = df.set_index('样本id')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"raw = df_trn_tst.copy()\n",
"df = pd.DataFrame(raw['样本id'])\n",
"df['P5_NOT_NUM_N'] = raw.iloc[:, 1:-1].notnull().sum(axis=1)\n",
"df['P5_PH_1N'] = raw['A22']\n",
"df['P5_PH_2N'] = raw['A23']\n",
"df['P5_PH_2N'] = raw['B2']\n",
"df['P5_A7_1N'] = raw['A7_t'].isnull().astype(int)\n",
"df['P5_O_2M'] = (raw['B14'] <= 360).astype(int)\n",
"df['P5_1_3M'] = raw['B13']\n",
"df_interact = df.set_index('样本id')"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" P5_NOT_NUM_N | \n",
" P5_PH_1N | \n",
" P5_PH_2N | \n",
" P5_A7_1N | \n",
" P5_O_2M | \n",
" P5_1_3M | \n",
"
\n",
" \n",
" 样本id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" sample_1528 | \n",
" 42 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
"
\n",
" \n",
" sample_1698 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
"
\n",
" \n",
" sample_639 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
"
\n",
" \n",
" sample_483 | \n",
" 42 | \n",
" 10.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
"
\n",
" \n",
" sample_617 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" P5_NOT_NUM_N P5_PH_1N P5_PH_2N P5_A7_1N P5_O_2M P5_1_3M\n",
"样本id \n",
"sample_1528 42 9.0 3.5 1 0 0.15\n",
"sample_1698 44 9.0 3.5 1 0 0.15\n",
"sample_639 44 9.0 3.5 1 0 0.15\n",
"sample_483 42 10.0 3.5 1 0 0.15\n",
"sample_617 44 9.0 3.5 1 0 0.15"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_interact.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 合并所有特征"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"df_feature = pd.concat([df_materials,\n",
" df_duration,\n",
" df_temperature,\n",
" df_interact], axis=1).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df_trn = df_feature.iloc[:len(df_trn)].reset_index(drop=True)\n",
"df_trn['收率'] = df_target\n",
"df_tst = df_feature.iloc[len(df_trn):].reset_index(drop=True)\n",
"df_tst['收率'] = np.nan"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
" P2_W_1M | \n",
" P2_W_2M | \n",
" P3_H_1M | \n",
" P4_H_2M | \n",
" P2_N_1M | \n",
" P2_C_1M | \n",
" P5_W_3M | \n",
" P5_W_1M | \n",
" P5_H_1M | \n",
" ... | \n",
" P2_C1-C12_KD_ABS_sum | \n",
" P2_C1-C12_D_sum | \n",
" P2_LARGE_KD_sum | \n",
" P5_NOT_NUM_N | \n",
" P5_PH_1N | \n",
" P5_PH_2N | \n",
" P5_A7_1N | \n",
" P5_O_2M | \n",
" P5_1_3M | \n",
" 收率 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1528 | \n",
" 700 | \n",
" 300 | \n",
" 50.0 | \n",
" 350.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 1800.0 | \n",
" 1000 | \n",
" 400.0 | \n",
" ... | \n",
" 27.0 | \n",
" 191.0 | \n",
" 113.0 | \n",
" 42 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.879 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1698 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 320.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 2100.0 | \n",
" 900 | \n",
" 370.0 | \n",
" ... | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.902 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_639 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 320.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 2100.0 | \n",
" 900 | \n",
" 370.0 | \n",
" ... | \n",
" 43.0 | \n",
" 226.0 | \n",
" 135.0 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.936 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_483 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 290.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 1700.0 | \n",
" 900 | \n",
" 340.0 | \n",
" ... | \n",
" 30.0 | \n",
" 207.0 | \n",
" 118.0 | \n",
" 42 | \n",
" 10.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.902 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_617 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 320.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 2100.0 | \n",
" 900 | \n",
" 370.0 | \n",
" ... | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.983 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 144 columns
\n",
"
"
],
"text/plain": [
" 样本id P2_W_1M P2_W_2M P3_H_1M P4_H_2M P2_N_1M P2_C_1M P5_W_3M \\\n",
"0 sample_1528 700 300 50.0 350.0 405.0 300 1800.0 \n",
"1 sample_1698 700 200 50.0 320.0 405.0 300 2100.0 \n",
"2 sample_639 700 200 50.0 320.0 405.0 300 2100.0 \n",
"3 sample_483 700 200 50.0 290.0 405.0 300 1700.0 \n",
"4 sample_617 700 200 50.0 320.0 405.0 300 2100.0 \n",
"\n",
" P5_W_1M P5_H_1M ... P2_C1-C12_KD_ABS_sum P2_C1-C12_D_sum \\\n",
"0 1000 400.0 ... 27.0 191.0 \n",
"1 900 370.0 ... 44.0 226.0 \n",
"2 900 370.0 ... 43.0 226.0 \n",
"3 900 340.0 ... 30.0 207.0 \n",
"4 900 370.0 ... 44.0 226.0 \n",
"\n",
" P2_LARGE_KD_sum P5_NOT_NUM_N P5_PH_1N P5_PH_2N P5_A7_1N P5_O_2M \\\n",
"0 113.0 42 9.0 3.5 1 0 \n",
"1 134.0 44 9.0 3.5 1 0 \n",
"2 135.0 44 9.0 3.5 1 0 \n",
"3 118.0 42 10.0 3.5 1 0 \n",
"4 134.0 44 9.0 3.5 1 0 \n",
"\n",
" P5_1_3M 收率 \n",
"0 0.15 0.879 \n",
"1 0.15 0.902 \n",
"2 0.15 0.936 \n",
"3 0.15 0.902 \n",
"4 0.15 0.983 \n",
"\n",
"[5 rows x 144 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_trn.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"for _df in [df_trn, df_tst]:\n",
" _df.insert(1, 'id', _df['样本id'].str.split('_').str[1].astype(float))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
" id | \n",
" P2_W_1M | \n",
" P2_W_2M | \n",
" P3_H_1M | \n",
" P4_H_2M | \n",
" P2_N_1M | \n",
" P2_C_1M | \n",
" P5_W_3M | \n",
" P5_W_1M | \n",
" ... | \n",
" P2_C1-C12_KD_ABS_sum | \n",
" P2_C1-C12_D_sum | \n",
" P2_LARGE_KD_sum | \n",
" P5_NOT_NUM_N | \n",
" P5_PH_1N | \n",
" P5_PH_2N | \n",
" P5_A7_1N | \n",
" P5_O_2M | \n",
" P5_1_3M | \n",
" 收率 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1528 | \n",
" 1528.0 | \n",
" 700 | \n",
" 300 | \n",
" 50.0 | \n",
" 350.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 1800.0 | \n",
" 1000 | \n",
" ... | \n",
" 27.0 | \n",
" 191.0 | \n",
" 113.0 | \n",
" 42 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.879 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1698 | \n",
" 1698.0 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 320.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 2100.0 | \n",
" 900 | \n",
" ... | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.902 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_639 | \n",
" 639.0 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 320.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 2100.0 | \n",
" 900 | \n",
" ... | \n",
" 43.0 | \n",
" 226.0 | \n",
" 135.0 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.936 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_483 | \n",
" 483.0 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 290.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 1700.0 | \n",
" 900 | \n",
" ... | \n",
" 30.0 | \n",
" 207.0 | \n",
" 118.0 | \n",
" 42 | \n",
" 10.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.902 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_617 | \n",
" 617.0 | \n",
" 700 | \n",
" 200 | \n",
" 50.0 | \n",
" 320.0 | \n",
" 405.0 | \n",
" 300 | \n",
" 2100.0 | \n",
" 900 | \n",
" ... | \n",
" 44.0 | \n",
" 226.0 | \n",
" 134.0 | \n",
" 44 | \n",
" 9.0 | \n",
" 3.5 | \n",
" 1 | \n",
" 0 | \n",
" 0.15 | \n",
" 0.983 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 145 columns
\n",
"
"
],
"text/plain": [
" 样本id id P2_W_1M P2_W_2M P3_H_1M P4_H_2M P2_N_1M P2_C_1M \\\n",
"0 sample_1528 1528.0 700 300 50.0 350.0 405.0 300 \n",
"1 sample_1698 1698.0 700 200 50.0 320.0 405.0 300 \n",
"2 sample_639 639.0 700 200 50.0 320.0 405.0 300 \n",
"3 sample_483 483.0 700 200 50.0 290.0 405.0 300 \n",
"4 sample_617 617.0 700 200 50.0 320.0 405.0 300 \n",
"\n",
" P5_W_3M P5_W_1M ... P2_C1-C12_KD_ABS_sum P2_C1-C12_D_sum \\\n",
"0 1800.0 1000 ... 27.0 191.0 \n",
"1 2100.0 900 ... 44.0 226.0 \n",
"2 2100.0 900 ... 43.0 226.0 \n",
"3 1700.0 900 ... 30.0 207.0 \n",
"4 2100.0 900 ... 44.0 226.0 \n",
"\n",
" P2_LARGE_KD_sum P5_NOT_NUM_N P5_PH_1N P5_PH_2N P5_A7_1N P5_O_2M \\\n",
"0 113.0 42 9.0 3.5 1 0 \n",
"1 134.0 44 9.0 3.5 1 0 \n",
"2 135.0 44 9.0 3.5 1 0 \n",
"3 118.0 42 10.0 3.5 1 0 \n",
"4 134.0 44 9.0 3.5 1 0 \n",
"\n",
" P5_1_3M 收率 \n",
"0 0.15 0.879 \n",
"1 0.15 0.902 \n",
"2 0.15 0.936 \n",
"3 0.15 0.902 \n",
"4 0.15 0.983 \n",
"\n",
"[5 rows x 145 columns]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_trn.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAATvElEQVR4nO3df7RlZX3f8fcHRkStOCADZc2gF5PRQLqKTCeW1jYmkib8qA6mkmLTOqVTp0lomqy0q46xq7FZSRes1YqhzSIhYjLQoEGtYRpoUhxBV7MKeJHfIDIihXGocyM/jKISyLd/nOduDzN3ZvYwd99zmHm/1jrrPPvZz973e/c98Jln73P2SVUhSRLAYZMuQJI0PQwFSVLHUJAkdQwFSVLHUJAkdZZNuoADceyxx9bMzMyky5CkF5Xbbrvtz6pqxULrXtShMDMzw+zs7KTLkKQXlST/d0/rPH0kSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeq8qD/RLGl6zGy6bmI/++GLzpnYzz7YOFOQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSZ9BQSLI8ySeSfDHJ/Un+VpJjktyQ5MH2fHQbmySXJtmW5K4ka4asTZK0u6FnCr8B/HFV/QBwKnA/sAnYWlWrga1tGeAsYHV7bAQuG7g2SdIuBguFJEcBPwxcAVBVz1TVk8A6YHMbthk4t7XXAVfWyM3A8iQnDFWfJGl3Q84UXgfMAb+b5PYkH07yCuD4qnoMoD0f18avBB4d235763ueJBuTzCaZnZubG7B8STr0DBkKy4A1wGVVdRrwLb53qmghWaCvduuouryq1lbV2hUrVixOpZIkYNhQ2A5sr6pb2vInGIXE1+ZPC7XnnWPjTxzbfhWwY8D6JEm7GCwUqur/AY8meUPrOgO4D9gCrG9964FrW3sL8O72LqTTgafmTzNJkpbGsoH3//PA7yc5AngIuIBREF2TZAPwCHBeG3s9cDawDXi6jZUkLaFBQ6Gq7gDWLrDqjAXGFnDhkPVIkvbOTzRLkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpM2goJHk4yd1J7kgy2/qOSXJDkgfb89GtP0kuTbItyV1J1gxZmyRpd0sxU/jRqnpjVa1ty5uArVW1GtjalgHOAla3x0bgsiWoTZI0ZhKnj9YBm1t7M3DuWP+VNXIzsDzJCROoT5IOWUOHQgH/K8ltSTa2vuOr6jGA9nxc618JPDq27fbW9zxJNiaZTTI7Nzc3YOmSdOhZNvD+31xVO5IcB9yQ5It7GZsF+mq3jqrLgcsB1q5du9t6SdILN+hMoap2tOedwKeANwFfmz8t1J53tuHbgRPHNl8F7BiyPknS8w0WCklekeSV823gx4F7gC3A+jZsPXBta28B3t3ehXQ68NT8aSZJ0tIY8vTR8cCnksz/nKur6o+TfB64JskG4BHgvDb+euBsYBvwNHDBgLVJkhYwWChU1UPAqQv0fx04Y4H+Ai4cqh5J0r75iWZJUsdQkCR1DAVJUmfozylI0uBmNl03kZ/78EXnTOTnDsmZgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSp0ysUkvy1oQuRJE1e35nCbyW5NcnPJVk+aEWSpInpFQpV9XeAnwZOBGaTXJ3k7w1amSRpyfW+plBVDwL/Dngv8Bbg0iRfTPKTQxUnSVpafa8p/PUklwD3A28F3lZVJ7f2JQPWJ0laQn1nCv8V+AJwalVdWFVfAKiqHYxmD3uU5PAktyf5o7Z8UpJbkjyY5A+SHNH6X9qWt7X1My/0l5IkvTB9Q+Fs4Oqq+jZAksOSvBygqq7ax7a/wGiGMe9i4JKqWg08AWxo/RuAJ6rq+xnNPi7uWZskaZH0DYVPAy8bW35569urJKuAc4APt+UwOuX0iTZkM3Bua69ry7T1Z7TxkqQl0jcUjqyqb84vtPbLe2z3IeDfAn/Zll8NPFlVz7bl7cDK1l4JPNr2/yzwVBv/PEk2JplNMjs3N9ezfElSH31D4VtJ1swvJPkbwLf3tkGSvw/srKrbxrsXGFo91n2vo+ryqlpbVWtXrFix78olSb0t6znuF4GPJ9nRlk8A/uE+tnkz8PYkZwNHAkcxmjksT7KszQZWAfP73M7ocxDbkywDXgU83vs3kSQdsL4fXvs88APAzwI/B5y8ywxgoW3eV1WrqmoGOB/4TFX9NHAj8M42bD1wbWtvacu09Z+pqt1mCpKk4fSdKQD8EDDTtjktCVV15Qv4me8FPpbk14DbgSta/xXAVUm2MZohnP8C9i1JOgC9QiHJVcD3AXcAz7XuAnqFQlXdBNzU2g8Bb1pgzHeA8/rsT5I0jL4zhbXAKZ7OkaSDW993H90D/NUhC5EkTV7fmcKxwH1JbgW+O99ZVW8fpCpJ0kT0DYUPDFmEJGk69AqFqvpsktcCq6vq0+2+R4cPW5okaan1vXX2exjdj+i3W9dK4A+HKkqSNBl9LzRfyOgTyt+A7gt3jhuqKEnSZPQNhe9W1TPzC+02FL49VZIOMn1D4bNJfhl4Wftu5o8D/2O4siRJk9A3FDYBc8DdwL8Armcf37gmSXrx6fvuo78Efqc9JEkHqb73PvoKC3+3wesWvSJJ0sTsz72P5h3J6MZ1xyx+OZKkSer7fQpfH3t8tao+xOi7liVJB5G+p4/WjC0exmjm8MpBKpIkTUzf00f/eaz9LPAw8FOLXo0kaaL6vvvoR4cuRJI0eX1PH/3S3tZX1QcXpxxJ0iTtz7uPfgjY0pbfBnwOeHSIoiRJk7E/X7Kzpqr+HCDJB4CPV9U/H6owSdLS63ubi9cAz4wtPwPMLHo1kqSJ6jtTuAq4NcmnGH2y+R3AlYNVJUmaiL4fXvt14ALgCeBJ4IKq+o972ybJkUluTXJnknuT/IfWf1KSW5I8mOQPkhzR+l/alre19TMH8otJkvZf39NHAC8HvlFVvwFsT3LSPsZ/F3hrVZ0KvBE4M8npwMXAJVW1mlHIbGjjNwBPVNX3A5e0cZKkJdT36zh/BXgv8L7W9RLgv+1tmxr55tj4lzA69fRWRl/tCbAZOLe117Vl2vozkqRPfZKkxdF3pvAO4O3AtwCqagc9bnOR5PAkdwA7gRuALwNPVtWzbch2Rt/3THt+tO3/WeAp4NU965MkLYK+ofBMVRXt9tlJXtFno6p6rqreCKwC3gScvNCw9rzQrGC323Un2ZhkNsns3Nxcr+IlSf30DYVrkvw2sDzJe4BPsx9fuFNVTwI3Aae3fcy/62kVsKO1twMnQvcd0K8CHl9gX5dX1dqqWrtixYq+JUiSeuj77qP/xOg8/yeBNwD/vqr+y962SbIiyfLWfhnwY8D9wI3AO9uw9cC1rb2lLdPWf6bNTiRJS2Sfn1NIcjjwJ1X1Y4yuC/R1ArC5bX8YcE1V/VGS+4CPJfk14Hbgijb+CuCqJNsYzRDO34+fJUlaBPsMhap6LsnTSV5VVU/13XFV3QWctkD/Q4yuL+za/x1G3+gmSZqQvp9o/g5wd5IbaO9AAqiqfzVIVZKkiegbCte1hyTpILbXUEjymqp6pKo2722cJOngsK93H/3hfCPJJweuRZI0YfsKhfEPlL1uyEIkSZO3r1CoPbQlSQehfV1oPjXJNxjNGF7W2rTlqqqjBq1OkrSk9hoKVXX4UhUiSZq8/fk+BUnSQc5QkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1BguFJCcmuTHJ/UnuTfILrf+YJDckebA9H936k+TSJNuS3JVkzVC1SZIWNuRM4VngX1fVycDpwIVJTgE2AVurajWwtS0DnAWsbo+NwGUD1iZJWsBgoVBVj1XVF1r7z4H7gZXAOmBzG7YZOLe11wFX1sjNwPIkJwxVnyRpd0tyTSHJDHAacAtwfFU9BqPgAI5rw1YCj45ttr31SZKWyOChkOSvAJ8EfrGqvrG3oQv01QL725hkNsns3NzcYpUpSWLgUEjyEkaB8PtV9d9b99fmTwu1552tfztw4tjmq4Adu+6zqi6vqrVVtXbFihXDFS9Jh6Ah330U4Arg/qr64NiqLcD61l4PXDvW/+72LqTTgafmTzNJkpbGsgH3/WbgnwB3J7mj9f0ycBFwTZINwCPAeW3d9cDZwDbgaeCCAWuTJC1gsFCoqv/NwtcJAM5YYHwBFw5VjyRp3/xEsySpYyhIkjqGgiSpYyhIkjpDvvtIkg5qM5uum9jPfviicwbZrzMFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQYLhSQfSbIzyT1jfcckuSHJg+356NafJJcm2ZbkriRrhqpLkrRnQ84Ufg84c5e+TcDWqloNbG3LAGcBq9tjI3DZgHVJkvZgsFCoqs8Bj+/SvQ7Y3NqbgXPH+q+skZuB5UlOGKo2SdLClvqawvFV9RhAez6u9a8EHh0bt7317SbJxiSzSWbn5uYGLVaSDjXTcqE5C/TVQgOr6vKqWltVa1esWDFwWZJ0aFnqUPja/Gmh9ryz9W8HThwbtwrYscS1SdIhb6lDYQuwvrXXA9eO9b+7vQvpdOCp+dNMkqSls2yoHSf5KPAjwLFJtgO/AlwEXJNkA/AIcF4bfj1wNrANeBq4YKi6JEl7NlgoVNW79rDqjAXGFnDhULVIh5KZTddNugS9iE3LhWZJ0hQwFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJnakKhSRnJnkgybYkmyZdjyQdaqYmFJIcDvwmcBZwCvCuJKdMtipJOrQsm3QBY94EbKuqhwCSfAxYB9w3xA+b2XTdELudag9fdM6kS1hyh+LfWToQ0xQKK4FHx5a3A39z10FJNgIb2+I3kzywBLX1dSzwZ5MuYk9yMTDlNWJ9i2Haa5z2+mD6azw2Fx9Qfa/d04ppCoUs0Fe7dVRdDlw+fDn7L8lsVa2ddB17M+01Wt+Bm/Yap70+mP4ah6xvaq4pMJoZnDi2vArYMaFaJOmQNE2h8HlgdZKTkhwBnA9smXBNknRImZrTR1X1bJJ/CfwJcDjwkaq6d8Jl7a+pPK21i2mv0foO3LTXOO31wfTXOFh9qdrttL0k6RA1TaePJEkTZihIkjqGQk99bsGR5KeS3Jfk3iRXj/U/l+SO9hjk4vm+6ktyyVgNX0ry5Ni69UkebI/1Q9S3CDVOwzF8TZIbk9ye5K4kZ4+te1/b7oEkPzFN9SWZSfLtseP3W0PU17PG1ybZ2uq7KcmqsXWDvw4PsL6leA1+JMnOJPfsYX2SXNrqvyvJmrF1i3P8qsrHPh6MLnx/GXgdcARwJ3DKLmNWA7cDR7fl48bWfXPS9e0y/ucZXcgHOAZ4qD0f3dpHT1ON03IMGV3c+9nWPgV4eKx9J/BS4KS2n8OnqL4Z4J4hj99+1PhxYH1rvxW4aqlehwdS31K8BtvP+GFgzZ7+XsDZwP9k9Lmu04FbFvv4OVPop7sFR1U9A8zfgmPce4DfrKonAKpq55TVN+5dwEdb+yeAG6rq8Vb7DcCZU1bjUuhTXwFHtfar+N7naNYBH6uq71bVV4BtbX/TUt9S6VPjKcDW1r5xbP1SvA4PpL4lUVWfAx7fy5B1wJU1cjOwPMkJLOLxMxT6WegWHCt3GfN64PVJ/jTJzUnG/yBHJplt/edOqD5gND1m9K/Zz+zvthOsEabjGH4A+MdJtgPXM5rN9N12kvUBnNROK302yd9d5Nr2p8Y7gX/Q2u8AXpnk1T23nWR9MPxrsI89/Q6LdvwMhX763IJjGaNTSD/C6F+5H06yvK17TY0+kv6PgA8l+b4J1DfvfOATVfXcC9j2QBxIjTAdx/BdwO9V1SpG0/irkhzWc9tJ1vcYo+N3GvBLwNVJjmLx9anx3wBvSXI78Bbgq8CzPbc9UAdSHwz/GuxjT7/Doh0/Q6GfPrfg2A5cW1V/0U4hPMAoJKiqHe35IeAm4LQJ1DfvfJ5/Wmapbi9yIDVOyzHcAFzT6vg/wJGMbpy2FMfwBdfXTmt9vfXfxui8+usXub5eNVbVjqr6yRZQ7299T/XZdsL1LcVrsI89/Q6Ld/yGvnByMDwYzQIeYnRKY/4C1Q/uMuZMYHNrH8toKvdqRhd9XjrW/yB7ucA6VH1t3BuAh2kfWqzvXaD6Sqvz6NY+ZhLHcC81TsUxZHSB75+29sntP7oAP8jzLzQ/xOJfaD6Q+lbM18PoIutXJ/U3bn+/w1r714FfXarX4QHWN/hrcKyGGfZ8ofkcnn+h+dbFPn6L/gsdrA9G0/EvMfpX1vtb368Cb2/tAB9k9P0PdwPnt/6/3ZbvbM8bJlFfW/4AcNEC2/4zRhdHtwEXTOoY7qnGaTmGjC5C/mmr4w7gx8e2fX/b7gHgrGmqj9E58ntb/xeAt03qbwy8s/0P9UvAh2n/o12q1+ELrW8JX4MfZXS67y8Y/et/A/AzwM+09WH0ZWRfbnWsXezj520uJEkdrylIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjr/H+ez4bfNKr74AAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"df_trn['收率'].plot(kind='hist')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"df_trn = df_trn.query('收率 > 0.8671').reset_index(drop=True) # 筛选常规数据\n",
"df_trn = df_trn.query('收率 < 0.9861').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAATa0lEQVR4nO3df7BfdX3n8eeL8FtUoFxYTMCLNnaNnRpsZNl1XSnWVnEt2pYK3dbIsqaz4rbOup1G1lnZzjJDd1W2jjuucVEjW0vxR5UtWA0s1mkHxCDIr2iJmIWYDEkriogFwff+8f3c4zW5Sb7Jvef7vTf3+Zj5zvdzPuec73l/5t7kdc+P7zmpKiRJAjhk3AVIkuYPQ0GS1DEUJEkdQ0GS1DEUJEmdQ8ddwGyccMIJNTk5Oe4yJGlBue222/6uqiZmmregQ2FycpKNGzeOuwxJWlCS/L89zfPwkSSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSps6C/0az9M7n2urFte8vlrx7btiUNzz0FSVLHUJAkdQwFSVLHUJAkdQwFSVKnt1BIcmSSW5N8Nck9Sf5z6z8tyZeS3Jfkz5Ic3vqPaNOb2/zJvmqTJM2szz2Fx4Gzq+qFwErglUnOBP4IuKKqlgMPAxe15S8CHq6qnwauaMtJkkaot1CogUfb5GHtVcDZwCda/3rgta19bpumzX95kvRVnyRpd72eU0iyJMkdwA5gA/AN4DtV9WRbZCuwtLWXAg8CtPnfBX6qz/okST+p11CoqqeqaiWwDDgDeP5Mi7X3mfYKateOJGuSbEyycefOnXNXrCRpNFcfVdV3gC8AZwLHJpm6vcYyYFtrbwVOAWjznwl8e4bPWldVq6pq1cTERN+lS9Ki0ufVRxNJjm3to4BfBDYBNwG/3hZbDXymta9t07T5/7eqdttTkCT1p88b4p0MrE+yhEH4XFNVf5HkXuDqJP8FuB24si1/JXBVks0M9hDO77E2SdIMeguFqroTOH2G/vsZnF/Ytf8fgPP6qkeStG9+o1mS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1DEUJEmdPu99JC1qk2uvG8t2t1z+6rFsVwcH9xQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSZ3eQiHJKUluSrIpyT1Jfq/1X5rkW0nuaK9zpq3z9iSbk3w9yS/3VZskaWZ93jr7SeBtVfWVJE8Hbkuyoc27oqreNX3hJCuA84EXAM8CbkjyvKp6qscaJUnT9LanUFXbq+orrf09YBOwdC+rnAtcXVWPV9U3gc3AGX3VJ0na3UjOKSSZBE4HvtS63pLkziQfSnJc61sKPDhtta3MECJJ1iTZmGTjzp07e6xakhaf3kMhyTHAJ4G3VtUjwPuB5wIrge3Au6cWnWH12q2jal1VraqqVRMTEz1VLUmLU6+hkOQwBoHwJ1X1KYCqeqiqnqqqHwEf5MeHiLYCp0xbfRmwrc/6JEk/qc+rjwJcCWyqqvdM6z952mKvA+5u7WuB85MckeQ0YDlwa1/1SZJ21+fVRy8Bfhu4K8kdre8S4IIkKxkcGtoC/A5AVd2T5BrgXgZXLl3slUeSNFq9hUJV/TUznye4fi/rXAZc1ldNkqS98xvNkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6vQWCklOSXJTkk1J7knye63/+CQbktzX3o9r/Uny3iSbk9yZ5EV91SZJmlmfewpPAm+rqucDZwIXJ1kBrAVurKrlwI1tGuBVwPL2WgO8v8faJEkzGCoUkvzs/n5wVW2vqq+09veATcBS4FxgfVtsPfDa1j4X+GgN3AIcm+Tk/d2uJOnADbun8D+T3JrkzUmO3d+NJJkETge+BJxUVdthEBzAiW2xpcCD01bb2vp2/aw1STYm2bhz5879LUWStBdDhUJV/XPgXwGnABuTfCzJK4ZZN8kxwCeBt1bVI3tbdKZNz1DLuqpaVVWrJiYmhilBkjSkoc8pVNV9wDuAPwBeBrw3ydeS/Oqe1klyGINA+JOq+lTrfmjqsFB739H6tzIInSnLgG3D1idJmr1hzyn8XJIrGJwXOBt4TTuBfDZwxR7WCXAlsKmq3jNt1rXA6tZeDXxmWv8b2lVIZwLfnTrMJEkajUOHXO59wAeBS6rqB1OdVbUtyTv2sM5LgN8G7kpyR+u7BLgcuCbJRcADwHlt3vXAOcBm4DHgwv0ZiCRp9oYNhXOAH1TVUwBJDgGOrKrHquqqmVaoqr9m5vMEAC+fYfkCLh6yHklSD4Y9p3ADcNS06aNbnyTpIDJsKBxZVY9OTbT20f2UJEkal2FD4fvTbzuR5OeBH+xleUnSAjTsOYW3Ah9PMnWJ6MnA6/spSZI0LkOFQlV9Ock/Bn6Gwcnjr1XVD3utTJI0csPuKQC8GJhs65yehKr6aC9VSZLGYqhQSHIV8FzgDuCp1l2AoSBJB5Fh9xRWASvadwkkSQepYa8+uhv4R30WIkkav2H3FE4A7k1yK/D4VGdV/UovVUmSxmLYULi0zyIkSfPDsJek/lWSZwPLq+qGJEcDS/otTZI0asPeOvtNwCeAD7SupcCn+ypKkjQew55ovpjBrbAfge6BOyfudQ1J0oIzbCg8XlVPTE0kOZQZHpUpSVrYhg2Fv0pyCXBUezbzx4H/019ZkqRxGDYU1gI7gbuA32HwlLQ9PXFNkrRADXv10Y8YPI7zg/2WI0kap2HvffRNZjiHUFXPmfOKJEljsz/3PppyJHAecPzclyNJGqehzilU1d9Pe32rqv47cHbPtUmSRmzYw0cvmjZ5CIM9h6f3UpEkaWyGPXz07mntJ4EtwG/MeTWSpLEa9uqjX+i7EEnS+A17+Ojf721+Vb1nhnU+BPxLYEdV/WzruxR4E4PvPABcUlXXt3lvBy5i8GS3362qzw05BknSHNmfq49eDFzbpl8DfBF4cC/rfAR4H7s/svOKqnrX9I4kK4DzgRcAzwJuSPK8qnoKSftlcu11Y9nulstfPZbtam7tz0N2XlRV34PuL/6PV9W/2dMKVfXFJJNDfv65wNVV9TjwzSSbgTOAm4dcX5I0B4YNhVOBJ6ZNPwFMHuA235LkDcBG4G1V9TCDW3HfMm2Zra1vN0nWAGsATj311AMsYbzG9ZecJO3LsPc+ugq4NcmlSd4JfIndDwsN4/3Ac4GVwHZ+fFVTZlh2xruwVtW6qlpVVasmJiYOoARJ0p4Me/XRZUk+C7y0dV1YVbfv78aq6qGpdpIPAn/RJrcCp0xbdBmwbX8/X5I0O8PuKQAcDTxSVX8MbE1y2v5uLMnJ0yZfB9zd2tcC5yc5on3ucuDW/f18SdLsDHtJ6jsZXIH0M8CHgcOA/83gaWx7WudPgbOAE5JsBd4JnJVkJYNDQ1sY3IabqronyTXAvQy+HHexVx5J0ugNe6L5dcDpwFcAqmpbkr3e5qKqLpih+8q9LH8ZcNmQ9UiSejBsKDxRVZWkAJI8rceapDnjlV7S/hn2nMI1ST4AHJvkTcAN+MAdSTroDHv10bvas5kfYXBe4T9V1YZeK5Mkjdw+QyHJEuBzVfWLgEEgSQexfR4+alcBPZbkmSOoR5I0RsOeaP4H4K4kG4DvT3VW1e/2UpUkaSyGDYXr2kuSdBDbaygkObWqHqiq9aMqSJI0Pvs6p/DpqUaST/ZciyRpzPYVCtPvXvqcPguRJI3fvkKh9tCWJB2E9nWi+YVJHmGwx3BUa9Omq6qe0Wt1kqSR2msoVNWSURUiSRq//XmegiTpIGcoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqdNbKCT5UJIdSe6e1nd8kg1J7mvvx7X+JHlvks1J7kzyor7qkiTtWZ97Ch8BXrlL31rgxqpaDtzYpgFeBSxvrzXA+3usS5K0B72FQlV9Efj2Lt3nAlOP9lwPvHZa/0dr4Bbg2CQn91WbJGlmoz6ncFJVbQdo7ye2/qXAg9OW29r6dpNkTZKNSTbu3Lmz12IlabGZLyeaM0PfjE96q6p1VbWqqlZNTEz0XJYkLS6jDoWHpg4LtfcdrX8rcMq05ZYB20ZcmyQteqMOhWuB1a29GvjMtP43tKuQzgS+O3WYSZI0Ovt6RvMBS/KnwFnACUm2Au8ELgeuSXIR8ABwXlv8euAcYDPwGHBhX3VJkvast1Coqgv2MOvlMyxbwMV91SJJGs58OdEsSZoHDAVJUqe3w0eSNCqTa68by3a3XP7qsWy3T+4pSJI6hoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoSJI6i/Z7CuO6rlmS5jP3FCRJHUNBktQxFCRJnUV7TkGSZmuc5yb7uu+SewqSpI6hIEnqePhII+ElwNLC4J6CJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOmO5JDXJFuB7wFPAk1W1KsnxwJ8Bk8AW4Deq6uFx1CdJi9U49xR+oapWVtWqNr0WuLGqlgM3tmlJ0gjNp8NH5wLrW3s98Nox1iJJi9K4QqGAzye5Lcma1ndSVW0HaO8nzrRikjVJNibZuHPnzhGVK0mLw7huc/GSqtqW5ERgQ5KvDbtiVa0D1gGsWrWq+ipQkhajsewpVNW29r4D+HPgDOChJCcDtPcd46hNkhazkYdCkqclefpUG/gl4G7gWmB1W2w18JlR1yZJi904Dh+dBPx5kqntf6yq/jLJl4FrklwEPACcN4baJGlRG3koVNX9wAtn6P974OWjrkeS9GPz6ZJUSdKY+ZAdSXPCBykdHNxTkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUmfehUKSVyb5epLNSdaOux5JWkzmVSgkWQL8D+BVwArggiQrxluVJC0e8yoUgDOAzVV1f1U9AVwNnDvmmiRp0Th03AXsYinw4LTprcA/mb5AkjXAmjb5aJKvj6i2uXYC8HfjLmIOOZ75zfHMb/s9nvzRrLb37D3NmG+hkBn66icmqtYB60ZTTn+SbKyqVeOuY644nvnN8cxv82k88+3w0VbglGnTy4BtY6pFkhad+RYKXwaWJzktyeHA+cC1Y65JkhaNeXX4qKqeTPIW4HPAEuBDVXXPmMvqy4I/BLYLxzO/OZ75bd6MJ1W176UkSYvCfDt8JEkaI0NBktQxFObYvm7TkeTUJDcluT3JnUnOaf2HJVmf5K4km5K8ffTV726I8Tw7yY1tLF9IsmzavNVJ7muv1aOtfGYHOp4kK5PcnOSeNu/1o69+d7P5+bT5z0jyrSTvG13VezbL37dTk3y+/fu5N8nkKGufySzH81/b79umJO9NMtMl+3OvqnzN0YvByfFvAM8BDge+CqzYZZl1wL9t7RXAltb+TeDq1j4a2AJMLoDxfBxY3dpnA1e19vHA/e39uNY+bgGP53nA8tZ+FrAdOHahjmfa/D8GPga8b5xjmYvxAF8AXtHaxwBHL9TxAP8M+Jv2GUuAm4GzRlG3ewpza5jbdBTwjNZ+Jj/+HkYBT0tyKHAU8ATwSP8l79Uw41kB3NjaN02b/8vAhqr6dlU9DGwAXjmCmvfmgMdTVX9bVfe19jZgBzAxkqr3bDY/H5L8PHAS8PkR1DqMAx5Pu0faoVW1AaCqHq2qx0ZT9h7N5udTwJEMwuQI4DDgod4rxsNHc22m23Qs3WWZS4HfSrIVuB74d63/E8D3GfwF+gDwrqr6dq/V7tsw4/kq8Gut/Trg6Ul+ash1R2024+kkOYPBP9Zv9FTnsA54PEkOAd4N/H7vVQ5vNj+f5wHfSfKpdmj2v7UbbI7TAY+nqm5mEBLb2+tzVbWp53oBQ2Gu7fM2HcAFwEeqahlwDnBV+wd6BvAUg0MTpwFvS/KcPosdwjDj+Q/Ay5LcDrwM+Bbw5JDrjtpsxjP4gORk4Crgwqr6UV+FDmk243kzcH1VPcj8MZvxHAq8tM1/MYNDNm/srdLhHPB4kvw08HwGd3VYCpyd5F/0WeyUefXltYPAMLfpuIh2GKWqbk5yJIObYf0m8JdV9UNgR5K/AVYxOBY/LvscTzuU8qsASY4Bfq2qvtv2hM7aZd0v9FnsEA54PG36GcB1wDuq6paRVLx3s/n5/FPgpUnezOD4++FJHq2qcT7DZLa/b7dX1f1t3qeBM4ErR1H4HsxmPGuAW6rq0TbvswzG88Xeqx7niZiD7cUgZO9n8Jf+1ImlF+yyzGeBN7b289svSYA/AD7c2k8D7gV+bgGM5wTgkNa+DPjD1j4e+CaDk8zHtfbxC3g8hzM49vvWcf+ezcV4dlnmjcyPE82z+fksactPtOkPAxcv4PG8HrihfcZh7XfvNSOpe9y/CAfbi8Ehob9lcLz5P7a+PwR+pbVXMLiq4KvAHcAvtf5jGFyJcE8LhN8f91iGHM+vA/e1Zf4XcMS0df81sLm9Lhz3WGYzHuC3gB+2n9nUa+VCHc8unzEvQmEOft9eAdwJ3AV8BDh8oY6HQch9ANjU/j94z6hq9jYXkqSOJ5olSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSZ3/DzGuKrVk9SkXAAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"df_trn['收率'].plot(kind='hist')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 训练模型"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"def xgb_cv(train, test, params, fit_params, feature_names, nfold, seed):\n",
" \"\"\"\n",
" train训练数据\n",
" test测试数据\n",
" params参数\n",
" fit_params训练参数\n",
" feature_names特征名\n",
" nfold几折交叉\n",
" seed随机种子\n",
" \"\"\"\n",
" # 创建结果df\n",
" train_pred = pd.DataFrame({\n",
" 'id': train['样本id'],\n",
" 'true': train['收率'],\n",
" 'pred': np.zeros(len(train))})\n",
" # 测试提交结果\n",
" test_pred = pd.DataFrame({'id': test['样本id'], 'pred': np.zeros(len(test))})\n",
" # 交叉验证\n",
" kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)\n",
" # 构造测试DMatrix\n",
" xgb_tst = xgb.DMatrix(data=test[feature_names])\n",
" print('\\n')\n",
" # 遍历cv中每一折数据,通过索引来指定\n",
" for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train['收率'])):\n",
" # 构造当前训练的DMatrix\n",
" xgb_trn = xgb.DMatrix(\n",
" train.iloc[trn_idx][feature_names],\n",
" train.iloc[trn_idx]['收率'])\n",
" # 构造当前验证的DMatrix\n",
" xgb_val = xgb.DMatrix(\n",
" train.iloc[val_idx][feature_names],\n",
" train.iloc[val_idx]['收率'])\n",
" # 训练回归模型\n",
" xgb_reg = xgb.train(params=params, dtrain=xgb_trn, **fit_params,\n",
" evals=[(xgb_trn, 'train'), (xgb_val, 'valid')])\n",
" # 得到验证结果\n",
" val_pred = xgb_reg.predict(\n",
" xgb.DMatrix(train.iloc[val_idx][feature_names]),\n",
" ntree_limit=xgb_reg.best_ntree_limit)\n",
" train_pred.loc[val_idx, 'pred'] = val_pred\n",
" # print(f'Fold_{fold_id}', mse(train.iloc[val_idx]['收率'], val_pred))\n",
" test_pred['pred'] += xgb_reg.predict(\n",
" xgb_tst, ntree_limit=xgb_reg.best_ntree_limit) / nfold\n",
" print('\\nCV LOSS:', mse(train_pred['true'], train_pred['pred']), '\\n')\n",
" return test_pred\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# 设置训练参数\n",
"fit_params = {'num_boost_round': 10800,\n",
" 'verbose_eval': 300,\n",
" 'early_stopping_rounds': 360}\n",
"params_xgb = {'eta': 0.01, 'max_depth': 7, 'subsample': 0.8,\n",
" 'booster': 'gbtree', 'colsample_bytree': 0.8,\n",
" 'objective': 'reg:linear', 'silent': True, 'nthread': 4}"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"[0]\ttrain-rmse:0.42052\tvalid-rmse:0.417952\n",
"Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
"\n",
"Will train until valid-rmse hasn't improved in 360 rounds.\n",
"[300]\ttrain-rmse:0.023717\tvalid-rmse:0.023667\n",
"[600]\ttrain-rmse:0.00645\tvalid-rmse:0.011488\n",
"[900]\ttrain-rmse:0.004691\tvalid-rmse:0.011727\n",
"Stopping. Best iteration:\n",
"[600]\ttrain-rmse:0.00645\tvalid-rmse:0.011488\n",
"\n",
"[0]\ttrain-rmse:0.419812\tvalid-rmse:0.420785\n",
"Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
"\n",
"Will train until valid-rmse hasn't improved in 360 rounds.\n",
"[300]\ttrain-rmse:0.02374\tvalid-rmse:0.025614\n",
"[600]\ttrain-rmse:0.006597\tvalid-rmse:0.01204\n",
"[900]\ttrain-rmse:0.004692\tvalid-rmse:0.01197\n",
"Stopping. Best iteration:\n",
"[810]\ttrain-rmse:0.005159\tvalid-rmse:0.011948\n",
"\n",
"[0]\ttrain-rmse:0.419963\tvalid-rmse:0.420191\n",
"Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
"\n",
"Will train until valid-rmse hasn't improved in 360 rounds.\n",
"[300]\ttrain-rmse:0.023604\tvalid-rmse:0.025064\n",
"[600]\ttrain-rmse:0.006202\tvalid-rmse:0.01245\n",
"[900]\ttrain-rmse:0.004472\tvalid-rmse:0.012215\n",
"[1200]\ttrain-rmse:0.003453\tvalid-rmse:0.012209\n",
"Stopping. Best iteration:\n",
"[1062]\ttrain-rmse:0.003866\tvalid-rmse:0.012199\n",
"\n",
"[0]\ttrain-rmse:0.420254\tvalid-rmse:0.419025\n",
"Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
"\n",
"Will train until valid-rmse hasn't improved in 360 rounds.\n",
"[300]\ttrain-rmse:0.02381\tvalid-rmse:0.024832\n",
"[600]\ttrain-rmse:0.006467\tvalid-rmse:0.010957\n",
"[900]\ttrain-rmse:0.004619\tvalid-rmse:0.010752\n",
"[1200]\ttrain-rmse:0.003542\tvalid-rmse:0.010815\n",
"Stopping. Best iteration:\n",
"[873]\ttrain-rmse:0.004751\tvalid-rmse:0.01075\n",
"\n",
"[0]\ttrain-rmse:0.419487\tvalid-rmse:0.422069\n",
"Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
"\n",
"Will train until valid-rmse hasn't improved in 360 rounds.\n",
"[300]\ttrain-rmse:0.023859\tvalid-rmse:0.024063\n",
"[600]\ttrain-rmse:0.006739\tvalid-rmse:0.0102\n",
"[900]\ttrain-rmse:0.004817\tvalid-rmse:0.010053\n",
"[1200]\ttrain-rmse:0.003709\tvalid-rmse:0.010089\n",
"Stopping. Best iteration:\n",
"[872]\ttrain-rmse:0.004956\tvalid-rmse:0.010047\n",
"\n",
"\n",
"CV LOSS: 0.0001280110217167903 \n",
"\n"
]
}
],
"source": [
"# 开始训练\n",
"pred_xgb_a = xgb_cv(df_trn, df_tst, \n",
" params_xgb, fit_params,\n",
" df_trn.columns.tolist()[1:-1], 5, 0)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"# 得到预测结果\n",
"df_tst_a['收率'] = pred_xgb_a['pred'].values"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 样本id | \n",
" A1 | \n",
" A2 | \n",
" A3 | \n",
" A4 | \n",
" A5 | \n",
" A6 | \n",
" A7 | \n",
" A8 | \n",
" A9 | \n",
" ... | \n",
" B6 | \n",
" B7 | \n",
" B8 | \n",
" B9 | \n",
" B10 | \n",
" B11 | \n",
" B12 | \n",
" B13 | \n",
" B14 | \n",
" 收率 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sample_1656 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 6:00:00 | \n",
" 29 | \n",
" NaN | \n",
" NaN | \n",
" 8:00:00 | \n",
" ... | \n",
" 79 | \n",
" 17:00:00 | \n",
" 45 | \n",
" 17:00-18:30 | \n",
" 18:30-20:00 | \n",
" 20:00-21:00 | \n",
" 1200 | \n",
" 0.15 | \n",
" 400 | \n",
" 0.905793 | \n",
"
\n",
" \n",
" 1 | \n",
" sample_1548 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 12:30:00 | \n",
" 39 | \n",
" 12:50:00 | \n",
" 80.0 | \n",
" 14:20:00 | \n",
" ... | \n",
" 65 | \n",
" 10:00:00 | \n",
" 45 | \n",
" 12:00-13:00 | \n",
" 14:00-15:30 | \n",
" NaN | \n",
" 800 | \n",
" 0.15 | \n",
" 385 | \n",
" 0.879575 | \n",
"
\n",
" \n",
" 2 | \n",
" sample_769 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 6:00:00 | \n",
" 80 | \n",
" NaN | \n",
" NaN | \n",
" 8:00:00 | \n",
" ... | \n",
" 80 | \n",
" 17:00:00 | \n",
" 45 | \n",
" 17:00-20:00 | \n",
" NaN | \n",
" NaN | \n",
" 1200 | \n",
" 0.15 | \n",
" 440 | \n",
" 0.934695 | \n",
"
\n",
" \n",
" 3 | \n",
" sample_1881 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 22:00:00 | \n",
" 29 | \n",
" NaN | \n",
" NaN | \n",
" 0:00:00 | \n",
" ... | \n",
" 80 | \n",
" 9:00:00 | \n",
" 45 | \n",
" 9:00-10:30 | \n",
" 10:30-12:00 | \n",
" 12:00-13:00 | \n",
" 1200 | \n",
" 0.15 | \n",
" 400 | \n",
" 0.903490 | \n",
"
\n",
" \n",
" 4 | \n",
" sample_1807 | \n",
" 300 | \n",
" NaN | \n",
" 405.0 | \n",
" 700 | \n",
" 22:00:00 | \n",
" 30 | \n",
" NaN | \n",
" NaN | \n",
" 0:00:00 | \n",
" ... | \n",
" 79 | \n",
" 9:00:00 | \n",
" 45 | \n",
" 9:00-10:30 | \n",
" 10:30-12:00 | \n",
" 12:00-13:00 | \n",
" 1200 | \n",
" 0.15 | \n",
" 400 | \n",
" 0.928534 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 44 columns
\n",
"
"
],
"text/plain": [
" 样本id A1 A2 A3 A4 A5 A6 A7 A8 A9 \\\n",
"0 sample_1656 300 NaN 405.0 700 6:00:00 29 NaN NaN 8:00:00 \n",
"1 sample_1548 300 NaN 405.0 700 12:30:00 39 12:50:00 80.0 14:20:00 \n",
"2 sample_769 300 NaN 405.0 700 6:00:00 80 NaN NaN 8:00:00 \n",
"3 sample_1881 300 NaN 405.0 700 22:00:00 29 NaN NaN 0:00:00 \n",
"4 sample_1807 300 NaN 405.0 700 22:00:00 30 NaN NaN 0:00:00 \n",
"\n",
" ... B6 B7 B8 B9 B10 B11 B12 B13 \\\n",
"0 ... 79 17:00:00 45 17:00-18:30 18:30-20:00 20:00-21:00 1200 0.15 \n",
"1 ... 65 10:00:00 45 12:00-13:00 14:00-15:30 NaN 800 0.15 \n",
"2 ... 80 17:00:00 45 17:00-20:00 NaN NaN 1200 0.15 \n",
"3 ... 80 9:00:00 45 9:00-10:30 10:30-12:00 12:00-13:00 1200 0.15 \n",
"4 ... 79 9:00:00 45 9:00-10:30 10:30-12:00 12:00-13:00 1200 0.15 \n",
"\n",
" B14 收率 \n",
"0 400 0.905793 \n",
"1 385 0.879575 \n",
"2 440 0.934695 \n",
"3 400 0.903490 \n",
"4 400 0.928534 \n",
"\n",
"[5 rows x 44 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_tst_a.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}