{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### 任务目标:利用异烟酸生产过程中的各参数,预测最终异烟酸的收率\n", "\n", "\n", "**预测具体的值:回归任务**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 数据处理" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import warnings\n", "import xgboost as xgb\n", "from sklearn.model_selection import KFold\n", "from sklearn.metrics import mean_squared_error as mse\n", "\n", "warnings.simplefilter('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\"\"\"导入数据集\"\"\"\n", "df_trn = pd.read_csv('jinnan_round1_train_20181227.csv', encoding='GB2312') # encoding进行编码\n", "df_tst_a = pd.read_csv('jinnan_round1_testA_20181227.csv', encoding='GB2312')\n", "df_tst_b = pd.read_csv('jinnan_round1_testB_20190121.csv', encoding='GB2312') " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
样本idA1A2A3A4A5A6A7A8A9...B6B7B8B9B10B11B12B13B14收率
0sample_1528300NaN405.070013:30:0038.0NaNNaN15:30:00...6511:30:0045.011:30-13:0014:00-15:30NaN800.00.154000.879
1sample_1698300NaN405.070014:00:0029.0NaNNaN16:00:00...806:00:0045.06:00-7:307:30-9:009:00-10:001200.00.154000.902
2sample_639300NaN405.070014:00:0029.0NaNNaN16:00:00...801:00:0045.01:00-2:302:30-4:004:00-5:001200.00.154000.936
3sample_483300NaN405.07001:30:0038.0NaNNaN3:00:00...6518:00:0045.019:00-20:3021:30-23:00NaN800.00.154000.902
4sample_617300NaN405.070022:00:0029.0NaNNaN0:00:00...809:00:0045.09:00-10:3010:30-12:0012:00-13:001200.00.154200.983
\n", "

5 rows × 44 columns

\n", "
" ], "text/plain": [ " 样本id A1 A2 A3 A4 A5 A6 A7 A8 A9 ... \\\n", "0 sample_1528 300 NaN 405.0 700 13:30:00 38.0 NaN NaN 15:30:00 ... \n", "1 sample_1698 300 NaN 405.0 700 14:00:00 29.0 NaN NaN 16:00:00 ... \n", "2 sample_639 300 NaN 405.0 700 14:00:00 29.0 NaN NaN 16:00:00 ... \n", "3 sample_483 300 NaN 405.0 700 1:30:00 38.0 NaN NaN 3:00:00 ... \n", "4 sample_617 300 NaN 405.0 700 22:00:00 29.0 NaN NaN 0:00:00 ... \n", "\n", " B6 B7 B8 B9 B10 B11 B12 B13 \\\n", "0 65 11:30:00 45.0 11:30-13:00 14:00-15:30 NaN 800.0 0.15 \n", "1 80 6:00:00 45.0 6:00-7:30 7:30-9:00 9:00-10:00 1200.0 0.15 \n", "2 80 1:00:00 45.0 1:00-2:30 2:30-4:00 4:00-5:00 1200.0 0.15 \n", "3 65 18:00:00 45.0 19:00-20:30 21:30-23:00 NaN 800.0 0.15 \n", "4 80 9:00:00 45.0 9:00-10:30 10:30-12:00 12:00-13:00 1200.0 0.15 \n", "\n", " B14 收率 \n", "0 400 0.879 \n", "1 400 0.902 \n", "2 400 0.936 \n", "3 400 0.902 \n", "4 420 0.983 \n", "\n", "[5 rows x 44 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 观察数据\n", "df_trn.head() # 可以发现A2、A7等有NaN缺失值" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 1396 entries, 0 to 1395\n", "Data columns (total 44 columns):\n", "样本id 1396 non-null object\n", "A1 1396 non-null int64\n", "A2 42 non-null float64\n", "A3 1354 non-null float64\n", "A4 1396 non-null int64\n", "A5 1396 non-null object\n", "A6 1396 non-null float64\n", "A7 149 non-null object\n", "A8 149 non-null float64\n", "A9 1396 non-null object\n", "A10 1396 non-null int64\n", "A11 1396 non-null object\n", "A12 1396 non-null int64\n", "A13 1396 non-null float64\n", "A14 1396 non-null object\n", "A15 1396 non-null float64\n", "A16 1396 non-null object\n", "A17 1396 non-null float64\n", "A18 1396 non-null float64\n", "A19 1396 non-null int64\n", "A20 1396 non-null object\n", "A21 1393 non-null float64\n", "A22 1396 non-null float64\n", "A23 1393 non-null float64\n", "A24 1395 non-null object\n", "A25 1396 non-null object\n", "A26 1394 non-null object\n", "A27 1396 non-null int64\n", "A28 1396 non-null object\n", "B1 1386 non-null float64\n", "B2 1394 non-null float64\n", "B3 1394 non-null float64\n", "B4 1396 non-null object\n", "B5 1395 non-null object\n", "B6 1396 non-null int64\n", "B7 1396 non-null object\n", "B8 1395 non-null float64\n", "B9 1396 non-null object\n", "B10 1152 non-null object\n", "B11 547 non-null object\n", "B12 1395 non-null float64\n", "B13 1395 non-null float64\n", "B14 1396 non-null int64\n", "收率 1396 non-null float64\n", "dtypes: float64(18), int64(8), object(18)\n", "memory usage: 480.0+ KB\n" ] } ], "source": [ "df_trn.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 如何确定字段需要处理\n", "我们需要解决一些异常值,如某值相对其它值过大的离群点" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
A1A2A3A4A6A8A10A12A13A15A17A18A19A21A22A23A27B1B2B3B6B8B12B13B14收率
count1396.00000042.01354.0000001396.0000001396.000000149.0000001396.0000001396.0000001396.0000001396.0000001396.0000001396.0000001396.0000001393.0000001396.0000001393.0000001396.0000001386.0000001394.0000001394.0000001396.0000001395.0000001395.0000001395.0000001396.0000001396.000000
mean298.853868125.0403.515510705.97421228.28775178.818792100.861032102.6418340.199907103.829370104.7669050.199928231.06733548.7078259.1171205.00287274.396848334.4527423.4544123.50007272.06518643.7096771020.2150540.149419410.4032950.923244
std10.1305520.013.34809353.2147546.7427652.6839200.9051980.9153870.0025240.9636391.4014460.00267650.4780714.9765310.3691520.1366383.044490105.1207530.3885850.0026789.1619864.338396205.9201550.00821326.0184100.030880
min200.000000125.0270.000000470.00000017.00000070.000000100.00000098.0000000.120000100.00000089.0000000.100000100.00000020.0000003.5000004.00000045.0000003.5000000.1500003.50000040.00000020.000000400.0000000.03000040.0000000.624000
25%300.000000125.0405.000000700.00000024.00000080.000000100.000000102.0000000.200000103.000000104.0000000.200000200.00000050.0000009.0000005.00000073.000000320.0000003.5000003.50000065.00000045.000000800.0000000.150000400.0000000.902000
50%300.000000125.0405.000000700.00000029.00000080.000000101.000000103.0000000.200000104.000000105.0000000.200000200.00000050.0000009.0000005.00000073.000000320.0000003.5000003.50000078.00000045.0000001200.0000000.150000400.0000000.925000
75%300.000000125.0405.000000700.00000030.00000080.000000102.000000103.0000000.200000104.000000105.0000000.200000300.00000050.0000009.0000005.00000077.000000330.0000003.5000003.50000080.00000045.0000001200.0000000.150000420.0000000.943000
max300.000000125.0405.000000980.00000097.00000082.000000103.000000107.0000000.200000109.000000108.0000000.200000350.00000090.00000010.00000010.00000080.0000001200.0000003.6000003.60000080.00000073.0000001200.0000000.150000460.0000001.000800
\n", "
" ], "text/plain": [ " A1 A2 A3 A4 A6 A8 \\\n", "count 1396.000000 42.0 1354.000000 1396.000000 1396.000000 149.000000 \n", "mean 298.853868 125.0 403.515510 705.974212 28.287751 78.818792 \n", "std 10.130552 0.0 13.348093 53.214754 6.742765 2.683920 \n", "min 200.000000 125.0 270.000000 470.000000 17.000000 70.000000 \n", "25% 300.000000 125.0 405.000000 700.000000 24.000000 80.000000 \n", "50% 300.000000 125.0 405.000000 700.000000 29.000000 80.000000 \n", "75% 300.000000 125.0 405.000000 700.000000 30.000000 80.000000 \n", "max 300.000000 125.0 405.000000 980.000000 97.000000 82.000000 \n", "\n", " A10 A12 A13 A15 A17 \\\n", "count 1396.000000 1396.000000 1396.000000 1396.000000 1396.000000 \n", "mean 100.861032 102.641834 0.199907 103.829370 104.766905 \n", "std 0.905198 0.915387 0.002524 0.963639 1.401446 \n", "min 100.000000 98.000000 0.120000 100.000000 89.000000 \n", "25% 100.000000 102.000000 0.200000 103.000000 104.000000 \n", "50% 101.000000 103.000000 0.200000 104.000000 105.000000 \n", "75% 102.000000 103.000000 0.200000 104.000000 105.000000 \n", "max 103.000000 107.000000 0.200000 109.000000 108.000000 \n", "\n", " A18 A19 A21 A22 A23 \\\n", "count 1396.000000 1396.000000 1393.000000 1396.000000 1393.000000 \n", "mean 0.199928 231.067335 48.707825 9.117120 5.002872 \n", "std 0.002676 50.478071 4.976531 0.369152 0.136638 \n", "min 0.100000 100.000000 20.000000 3.500000 4.000000 \n", "25% 0.200000 200.000000 50.000000 9.000000 5.000000 \n", "50% 0.200000 200.000000 50.000000 9.000000 5.000000 \n", "75% 0.200000 300.000000 50.000000 9.000000 5.000000 \n", "max 0.200000 350.000000 90.000000 10.000000 10.000000 \n", "\n", " A27 B1 B2 B3 B6 \\\n", "count 1396.000000 1386.000000 1394.000000 1394.000000 1396.000000 \n", "mean 74.396848 334.452742 3.454412 3.500072 72.065186 \n", "std 3.044490 105.120753 0.388585 0.002678 9.161986 \n", "min 45.000000 3.500000 0.150000 3.500000 40.000000 \n", "25% 73.000000 320.000000 3.500000 3.500000 65.000000 \n", "50% 73.000000 320.000000 3.500000 3.500000 78.000000 \n", "75% 77.000000 330.000000 3.500000 3.500000 80.000000 \n", "max 80.000000 1200.000000 3.600000 3.600000 80.000000 \n", "\n", " B8 B12 B13 B14 收率 \n", "count 1395.000000 1395.000000 1395.000000 1396.000000 1396.000000 \n", "mean 43.709677 1020.215054 0.149419 410.403295 0.923244 \n", "std 4.338396 205.920155 0.008213 26.018410 0.030880 \n", "min 20.000000 400.000000 0.030000 40.000000 0.624000 \n", "25% 45.000000 800.000000 0.150000 400.000000 0.902000 \n", "50% 45.000000 1200.000000 0.150000 400.000000 0.925000 \n", "75% 45.000000 1200.000000 0.150000 420.000000 0.943000 \n", "max 73.000000 1200.000000 0.150000 460.000000 1.000800 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pd.set_option('display.max_rows',100)#设置最大可见100行\n", "pd.set_option('display.max_columns',100) #给最大列设置为100列\n", "df_trn.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**观测点:**\n", "