diff --git a/机器学习竞赛实战_优胜解决方案/工业化工生产预测/.ipynb_checkpoints/工业生产预测-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/工业化工生产预测/.ipynb_checkpoints/工业生产预测-checkpoint.ipynb
index de8a0a6..d2d70e4 100644
--- a/机器学习竞赛实战_优胜解决方案/工业化工生产预测/.ipynb_checkpoints/工业生产预测-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/工业化工生产预测/.ipynb_checkpoints/工业生产预测-checkpoint.ipynb
@@ -334,9 +334,374 @@
"cell_type": "markdown",
"metadata": {},
"source": [
+ "### 如何确定字段需要处理\n",
"我们需要解决一些异常值,如某值相对其它值过大的离群点"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A1 | \n",
+ " A2 | \n",
+ " A3 | \n",
+ " A4 | \n",
+ " A6 | \n",
+ " A8 | \n",
+ " A10 | \n",
+ " A12 | \n",
+ " A13 | \n",
+ " A15 | \n",
+ " A17 | \n",
+ " A18 | \n",
+ " A19 | \n",
+ " A21 | \n",
+ " A22 | \n",
+ " A23 | \n",
+ " A27 | \n",
+ " B1 | \n",
+ " B2 | \n",
+ " B3 | \n",
+ " B6 | \n",
+ " B8 | \n",
+ " B12 | \n",
+ " B13 | \n",
+ " B14 | \n",
+ " 收率 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1396.000000 | \n",
+ " 42.0 | \n",
+ " 1354.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 149.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1393.000000 | \n",
+ " 1396.000000 | \n",
+ " 1393.000000 | \n",
+ " 1396.000000 | \n",
+ " 1386.000000 | \n",
+ " 1394.000000 | \n",
+ " 1394.000000 | \n",
+ " 1396.000000 | \n",
+ " 1395.000000 | \n",
+ " 1395.000000 | \n",
+ " 1395.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 298.853868 | \n",
+ " 125.0 | \n",
+ " 403.515510 | \n",
+ " 705.974212 | \n",
+ " 28.287751 | \n",
+ " 78.818792 | \n",
+ " 100.861032 | \n",
+ " 102.641834 | \n",
+ " 0.199907 | \n",
+ " 103.829370 | \n",
+ " 104.766905 | \n",
+ " 0.199928 | \n",
+ " 231.067335 | \n",
+ " 48.707825 | \n",
+ " 9.117120 | \n",
+ " 5.002872 | \n",
+ " 74.396848 | \n",
+ " 334.452742 | \n",
+ " 3.454412 | \n",
+ " 3.500072 | \n",
+ " 72.065186 | \n",
+ " 43.709677 | \n",
+ " 1020.215054 | \n",
+ " 0.149419 | \n",
+ " 410.403295 | \n",
+ " 0.923244 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 10.130552 | \n",
+ " 0.0 | \n",
+ " 13.348093 | \n",
+ " 53.214754 | \n",
+ " 6.742765 | \n",
+ " 2.683920 | \n",
+ " 0.905198 | \n",
+ " 0.915387 | \n",
+ " 0.002524 | \n",
+ " 0.963639 | \n",
+ " 1.401446 | \n",
+ " 0.002676 | \n",
+ " 50.478071 | \n",
+ " 4.976531 | \n",
+ " 0.369152 | \n",
+ " 0.136638 | \n",
+ " 3.044490 | \n",
+ " 105.120753 | \n",
+ " 0.388585 | \n",
+ " 0.002678 | \n",
+ " 9.161986 | \n",
+ " 4.338396 | \n",
+ " 205.920155 | \n",
+ " 0.008213 | \n",
+ " 26.018410 | \n",
+ " 0.030880 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 200.000000 | \n",
+ " 125.0 | \n",
+ " 270.000000 | \n",
+ " 470.000000 | \n",
+ " 17.000000 | \n",
+ " 70.000000 | \n",
+ " 100.000000 | \n",
+ " 98.000000 | \n",
+ " 0.120000 | \n",
+ " 100.000000 | \n",
+ " 89.000000 | \n",
+ " 0.100000 | \n",
+ " 100.000000 | \n",
+ " 20.000000 | \n",
+ " 3.500000 | \n",
+ " 4.000000 | \n",
+ " 45.000000 | \n",
+ " 3.500000 | \n",
+ " 0.150000 | \n",
+ " 3.500000 | \n",
+ " 40.000000 | \n",
+ " 20.000000 | \n",
+ " 400.000000 | \n",
+ " 0.030000 | \n",
+ " 40.000000 | \n",
+ " 0.624000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 700.000000 | \n",
+ " 24.000000 | \n",
+ " 80.000000 | \n",
+ " 100.000000 | \n",
+ " 102.000000 | \n",
+ " 0.200000 | \n",
+ " 103.000000 | \n",
+ " 104.000000 | \n",
+ " 0.200000 | \n",
+ " 200.000000 | \n",
+ " 50.000000 | \n",
+ " 9.000000 | \n",
+ " 5.000000 | \n",
+ " 73.000000 | \n",
+ " 320.000000 | \n",
+ " 3.500000 | \n",
+ " 3.500000 | \n",
+ " 65.000000 | \n",
+ " 45.000000 | \n",
+ " 800.000000 | \n",
+ " 0.150000 | \n",
+ " 400.000000 | \n",
+ " 0.902000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 700.000000 | \n",
+ " 29.000000 | \n",
+ " 80.000000 | \n",
+ " 101.000000 | \n",
+ " 103.000000 | \n",
+ " 0.200000 | \n",
+ " 104.000000 | \n",
+ " 105.000000 | \n",
+ " 0.200000 | \n",
+ " 200.000000 | \n",
+ " 50.000000 | \n",
+ " 9.000000 | \n",
+ " 5.000000 | \n",
+ " 73.000000 | \n",
+ " 320.000000 | \n",
+ " 3.500000 | \n",
+ " 3.500000 | \n",
+ " 78.000000 | \n",
+ " 45.000000 | \n",
+ " 1200.000000 | \n",
+ " 0.150000 | \n",
+ " 400.000000 | \n",
+ " 0.925000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 700.000000 | \n",
+ " 30.000000 | \n",
+ " 80.000000 | \n",
+ " 102.000000 | \n",
+ " 103.000000 | \n",
+ " 0.200000 | \n",
+ " 104.000000 | \n",
+ " 105.000000 | \n",
+ " 0.200000 | \n",
+ " 300.000000 | \n",
+ " 50.000000 | \n",
+ " 9.000000 | \n",
+ " 5.000000 | \n",
+ " 77.000000 | \n",
+ " 330.000000 | \n",
+ " 3.500000 | \n",
+ " 3.500000 | \n",
+ " 80.000000 | \n",
+ " 45.000000 | \n",
+ " 1200.000000 | \n",
+ " 0.150000 | \n",
+ " 420.000000 | \n",
+ " 0.943000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 980.000000 | \n",
+ " 97.000000 | \n",
+ " 82.000000 | \n",
+ " 103.000000 | \n",
+ " 107.000000 | \n",
+ " 0.200000 | \n",
+ " 109.000000 | \n",
+ " 108.000000 | \n",
+ " 0.200000 | \n",
+ " 350.000000 | \n",
+ " 90.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 80.000000 | \n",
+ " 1200.000000 | \n",
+ " 3.600000 | \n",
+ " 3.600000 | \n",
+ " 80.000000 | \n",
+ " 73.000000 | \n",
+ " 1200.000000 | \n",
+ " 0.150000 | \n",
+ " 460.000000 | \n",
+ " 1.000800 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A1 A2 A3 A4 A6 A8 \\\n",
+ "count 1396.000000 42.0 1354.000000 1396.000000 1396.000000 149.000000 \n",
+ "mean 298.853868 125.0 403.515510 705.974212 28.287751 78.818792 \n",
+ "std 10.130552 0.0 13.348093 53.214754 6.742765 2.683920 \n",
+ "min 200.000000 125.0 270.000000 470.000000 17.000000 70.000000 \n",
+ "25% 300.000000 125.0 405.000000 700.000000 24.000000 80.000000 \n",
+ "50% 300.000000 125.0 405.000000 700.000000 29.000000 80.000000 \n",
+ "75% 300.000000 125.0 405.000000 700.000000 30.000000 80.000000 \n",
+ "max 300.000000 125.0 405.000000 980.000000 97.000000 82.000000 \n",
+ "\n",
+ " A10 A12 A13 A15 A17 \\\n",
+ "count 1396.000000 1396.000000 1396.000000 1396.000000 1396.000000 \n",
+ "mean 100.861032 102.641834 0.199907 103.829370 104.766905 \n",
+ "std 0.905198 0.915387 0.002524 0.963639 1.401446 \n",
+ "min 100.000000 98.000000 0.120000 100.000000 89.000000 \n",
+ "25% 100.000000 102.000000 0.200000 103.000000 104.000000 \n",
+ "50% 101.000000 103.000000 0.200000 104.000000 105.000000 \n",
+ "75% 102.000000 103.000000 0.200000 104.000000 105.000000 \n",
+ "max 103.000000 107.000000 0.200000 109.000000 108.000000 \n",
+ "\n",
+ " A18 A19 A21 A22 A23 \\\n",
+ "count 1396.000000 1396.000000 1393.000000 1396.000000 1393.000000 \n",
+ "mean 0.199928 231.067335 48.707825 9.117120 5.002872 \n",
+ "std 0.002676 50.478071 4.976531 0.369152 0.136638 \n",
+ "min 0.100000 100.000000 20.000000 3.500000 4.000000 \n",
+ "25% 0.200000 200.000000 50.000000 9.000000 5.000000 \n",
+ "50% 0.200000 200.000000 50.000000 9.000000 5.000000 \n",
+ "75% 0.200000 300.000000 50.000000 9.000000 5.000000 \n",
+ "max 0.200000 350.000000 90.000000 10.000000 10.000000 \n",
+ "\n",
+ " A27 B1 B2 B3 B6 \\\n",
+ "count 1396.000000 1386.000000 1394.000000 1394.000000 1396.000000 \n",
+ "mean 74.396848 334.452742 3.454412 3.500072 72.065186 \n",
+ "std 3.044490 105.120753 0.388585 0.002678 9.161986 \n",
+ "min 45.000000 3.500000 0.150000 3.500000 40.000000 \n",
+ "25% 73.000000 320.000000 3.500000 3.500000 65.000000 \n",
+ "50% 73.000000 320.000000 3.500000 3.500000 78.000000 \n",
+ "75% 77.000000 330.000000 3.500000 3.500000 80.000000 \n",
+ "max 80.000000 1200.000000 3.600000 3.600000 80.000000 \n",
+ "\n",
+ " B8 B12 B13 B14 收率 \n",
+ "count 1395.000000 1395.000000 1395.000000 1396.000000 1396.000000 \n",
+ "mean 43.709677 1020.215054 0.149419 410.403295 0.923244 \n",
+ "std 4.338396 205.920155 0.008213 26.018410 0.030880 \n",
+ "min 20.000000 400.000000 0.030000 40.000000 0.624000 \n",
+ "25% 45.000000 800.000000 0.150000 400.000000 0.902000 \n",
+ "50% 45.000000 1200.000000 0.150000 400.000000 0.925000 \n",
+ "75% 45.000000 1200.000000 0.150000 420.000000 0.943000 \n",
+ "max 73.000000 1200.000000 0.150000 460.000000 1.000800 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# pd.set_option('display.max_rows',100)#设置最大可见100行\n",
+ "pd.set_option('display.max_columns',100) #给最大列设置为100列\n",
+ "df_trn.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**观测点:**\n",
+ "\n",
+ " - A5、A9等字段的describe没有了,而head()是有的,说明这些字段有问题\n",
+ "
- 理论上,std(方差也可以)越大表明特征间的差异越大,这样模型能学到区分性,但是过大可能是数据有离群值,B1、B12是需要关注的,再看其它值,B1里面最小值是3.5,25%/50%/75%都是320,3.5非常离群,而B12里最小值和中位数和最大值像是递进。"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 5,
diff --git a/机器学习竞赛实战_优胜解决方案/工业化工生产预测/工业生产预测.ipynb b/机器学习竞赛实战_优胜解决方案/工业化工生产预测/工业生产预测.ipynb
index de8a0a6..d2d70e4 100644
--- a/机器学习竞赛实战_优胜解决方案/工业化工生产预测/工业生产预测.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/工业化工生产预测/工业生产预测.ipynb
@@ -334,9 +334,374 @@
"cell_type": "markdown",
"metadata": {},
"source": [
+ "### 如何确定字段需要处理\n",
"我们需要解决一些异常值,如某值相对其它值过大的离群点"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A1 | \n",
+ " A2 | \n",
+ " A3 | \n",
+ " A4 | \n",
+ " A6 | \n",
+ " A8 | \n",
+ " A10 | \n",
+ " A12 | \n",
+ " A13 | \n",
+ " A15 | \n",
+ " A17 | \n",
+ " A18 | \n",
+ " A19 | \n",
+ " A21 | \n",
+ " A22 | \n",
+ " A23 | \n",
+ " A27 | \n",
+ " B1 | \n",
+ " B2 | \n",
+ " B3 | \n",
+ " B6 | \n",
+ " B8 | \n",
+ " B12 | \n",
+ " B13 | \n",
+ " B14 | \n",
+ " 收率 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1396.000000 | \n",
+ " 42.0 | \n",
+ " 1354.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 149.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ " 1393.000000 | \n",
+ " 1396.000000 | \n",
+ " 1393.000000 | \n",
+ " 1396.000000 | \n",
+ " 1386.000000 | \n",
+ " 1394.000000 | \n",
+ " 1394.000000 | \n",
+ " 1396.000000 | \n",
+ " 1395.000000 | \n",
+ " 1395.000000 | \n",
+ " 1395.000000 | \n",
+ " 1396.000000 | \n",
+ " 1396.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 298.853868 | \n",
+ " 125.0 | \n",
+ " 403.515510 | \n",
+ " 705.974212 | \n",
+ " 28.287751 | \n",
+ " 78.818792 | \n",
+ " 100.861032 | \n",
+ " 102.641834 | \n",
+ " 0.199907 | \n",
+ " 103.829370 | \n",
+ " 104.766905 | \n",
+ " 0.199928 | \n",
+ " 231.067335 | \n",
+ " 48.707825 | \n",
+ " 9.117120 | \n",
+ " 5.002872 | \n",
+ " 74.396848 | \n",
+ " 334.452742 | \n",
+ " 3.454412 | \n",
+ " 3.500072 | \n",
+ " 72.065186 | \n",
+ " 43.709677 | \n",
+ " 1020.215054 | \n",
+ " 0.149419 | \n",
+ " 410.403295 | \n",
+ " 0.923244 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 10.130552 | \n",
+ " 0.0 | \n",
+ " 13.348093 | \n",
+ " 53.214754 | \n",
+ " 6.742765 | \n",
+ " 2.683920 | \n",
+ " 0.905198 | \n",
+ " 0.915387 | \n",
+ " 0.002524 | \n",
+ " 0.963639 | \n",
+ " 1.401446 | \n",
+ " 0.002676 | \n",
+ " 50.478071 | \n",
+ " 4.976531 | \n",
+ " 0.369152 | \n",
+ " 0.136638 | \n",
+ " 3.044490 | \n",
+ " 105.120753 | \n",
+ " 0.388585 | \n",
+ " 0.002678 | \n",
+ " 9.161986 | \n",
+ " 4.338396 | \n",
+ " 205.920155 | \n",
+ " 0.008213 | \n",
+ " 26.018410 | \n",
+ " 0.030880 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 200.000000 | \n",
+ " 125.0 | \n",
+ " 270.000000 | \n",
+ " 470.000000 | \n",
+ " 17.000000 | \n",
+ " 70.000000 | \n",
+ " 100.000000 | \n",
+ " 98.000000 | \n",
+ " 0.120000 | \n",
+ " 100.000000 | \n",
+ " 89.000000 | \n",
+ " 0.100000 | \n",
+ " 100.000000 | \n",
+ " 20.000000 | \n",
+ " 3.500000 | \n",
+ " 4.000000 | \n",
+ " 45.000000 | \n",
+ " 3.500000 | \n",
+ " 0.150000 | \n",
+ " 3.500000 | \n",
+ " 40.000000 | \n",
+ " 20.000000 | \n",
+ " 400.000000 | \n",
+ " 0.030000 | \n",
+ " 40.000000 | \n",
+ " 0.624000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 700.000000 | \n",
+ " 24.000000 | \n",
+ " 80.000000 | \n",
+ " 100.000000 | \n",
+ " 102.000000 | \n",
+ " 0.200000 | \n",
+ " 103.000000 | \n",
+ " 104.000000 | \n",
+ " 0.200000 | \n",
+ " 200.000000 | \n",
+ " 50.000000 | \n",
+ " 9.000000 | \n",
+ " 5.000000 | \n",
+ " 73.000000 | \n",
+ " 320.000000 | \n",
+ " 3.500000 | \n",
+ " 3.500000 | \n",
+ " 65.000000 | \n",
+ " 45.000000 | \n",
+ " 800.000000 | \n",
+ " 0.150000 | \n",
+ " 400.000000 | \n",
+ " 0.902000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 700.000000 | \n",
+ " 29.000000 | \n",
+ " 80.000000 | \n",
+ " 101.000000 | \n",
+ " 103.000000 | \n",
+ " 0.200000 | \n",
+ " 104.000000 | \n",
+ " 105.000000 | \n",
+ " 0.200000 | \n",
+ " 200.000000 | \n",
+ " 50.000000 | \n",
+ " 9.000000 | \n",
+ " 5.000000 | \n",
+ " 73.000000 | \n",
+ " 320.000000 | \n",
+ " 3.500000 | \n",
+ " 3.500000 | \n",
+ " 78.000000 | \n",
+ " 45.000000 | \n",
+ " 1200.000000 | \n",
+ " 0.150000 | \n",
+ " 400.000000 | \n",
+ " 0.925000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 700.000000 | \n",
+ " 30.000000 | \n",
+ " 80.000000 | \n",
+ " 102.000000 | \n",
+ " 103.000000 | \n",
+ " 0.200000 | \n",
+ " 104.000000 | \n",
+ " 105.000000 | \n",
+ " 0.200000 | \n",
+ " 300.000000 | \n",
+ " 50.000000 | \n",
+ " 9.000000 | \n",
+ " 5.000000 | \n",
+ " 77.000000 | \n",
+ " 330.000000 | \n",
+ " 3.500000 | \n",
+ " 3.500000 | \n",
+ " 80.000000 | \n",
+ " 45.000000 | \n",
+ " 1200.000000 | \n",
+ " 0.150000 | \n",
+ " 420.000000 | \n",
+ " 0.943000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 300.000000 | \n",
+ " 125.0 | \n",
+ " 405.000000 | \n",
+ " 980.000000 | \n",
+ " 97.000000 | \n",
+ " 82.000000 | \n",
+ " 103.000000 | \n",
+ " 107.000000 | \n",
+ " 0.200000 | \n",
+ " 109.000000 | \n",
+ " 108.000000 | \n",
+ " 0.200000 | \n",
+ " 350.000000 | \n",
+ " 90.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 80.000000 | \n",
+ " 1200.000000 | \n",
+ " 3.600000 | \n",
+ " 3.600000 | \n",
+ " 80.000000 | \n",
+ " 73.000000 | \n",
+ " 1200.000000 | \n",
+ " 0.150000 | \n",
+ " 460.000000 | \n",
+ " 1.000800 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A1 A2 A3 A4 A6 A8 \\\n",
+ "count 1396.000000 42.0 1354.000000 1396.000000 1396.000000 149.000000 \n",
+ "mean 298.853868 125.0 403.515510 705.974212 28.287751 78.818792 \n",
+ "std 10.130552 0.0 13.348093 53.214754 6.742765 2.683920 \n",
+ "min 200.000000 125.0 270.000000 470.000000 17.000000 70.000000 \n",
+ "25% 300.000000 125.0 405.000000 700.000000 24.000000 80.000000 \n",
+ "50% 300.000000 125.0 405.000000 700.000000 29.000000 80.000000 \n",
+ "75% 300.000000 125.0 405.000000 700.000000 30.000000 80.000000 \n",
+ "max 300.000000 125.0 405.000000 980.000000 97.000000 82.000000 \n",
+ "\n",
+ " A10 A12 A13 A15 A17 \\\n",
+ "count 1396.000000 1396.000000 1396.000000 1396.000000 1396.000000 \n",
+ "mean 100.861032 102.641834 0.199907 103.829370 104.766905 \n",
+ "std 0.905198 0.915387 0.002524 0.963639 1.401446 \n",
+ "min 100.000000 98.000000 0.120000 100.000000 89.000000 \n",
+ "25% 100.000000 102.000000 0.200000 103.000000 104.000000 \n",
+ "50% 101.000000 103.000000 0.200000 104.000000 105.000000 \n",
+ "75% 102.000000 103.000000 0.200000 104.000000 105.000000 \n",
+ "max 103.000000 107.000000 0.200000 109.000000 108.000000 \n",
+ "\n",
+ " A18 A19 A21 A22 A23 \\\n",
+ "count 1396.000000 1396.000000 1393.000000 1396.000000 1393.000000 \n",
+ "mean 0.199928 231.067335 48.707825 9.117120 5.002872 \n",
+ "std 0.002676 50.478071 4.976531 0.369152 0.136638 \n",
+ "min 0.100000 100.000000 20.000000 3.500000 4.000000 \n",
+ "25% 0.200000 200.000000 50.000000 9.000000 5.000000 \n",
+ "50% 0.200000 200.000000 50.000000 9.000000 5.000000 \n",
+ "75% 0.200000 300.000000 50.000000 9.000000 5.000000 \n",
+ "max 0.200000 350.000000 90.000000 10.000000 10.000000 \n",
+ "\n",
+ " A27 B1 B2 B3 B6 \\\n",
+ "count 1396.000000 1386.000000 1394.000000 1394.000000 1396.000000 \n",
+ "mean 74.396848 334.452742 3.454412 3.500072 72.065186 \n",
+ "std 3.044490 105.120753 0.388585 0.002678 9.161986 \n",
+ "min 45.000000 3.500000 0.150000 3.500000 40.000000 \n",
+ "25% 73.000000 320.000000 3.500000 3.500000 65.000000 \n",
+ "50% 73.000000 320.000000 3.500000 3.500000 78.000000 \n",
+ "75% 77.000000 330.000000 3.500000 3.500000 80.000000 \n",
+ "max 80.000000 1200.000000 3.600000 3.600000 80.000000 \n",
+ "\n",
+ " B8 B12 B13 B14 收率 \n",
+ "count 1395.000000 1395.000000 1395.000000 1396.000000 1396.000000 \n",
+ "mean 43.709677 1020.215054 0.149419 410.403295 0.923244 \n",
+ "std 4.338396 205.920155 0.008213 26.018410 0.030880 \n",
+ "min 20.000000 400.000000 0.030000 40.000000 0.624000 \n",
+ "25% 45.000000 800.000000 0.150000 400.000000 0.902000 \n",
+ "50% 45.000000 1200.000000 0.150000 400.000000 0.925000 \n",
+ "75% 45.000000 1200.000000 0.150000 420.000000 0.943000 \n",
+ "max 73.000000 1200.000000 0.150000 460.000000 1.000800 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# pd.set_option('display.max_rows',100)#设置最大可见100行\n",
+ "pd.set_option('display.max_columns',100) #给最大列设置为100列\n",
+ "df_trn.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**观测点:**\n",
+ "\n",
+ " - A5、A9等字段的describe没有了,而head()是有的,说明这些字段有问题\n",
+ "
- 理论上,std(方差也可以)越大表明特征间的差异越大,这样模型能学到区分性,但是过大可能是数据有离群值,B1、B12是需要关注的,再看其它值,B1里面最小值是3.5,25%/50%/75%都是320,3.5非常离群,而B12里最小值和中位数和最大值像是递进。"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 5,