diff --git a/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/游戏销售数据-常用特征构造方法.ipynb b/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/游戏销售数据-常用特征构造方法.ipynb
index 2d81053..1dc6681 100644
--- a/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/游戏销售数据-常用特征构造方法.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/游戏销售数据-常用特征构造方法.ipynb
@@ -9,12 +9,14 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
- "import numpy as np"
+ "import numpy as np\n",
+ "import warnings # 忽略普通警告,不打印太多东西\n",
+ "warnings.filterwarnings('ignore')"
]
},
{
@@ -451,19 +453,9 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 52,
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "D:\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
- "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
- "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
- " warnings.warn(msg, FutureWarning)\n"
- ]
- },
{
"data": {
"text/plain": [
@@ -476,7 +468,7 @@
" [0., 0., 0., ..., 0., 0., 0.]])"
]
},
- "execution_count": 26,
+ "execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
@@ -1240,6 +1232,930 @@
"dummy_df_true.head()"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 二值特征化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " 2006.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " 1985.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " 2008.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " 2009.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " 1996.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Year\n",
+ "0 Wii Sports 2006.0\n",
+ "1 Super Mario Bros. 1985.0\n",
+ "2 Mario Kart Wii 2008.0\n",
+ "3 Wii Sports Resort 2009.0\n",
+ "4 Pokemon Red/Pokemon Blue 1996.0"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vg_year_df = vg_df[['Name', 'Year']]\n",
+ "vg_year_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "我们把2000年以上的归类为1,其它归类为0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Year | \n",
+ " Year_tow | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " 2006.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " 1985.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " 2008.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " 2009.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " 1996.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Year Year_tow\n",
+ "0 Wii Sports 2006.0 1\n",
+ "1 Super Mario Bros. 1985.0 0\n",
+ "2 Mario Kart Wii 2008.0 1\n",
+ "3 Wii Sports Resort 2009.0 1\n",
+ "4 Pokemon Red/Pokemon Blue 1996.0 0"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vg_year_df['Year_tow'] = np.where(vg_year_df['Year'] >= 2000, 1, 0)\n",
+ "vg_year_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Year | \n",
+ " Year_tow | \n",
+ " bn_year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " 2006.0 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " 1985.0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " 2008.0 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " 2009.0 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " 1996.0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Year Year_tow bn_year\n",
+ "0 Wii Sports 2006.0 1 1.0\n",
+ "1 Super Mario Bros. 1985.0 0 0.0\n",
+ "2 Mario Kart Wii 2008.0 1 1.0\n",
+ "3 Wii Sports Resort 2009.0 1 1.0\n",
+ "4 Pokemon Red/Pokemon Blue 1996.0 0 0.0"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.preprocessing import Binarizer\n",
+ "# sklearn中的方法\n",
+ "bn = Binarizer(threshold=2000) # 大于2000我1,小于为0\n",
+ "vg_year_df['Year']=vg_year_df['Year'].fillna(0) # 数据中有Nan值,需要补0,否则无法二分\n",
+ "bn_year = bn.transform([vg_year_df['Year']])[0] # 获取转换的值,取第0列\n",
+ "vg_year_df['bn_year'] = bn_year # 插入数据\n",
+ "vg_year_df.head() # 结果与手动一致"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 多项式特征\n",
+ "获得特征的更高维度和互相间关系的项。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " NA_Sales | \n",
+ " EU_Sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41.49 | \n",
+ " 29.02 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 29.08 | \n",
+ " 3.58 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 15.85 | \n",
+ " 12.88 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 15.75 | \n",
+ " 11.01 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11.27 | \n",
+ " 8.89 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " NA_Sales EU_Sales\n",
+ "0 41.49 29.02\n",
+ "1 29.08 3.58\n",
+ "2 15.85 12.88\n",
+ "3 15.75 11.01\n",
+ "4 11.27 8.89"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polynomial_df = vg_df[['NA_Sales', 'EU_Sales']]\n",
+ "polynomial_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[4.1490000e+01, 2.9020000e+01, 1.7214201e+03, 1.2040398e+03,\n",
+ " 8.4216040e+02],\n",
+ " [2.9080000e+01, 3.5800000e+00, 8.4564640e+02, 1.0410640e+02,\n",
+ " 1.2816400e+01],\n",
+ " [1.5850000e+01, 1.2880000e+01, 2.5122250e+02, 2.0414800e+02,\n",
+ " 1.6589440e+02],\n",
+ " ...,\n",
+ " [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n",
+ " 0.0000000e+00],\n",
+ " [0.0000000e+00, 1.0000000e-02, 0.0000000e+00, 0.0000000e+00,\n",
+ " 1.0000000e-04],\n",
+ " [1.0000000e-02, 0.0000000e+00, 1.0000000e-04, 0.0000000e+00,\n",
+ " 0.0000000e+00]])"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.preprocessing import PolynomialFeatures\n",
+ "\n",
+ "# degree二次幂的复杂度\n",
+ "pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)\n",
+ "res = pf.fit_transform(polynomial_df)\n",
+ "res"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "以第一行为例:\n",
+ "
第一列和第二列分别表示原先的第一列和第二列\n",
+ "
第三列和第五列表示第一列和第二列分别的平方,第四列表示两者的乘积"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " NA_Sales | \n",
+ " EU_Sales | \n",
+ " NA_Sales^2 | \n",
+ " NA_Sales*EU_Sales | \n",
+ " EU_Sales^2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41.49 | \n",
+ " 29.02 | \n",
+ " 1721.4201 | \n",
+ " 1204.0398 | \n",
+ " 842.1604 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 29.08 | \n",
+ " 3.58 | \n",
+ " 845.6464 | \n",
+ " 104.1064 | \n",
+ " 12.8164 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 15.85 | \n",
+ " 12.88 | \n",
+ " 251.2225 | \n",
+ " 204.1480 | \n",
+ " 165.8944 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 15.75 | \n",
+ " 11.01 | \n",
+ " 248.0625 | \n",
+ " 173.4075 | \n",
+ " 121.2201 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11.27 | \n",
+ " 8.89 | \n",
+ " 127.0129 | \n",
+ " 100.1903 | \n",
+ " 79.0321 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " NA_Sales EU_Sales NA_Sales^2 NA_Sales*EU_Sales EU_Sales^2\n",
+ "0 41.49 29.02 1721.4201 1204.0398 842.1604\n",
+ "1 29.08 3.58 845.6464 104.1064 12.8164\n",
+ "2 15.85 12.88 251.2225 204.1480 165.8944\n",
+ "3 15.75 11.01 248.0625 173.4075 121.2201\n",
+ "4 11.27 8.89 127.0129 100.1903 79.0321"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "intr_features = pd.DataFrame(res, columns=['NA_Sales',\n",
+ " 'EU_Sales',\n",
+ " 'NA_Sales^2',\n",
+ " 'NA_Sales*EU_Sales',\n",
+ " 'EU_Sales^2'])\n",
+ "intr_features.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Rank | \n",
+ " Name | \n",
+ " Platform | \n",
+ " Year | \n",
+ " Genre | \n",
+ " Publisher | \n",
+ " NA_Sales | \n",
+ " EU_Sales | \n",
+ " JP_Sales | \n",
+ " Other_Sales | \n",
+ " Global_Sales | \n",
+ " GenreLabel | \n",
+ " GenreMap | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Wii Sports | \n",
+ " Wii | \n",
+ " 2006.0 | \n",
+ " Sports | \n",
+ " Nintendo | \n",
+ " 41.49 | \n",
+ " 29.02 | \n",
+ " 3.77 | \n",
+ " 8.46 | \n",
+ " 82.74 | \n",
+ " 10 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " Super Mario Bros. | \n",
+ " NES | \n",
+ " 1985.0 | \n",
+ " Platform | \n",
+ " Nintendo | \n",
+ " 29.08 | \n",
+ " 3.58 | \n",
+ " 6.81 | \n",
+ " 0.77 | \n",
+ " 40.24 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Mario Kart Wii | \n",
+ " Wii | \n",
+ " 2008.0 | \n",
+ " Racing | \n",
+ " Nintendo | \n",
+ " 15.85 | \n",
+ " 12.88 | \n",
+ " 3.79 | \n",
+ " 3.31 | \n",
+ " 35.82 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " Wii Sports Resort | \n",
+ " Wii | \n",
+ " 2009.0 | \n",
+ " Sports | \n",
+ " Nintendo | \n",
+ " 15.75 | \n",
+ " 11.01 | \n",
+ " 3.28 | \n",
+ " 2.96 | \n",
+ " 33.00 | \n",
+ " 10 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " GB | \n",
+ " 1996.0 | \n",
+ " Role-Playing | \n",
+ " Nintendo | \n",
+ " 11.27 | \n",
+ " 8.89 | \n",
+ " 10.22 | \n",
+ " 1.00 | \n",
+ " 31.37 | \n",
+ " 7 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Rank Name Platform Year Genre Publisher \\\n",
+ "0 1 Wii Sports Wii 2006.0 Sports Nintendo \n",
+ "1 2 Super Mario Bros. NES 1985.0 Platform Nintendo \n",
+ "2 3 Mario Kart Wii Wii 2008.0 Racing Nintendo \n",
+ "3 4 Wii Sports Resort Wii 2009.0 Sports Nintendo \n",
+ "4 5 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo \n",
+ "\n",
+ " NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales GenreLabel \\\n",
+ "0 41.49 29.02 3.77 8.46 82.74 10 \n",
+ "1 29.08 3.58 6.81 0.77 40.24 4 \n",
+ "2 15.85 12.88 3.79 3.31 35.82 6 \n",
+ "3 15.75 11.01 3.28 2.96 33.00 10 \n",
+ "4 11.27 8.89 10.22 1.00 31.37 7 \n",
+ "\n",
+ " GenreMap \n",
+ "0 10 \n",
+ "1 4 \n",
+ "2 6 \n",
+ "3 10 \n",
+ "4 7 "
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vg_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Binning 特征\n",
+ "一般用来处理年龄"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 116,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " 2006.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " 1985.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " 2008.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " 2009.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " 1996.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Year\n",
+ "0 Wii Sports 2006.0\n",
+ "1 Super Mario Bros. 1985.0\n",
+ "2 Mario Kart Wii 2008.0\n",
+ "3 Wii Sports Resort 2009.0\n",
+ "4 Pokemon Red/Pokemon Blue 1996.0"
+ ]
+ },
+ "execution_count": 116,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bin_df = vg_df[['Name','Year']] # 假设GenreLabel是年龄\n",
+ "bin_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0, 0.5, 'Frequency')"
+ ]
+ },
+ "execution_count": 117,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib as mpl\n",
+ "import scipy.stats as spstats\n",
+ "\n",
+ "fig, ax = plt.subplots()\n",
+ "bin_df['Year'].hist(color='#A9C5D3')\n",
+ "ax.set_title('Developer Global_Sales Hostogram', fontsize=12)\n",
+ "ax.set_xlabel('Global_Sales', fontsize=12)\n",
+ "ax.set_ylabel('Frequency', fontsize=12)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "这样区间就出来了,我们可以分成多个区间,如1980-1985是一个区间,1986-1990是一个区间"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 129,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Year | \n",
+ " Year_bin | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " 2006.0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " 1985.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " 2008.0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " 2009.0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " 1996.0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Tetris | \n",
+ " 1989.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " New Super Mario Bros. | \n",
+ " 2006.0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Wii Play | \n",
+ " 2006.0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " New Super Mario Bros. Wii | \n",
+ " 2009.0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Duck Hunt | \n",
+ " 1984.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Year Year_bin\n",
+ "0 Wii Sports 2006.0 5\n",
+ "1 Super Mario Bros. 1985.0 1\n",
+ "2 Mario Kart Wii 2008.0 6\n",
+ "3 Wii Sports Resort 2009.0 6\n",
+ "4 Pokemon Red/Pokemon Blue 1996.0 3\n",
+ "5 Tetris 1989.0 2\n",
+ "6 New Super Mario Bros. 2006.0 5\n",
+ "7 Wii Play 2006.0 5\n",
+ "8 New Super Mario Bros. Wii 2009.0 6\n",
+ "9 Duck Hunt 1984.0 0"
+ ]
+ },
+ "execution_count": 129,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gle = LabelEncoder() # 实例化\n",
+ "bin_df['Year_bin'] = pd.cut(bin_df['Year'], 9) # 切分成9组\n",
+ "bin_df['Year_bin'] = bin_df['Year_bin'].astype(str) # 转换类型为字符串\n",
+ "bin_year = gle.fit_transform(bin_df['Year_bin']) # 利用LabelEncoder方法变成1-9的数值\n",
+ "bin_df['Year_bin'] = bin_year # 赋值到新的列\n",
+ "bin_df.head(10)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,