diff --git a/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/.ipynb_checkpoints/游戏销售数据-常用特征构造方法-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/.ipynb_checkpoints/游戏销售数据-常用特征构造方法-checkpoint.ipynb
deleted file mode 100644
index 1dc6681..0000000
--- a/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/.ipynb_checkpoints/游戏销售数据-常用特征构造方法-checkpoint.ipynb
+++ /dev/null
@@ -1,2188 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 离散值处理"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import warnings # 忽略普通警告,不打印太多东西\n",
- "warnings.filterwarnings('ignore')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Platform | \n",
- " Year | \n",
- " Genre | \n",
- " Publisher | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " NES | \n",
- " 1985.0 | \n",
- " Platform | \n",
- " Nintendo | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " Wii | \n",
- " 2008.0 | \n",
- " Racing | \n",
- " Nintendo | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " Wii | \n",
- " 2009.0 | \n",
- " Sports | \n",
- " Nintendo | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " GB | \n",
- " 1996.0 | \n",
- " Role-Playing | \n",
- " Nintendo | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Tetris | \n",
- " GB | \n",
- " 1989.0 | \n",
- " Puzzle | \n",
- " Nintendo | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " New Super Mario Bros. | \n",
- " DS | \n",
- " 2006.0 | \n",
- " Platform | \n",
- " Nintendo | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Platform Year Genre Publisher\n",
- "1 Super Mario Bros. NES 1985.0 Platform Nintendo\n",
- "2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n",
- "3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n",
- "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n",
- "5 Tetris GB 1989.0 Puzzle Nintendo\n",
- "6 New Super Mario Bros. DS 2006.0 Platform Nintendo"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')\n",
- "vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "机器无法识别字符串类型数据,需要做处理"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n",
- " 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n",
- " 'Strategy'], dtype=object)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "genres = np.unique(vg_df['Genre'])\n",
- "genres # 不同的字符串并不多"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## LabelEncoder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{0: 'Action',\n",
- " 1: 'Adventure',\n",
- " 2: 'Fighting',\n",
- " 3: 'Misc',\n",
- " 4: 'Platform',\n",
- " 5: 'Puzzle',\n",
- " 6: 'Racing',\n",
- " 7: 'Role-Playing',\n",
- " 8: 'Shooter',\n",
- " 9: 'Simulation',\n",
- " 10: 'Sports',\n",
- " 11: 'Strategy'}"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.preprocessing import LabelEncoder\n",
- "\n",
- "gle = LabelEncoder() # 实例化\n",
- "genre_labels = gle.fit_transform(vg_df['Genre']) # 转换需要离散值的一列\n",
- "genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n",
- "genre_mappings # 映射成数值"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Platform | \n",
- " Year | \n",
- " Genre | \n",
- " GenreLabel | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " NES | \n",
- " 1985.0 | \n",
- " Platform | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " Wii | \n",
- " 2008.0 | \n",
- " Racing | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " Wii | \n",
- " 2009.0 | \n",
- " Sports | \n",
- " 10 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " GB | \n",
- " 1996.0 | \n",
- " Role-Playing | \n",
- " 7 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Tetris | \n",
- " GB | \n",
- " 1989.0 | \n",
- " Puzzle | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " New Super Mario Bros. | \n",
- " DS | \n",
- " 2006.0 | \n",
- " Platform | \n",
- " 4 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Platform Year Genre GenreLabel\n",
- "1 Super Mario Bros. NES 1985.0 Platform 4\n",
- "2 Mario Kart Wii Wii 2008.0 Racing 6\n",
- "3 Wii Sports Resort Wii 2009.0 Sports 10\n",
- "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n",
- "5 Tetris GB 1989.0 Puzzle 5\n",
- "6 New Super Mario Bros. DS 2006.0 Platform 4"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vg_df['GenreLabel'] = genre_labels # 赋值到一列\n",
- "vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Map\n",
- "自己建一个字典"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'Action': 0,\n",
- " 'Adventure': 1,\n",
- " 'Fighting': 2,\n",
- " 'Misc': 3,\n",
- " 'Platform': 4,\n",
- " 'Puzzle': 5,\n",
- " 'Racing': 6,\n",
- " 'Role-Playing': 7,\n",
- " 'Shooter': 8,\n",
- " 'Simulation': 9,\n",
- " 'Sports': 10,\n",
- " 'Strategy': 11}"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "gen_ord_map = {label:index for index, label in enumerate(gle.classes_)}\n",
- "gen_ord_map"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Genre | \n",
- " GenreLabel | \n",
- " GenreMap | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " Platform | \n",
- " 4 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " Racing | \n",
- " 6 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " Sports | \n",
- " 10 | \n",
- " 10 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " Role-Playing | \n",
- " 7 | \n",
- " 7 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Tetris | \n",
- " Puzzle | \n",
- " 5 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " New Super Mario Bros. | \n",
- " Platform | \n",
- " 4 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Genre GenreLabel GenreMap\n",
- "1 Super Mario Bros. Platform 4 4\n",
- "2 Mario Kart Wii Racing 6 6\n",
- "3 Wii Sports Resort Sports 10 10\n",
- "4 Pokemon Red/Pokemon Blue Role-Playing 7 7\n",
- "5 Tetris Puzzle 5 5\n",
- "6 New Super Mario Bros. Platform 4 4"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)\n",
- "vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7] # 结果呈现我们设置的map"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## One-Hot Encoder\n",
- "对于离散型特征,基于树的方法是不需要使用one-hot编码的,例如随机森林等。基于距离的模型,都是要使用one-hot编码,例如神经网络等。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[0., 0., 0., ..., 0., 1., 0.],\n",
- " [0., 0., 0., ..., 0., 0., 0.],\n",
- " [0., 0., 0., ..., 0., 0., 0.],\n",
- " ...,\n",
- " [0., 0., 0., ..., 0., 0., 0.],\n",
- " [0., 0., 0., ..., 0., 0., 0.],\n",
- " [0., 0., 0., ..., 0., 0., 0.]])"
- ]
- },
- "execution_count": 52,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.preprocessing import OneHotEncoder\n",
- "# 获取onehot后的结果,将字符串变成多列的0/1值,有则为1,无则为0\n",
- "gen_ohe = OneHotEncoder()\n",
- "gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()\n",
- "gen_feature_arr"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Action | \n",
- " Adventure | \n",
- " Fighting | \n",
- " Misc | \n",
- " Platform | \n",
- " Puzzle | \n",
- " Racing | \n",
- " Role-Playing | \n",
- " Shooter | \n",
- " Simulation | \n",
- " Sports | \n",
- " Strategy | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Action Adventure Fighting Misc Platform Puzzle Racing Role-Playing \\\n",
- "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
- "1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n",
- "2 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n",
- "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
- "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
- "\n",
- " Shooter Simulation Sports Strategy \n",
- "0 0.0 0.0 1.0 0.0 \n",
- "1 0.0 0.0 0.0 0.0 \n",
- "2 0.0 0.0 0.0 0.0 \n",
- "3 0.0 0.0 1.0 0.0 \n",
- "4 0.0 0.0 0.0 0.0 "
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "genres = np.unique(vg_df['Genre']) # 获取全部不同的字符串\n",
- "gen_features = pd.DataFrame(gen_feature_arr, columns=genres) # 将字符串作为列,合并onehot数据\n",
- "gen_features.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Genre | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " Sports | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " Platform | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " Racing | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " Sports | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " Role-Playing | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Genre\n",
- "0 Wii Sports Sports\n",
- "1 Super Mario Bros. Platform\n",
- "2 Mario Kart Wii Racing\n",
- "3 Wii Sports Resort Sports\n",
- "4 Pokemon Red/Pokemon Blue Role-Playing"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 拿出两列原本的数据,实际场景中是全部数据合并,这里是为了查看方便\n",
- "vg_df_2 = vg_df[['Name', 'Genre']]\n",
- "vg_df_2.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Genre | \n",
- " Action | \n",
- " Adventure | \n",
- " Fighting | \n",
- " Misc | \n",
- " Platform | \n",
- " Puzzle | \n",
- " Racing | \n",
- " Role-Playing | \n",
- " Shooter | \n",
- " Simulation | \n",
- " Sports | \n",
- " Strategy | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " Sports | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " Platform | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " Racing | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " Sports | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " Role-Playing | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Genre Action Adventure Fighting Misc \\\n",
- "0 Wii Sports Sports 0.0 0.0 0.0 0.0 \n",
- "1 Super Mario Bros. Platform 0.0 0.0 0.0 0.0 \n",
- "2 Mario Kart Wii Racing 0.0 0.0 0.0 0.0 \n",
- "3 Wii Sports Resort Sports 0.0 0.0 0.0 0.0 \n",
- "4 Pokemon Red/Pokemon Blue Role-Playing 0.0 0.0 0.0 0.0 \n",
- "\n",
- " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
- "0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
- "1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
- "2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n",
- "3 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
- "4 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n",
- "\n",
- " Strategy \n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 "
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1) # 两个数据合并\n",
- "vg_df_ohe.head() # 可以看到Platform列第二行为1,对应着Genre列第二行是Platform字符串"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Get Dummy\n",
- "更加实用的onehot"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(16598, 13)\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Genre | \n",
- " Adventure | \n",
- " Fighting | \n",
- " Misc | \n",
- " Platform | \n",
- " Puzzle | \n",
- " Racing | \n",
- " Role-Playing | \n",
- " Shooter | \n",
- " Simulation | \n",
- " Sports | \n",
- " Strategy | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " Sports | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " Platform | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " Racing | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " Sports | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " Role-Playing | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Genre Adventure Fighting Misc \\\n",
- "0 Wii Sports Sports 0 0 0 \n",
- "1 Super Mario Bros. Platform 0 0 0 \n",
- "2 Mario Kart Wii Racing 0 0 0 \n",
- "3 Wii Sports Resort Sports 0 0 0 \n",
- "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 \n",
- "\n",
- " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
- "0 0 0 0 0 0 0 1 \n",
- "1 1 0 0 0 0 0 0 \n",
- "2 0 0 1 0 0 0 0 \n",
- "3 0 0 0 0 0 0 1 \n",
- "4 0 0 0 1 0 0 0 \n",
- "\n",
- " Strategy \n",
- "0 0 \n",
- "1 0 \n",
- "2 0 \n",
- "3 0 \n",
- "4 0 "
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "gen_dummy_features = pd.get_dummies(vg_df['Genre'],drop_first=True) # drop_first=True删掉全为0的列\n",
- "dummy_df = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
- "print(dummy_df.shape)\n",
- "dummy_df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "可以看到两句话就解决了我们上面那一长串"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(16598, 14)\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Genre | \n",
- " Action | \n",
- " Adventure | \n",
- " Fighting | \n",
- " Misc | \n",
- " Platform | \n",
- " Puzzle | \n",
- " Racing | \n",
- " Role-Playing | \n",
- " Shooter | \n",
- " Simulation | \n",
- " Sports | \n",
- " Strategy | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " Sports | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " Platform | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " Racing | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " Sports | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " Role-Playing | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Genre Action Adventure Fighting Misc \\\n",
- "0 Wii Sports Sports 0 0 0 0 \n",
- "1 Super Mario Bros. Platform 0 0 0 0 \n",
- "2 Mario Kart Wii Racing 0 0 0 0 \n",
- "3 Wii Sports Resort Sports 0 0 0 0 \n",
- "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 0 \n",
- "\n",
- " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
- "0 0 0 0 0 0 0 1 \n",
- "1 1 0 0 0 0 0 0 \n",
- "2 0 0 1 0 0 0 0 \n",
- "3 0 0 0 0 0 0 1 \n",
- "4 0 0 0 1 0 0 0 \n",
- "\n",
- " Strategy \n",
- "0 0 \n",
- "1 0 \n",
- "2 0 \n",
- "3 0 \n",
- "4 0 "
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "gen_dummy_features = pd.get_dummies(vg_df['Genre']) # 和上面相比少了drop_first=True,一般用这种\n",
- "dummy_df_true = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
- "print(dummy_df_true.shape)\n",
- "dummy_df_true.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 二值特征化"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " 2006.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " 1985.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " 2008.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " 2009.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " 1996.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Year\n",
- "0 Wii Sports 2006.0\n",
- "1 Super Mario Bros. 1985.0\n",
- "2 Mario Kart Wii 2008.0\n",
- "3 Wii Sports Resort 2009.0\n",
- "4 Pokemon Red/Pokemon Blue 1996.0"
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vg_year_df = vg_df[['Name', 'Year']]\n",
- "vg_year_df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "我们把2000年以上的归类为1,其它归类为0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Year | \n",
- " Year_tow | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " 2006.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " 1985.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " 2008.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " 2009.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " 1996.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Year Year_tow\n",
- "0 Wii Sports 2006.0 1\n",
- "1 Super Mario Bros. 1985.0 0\n",
- "2 Mario Kart Wii 2008.0 1\n",
- "3 Wii Sports Resort 2009.0 1\n",
- "4 Pokemon Red/Pokemon Blue 1996.0 0"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vg_year_df['Year_tow'] = np.where(vg_year_df['Year'] >= 2000, 1, 0)\n",
- "vg_year_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Year | \n",
- " Year_tow | \n",
- " bn_year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " 2006.0 | \n",
- " 1 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " 1985.0 | \n",
- " 0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " 2008.0 | \n",
- " 1 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " 2009.0 | \n",
- " 1 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " 1996.0 | \n",
- " 0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Year Year_tow bn_year\n",
- "0 Wii Sports 2006.0 1 1.0\n",
- "1 Super Mario Bros. 1985.0 0 0.0\n",
- "2 Mario Kart Wii 2008.0 1 1.0\n",
- "3 Wii Sports Resort 2009.0 1 1.0\n",
- "4 Pokemon Red/Pokemon Blue 1996.0 0 0.0"
- ]
- },
- "execution_count": 56,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.preprocessing import Binarizer\n",
- "# sklearn中的方法\n",
- "bn = Binarizer(threshold=2000) # 大于2000我1,小于为0\n",
- "vg_year_df['Year']=vg_year_df['Year'].fillna(0) # 数据中有Nan值,需要补0,否则无法二分\n",
- "bn_year = bn.transform([vg_year_df['Year']])[0] # 获取转换的值,取第0列\n",
- "vg_year_df['bn_year'] = bn_year # 插入数据\n",
- "vg_year_df.head() # 结果与手动一致"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 多项式特征\n",
- "获得特征的更高维度和互相间关系的项。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " NA_Sales | \n",
- " EU_Sales | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 41.49 | \n",
- " 29.02 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 29.08 | \n",
- " 3.58 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 15.85 | \n",
- " 12.88 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 15.75 | \n",
- " 11.01 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 11.27 | \n",
- " 8.89 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " NA_Sales EU_Sales\n",
- "0 41.49 29.02\n",
- "1 29.08 3.58\n",
- "2 15.85 12.88\n",
- "3 15.75 11.01\n",
- "4 11.27 8.89"
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "polynomial_df = vg_df[['NA_Sales', 'EU_Sales']]\n",
- "polynomial_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[4.1490000e+01, 2.9020000e+01, 1.7214201e+03, 1.2040398e+03,\n",
- " 8.4216040e+02],\n",
- " [2.9080000e+01, 3.5800000e+00, 8.4564640e+02, 1.0410640e+02,\n",
- " 1.2816400e+01],\n",
- " [1.5850000e+01, 1.2880000e+01, 2.5122250e+02, 2.0414800e+02,\n",
- " 1.6589440e+02],\n",
- " ...,\n",
- " [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n",
- " 0.0000000e+00],\n",
- " [0.0000000e+00, 1.0000000e-02, 0.0000000e+00, 0.0000000e+00,\n",
- " 1.0000000e-04],\n",
- " [1.0000000e-02, 0.0000000e+00, 1.0000000e-04, 0.0000000e+00,\n",
- " 0.0000000e+00]])"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.preprocessing import PolynomialFeatures\n",
- "\n",
- "# degree二次幂的复杂度\n",
- "pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)\n",
- "res = pf.fit_transform(polynomial_df)\n",
- "res"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "以第一行为例:\n",
- "
第一列和第二列分别表示原先的第一列和第二列\n",
- "
第三列和第五列表示第一列和第二列分别的平方,第四列表示两者的乘积"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " NA_Sales | \n",
- " EU_Sales | \n",
- " NA_Sales^2 | \n",
- " NA_Sales*EU_Sales | \n",
- " EU_Sales^2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 41.49 | \n",
- " 29.02 | \n",
- " 1721.4201 | \n",
- " 1204.0398 | \n",
- " 842.1604 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 29.08 | \n",
- " 3.58 | \n",
- " 845.6464 | \n",
- " 104.1064 | \n",
- " 12.8164 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 15.85 | \n",
- " 12.88 | \n",
- " 251.2225 | \n",
- " 204.1480 | \n",
- " 165.8944 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 15.75 | \n",
- " 11.01 | \n",
- " 248.0625 | \n",
- " 173.4075 | \n",
- " 121.2201 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 11.27 | \n",
- " 8.89 | \n",
- " 127.0129 | \n",
- " 100.1903 | \n",
- " 79.0321 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " NA_Sales EU_Sales NA_Sales^2 NA_Sales*EU_Sales EU_Sales^2\n",
- "0 41.49 29.02 1721.4201 1204.0398 842.1604\n",
- "1 29.08 3.58 845.6464 104.1064 12.8164\n",
- "2 15.85 12.88 251.2225 204.1480 165.8944\n",
- "3 15.75 11.01 248.0625 173.4075 121.2201\n",
- "4 11.27 8.89 127.0129 100.1903 79.0321"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "intr_features = pd.DataFrame(res, columns=['NA_Sales',\n",
- " 'EU_Sales',\n",
- " 'NA_Sales^2',\n",
- " 'NA_Sales*EU_Sales',\n",
- " 'EU_Sales^2'])\n",
- "intr_features.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Rank | \n",
- " Name | \n",
- " Platform | \n",
- " Year | \n",
- " Genre | \n",
- " Publisher | \n",
- " NA_Sales | \n",
- " EU_Sales | \n",
- " JP_Sales | \n",
- " Other_Sales | \n",
- " Global_Sales | \n",
- " GenreLabel | \n",
- " GenreMap | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1 | \n",
- " Wii Sports | \n",
- " Wii | \n",
- " 2006.0 | \n",
- " Sports | \n",
- " Nintendo | \n",
- " 41.49 | \n",
- " 29.02 | \n",
- " 3.77 | \n",
- " 8.46 | \n",
- " 82.74 | \n",
- " 10 | \n",
- " 10 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2 | \n",
- " Super Mario Bros. | \n",
- " NES | \n",
- " 1985.0 | \n",
- " Platform | \n",
- " Nintendo | \n",
- " 29.08 | \n",
- " 3.58 | \n",
- " 6.81 | \n",
- " 0.77 | \n",
- " 40.24 | \n",
- " 4 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 3 | \n",
- " Mario Kart Wii | \n",
- " Wii | \n",
- " 2008.0 | \n",
- " Racing | \n",
- " Nintendo | \n",
- " 15.85 | \n",
- " 12.88 | \n",
- " 3.79 | \n",
- " 3.31 | \n",
- " 35.82 | \n",
- " 6 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4 | \n",
- " Wii Sports Resort | \n",
- " Wii | \n",
- " 2009.0 | \n",
- " Sports | \n",
- " Nintendo | \n",
- " 15.75 | \n",
- " 11.01 | \n",
- " 3.28 | \n",
- " 2.96 | \n",
- " 33.00 | \n",
- " 10 | \n",
- " 10 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 5 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " GB | \n",
- " 1996.0 | \n",
- " Role-Playing | \n",
- " Nintendo | \n",
- " 11.27 | \n",
- " 8.89 | \n",
- " 10.22 | \n",
- " 1.00 | \n",
- " 31.37 | \n",
- " 7 | \n",
- " 7 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Rank Name Platform Year Genre Publisher \\\n",
- "0 1 Wii Sports Wii 2006.0 Sports Nintendo \n",
- "1 2 Super Mario Bros. NES 1985.0 Platform Nintendo \n",
- "2 3 Mario Kart Wii Wii 2008.0 Racing Nintendo \n",
- "3 4 Wii Sports Resort Wii 2009.0 Sports Nintendo \n",
- "4 5 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo \n",
- "\n",
- " NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales GenreLabel \\\n",
- "0 41.49 29.02 3.77 8.46 82.74 10 \n",
- "1 29.08 3.58 6.81 0.77 40.24 4 \n",
- "2 15.85 12.88 3.79 3.31 35.82 6 \n",
- "3 15.75 11.01 3.28 2.96 33.00 10 \n",
- "4 11.27 8.89 10.22 1.00 31.37 7 \n",
- "\n",
- " GenreMap \n",
- "0 10 \n",
- "1 4 \n",
- "2 6 \n",
- "3 10 \n",
- "4 7 "
- ]
- },
- "execution_count": 68,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vg_df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Binning 特征\n",
- "一般用来处理年龄"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 116,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " 2006.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " 1985.0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " 2008.0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " 2009.0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " 1996.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Year\n",
- "0 Wii Sports 2006.0\n",
- "1 Super Mario Bros. 1985.0\n",
- "2 Mario Kart Wii 2008.0\n",
- "3 Wii Sports Resort 2009.0\n",
- "4 Pokemon Red/Pokemon Blue 1996.0"
- ]
- },
- "execution_count": 116,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "bin_df = vg_df[['Name','Year']] # 假设GenreLabel是年龄\n",
- "bin_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 117,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Text(0, 0.5, 'Frequency')"
- ]
- },
- "execution_count": 117,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEZCAYAAACAZ8KHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3de5xdVX338c+XJNwSIMFASi4SFKogKkK4qmUA5aYQ+lQUSyUgPmihrbaooIKKiqKlgrZVpBDlHlMUiQpiBKY+FoOA3ALhEiCQISkRMglJJEDw9/yx1ml2hpnJrJlzmWS+79frvM7ea99+Z519zu+stffZWxGBmZlZX23S6gDMzGzD4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVsSJw+pOUrukj7Q6jr6SFJJ27sN8J0r6TT+30Sapoz/LFmxjgaR3NXIbZuDEsdHKXyIvSFohaZmk2yR9TNKQes8lbSXpm7k+Vkl6StK1kvZpdWzdkfSO/F4tl7RU0n9L2ruF8bwqGQ0kgVbW0fBEao0zpL5EhqCjImIrYEfgPOAM4NLWhtQYkoZ3U7YZcAvwZuC9wNbArsAM4MimBtgHkrYGfgb8K7AtMAE4B3ixlXFtiCQNa3UMGzMnjiEgIpZHxCzgA8A0SbtD+mKVdH7+Ff6MpIskbZGnzZP03to6JA2X9KykPfP4fvmX8TJJ90pq627bkjaRdJakJyUtkXS5pG3ytMm5m+gUSYskLZZ0epdlz5T0mKTnJM2UtG2XZU+W9BQpQXT1IWAicExEzI2IVyJiVURcGxFf7CHebXKMf8gxn9WllSZJ/5pbBA9JOqQy4aRcbyskPS7po+t9c9b15wARcU2O9YWI+GVE3JfX/3pJt+S6eFbSVZJG9/A6equ7zSVdmcuXSbpD0rjCWKvb2jV3Ty6T9ICkoyvTjpT0YK6TpyV9UtJI4EZgvKSV+TE+748X5n1hUR7erLKuT+d9ZJGkj6jSxSjpB5K+K+kGSauAgyS9R9Ldkp6XtFDSFyvrqu0/J+VpnUot8r0l3Zdfy7/1t042ehHhx0b4ABYA7+qm/Cngb/PwhcAs0q/brYCfAl/L0z4PXFVZ7j3AQ3l4AvAc6Vf7JsC78/h2eXo78JE8/GFgPvA6YBTwY+CKPG0yEMA1wEhSy+APtbiBTwBzSF/+mwHfA67psuzledktunmtM4Af9KGuAtg5D18OXJ/rYzLwCHBynnYisAb4R2AEKREvB7at1NHrAQEHAn8E9szT2oCO9cSxda7Hy4AjgDFdpu+c63ozYDvg18CF3b3n66m7j+b3ektgGLAXsHVf96NcD7/JwyPy+/tZYFPgYGAF8IY8fTHwzjw8prf6AL6UY94+v77bgC/naYcD/wO8Kcd9RZf37Qf5vXg7aZ/cPG/jzXn8LcAzpB8R1f3nojzvocBq4Cd5+xOAJcCBrf4sD8ZHywPwo0FvbM+JYw7wufzltgp4fWXa/sATeXjn/AWwZR6/Cvh8Hj6D/OVfWfYmYFoebmdt4rgZOLUy3xuAl4HhlQ/vGyvTvwFcmofnAYdUpu3QzbKv66UOfgWcVxnfA1gGPA88XCmP/HqHkbqFdqtM+yjQnodPBBYBqkz/HfChHrb/E+DjebiN9SSOPN+u+Uuwg5SkZgHjepj3GODu7t7z9dTdh0lfym/p4360Mtdb7fFH1iaOd5K+0DepLHMN8MU8/FSuw627rPdV9QE8BhxZGT8MWJCHp5N/1FT2z66J4/L1vJYLgQvycG3/mVCZ/hzwgcr4j4BPtOozPJgf7qoaeiYAS0m/6LYE7srN8mXAL3I5ETGf9OVzlKQtgaOBq/M6dgSOrS2Xl30H6cupq/HAk5XxJ0lfXtWukYVdpo+vbOe6yjbmAa/0smxXz1Vjioh7ImI08H9Iv8K7Gkv61dw13gmV8acjf6t0jVfSEZLmKB3UXkZqkY3tJb5XiYh5EXFiREwEds/rvjCvf3tJM3KXz/PAlb2sv7e6u4KU6Gfkbp9vSBrRS1jHRMTo2gM4tTJtPLAwIv7UpU5qdfZXpHp4UtJ/Sdq/l+10t6+Mr0yrvtfdve/rlEnaV9KtudtxOfAxXl1fz1SGX+hmfFQv8Q5ZThxDiNLZOROA3wDPkj4Yb6p8KWwTEdUPyjXAB4GpwIM5mUD6gF5R/TKJiJERcV43m11E+hKreS3pl3T1Azqpy/RFle0c0WU7m0fE05X5e7u8883AoblPvS+eJf0q7xpvdXsTJKlrvLkv/kfA+aQWwmjgBlLLrl8i4iHSL+ndc9HXSK/3LRGxNfA3vay/x7qLiJcj4pyI2A04gHTiwAn9DHMRMKnLcaD/rbOIuCMippK6f34CzKy9vB7W1bXua/vCYlK3W011n6npus6rSS22SRGxDalbqt/vh63lxDEESNpa6UD3DODKiLg//0L8D+ACSdvn+SZIOqyy6AxS3+/fsra1AemX7lGSDpM0LB9sbZNU/WDXXAP8o6SdJI0Cvgr8MCLWVOY5W9KWkt4EnAT8MJdfBJwraccc33aSpha89MtJXzjXSdq9FiswpbuZI+IV0hfbuUqn8e4I/FN+vTXbA/8gaYSkY0ldSzeQWiqbkY7RrJF0BKnu+kzSGyWdXqtHSZNIiXtOnmUrcreRpAnAp3pZXY91J+kgSW9WOvPoeVKyfKUk1orbSV2en8510gYcRWrNbCrpeEnbRMTLeVu17TwDvEb5RInsGuCsHOtY0nG2Wt3PBE5SOhC/ZZ62PlsBSyNitdLp13/dz9doXThxbNx+KmkF6dfn54Bvkr6Ya84gHdick7s+fkU6BgFARCwGfkv6VfrDSvlCUivks6QvyoWkL7Hu9qfppK6RXwNPkA5A/n2Xef4rx3EzcH5E/DKXf4v0i/GX+XXMAfbt64uPiNXAQcCDwM/JxzaAvYH397DY35O+CB8ntcyuzq+h5nZgF1Lr5FzgfRHxXESsAP6B9AXXSfqSmtXXWLMVpNd3ez4zaA4wF6idaXYOsCfpIPDPSSca9KS3uvsz4FpSfcwj1f+V3a1kfSLiJVI35hGkOvkOcEJuLUE6s21B3r8+Rmol1VpT1wCP5+608cBXgDuB+4D7gd/nMiLiRuDbwK2kfeW3ef29nap8KvCl/Po/z9rWjg2Q1u2uNWseSZNJyWRElxaIWa8k7UpKqpt532k+tzjMbIMg6S9z99cY4OvAT500WsOJw6yJJL1Wa//01vXx2lbHN8h9lNQ1+hjpWMnftjacoctdVWZmVsQtDjMzK/KqC8NtbMaOHRuTJ0/u9/KrVq1i5Mi+/g2geRxXGcdVxnGV2Rjjuuuuu56NiO26ndjqv643+rHXXnvFQNx6660DWr5RHFcZx1XGcZXZGOMC7gxfcsTMzOrBicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlZko7/kiJkNLjNvm9vvZUesWt3v5d9/wO7rn8n6xC0OMzMr4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVqRpiUPSAkn3S7pH0p25bFtJsyU9mp/H5HJJ+rak+ZLuk7RnZT3T8vyPSprWrPjNzCxpdovjoIjYIyKm5PEzgZsjYhfg5jwOcASwS36cAnwXUqIBvgDsC+wDfKGWbMzMrDla3VU1FbgsD18GHFMpvzzfwXAOMFrSDsBhwOyIWBoRncBs4PBmB21mNpQp3Vq2CRuSngA6gQC+FxEXS1oWEaMr83RGxBhJPwPOi4jf5PKbgTOANmDziPhKLj8beCEizu+yrVNILRXGjRu314wZM/od98qVKxk1alS/l28Ux1XGcZVpZFydq1b3e1mteYkYvmm/lh0zcvN+b3d9Nsb38aCDDrqr0ju0jmZecuTtEbFI0vbAbEkP9TKvuimLXsrXLYi4GLgYYMqUKdHW1taPcJP29nYGsnyjOK4yjqtMI+Ma0CVHOjt4eczEfi3b1sBLjgy197FpXVURsSg/LwGuIx2jeCZ3QZGfl+TZO4BJlcUnAot6KTczsyZpSuKQNFLSVrVh4FBgLjALqJ0ZNQ24Pg/PAk7IZ1ftByyPiMXATcChksbkg+KH5jIzM2uSZnVVjQOuk1Tb5tUR8QtJdwAzJZ0MPAUcm+e/ATgSmA/8ETgJICKWSvoycEee70sRsbRJr8HMzGhS4oiIx4G3dlP+HHBIN+UBnNbDuqYD0+sdo5mZ9U2rT8c1M7MNjBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWZGmJg5JwyTdLelneXwnSbdLelTSDyVtmss3y+Pz8/TJlXV8Jpc/LOmwZsZvZmbNb3F8HJhXGf86cEFE7AJ0Aifn8pOBzojYGbggz4ek3YDjgDcBhwPfkTSsSbGbmRlNTBySJgLvAS7J4wIOBq7Ns1wGHJOHp+Zx8vRD8vxTgRkR8WJEPAHMB/ZpziswMzMARURzNiRdC3wN2Ar4JHAiMCe3KpA0CbgxInaXNBc4PCI68rTHgH2BL+Zlrszll+Zlru2yrVOAUwDGjRu314wZM/od98qVKxk1alS/l28Ux1XGcZVpZFydq1b3e1mteYkYvmm/lh0zcvN+b3d9Nsb38aCDDrorIqZ0N234gKLqI0nvBZZExF2S2mrF3cwa65nW2zJrCyIuBi4GmDJlSrS1tXWdpc/a29sZyPKN4rjKOK4yjYxr5m1z+73siM4OXh4zsV/Lth2we7+3uz5D7X1sSuIA3g4cLelIYHNga+BCYLSk4RGxBpgILMrzdwCTgA5Jw4FtgKWV8prqMmZm1gRNOcYREZ+JiIkRMZl0cPuWiDgeuBV4X55tGnB9Hp6Vx8nTb4nUpzYLOC6fdbUTsAvwu2a8BjMzS5rV4ujJGcAMSV8B7gYuzeWXAldImk9qaRwHEBEPSJoJPAisAU6LiFeaH7bZhm193UUjVq0eUJeSbdyanjgioh1oz8OP081ZURGxGji2h+XPBc5tXIRmZtYb/3PczMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIk4cZmZWxInDzMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIn1OHJL+QdLYRgZjZmaDX0mL413AAkk/k/QBSZs1KigzMxu8+pw4IuJoYEfgRuATwP9IukTSXzQqODMzG3yKjnFExHMR8e8RsT9wILA3cKukBZI+J2lUQ6I0M7NBo/jguKRDJH2fdN/wZ4ATgA8BbyO1RszMbCM2vK8zSjofOA5YDlwOnBURT1emzwE66x6hmZkNKn1OHMDmwF9GxB3dTYyIlyVNqU9YZmY2WJUkjq8Bf6wWSBoDbBERiwAi4qE6xmZmZoNQyTGOnwATu5RNBK6rXzhmZjbYlSSON0TE/dWCPP7G+oZkZmaDWUniWCJp52pBHn+uviGZmdlgVpI4pgM/kvReSbtJOgq4FrikMaGZmdlgVHJw/DzgZeB8YBKwkJQ0vtmAuMzMbJDqc+KIiD8B/5wfZmY2RJW0OJD0BuCtwDqXFomI6fUMyszMBq+Sy6p/FrgXOJ10iZHa42/6sOzmkn4n6V5JD0g6J5fvJOl2SY9K+qGkTXP5Znl8fp4+ubKuz+TyhyUdVvJizcxs4EpaHJ8A9omI+/qxnReBgyNipaQRwG8k3Qj8E3BBRMyQdBFwMvDd/NwZETtLOg74OvABSbuRLnvyJmA88CtJfx4Rr/QjJjMz64eSs6peAPr1z/BIVubREfkRwMGkM7MALgOOycNT8zh5+iGSlMtnRMSLEfEEMB/Ypz8xmZlZ/5QkjrOBf5W0g6RNqo++LCxpmKR7gCXAbOAxYFlErMmzdAAT8vAE0llb5OnLgddUy7tZxszMmqCkq+oH+fkjlTKRWg7D1rdw7k7aQ9Jo0mVKdu1utsp6u5vWU/k6JJ0CnAIwbtw42tvb1xdej1auXDmg5RvFcZVxXOsasWp1r9O15iVGdHY0KZq+G0hc7e3P1jmatYba/lWSOHaqxwYjYpmkdmA/YLSk4blVMRFYlGfrIP1XpEPScGAbYGmlvKa6THUbFwMXA0yZMiXa2tr6HW97ezsDWb5RHFcZx7WumbfN7XX6iM4OXh7T9dJ0rTeQuNoO2L3O0aw11PavklvHPhkRT5K6il6qjeeyXknaLrc0kLQF6f7l84Bbgffl2aYB1+fhWXmcPP2WiIhcflw+62onYBfgd319DWZmNnAlN3IaDXyH9EX+MjBS0tGkM63OWs/iOwCXSRpGSlYzI+Jnkh4EZkj6CnA3cGme/1LgCknzSS2N4wAi4gFJM4EHgTXAaT6jysysuUq6qi4i3eFvR9IXN8BvgX8Bek0c+RTet3VT/jjdnBUVEauBY3tY17nAuQVxm5lZHZUkjkOA8flOfwEQEX+QtH1jQjMzs8Go5HTc5cDYaoGk1wKL6xqRmZkNaiWJ4xLSZdUPAjaRtD/pT3oXNSQyMzMblEq6qr4OrAb+nfTP7+nA94BvNSAuMzMbpEouqx7AhflhZmZDVMnpuAf3NC0ibqlPOGZmNtiVdFVd2mV8O2BT0r+5X1e3iMzMbFAr6apa55Ij+c98ZwEr6h2UmZkNXiVnVa0j/2P7XODT9QvHzMwGu34njuzdwJ/qEYiZmW0YSg6OL2TdS5hvCWwOnFrvoMzMbPAqOTje9d7iq4BHIuL5OsZjZmaDXMnB8f9qZCBmZrZhKOmquoJu7rbXVUScMKCIzMxsUCs5OL4MOIZ0m9iOvOzUXP5Y5WFmZhuxkmMcfw68JyL+X61A0juAsyPisLpHZmZmg1JJi2M/YE6XstuB/esXjpmZDXYlLY67ga9K+nxEvJDvHX4OcE9jQjPb+HWuWs3M2+a2OgyzIiWJ40TgamC5pE5gDHAncHwD4jIzq6tGJugRvfwAeP8Buzdsu61ScjruAuAASZOA8cDiiHiqUYGZmdngVHTJEUmvAdqAAyPiKUnjJU1sSGRmZjYo9TlxSDoQeJjUNXV2Lt4F+G4D4jIzs0GqpMVxIfCBiDgcWJPLbgf2qXtUZmY2aJUkjskRcXMerv2D/CXKDrCbmdkGriRxPCip6x/93gXcX8d4zMxskCtpLZwO/EzSz4EtJH0POIp02REzMxsi+tziiIg5wFuAB4DpwBPAPhFxR4NiMzOzQahPLY58f/GbgcMi4huNDcnMzAazPrU48v3Fd+rr/GZmtvEqSQTnAN+VtKOkYZI2qT0aFZyZmQ0+JQfHL8nPJ7D2dFzl4WH1DMrMzAav9bYWJP1ZHtyp8nhdftSG17eOSZJulTRP0gOSPp7Lt5U0W9Kj+XlMLpekb0uaL+k+SXtW1jUtz/+opGnFr9jMzAakL91MjwBExJMR8SRwQW24UrY+a4DTI2JX0n09TpO0G3AmcHNE7EI6+H5mnv8I0uVMdgFOIV/WRNK2wBeAfUn/WP9CLdmYmVlz9CVxqMt4W+lGImJxRPw+D68A5gETSP8BuSzPdhnp1rTk8ssjmQOMlrQDcBgwOyKWRkQnMBs4vDQeMzPrv74c44j1z9J3kiYDbyNd52pcRCyGlFwkbZ9nmwAsrCzWkct6Ku+6jVNILRXGjRtHe3t7v+NduXLlgJZvFMdVZrDGpTUvMaKzo9VhvIrjKtNbXO3tzzY5mrUatd/3JXEMl3QQa1seXceJiFv6sjFJo4AfAZ+IiOelro2ZtbN2Uxa9lK9bEHExcDHAlClToq2trS/hdau9vZ2BLN8ojqvMYI3rup//gpfHDL47E4zo7HBcBXqLq62FN3Jq1H7fl8SxhPRP8ZrnuowHfTtAPoKUNK6KiB/n4mck7ZBbGzvkbUFqSUyqLD4RWJTL27qUt/fhNZiZWZ2s9xhHREyOiJ16efQlaQi4FJgXEd+sTJoF1M6MmgZcXyk/IZ9dtR+wPHdp3QQcKmlMPih+aC4zM7MmadYl0d8OfAi4X9I9ueyzwHnATEknA08Bx+ZpNwBHAvOBPwInAUTEUklfBmrXx/pSRCxtzkswMzNoUuKIiN/Q/fEJgEO6mT+A03pY13TW7SozM7Mm8uVCzMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzKzK81QGYtdrM2+a2bNsjWrZls/5zi8PMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVaUrikDRd0hJJcytl20qaLenR/Dwml0vStyXNl3SfpD0ry0zL8z8qaVozYjczs3U1q8XxA+DwLmVnAjdHxC7AzXkc4Ahgl/w4BfgupEQDfAHYF9gH+EIt2ZiZWfM0JXFExK+BpV2KpwKX5eHLgGMq5ZdHMgcYLWkH4DBgdkQsjYhOYDavTkZmZtZgrfzn+LiIWAwQEYslbZ/LJwALK/N15LKeyl9F0imk1grjxo2jvb2930GuXLlyQMs3iuMq01tcI1atbm4wFVrzEiM6O1q2/Z44rjK9xdXe/myTo1mrUZ/HwXjJEXVTFr2Uv7ow4mLgYoApU6ZEW1tbv4Npb29nIMs3iuMq01tcLb3kSGcHL4+Z2LLt98RxlektrrYDdm9yNGs16vPYyrOqnsldUOTnJbm8A5hUmW8isKiXcjMza6JWJo5ZQO3MqGnA9ZXyE/LZVfsBy3OX1k3AoZLG5IPih+YyMzNroqZ0VUm6BmgDxkrqIJ0ddR4wU9LJwFPAsXn2G4AjgfnAH4GTACJiqaQvA3fk+b4UEV0PuJuZWYM1JXFExAd7mHRIN/MGcFoP65kOTK9jaGZmVsj/HDczsyJOHGZmVsSJw8zMijhxmJlZEScOMzMr4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVsSJw8zMijhxmJlZEScOMzMr4sRhZmZFnDjMzKyIE4eZmRVpyo2czPpi5m1zG7buEatWN3T9ZkOJWxxmZlbEicPMzIo4cZiZWREf4zAza6BWHlvbvkHrdYvDzMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIk4cZmZWxInDzMyK+J/j69HZoquqvv+A3Zu+TRj4v1x9FVqzjd8GmTgkHQ58CxgGXBIR57U4pLpb35evv6DNrFU2uK4qScOAfweOAHYDPihpt9ZGZWY2dGxwiQPYB5gfEY9HxEvADGBqi2MyMxsyNsSuqgnAwsp4B7BvdQZJpwCn5NGVkh4ewPbGAs8OYPlGcVxlHFcZx1VmY4xrx54mbIiJQ92UxTojERcDF9dlY9KdETGlHuuqJ8dVxnGVcVxlhlpcG2JXVQcwqTI+EVjUoljMzIacDTFx3AHsImknSZsCxwGzWhyTmdmQscF1VUXEGkl/B9xEOh13ekQ80MBN1qXLqwEcVxnHVcZxlRlScSki1j+XmZlZtiF2VZmZWQs5cZiZWZEhlzgkTZe0RNLcStlbJf1W0v2Sfipp61w+QtJluXyepM9Uljlc0sOS5ks6cxDFtSCX3yPpzibHtamk7+fyeyW1VZbZK5fPl/RtSd2dVt2KuNrz+3hPfmw/wLgmSbo1vy8PSPp4Lt9W0mxJj+bnMblcuT7mS7pP0p6VdU3L8z8qadogiuuVSn0N6MSUfsT1xvwevyjpk13WVbfPZJ3jqttnsh9xHZ/fv/sk3SbprZV19b++ImJIPYC/APYE5lbK7gAOzMMfBr6ch/8amJGHtwQWAJNJB+UfA14HbArcC+zW6rjy+AJgbIvq6zTg+3l4e+AuYJM8/jtgf9L/cG4EjhgkcbUDU+pYXzsAe+bhrYBHSJfG+QZwZi4/E/h6Hj4y14eA/YDbc/m2wOP5eUweHtPquPK0lS2sr+2BvYFzgU9W1lPXz2S94srTFlCnz2Q/4jqgtt+QLtNU278GVF9DrsUREb8GlnYpfgPw6zw8G/ir2uzASEnDgS2Al4DnacBlT+oUV90VxrUbcHNebgmwDJgiaQdg64j4baS99nLgmFbHNZDt9xLX4oj4fR5eAcwjXe1gKnBZnu0y1r7+qcDlkcwBRuf6OgyYHRFLI6Izv57DB0FcdVUaV0QsiYg7gJe7rKqun8k6xlVX/Yjrtrz/AMwh/e8NBlhfQy5x9GAucHQePpa1fzC8FlgFLAaeAs6PiKV0f9mTCYMgLkhJ5ZeS7lK69Eoj9BTXvcBUScMl7QTsladNINVRTbPrq6e4ar6fuxHOlgbWhVYlaTLwNuB2YFxELIb04Sf9QoWe96WG7WMDjAtgc0l3SpojaUA/APoRV09aXV+9achnsh9xnUxqRcIA68uJI/kwcJqku0jNv5dy+T7AK8B4YCfgdEmvow+XPWlRXABvj4g9Sc3S0yT9RRPjmk7aAe8ELgRuA9bQ+vrqKS6A4yPizcA78+ND9QhE0ijgR8AnIqK31mBPddOQOqtDXACvjXQZi78GLpT0+ibG1eMquilrZn31pu6fydK4JB1EShxn1Iq6ma3P9eXEAUTEQxFxaETsBVxD6vuD9MH4RUS8nLs4/pvUxdGUy570Iy4iYlF+XgJcR0oyTYkrItZExD9GxB4RMRUYDTxKqq+JlVU0tb56iYuIeDo/rwCupg71JWkE6UN9VUT8OBc/U+vqyc9LcnlP+1Ld97E6xVXdxx4nHSN6WxPj6kmr66tH9f5MlsYl6S3AJcDUiHguFw+ovpw4AOUzaSRtApwFXJQnPQUcnM8wGUk6SPgQTbrsSWlckkZK2iovMxI4lNR905S4JG2Zt4ukdwNrIuLB3HReIWm/3BV0AnB9q+PKXVdjc/kI4L0MsL7y67sUmBcR36xMmgXUzoyaxtrXPws4Ib+X+wHLc33dBBwqaUw+Q+bQXNbSuHI8m+V1jgXeDs2iZSQAAATsSURBVDzYxLh6UtfPZL3iqvdnsjQuSa8Ffgx8KCIeqcw/sPrq61H0jeVB+iW6mHQQq4PUfPs46eyER4DzWPuP+lHAfwIPkD4cn6qs58g8/2PA5wZDXKQzJO7NjwdaENdk4GHSAbtfATtW1jOF9IF5DPi32jKtjAsYSTrD6r5cX98Chg0wrneQmvz3Affkx5HAa0gH6B/Nz9vm+UW6MdljwP1UzvAidb3Nz4+TBkNcpLN07s/72P3AyU2O68/y+/086SSHDtKJF1DHz2S94qLOn8l+xHUJ0FmZ987KuvpdX77kiJmZFXFXlZmZFXHiMDOzIk4cZmZWxInDzMyKOHGYmVkRJw4zMyvixGGWSfqBpK/0cd6QtHM/t7NA0rv6s2wf1/9FSVc2av1mThw2pEg6TtLtklYp3c/jdkmn1vPihvUgabTSPUf+R9IKSY9IOmP9S5o1nhOHDRmSTif9O/yfSf/0HQd8jHTZjE1bGFp3LiBdIWBXYBvSVX8f63UJsyZx4rAhQdI2wJeAUyPi2ohYEcndEXF8RLzYzTL/V+nuaEslzZI0vsssR0p6XNKzkv45XyMLSa+XdIuk5/K0qySNLgx5b+DqiOiMiD9FuoDjtZXYviVpoaTnlS7X/c5eXvt+Snd/W6ZX3wHxxPwaVkh6QtLxhXHaEOTEYUPF/sBm9PHiipIOBr4GvJ9017UnSTe7qfpL0jW49iTdBOfDtcXzsuNJLYZJwBcL450DnCvpJEm7dDP9DmAP0h0Crwb+U9Lm3byOCcDPga/keT8J/EjSdvmie98m3Y1xK9J1qO4pjNOGICcOGyrGAs9GRO0+HFR+hb+gV98j4XhgekT8PrdGPgPsr3TznJqvR7pD31Ok+3x8ECAi5kfE7Ih4MSL+AHwTOLAw3r8HrgL+Dngwt3yOqE2MiCsj4rlIl4z/F1JSfEM36/kb4IaIuCG3XGaT7ktyZJ7+J2B3SVtEurvcA4Vx2hDkxGFDxXPAWKXb7QIQEQdExOg8retnYTyplVGbd2Wer3qXtOod1J7MyyBpe0kzJD0t6XngSlLi6rOIeCEivhrp3iKvAWaSWhXb5m2cLmmepOWSlpGOg3S3jR2BY3OCXJbnfQewQ0SsAj5AOs6zWNLPJb2xJE4bmpw4bKj4LfAifb+v8iLSly7wv/dSeA3wdGWe6o1wXsvaG+F8jXTp67dExNakX/39Pmsr0h3evkq6DPxO+XjGGaRutDE5+S3vYRsLgSsiYnTlMTIizsvrviki3k3qjnsI+I/+xmlDhxOHDQkRsQw4B/iOpPdJGiVpE0l7kL6Qu7oaOEnSHvnGRV8Fbo+IBZV5PpVvbDSJdC+QH+byrYCVwLJ8jOFTpfEq3f98b0mb5mMXHyfd5+HhvP41wB+A4ZI+T7r3Q3euBI6SdJikYZI2l9QmaaKkcZKOzknxxRzzK6Wx2tDjxGFDRkR8A/gn4NOkW2s+A3yP9Ov9ti7z3gycTbpF52Lg9aS7pFVdT7oR1D2kA9CX5vJzSAfMl+fyH1MugO8Dz5JaMu8G3pO7zG4CbiTdhOdJYDXrdptVX8dCUivrs6REs5CUyDbJj9Pz+peSjsOc2o9YbYjxjZzMzKyIWxxmZlbEicOsRSTdKGllN4/Ptjo2s964q8rMzIq4xWFmZkWcOMzMrIgTh5mZFXHiMDOzIv8fEgSYdA1YC3gAAAAASUVORK5CYII=\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "import matplotlib as mpl\n",
- "import scipy.stats as spstats\n",
- "\n",
- "fig, ax = plt.subplots()\n",
- "bin_df['Year'].hist(color='#A9C5D3')\n",
- "ax.set_title('Developer Global_Sales Hostogram', fontsize=12)\n",
- "ax.set_xlabel('Global_Sales', fontsize=12)\n",
- "ax.set_ylabel('Frequency', fontsize=12)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "这样区间就出来了,我们可以分成多个区间,如1980-1985是一个区间,1986-1990是一个区间"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 129,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Year | \n",
- " Year_bin | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Wii Sports | \n",
- " 2006.0 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Super Mario Bros. | \n",
- " 1985.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Mario Kart Wii | \n",
- " 2008.0 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Wii Sports Resort | \n",
- " 2009.0 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Pokemon Red/Pokemon Blue | \n",
- " 1996.0 | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " Tetris | \n",
- " 1989.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " New Super Mario Bros. | \n",
- " 2006.0 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " Wii Play | \n",
- " 2006.0 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " New Super Mario Bros. Wii | \n",
- " 2009.0 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " Duck Hunt | \n",
- " 1984.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Name Year Year_bin\n",
- "0 Wii Sports 2006.0 5\n",
- "1 Super Mario Bros. 1985.0 1\n",
- "2 Mario Kart Wii 2008.0 6\n",
- "3 Wii Sports Resort 2009.0 6\n",
- "4 Pokemon Red/Pokemon Blue 1996.0 3\n",
- "5 Tetris 1989.0 2\n",
- "6 New Super Mario Bros. 2006.0 5\n",
- "7 Wii Play 2006.0 5\n",
- "8 New Super Mario Bros. Wii 2009.0 6\n",
- "9 Duck Hunt 1984.0 0"
- ]
- },
- "execution_count": 129,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "gle = LabelEncoder() # 实例化\n",
- "bin_df['Year_bin'] = pd.cut(bin_df['Year'], 9) # 切分成9组\n",
- "bin_df['Year_bin'] = bin_df['Year_bin'].astype(str) # 转换类型为字符串\n",
- "bin_year = gle.fit_transform(bin_df['Year_bin']) # 利用LabelEncoder方法变成1-9的数值\n",
- "bin_df['Year_bin'] = bin_year # 赋值到新的列\n",
- "bin_df.head(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}