diff --git a/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/游戏销售数据-常用特征构造方法.ipynb b/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/游戏销售数据-常用特征构造方法.ipynb
new file mode 100644
index 0000000..2d81053
--- /dev/null
+++ b/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/游戏销售数据-常用特征构造方法.ipynb
@@ -0,0 +1,1272 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 离散值处理"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Platform | \n",
+ " Year | \n",
+ " Genre | \n",
+ " Publisher | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " NES | \n",
+ " 1985.0 | \n",
+ " Platform | \n",
+ " Nintendo | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " Wii | \n",
+ " 2008.0 | \n",
+ " Racing | \n",
+ " Nintendo | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " Wii | \n",
+ " 2009.0 | \n",
+ " Sports | \n",
+ " Nintendo | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " GB | \n",
+ " 1996.0 | \n",
+ " Role-Playing | \n",
+ " Nintendo | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Tetris | \n",
+ " GB | \n",
+ " 1989.0 | \n",
+ " Puzzle | \n",
+ " Nintendo | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " New Super Mario Bros. | \n",
+ " DS | \n",
+ " 2006.0 | \n",
+ " Platform | \n",
+ " Nintendo | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Platform Year Genre Publisher\n",
+ "1 Super Mario Bros. NES 1985.0 Platform Nintendo\n",
+ "2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n",
+ "3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n",
+ "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n",
+ "5 Tetris GB 1989.0 Puzzle Nintendo\n",
+ "6 New Super Mario Bros. DS 2006.0 Platform Nintendo"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')\n",
+ "vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "机器无法识别字符串类型数据,需要做处理"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n",
+ " 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n",
+ " 'Strategy'], dtype=object)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "genres = np.unique(vg_df['Genre'])\n",
+ "genres # 不同的字符串并不多"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## LabelEncoder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{0: 'Action',\n",
+ " 1: 'Adventure',\n",
+ " 2: 'Fighting',\n",
+ " 3: 'Misc',\n",
+ " 4: 'Platform',\n",
+ " 5: 'Puzzle',\n",
+ " 6: 'Racing',\n",
+ " 7: 'Role-Playing',\n",
+ " 8: 'Shooter',\n",
+ " 9: 'Simulation',\n",
+ " 10: 'Sports',\n",
+ " 11: 'Strategy'}"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "\n",
+ "gle = LabelEncoder() # 实例化\n",
+ "genre_labels = gle.fit_transform(vg_df['Genre']) # 转换需要离散值的一列\n",
+ "genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n",
+ "genre_mappings # 映射成数值"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Platform | \n",
+ " Year | \n",
+ " Genre | \n",
+ " GenreLabel | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " NES | \n",
+ " 1985.0 | \n",
+ " Platform | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " Wii | \n",
+ " 2008.0 | \n",
+ " Racing | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " Wii | \n",
+ " 2009.0 | \n",
+ " Sports | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " GB | \n",
+ " 1996.0 | \n",
+ " Role-Playing | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Tetris | \n",
+ " GB | \n",
+ " 1989.0 | \n",
+ " Puzzle | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " New Super Mario Bros. | \n",
+ " DS | \n",
+ " 2006.0 | \n",
+ " Platform | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Platform Year Genre GenreLabel\n",
+ "1 Super Mario Bros. NES 1985.0 Platform 4\n",
+ "2 Mario Kart Wii Wii 2008.0 Racing 6\n",
+ "3 Wii Sports Resort Wii 2009.0 Sports 10\n",
+ "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n",
+ "5 Tetris GB 1989.0 Puzzle 5\n",
+ "6 New Super Mario Bros. DS 2006.0 Platform 4"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vg_df['GenreLabel'] = genre_labels # 赋值到一列\n",
+ "vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Map\n",
+ "自己建一个字典"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'Action': 0,\n",
+ " 'Adventure': 1,\n",
+ " 'Fighting': 2,\n",
+ " 'Misc': 3,\n",
+ " 'Platform': 4,\n",
+ " 'Puzzle': 5,\n",
+ " 'Racing': 6,\n",
+ " 'Role-Playing': 7,\n",
+ " 'Shooter': 8,\n",
+ " 'Simulation': 9,\n",
+ " 'Sports': 10,\n",
+ " 'Strategy': 11}"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gen_ord_map = {label:index for index, label in enumerate(gle.classes_)}\n",
+ "gen_ord_map"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Genre | \n",
+ " GenreLabel | \n",
+ " GenreMap | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " Platform | \n",
+ " 4 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " Racing | \n",
+ " 6 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " Sports | \n",
+ " 10 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " Role-Playing | \n",
+ " 7 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Tetris | \n",
+ " Puzzle | \n",
+ " 5 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " New Super Mario Bros. | \n",
+ " Platform | \n",
+ " 4 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Genre GenreLabel GenreMap\n",
+ "1 Super Mario Bros. Platform 4 4\n",
+ "2 Mario Kart Wii Racing 6 6\n",
+ "3 Wii Sports Resort Sports 10 10\n",
+ "4 Pokemon Red/Pokemon Blue Role-Playing 7 7\n",
+ "5 Tetris Puzzle 5 5\n",
+ "6 New Super Mario Bros. Platform 4 4"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)\n",
+ "vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7] # 结果呈现我们设置的map"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## One-Hot Encoder\n",
+ "对于离散型特征,基于树的方法是不需要使用one-hot编码的,例如随机森林等。基于距离的模型,都是要使用one-hot编码,例如神经网络等。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "D:\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
+ "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
+ "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
+ " warnings.warn(msg, FutureWarning)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[0., 0., 0., ..., 0., 1., 0.],\n",
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
+ " ...,\n",
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
+ " [0., 0., 0., ..., 0., 0., 0.]])"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "# 获取onehot后的结果,将字符串变成多列的0/1值,有则为1,无则为0\n",
+ "gen_ohe = OneHotEncoder()\n",
+ "gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()\n",
+ "gen_feature_arr"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Action | \n",
+ " Adventure | \n",
+ " Fighting | \n",
+ " Misc | \n",
+ " Platform | \n",
+ " Puzzle | \n",
+ " Racing | \n",
+ " Role-Playing | \n",
+ " Shooter | \n",
+ " Simulation | \n",
+ " Sports | \n",
+ " Strategy | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Action Adventure Fighting Misc Platform Puzzle Racing Role-Playing \\\n",
+ "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
+ "\n",
+ " Shooter Simulation Sports Strategy \n",
+ "0 0.0 0.0 1.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 1.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "genres = np.unique(vg_df['Genre']) # 获取全部不同的字符串\n",
+ "gen_features = pd.DataFrame(gen_feature_arr, columns=genres) # 将字符串作为列,合并onehot数据\n",
+ "gen_features.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " Sports | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " Platform | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " Racing | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " Sports | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " Role-Playing | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Genre\n",
+ "0 Wii Sports Sports\n",
+ "1 Super Mario Bros. Platform\n",
+ "2 Mario Kart Wii Racing\n",
+ "3 Wii Sports Resort Sports\n",
+ "4 Pokemon Red/Pokemon Blue Role-Playing"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 拿出两列原本的数据,实际场景中是全部数据合并,这里是为了查看方便\n",
+ "vg_df_2 = vg_df[['Name', 'Genre']]\n",
+ "vg_df_2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Genre | \n",
+ " Action | \n",
+ " Adventure | \n",
+ " Fighting | \n",
+ " Misc | \n",
+ " Platform | \n",
+ " Puzzle | \n",
+ " Racing | \n",
+ " Role-Playing | \n",
+ " Shooter | \n",
+ " Simulation | \n",
+ " Sports | \n",
+ " Strategy | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " Sports | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " Platform | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " Racing | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " Sports | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " Role-Playing | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Genre Action Adventure Fighting Misc \\\n",
+ "0 Wii Sports Sports 0.0 0.0 0.0 0.0 \n",
+ "1 Super Mario Bros. Platform 0.0 0.0 0.0 0.0 \n",
+ "2 Mario Kart Wii Racing 0.0 0.0 0.0 0.0 \n",
+ "3 Wii Sports Resort Sports 0.0 0.0 0.0 0.0 \n",
+ "4 Pokemon Red/Pokemon Blue Role-Playing 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
+ "0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
+ "1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
+ "4 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Strategy \n",
+ "0 0.0 \n",
+ "1 0.0 \n",
+ "2 0.0 \n",
+ "3 0.0 \n",
+ "4 0.0 "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1) # 两个数据合并\n",
+ "vg_df_ohe.head() # 可以看到Platform列第二行为1,对应着Genre列第二行是Platform字符串"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get Dummy\n",
+ "更加实用的onehot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(16598, 13)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Genre | \n",
+ " Adventure | \n",
+ " Fighting | \n",
+ " Misc | \n",
+ " Platform | \n",
+ " Puzzle | \n",
+ " Racing | \n",
+ " Role-Playing | \n",
+ " Shooter | \n",
+ " Simulation | \n",
+ " Sports | \n",
+ " Strategy | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " Sports | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " Platform | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " Racing | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " Sports | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " Role-Playing | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Genre Adventure Fighting Misc \\\n",
+ "0 Wii Sports Sports 0 0 0 \n",
+ "1 Super Mario Bros. Platform 0 0 0 \n",
+ "2 Mario Kart Wii Racing 0 0 0 \n",
+ "3 Wii Sports Resort Sports 0 0 0 \n",
+ "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 \n",
+ "\n",
+ " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
+ "0 0 0 0 0 0 0 1 \n",
+ "1 1 0 0 0 0 0 0 \n",
+ "2 0 0 1 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 1 \n",
+ "4 0 0 0 1 0 0 0 \n",
+ "\n",
+ " Strategy \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 "
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gen_dummy_features = pd.get_dummies(vg_df['Genre'],drop_first=True) # drop_first=True删掉全为0的列\n",
+ "dummy_df = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
+ "print(dummy_df.shape)\n",
+ "dummy_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "可以看到两句话就解决了我们上面那一长串"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(16598, 14)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Genre | \n",
+ " Action | \n",
+ " Adventure | \n",
+ " Fighting | \n",
+ " Misc | \n",
+ " Platform | \n",
+ " Puzzle | \n",
+ " Racing | \n",
+ " Role-Playing | \n",
+ " Shooter | \n",
+ " Simulation | \n",
+ " Sports | \n",
+ " Strategy | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wii Sports | \n",
+ " Sports | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Super Mario Bros. | \n",
+ " Platform | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Mario Kart Wii | \n",
+ " Racing | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Wii Sports Resort | \n",
+ " Sports | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Pokemon Red/Pokemon Blue | \n",
+ " Role-Playing | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name Genre Action Adventure Fighting Misc \\\n",
+ "0 Wii Sports Sports 0 0 0 0 \n",
+ "1 Super Mario Bros. Platform 0 0 0 0 \n",
+ "2 Mario Kart Wii Racing 0 0 0 0 \n",
+ "3 Wii Sports Resort Sports 0 0 0 0 \n",
+ "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 0 \n",
+ "\n",
+ " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
+ "0 0 0 0 0 0 0 1 \n",
+ "1 1 0 0 0 0 0 0 \n",
+ "2 0 0 1 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 1 \n",
+ "4 0 0 0 1 0 0 0 \n",
+ "\n",
+ " Strategy \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 "
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gen_dummy_features = pd.get_dummies(vg_df['Genre']) # 和上面相比少了drop_first=True,一般用这种\n",
+ "dummy_df_true = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
+ "print(dummy_df_true.shape)\n",
+ "dummy_df_true.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}