You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2766 lines
104 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 离散值处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import warnings # 忽略普通警告,不打印太多东西\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Platform</th>\n",
" <th>Year</th>\n",
" <th>Genre</th>\n",
" <th>Publisher</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>NES</td>\n",
" <td>1985.0</td>\n",
" <td>Platform</td>\n",
" <td>Nintendo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Wii</td>\n",
" <td>2008.0</td>\n",
" <td>Racing</td>\n",
" <td>Nintendo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Wii</td>\n",
" <td>2009.0</td>\n",
" <td>Sports</td>\n",
" <td>Nintendo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>GB</td>\n",
" <td>1996.0</td>\n",
" <td>Role-Playing</td>\n",
" <td>Nintendo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Tetris</td>\n",
" <td>GB</td>\n",
" <td>1989.0</td>\n",
" <td>Puzzle</td>\n",
" <td>Nintendo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>New Super Mario Bros.</td>\n",
" <td>DS</td>\n",
" <td>2006.0</td>\n",
" <td>Platform</td>\n",
" <td>Nintendo</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Platform Year Genre Publisher\n",
"1 Super Mario Bros. NES 1985.0 Platform Nintendo\n",
"2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n",
"3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n",
"4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n",
"5 Tetris GB 1989.0 Puzzle Nintendo\n",
"6 New Super Mario Bros. DS 2006.0 Platform Nintendo"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')\n",
"vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"机器无法识别字符串类型数据,需要做处理"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n",
" 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n",
" 'Strategy'], dtype=object)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres = np.unique(vg_df['Genre'])\n",
"genres # 不同的字符串并不多"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'Action',\n",
" 1: 'Adventure',\n",
" 2: 'Fighting',\n",
" 3: 'Misc',\n",
" 4: 'Platform',\n",
" 5: 'Puzzle',\n",
" 6: 'Racing',\n",
" 7: 'Role-Playing',\n",
" 8: 'Shooter',\n",
" 9: 'Simulation',\n",
" 10: 'Sports',\n",
" 11: 'Strategy'}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"gle = LabelEncoder() # 实例化\n",
"genre_labels = gle.fit_transform(vg_df['Genre']) # 转换需要离散值的一列\n",
"genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n",
"genre_mappings # 映射成数值"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Platform</th>\n",
" <th>Year</th>\n",
" <th>Genre</th>\n",
" <th>GenreLabel</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>NES</td>\n",
" <td>1985.0</td>\n",
" <td>Platform</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Wii</td>\n",
" <td>2008.0</td>\n",
" <td>Racing</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Wii</td>\n",
" <td>2009.0</td>\n",
" <td>Sports</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>GB</td>\n",
" <td>1996.0</td>\n",
" <td>Role-Playing</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Tetris</td>\n",
" <td>GB</td>\n",
" <td>1989.0</td>\n",
" <td>Puzzle</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>New Super Mario Bros.</td>\n",
" <td>DS</td>\n",
" <td>2006.0</td>\n",
" <td>Platform</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Platform Year Genre GenreLabel\n",
"1 Super Mario Bros. NES 1985.0 Platform 4\n",
"2 Mario Kart Wii Wii 2008.0 Racing 6\n",
"3 Wii Sports Resort Wii 2009.0 Sports 10\n",
"4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n",
"5 Tetris GB 1989.0 Puzzle 5\n",
"6 New Super Mario Bros. DS 2006.0 Platform 4"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_df['GenreLabel'] = genre_labels # 赋值到一列\n",
"vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Map\n",
"自己建一个字典"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Action': 0,\n",
" 'Adventure': 1,\n",
" 'Fighting': 2,\n",
" 'Misc': 3,\n",
" 'Platform': 4,\n",
" 'Puzzle': 5,\n",
" 'Racing': 6,\n",
" 'Role-Playing': 7,\n",
" 'Shooter': 8,\n",
" 'Simulation': 9,\n",
" 'Sports': 10,\n",
" 'Strategy': 11}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_ord_map = {label:index for index, label in enumerate(gle.classes_)}\n",
"gen_ord_map"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Genre</th>\n",
" <th>GenreLabel</th>\n",
" <th>GenreMap</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>Platform</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Racing</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Sports</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>Role-Playing</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Tetris</td>\n",
" <td>Puzzle</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>New Super Mario Bros.</td>\n",
" <td>Platform</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Genre GenreLabel GenreMap\n",
"1 Super Mario Bros. Platform 4 4\n",
"2 Mario Kart Wii Racing 6 6\n",
"3 Wii Sports Resort Sports 10 10\n",
"4 Pokemon Red/Pokemon Blue Role-Playing 7 7\n",
"5 Tetris Puzzle 5 5\n",
"6 New Super Mario Bros. Platform 4 4"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)\n",
"vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7] # 结果呈现我们设置的map"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## One-Hot Encoder\n",
"对于离散型特征基于树的方法是不需要使用one-hot编码的例如随机森林等。基于距离的模型都是要使用one-hot编码例如神经网络等。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0., 0., ..., 0., 1., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" ...,\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"# 获取onehot后的结果将字符串变成多列的0/1值有则为1无则为0\n",
"gen_ohe = OneHotEncoder()\n",
"gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()\n",
"gen_feature_arr"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Fighting</th>\n",
" <th>Misc</th>\n",
" <th>Platform</th>\n",
" <th>Puzzle</th>\n",
" <th>Racing</th>\n",
" <th>Role-Playing</th>\n",
" <th>Shooter</th>\n",
" <th>Simulation</th>\n",
" <th>Sports</th>\n",
" <th>Strategy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Action Adventure Fighting Misc Platform Puzzle Racing Role-Playing \\\n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
"\n",
" Shooter Simulation Sports Strategy \n",
"0 0.0 0.0 1.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 1.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres = np.unique(vg_df['Genre']) # 获取全部不同的字符串\n",
"gen_features = pd.DataFrame(gen_feature_arr, columns=genres) # 将字符串作为列合并onehot数据\n",
"gen_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Genre</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>Sports</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>Platform</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Racing</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Sports</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>Role-Playing</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Genre\n",
"0 Wii Sports Sports\n",
"1 Super Mario Bros. Platform\n",
"2 Mario Kart Wii Racing\n",
"3 Wii Sports Resort Sports\n",
"4 Pokemon Red/Pokemon Blue Role-Playing"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 拿出两列原本的数据,实际场景中是全部数据合并,这里是为了查看方便\n",
"vg_df_2 = vg_df[['Name', 'Genre']]\n",
"vg_df_2.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Genre</th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Fighting</th>\n",
" <th>Misc</th>\n",
" <th>Platform</th>\n",
" <th>Puzzle</th>\n",
" <th>Racing</th>\n",
" <th>Role-Playing</th>\n",
" <th>Shooter</th>\n",
" <th>Simulation</th>\n",
" <th>Sports</th>\n",
" <th>Strategy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>Sports</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>Platform</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Racing</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Sports</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>Role-Playing</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Genre Action Adventure Fighting Misc \\\n",
"0 Wii Sports Sports 0.0 0.0 0.0 0.0 \n",
"1 Super Mario Bros. Platform 0.0 0.0 0.0 0.0 \n",
"2 Mario Kart Wii Racing 0.0 0.0 0.0 0.0 \n",
"3 Wii Sports Resort Sports 0.0 0.0 0.0 0.0 \n",
"4 Pokemon Red/Pokemon Blue Role-Playing 0.0 0.0 0.0 0.0 \n",
"\n",
" Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
"1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n",
"4 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n",
"\n",
" Strategy \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1) # 两个数据合并\n",
"vg_df_ohe.head() # 可以看到Platform列第二行为1对应着Genre列第二行是Platform字符串"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Dummy\n",
"更加实用的onehot"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(16598, 13)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Genre</th>\n",
" <th>Adventure</th>\n",
" <th>Fighting</th>\n",
" <th>Misc</th>\n",
" <th>Platform</th>\n",
" <th>Puzzle</th>\n",
" <th>Racing</th>\n",
" <th>Role-Playing</th>\n",
" <th>Shooter</th>\n",
" <th>Simulation</th>\n",
" <th>Sports</th>\n",
" <th>Strategy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>Sports</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>Platform</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Racing</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Sports</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>Role-Playing</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Genre Adventure Fighting Misc \\\n",
"0 Wii Sports Sports 0 0 0 \n",
"1 Super Mario Bros. Platform 0 0 0 \n",
"2 Mario Kart Wii Racing 0 0 0 \n",
"3 Wii Sports Resort Sports 0 0 0 \n",
"4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 \n",
"\n",
" Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
"0 0 0 0 0 0 0 1 \n",
"1 1 0 0 0 0 0 0 \n",
"2 0 0 1 0 0 0 0 \n",
"3 0 0 0 0 0 0 1 \n",
"4 0 0 0 1 0 0 0 \n",
"\n",
" Strategy \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_dummy_features = pd.get_dummies(vg_df['Genre'],drop_first=True) # drop_first=True删掉全为0的列\n",
"dummy_df = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
"print(dummy_df.shape)\n",
"dummy_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看到两句话就解决了我们上面那一长串"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(16598, 14)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Genre</th>\n",
" <th>Action</th>\n",
" <th>Adventure</th>\n",
" <th>Fighting</th>\n",
" <th>Misc</th>\n",
" <th>Platform</th>\n",
" <th>Puzzle</th>\n",
" <th>Racing</th>\n",
" <th>Role-Playing</th>\n",
" <th>Shooter</th>\n",
" <th>Simulation</th>\n",
" <th>Sports</th>\n",
" <th>Strategy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>Sports</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>Platform</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Racing</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Sports</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>Role-Playing</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Genre Action Adventure Fighting Misc \\\n",
"0 Wii Sports Sports 0 0 0 0 \n",
"1 Super Mario Bros. Platform 0 0 0 0 \n",
"2 Mario Kart Wii Racing 0 0 0 0 \n",
"3 Wii Sports Resort Sports 0 0 0 0 \n",
"4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 0 \n",
"\n",
" Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n",
"0 0 0 0 0 0 0 1 \n",
"1 1 0 0 0 0 0 0 \n",
"2 0 0 1 0 0 0 0 \n",
"3 0 0 0 0 0 0 1 \n",
"4 0 0 0 1 0 0 0 \n",
"\n",
" Strategy \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_dummy_features = pd.get_dummies(vg_df['Genre']) # 和上面相比少了drop_first=True一般用这种\n",
"dummy_df_true = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
"print(dummy_df_true.shape)\n",
"dummy_df_true.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 二值特征化"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>2006.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>1985.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>2008.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>2009.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>1996.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Year\n",
"0 Wii Sports 2006.0\n",
"1 Super Mario Bros. 1985.0\n",
"2 Mario Kart Wii 2008.0\n",
"3 Wii Sports Resort 2009.0\n",
"4 Pokemon Red/Pokemon Blue 1996.0"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_year_df = vg_df[['Name', 'Year']]\n",
"vg_year_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"我们把2000年以上的归类为1其它归类为0"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Year</th>\n",
" <th>Year_tow</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>2006.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>1985.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>2008.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>2009.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>1996.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Year Year_tow\n",
"0 Wii Sports 2006.0 1\n",
"1 Super Mario Bros. 1985.0 0\n",
"2 Mario Kart Wii 2008.0 1\n",
"3 Wii Sports Resort 2009.0 1\n",
"4 Pokemon Red/Pokemon Blue 1996.0 0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_year_df['Year_tow'] = np.where(vg_year_df['Year'] >= 2000, 1, 0)\n",
"vg_year_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Year</th>\n",
" <th>Year_tow</th>\n",
" <th>bn_year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>2006.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>1985.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>2008.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>2009.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>1996.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Year Year_tow bn_year\n",
"0 Wii Sports 2006.0 1 1.0\n",
"1 Super Mario Bros. 1985.0 0 0.0\n",
"2 Mario Kart Wii 2008.0 1 1.0\n",
"3 Wii Sports Resort 2009.0 1 1.0\n",
"4 Pokemon Red/Pokemon Blue 1996.0 0 0.0"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import Binarizer\n",
"# sklearn中的方法\n",
"bn = Binarizer(threshold=2000) # 大于2000我1小于为0\n",
"vg_year_df['Year']=vg_year_df['Year'].fillna(0) # 数据中有Nan值需要补0否则无法二分\n",
"bn_year = bn.transform([vg_year_df['Year']])[0] # 获取转换的值取第0列\n",
"vg_year_df['bn_year'] = bn_year # 插入数据\n",
"vg_year_df.head() # 结果与手动一致"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 多项式特征\n",
"获得特征的更高维度和互相间关系的项。"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NA_Sales</th>\n",
" <th>EU_Sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>41.49</td>\n",
" <td>29.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>29.08</td>\n",
" <td>3.58</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>15.85</td>\n",
" <td>12.88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>15.75</td>\n",
" <td>11.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11.27</td>\n",
" <td>8.89</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NA_Sales EU_Sales\n",
"0 41.49 29.02\n",
"1 29.08 3.58\n",
"2 15.85 12.88\n",
"3 15.75 11.01\n",
"4 11.27 8.89"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"polynomial_df = vg_df[['NA_Sales', 'EU_Sales']]\n",
"polynomial_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[4.1490000e+01, 2.9020000e+01, 1.7214201e+03, 1.2040398e+03,\n",
" 8.4216040e+02],\n",
" [2.9080000e+01, 3.5800000e+00, 8.4564640e+02, 1.0410640e+02,\n",
" 1.2816400e+01],\n",
" [1.5850000e+01, 1.2880000e+01, 2.5122250e+02, 2.0414800e+02,\n",
" 1.6589440e+02],\n",
" ...,\n",
" [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n",
" 0.0000000e+00],\n",
" [0.0000000e+00, 1.0000000e-02, 0.0000000e+00, 0.0000000e+00,\n",
" 1.0000000e-04],\n",
" [1.0000000e-02, 0.0000000e+00, 1.0000000e-04, 0.0000000e+00,\n",
" 0.0000000e+00]])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import PolynomialFeatures\n",
"\n",
"# degree二次幂的复杂度\n",
"pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)\n",
"res = pf.fit_transform(polynomial_df)\n",
"res"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"以第一行为例:\n",
"<br>第一列和第二列分别表示原先的第一列和第二列\n",
"<br>第三列和第五列表示第一列和第二列分别的平方,第四列表示两者的乘积"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NA_Sales</th>\n",
" <th>EU_Sales</th>\n",
" <th>NA_Sales^2</th>\n",
" <th>NA_Sales*EU_Sales</th>\n",
" <th>EU_Sales^2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>41.49</td>\n",
" <td>29.02</td>\n",
" <td>1721.4201</td>\n",
" <td>1204.0398</td>\n",
" <td>842.1604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>29.08</td>\n",
" <td>3.58</td>\n",
" <td>845.6464</td>\n",
" <td>104.1064</td>\n",
" <td>12.8164</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>15.85</td>\n",
" <td>12.88</td>\n",
" <td>251.2225</td>\n",
" <td>204.1480</td>\n",
" <td>165.8944</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>15.75</td>\n",
" <td>11.01</td>\n",
" <td>248.0625</td>\n",
" <td>173.4075</td>\n",
" <td>121.2201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11.27</td>\n",
" <td>8.89</td>\n",
" <td>127.0129</td>\n",
" <td>100.1903</td>\n",
" <td>79.0321</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NA_Sales EU_Sales NA_Sales^2 NA_Sales*EU_Sales EU_Sales^2\n",
"0 41.49 29.02 1721.4201 1204.0398 842.1604\n",
"1 29.08 3.58 845.6464 104.1064 12.8164\n",
"2 15.85 12.88 251.2225 204.1480 165.8944\n",
"3 15.75 11.01 248.0625 173.4075 121.2201\n",
"4 11.27 8.89 127.0129 100.1903 79.0321"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"intr_features = pd.DataFrame(res, columns=['NA_Sales',\n",
" 'EU_Sales',\n",
" 'NA_Sales^2',\n",
" 'NA_Sales*EU_Sales',\n",
" 'EU_Sales^2'])\n",
"intr_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Name</th>\n",
" <th>Platform</th>\n",
" <th>Year</th>\n",
" <th>Genre</th>\n",
" <th>Publisher</th>\n",
" <th>NA_Sales</th>\n",
" <th>EU_Sales</th>\n",
" <th>JP_Sales</th>\n",
" <th>Other_Sales</th>\n",
" <th>Global_Sales</th>\n",
" <th>GenreLabel</th>\n",
" <th>GenreMap</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Wii Sports</td>\n",
" <td>Wii</td>\n",
" <td>2006.0</td>\n",
" <td>Sports</td>\n",
" <td>Nintendo</td>\n",
" <td>41.49</td>\n",
" <td>29.02</td>\n",
" <td>3.77</td>\n",
" <td>8.46</td>\n",
" <td>82.74</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Super Mario Bros.</td>\n",
" <td>NES</td>\n",
" <td>1985.0</td>\n",
" <td>Platform</td>\n",
" <td>Nintendo</td>\n",
" <td>29.08</td>\n",
" <td>3.58</td>\n",
" <td>6.81</td>\n",
" <td>0.77</td>\n",
" <td>40.24</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Mario Kart Wii</td>\n",
" <td>Wii</td>\n",
" <td>2008.0</td>\n",
" <td>Racing</td>\n",
" <td>Nintendo</td>\n",
" <td>15.85</td>\n",
" <td>12.88</td>\n",
" <td>3.79</td>\n",
" <td>3.31</td>\n",
" <td>35.82</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Wii Sports Resort</td>\n",
" <td>Wii</td>\n",
" <td>2009.0</td>\n",
" <td>Sports</td>\n",
" <td>Nintendo</td>\n",
" <td>15.75</td>\n",
" <td>11.01</td>\n",
" <td>3.28</td>\n",
" <td>2.96</td>\n",
" <td>33.00</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>GB</td>\n",
" <td>1996.0</td>\n",
" <td>Role-Playing</td>\n",
" <td>Nintendo</td>\n",
" <td>11.27</td>\n",
" <td>8.89</td>\n",
" <td>10.22</td>\n",
" <td>1.00</td>\n",
" <td>31.37</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rank Name Platform Year Genre Publisher \\\n",
"0 1 Wii Sports Wii 2006.0 Sports Nintendo \n",
"1 2 Super Mario Bros. NES 1985.0 Platform Nintendo \n",
"2 3 Mario Kart Wii Wii 2008.0 Racing Nintendo \n",
"3 4 Wii Sports Resort Wii 2009.0 Sports Nintendo \n",
"4 5 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo \n",
"\n",
" NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales GenreLabel \\\n",
"0 41.49 29.02 3.77 8.46 82.74 10 \n",
"1 29.08 3.58 6.81 0.77 40.24 4 \n",
"2 15.85 12.88 3.79 3.31 35.82 6 \n",
"3 15.75 11.01 3.28 2.96 33.00 10 \n",
"4 11.27 8.89 10.22 1.00 31.37 7 \n",
"\n",
" GenreMap \n",
"0 10 \n",
"1 4 \n",
"2 6 \n",
"3 10 \n",
"4 7 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Binning 特征\n",
"一般用来处理年龄"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>2006.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>1985.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>2008.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>2009.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>1996.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Year\n",
"0 Wii Sports 2006.0\n",
"1 Super Mario Bros. 1985.0\n",
"2 Mario Kart Wii 2008.0\n",
"3 Wii Sports Resort 2009.0\n",
"4 Pokemon Red/Pokemon Blue 1996.0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bin_df = vg_df[['Name','Year']] # 假设GenreLabel是年龄\n",
"bin_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'Frequency')"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEZCAYAAACAZ8KHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3de5xdVX338c+XJNwSIMFASi4SFKogKkK4qmUA5aYQ+lQUSyUgPmihrbaooIKKiqKlgrZVpBDlHlMUiQpiBKY+FoOA3ALhEiCQISkRMglJJEDw9/yx1ml2hpnJrJlzmWS+79frvM7ea99+Z519zu+stffZWxGBmZlZX23S6gDMzGzD4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVsSJw+pOUrukj7Q6jr6SFJJ27sN8J0r6TT+30Sapoz/LFmxjgaR3NXIbZuDEsdHKXyIvSFohaZmk2yR9TNKQes8lbSXpm7k+Vkl6StK1kvZpdWzdkfSO/F4tl7RU0n9L2ruF8bwqGQ0kgVbW0fBEao0zpL5EhqCjImIrYEfgPOAM4NLWhtQYkoZ3U7YZcAvwZuC9wNbArsAM4MimBtgHkrYGfgb8K7AtMAE4B3ixlXFtiCQNa3UMGzMnjiEgIpZHxCzgA8A0SbtD+mKVdH7+Ff6MpIskbZGnzZP03to6JA2X9KykPfP4fvmX8TJJ90pq627bkjaRdJakJyUtkXS5pG3ytMm5m+gUSYskLZZ0epdlz5T0mKTnJM2UtG2XZU+W9BQpQXT1IWAicExEzI2IVyJiVURcGxFf7CHebXKMf8gxn9WllSZJ/5pbBA9JOqQy4aRcbyskPS7po+t9c9b15wARcU2O9YWI+GVE3JfX/3pJt+S6eFbSVZJG9/A6equ7zSVdmcuXSbpD0rjCWKvb2jV3Ty6T9ICkoyvTjpT0YK6TpyV9UtJI4EZgvKSV+TE+748X5n1hUR7erLKuT+d9ZJGkj6jSxSjpB5K+K+kGSauAgyS9R9Ldkp6XtFDSFyvrqu0/J+VpnUot8r0l3Zdfy7/1t042ehHhx0b4ABYA7+qm/Cngb/PwhcAs0q/brYCfAl/L0z4PXFVZ7j3AQ3l4AvAc6Vf7JsC78/h2eXo78JE8/GFgPvA6YBTwY+CKPG0yEMA1wEhSy+APtbiBTwBzSF/+mwHfA67psuzledktunmtM4Af9KGuAtg5D18OXJ/rYzLwCHBynnYisAb4R2AEKREvB7at1NHrAQEHAn8E9szT2oCO9cSxda7Hy4AjgDFdpu+c63ozYDvg18CF3b3n66m7j+b3ektgGLAXsHVf96NcD7/JwyPy+/tZYFPgYGAF8IY8fTHwzjw8prf6AL6UY94+v77bgC/naYcD/wO8Kcd9RZf37Qf5vXg7aZ/cPG/jzXn8LcAzpB8R1f3nojzvocBq4Cd5+xOAJcCBrf4sD8ZHywPwo0FvbM+JYw7wufzltgp4fWXa/sATeXjn/AWwZR6/Cvh8Hj6D/OVfWfYmYFoebmdt4rgZOLUy3xuAl4HhlQ/vGyvTvwFcmofnAYdUpu3QzbKv66UOfgWcVxnfA1gGPA88XCmP/HqHkbqFdqtM+yjQnodPBBYBqkz/HfChHrb/E+DjebiN9SSOPN+u+Uuwg5SkZgHjepj3GODu7t7z9dTdh0lfym/p4360Mtdb7fFH1iaOd5K+0DepLHMN8MU8/FSuw627rPdV9QE8BhxZGT8MWJCHp5N/1FT2z66J4/L1vJYLgQvycG3/mVCZ/hzwgcr4j4BPtOozPJgf7qoaeiYAS0m/6LYE7srN8mXAL3I5ETGf9OVzlKQtgaOBq/M6dgSOrS2Xl30H6cupq/HAk5XxJ0lfXtWukYVdpo+vbOe6yjbmAa/0smxXz1Vjioh7ImI08H9Iv8K7Gkv61dw13gmV8acjf6t0jVfSEZLmKB3UXkZqkY3tJb5XiYh5EXFiREwEds/rvjCvf3tJM3KXz/PAlb2sv7e6u4KU6Gfkbp9vSBrRS1jHRMTo2gM4tTJtPLAwIv7UpU5qdfZXpHp4UtJ/Sdq/l+10t6+Mr0yrvtfdve/rlEnaV9KtudtxOfAxXl1fz1SGX+hmfFQv8Q5ZThxDiNLZOROA3wDPkj4Yb6p8KWwTEdUPyjXAB4GpwIM5mUD6gF5R/TKJiJERcV43m11E+hKreS3pl3T1Azqpy/RFle0c0WU7m0fE05X5e7u8883AoblPvS+eJf0q7xpvdXsTJKlrvLkv/kfA+aQWwmjgBlLLrl8i4iHSL+ndc9HXSK/3LRGxNfA3vay/x7qLiJcj4pyI2A04gHTiwAn9DHMRMKnLcaD/rbOIuCMippK6f34CzKy9vB7W1bXua/vCYlK3W011n6npus6rSS22SRGxDalbqt/vh63lxDEESNpa6UD3DODKiLg//0L8D+ACSdvn+SZIOqyy6AxS3+/fsra1AemX7lGSDpM0LB9sbZNU/WDXXAP8o6SdJI0Cvgr8MCLWVOY5W9KWkt4EnAT8MJdfBJwraccc33aSpha89MtJXzjXSdq9FiswpbuZI+IV0hfbuUqn8e4I/FN+vTXbA/8gaYSkY0ldSzeQWiqbkY7RrJF0BKnu+kzSGyWdXqtHSZNIiXtOnmUrcreRpAnAp3pZXY91J+kgSW9WOvPoeVKyfKUk1orbSV2en8510gYcRWrNbCrpeEnbRMTLeVu17TwDvEb5RInsGuCsHOtY0nG2Wt3PBE5SOhC/ZZ62PlsBSyNitdLp13/dz9doXThxbNx+KmkF6dfn54Bvkr6Ya84gHdick7s+fkU6BgFARCwGfkv6VfrDSvlCUivks6QvyoWkL7Hu9qfppK6RXwNPkA5A/n2Xef4rx3EzcH5E/DKXf4v0i/GX+XXMAfbt64uPiNXAQcCDwM/JxzaAvYH397DY35O+CB8ntcyuzq+h5nZgF1Lr5FzgfRHxXESsAP6B9AXXSfqSmtXXWLMVpNd3ez4zaA4wF6idaXYOsCfpIPDPSSca9KS3uvsz4FpSfcwj1f+V3a1kfSLiJVI35hGkOvkOcEJuLUE6s21B3r8+Rmol1VpT1wCP5+608cBXgDuB+4D7gd/nMiLiRuDbwK2kfeW3ef29nap8KvCl/Po/z9rWjg2Q1u2uNWseSZNJyWRElxaIWa8k7UpKqpt532k+tzjMbIMg6S9z99cY4OvAT500WsOJw6yJJL1Wa//01vXx2lbHN8h9lNQ1+hjpWMnftjacoctdVWZmVsQtDjMzK/KqC8NtbMaOHRuTJ0/u9/KrVq1i5Mi+/g2geRxXGcdVxnGV2Rjjuuuuu56NiO26ndjqv643+rHXXnvFQNx6660DWr5RHFcZx1XGcZXZGOMC7gxfcsTMzOrBicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlZko7/kiJkNLjNvm9vvZUesWt3v5d9/wO7rn8n6xC0OMzMr4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVqRpiUPSAkn3S7pH0p25bFtJsyU9mp/H5HJJ+rak+ZLuk7RnZT3T8vyPSprWrPjNzCxpdovjoIjYIyKm5PEzgZsjYhfg5jwOcASwS36cAnwXUqIBvgDsC+wDfKGWbMzMrDla3VU1FbgsD18GHFMpvzzfwXAOMFrSDsBhwOyIWBoRncBs4PBmB21mNpQp3Vq2CRuSngA6gQC+FxEXS1oWEaMr83RGxBhJPwPOi4jf5PKbgTOANmDziPhKLj8beCEizu+yrVNILRXGjRu314wZM/od98qVKxk1alS/l28Ux1XGcZVpZFydq1b3e1mteYkYvmm/lh0zcvN+b3d9Nsb38aCDDrqr0ju0jmZecuTtEbFI0vbAbEkP9TKvuimLXsrXLYi4GLgYYMqUKdHW1taPcJP29nYGsnyjOK4yjqtMI+Ma0CVHOjt4eczEfi3b1sBLjgy197FpXVURsSg/LwGuIx2jeCZ3QZGfl+TZO4BJlcUnAot6KTczsyZpSuKQNFLSVrVh4FBgLjALqJ0ZNQ24Pg/PAk7IZ1ftByyPiMXATcChksbkg+KH5jIzM2uSZnVVjQOuk1Tb5tUR8QtJdwAzJZ0MPAUcm+e/ATgSmA/8ETgJICKWSvoycEee70sRsbRJr8HMzGhS4oiIx4G3dlP+HHBIN+UBnNbDuqYD0+sdo5mZ9U2rT8c1M7MNjBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWZGmJg5JwyTdLelneXwnSbdLelTSDyVtmss3y+Pz8/TJlXV8Jpc/LOmwZsZvZmbNb3F8HJhXGf86cEFE7AJ0Aifn8pOBzojYGbggz4ek3YDjgDcBhwPfkTSsSbGbmRlNTBySJgLvAS7J4wIOBq7Ns1wGHJOHp+Zx8vRD8vxTgRkR8WJEPAHMB/ZpziswMzMARURzNiRdC3wN2Ar4JHAiMCe3KpA0CbgxInaXNBc4PCI68rTHgH2BL+Zlrszll+Zlru2yrVOAUwDGjRu314wZM/od98qVKxk1alS/l28Ux1XGcZVpZFydq1b3e1mteYkYvmm/lh0zcvN+b3d9Nsb38aCDDrorIqZ0N234gKLqI0nvBZZExF2S2mrF3cwa65nW2zJrCyIuBi4GmDJlSrS1tXWdpc/a29sZyPKN4rjKOK4yjYxr5m1z+73siM4OXh4zsV/Lth2we7+3uz5D7X1sSuIA3g4cLelIYHNga+BCYLSk4RGxBpgILMrzdwCTgA5Jw4FtgKWV8prqMmZm1gRNOcYREZ+JiIkRMZl0cPuWiDgeuBV4X55tGnB9Hp6Vx8nTb4nUpzYLOC6fdbUTsAvwu2a8BjMzS5rV4ujJGcAMSV8B7gYuzeWXAldImk9qaRwHEBEPSJoJPAisAU6LiFeaH7bZhm193UUjVq0eUJeSbdyanjgioh1oz8OP081ZURGxGji2h+XPBc5tXIRmZtYb/3PczMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIk4cZmZWxInDzMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIn1OHJL+QdLYRgZjZmaDX0mL413AAkk/k/QBSZs1KigzMxu8+pw4IuJoYEfgRuATwP9IukTSXzQqODMzG3yKjnFExHMR8e8RsT9wILA3cKukBZI+J2lUQ6I0M7NBo/jguKRDJH2fdN/wZ4ATgA8BbyO1RszMbCM2vK8zSjofOA5YDlwOnBURT1emzwE66x6hmZkNKn1OHMDmwF9GxB3dTYyIlyVNqU9YZmY2WJUkjq8Bf6wWSBoDbBERiwAi4qE6xmZmZoNQyTGOnwATu5RNBK6rXzhmZjbYlSSON0TE/dWCPP7G+oZkZmaDWUniWCJp52pBHn+uviGZmdlgVpI4pgM/kvReSbtJOgq4FrikMaGZmdlgVHJw/DzgZeB8YBKwkJQ0vtmAuMzMbJDqc+KIiD8B/5wfZmY2RJW0OJD0BuCtwDqXFomI6fUMyszMBq+Sy6p/FrgXOJ10iZHa42/6sOzmkn4n6V5JD0g6J5fvJOl2SY9K+qGkTXP5Znl8fp4+ubKuz+TyhyUdVvJizcxs4EpaHJ8A9omI+/qxnReBgyNipaQRwG8k3Qj8E3BBRMyQdBFwMvDd/NwZETtLOg74OvABSbuRLnvyJmA88CtJfx4Rr/QjJjMz64eSs6peAPr1z/BIVubREfkRwMGkM7MALgOOycNT8zh5+iGSlMtnRMSLEfEEMB/Ypz8xmZlZ/5QkjrOBf5W0g6RNqo++LCxpmKR7gCXAbOAxYFlErMmzdAAT8vAE0llb5OnLgddUy7tZxszMmqCkq+oH+fkjlTKRWg7D1rdw7k7aQ9Jo0mVKdu1utsp6u5vWU/k6JJ0CnAIwbtw42tvb1xdej1auXDmg5RvFcZVxXOsasWp1r9O15iVGdHY0KZq+G0hc7e3P1jmatYba/lWSOHaqxwYjYpmkdmA/YLSk4blVMRFYlGfrIP1XpEPScGAbYGmlvKa6THUbFwMXA0yZMiXa2tr6HW97ezsDWb5RHFcZx7WumbfN7XX6iM4OXh7T9dJ0rTeQuNoO2L3O0aw11PavklvHPhkRT5K6il6qjeeyXknaLrc0kLQF6f7l84Bbgffl2aYB1+fhWXmcPP2WiIhcflw+62onYBfgd319DWZmNnAlN3IaDXyH9EX+MjBS0tGkM63OWs/iOwCXSRpGSlYzI+Jnkh4EZkj6CnA3cGme/1LgCknzSS2N4wAi4gFJM4EHgTXAaT6jysysuUq6qi4i3eFvR9IXN8BvgX8Bek0c+RTet3VT/jjdnBUVEauBY3tY17nAuQVxm5lZHZUkjkOA8flOfwEQEX+QtH1jQjMzs8Go5HTc5cDYaoGk1wKL6xqRmZkNaiWJ4xLSZdUPAjaRtD/pT3oXNSQyMzMblEq6qr4OrAb+nfTP7+nA94BvNSAuMzMbpEouqx7AhflhZmZDVMnpuAf3NC0ibqlPOGZmNtiVdFVd2mV8O2BT0r+5X1e3iMzMbFAr6apa55Ij+c98ZwEr6h2UmZkNXiVnVa0j/2P7XODT9QvHzMwGu34njuzdwJ/qEYiZmW0YSg6OL2TdS5hvCWwOnFrvoMzMbPAqOTje9d7iq4BHIuL5OsZjZmaDXMnB8f9qZCBmZrZhKOmquoJu7rbXVUScMKCIzMxsUCs5OL4MOIZ0m9iOvOzUXP5Y5WFmZhuxkmMcfw68JyL+X61A0juAsyPisLpHZmZmg1JJi2M/YE6XstuB/esXjpmZDXYlLY67ga9K+nxEvJDvHX4OcE9jQjPb+HWuWs3M2+a2OgyzIiWJ40TgamC5pE5gDHAncHwD4jIzq6tGJugRvfwAeP8Buzdsu61ScjruAuAASZOA8cDiiHiqUYGZmdngVHTJEUmvAdqAAyPiKUnjJU1sSGRmZjYo9TlxSDoQeJjUNXV2Lt4F+G4D4jIzs0GqpMVxIfCBiDgcWJPLbgf2qXtUZmY2aJUkjskRcXMerv2D/CXKDrCbmdkGriRxPCip6x/93gXcX8d4zMxskCtpLZwO/EzSz4EtJH0POIp02REzMxsi+tziiIg5wFuAB4DpwBPAPhFxR4NiMzOzQahPLY58f/GbgcMi4huNDcnMzAazPrU48v3Fd+rr/GZmtvEqSQTnAN+VtKOkYZI2qT0aFZyZmQ0+JQfHL8nPJ7D2dFzl4WH1DMrMzAav9bYWJP1ZHtyp8nhdftSG17eOSZJulTRP0gOSPp7Lt5U0W9Kj+XlMLpekb0uaL+k+SXtW1jUtz/+opGnFr9jMzAakL91MjwBExJMR8SRwQW24UrY+a4DTI2JX0n09TpO0G3AmcHNE7EI6+H5mnv8I0uVMdgFOIV/WRNK2wBeAfUn/WP9CLdmYmVlz9CVxqMt4W+lGImJxRPw+D68A5gETSP8BuSzPdhnp1rTk8ssjmQOMlrQDcBgwOyKWRkQnMBs4vDQeMzPrv74c44j1z9J3kiYDbyNd52pcRCyGlFwkbZ9nmwAsrCzWkct6Ku+6jVNILRXGjRtHe3t7v+NduXLlgJZvFMdVZrDGpTUvMaKzo9VhvIrjKtNbXO3tzzY5mrUatd/3JXEMl3QQa1seXceJiFv6sjFJo4AfAZ+IiOelro2ZtbN2Uxa9lK9bEHExcDHAlClToq2trS/hdau9vZ2BLN8ojqvMYI3rup//gpfHDL47E4zo7HBcBXqLq62FN3Jq1H7fl8SxhPRP8ZrnuowHfTtAPoKUNK6KiB/n4mck7ZBbGzvkbUFqSUyqLD4RWJTL27qUt/fhNZiZWZ2s9xhHREyOiJ16efQlaQi4FJgXEd+sTJoF1M6MmgZcXyk/IZ9dtR+wPHdp3QQcKmlMPih+aC4zM7MmadYl0d8OfAi4X9I9ueyzwHnATEknA08Bx+ZpNwBHAvOBPwInAUTEUklfBmrXx/pSRCxtzkswMzNoUuKIiN/Q/fEJgEO6mT+A03pY13TW7SozM7Mm8uVCzMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzKzK81QGYtdrM2+a2bNsjWrZls/5zi8PMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVaUrikDRd0hJJcytl20qaLenR/Dwml0vStyXNl3SfpD0ry0zL8z8qaVozYjczs3U1q8XxA+DwLmVnAjdHxC7AzXkc4Ahgl/w4BfgupEQDfAHYF9gH+EIt2ZiZWfM0JXFExK+BpV2KpwKX5eHLgGMq5ZdHMgcYLWkH4DBgdkQsjYhOYDavTkZmZtZgrfzn+LiIWAwQEYslbZ/LJwALK/N15LKeyl9F0imk1grjxo2jvb2930GuXLlyQMs3iuMq01tcI1atbm4wFVrzEiM6O1q2/Z44rjK9xdXe/myTo1mrUZ/HwXjJEXVTFr2Uv7ow4mLgYoApU6ZEW1tbv4Npb29nIMs3iuMq01tcLb3kSGcHL4+Z2LLt98RxlektrrYDdm9yNGs16vPYyrOqnsldUOTnJbm8A5hUmW8isKiXcjMza6JWJo5ZQO3MqGnA9ZXyE/LZVfsBy3OX1k3AoZLG5IPih+YyMzNroqZ0VUm6BmgDxkrqIJ0ddR4wU9LJwFPAsXn2G4AjgfnAH4GTACJiqaQvA3fk+b4UEV0PuJuZWYM1JXFExAd7mHRIN/MGcFoP65kOTK9jaGZmVsj/HDczsyJOHGZmVsSJw8zMijhxmJlZEScOMzMr4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVsSJw8zMijhxmJlZEScOMzMr4sRhZmZFnDjMzKyIE4eZmRVpyo2czPpi5m1zG7buEatWN3T9ZkOJWxxmZlbEicPMzIo4cZiZWREf4zAza6BWHlvbvkHrdYvDzMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIk4cZmZWxInDzMyK+J/j69HZoquqvv+A3Zu+TRj4v1x9FVqzjd8GmTgkHQ58CxgGXBIR57U4pLpb35evv6DNrFU2uK4qScOAfweOAHYDPihpt9ZGZWY2dGxwiQPYB5gfEY9HxEvADGBqi2MyMxsyNsSuqgnAwsp4B7BvdQZJpwCn5NGVkh4ewPbGAs8OYPlGcVxlHFcZx1VmY4xrx54mbIiJQ92UxTojERcDF9dlY9KdETGlHuuqJ8dVxnGVcVxlhlpcG2JXVQcwqTI+EVjUoljMzIacDTFx3AHsImknSZsCxwGzWhyTmdmQscF1VUXEGkl/B9xEOh13ekQ80MBN1qXLqwEcVxnHVcZxlRlScSki1j+XmZlZtiF2VZmZWQs5cZiZWZEhlzgkTZe0RNLcStlbJf1W0v2Sfipp61w+QtJluXyepM9Uljlc0sOS5ks6cxDFtSCX3yPpzibHtamk7+fyeyW1VZbZK5fPl/RtSd2dVt2KuNrz+3hPfmw/wLgmSbo1vy8PSPp4Lt9W0mxJj+bnMblcuT7mS7pP0p6VdU3L8z8qadogiuuVSn0N6MSUfsT1xvwevyjpk13WVbfPZJ3jqttnsh9xHZ/fv/sk3SbprZV19b++ImJIPYC/APYE5lbK7gAOzMMfBr6ch/8amJGHtwQWAJNJB+UfA14HbArcC+zW6rjy+AJgbIvq6zTg+3l4e+AuYJM8/jtgf9L/cG4EjhgkcbUDU+pYXzsAe+bhrYBHSJfG+QZwZi4/E/h6Hj4y14eA/YDbc/m2wOP5eUweHtPquPK0lS2sr+2BvYFzgU9W1lPXz2S94srTFlCnz2Q/4jqgtt+QLtNU278GVF9DrsUREb8GlnYpfgPw6zw8G/ir2uzASEnDgS2Al4DnacBlT+oUV90VxrUbcHNebgmwDJgiaQdg64j4baS99nLgmFbHNZDt9xLX4oj4fR5eAcwjXe1gKnBZnu0y1r7+qcDlkcwBRuf6OgyYHRFLI6Izv57DB0FcdVUaV0QsiYg7gJe7rKqun8k6xlVX/Yjrtrz/AMwh/e8NBlhfQy5x9GAucHQePpa1fzC8FlgFLAaeAs6PiKV0f9mTCYMgLkhJ5ZeS7lK69Eoj9BTXvcBUScMl7QTsladNINVRTbPrq6e4ar6fuxHOlgbWhVYlaTLwNuB2YFxELIb04Sf9QoWe96WG7WMDjAtgc0l3SpojaUA/APoRV09aXV+9achnsh9xnUxqRcIA68uJI/kwcJqku0jNv5dy+T7AK8B4YCfgdEmvow+XPWlRXABvj4g9Sc3S0yT9RRPjmk7aAe8ELgRuA9bQ+vrqKS6A4yPizcA78+ND9QhE0ijgR8AnIqK31mBPddOQOqtDXACvjXQZi78GLpT0+ibG1eMquilrZn31pu6fydK4JB1EShxn1Iq6ma3P9eXEAUTEQxFxaETsBVxD6vuD9MH4RUS8nLs4/pvUxdGUy570Iy4iYlF+XgJcR0oyTYkrItZExD9GxB4RMRUYDTxKqq+JlVU0tb56iYuIeDo/rwCupg71JWkE6UN9VUT8OBc/U+vqyc9LcnlP+1Ld97E6xVXdxx4nHSN6WxPj6kmr66tH9f5MlsYl6S3AJcDUiHguFw+ovpw4AOUzaSRtApwFXJQnPQUcnM8wGUk6SPgQTbrsSWlckkZK2iovMxI4lNR905S4JG2Zt4ukdwNrIuLB3HReIWm/3BV0AnB9q+PKXVdjc/kI4L0MsL7y67sUmBcR36xMmgXUzoyaxtrXPws4Ib+X+wHLc33dBBwqaUw+Q+bQXNbSuHI8m+V1jgXeDs2iZSQAAATsSURBVDzYxLh6UtfPZL3iqvdnsjQuSa8Ffgx8KCIeqcw/sPrq61H0jeVB+iW6mHQQq4PUfPs46eyER4DzWPuP+lHAfwIPkD4cn6qs58g8/2PA5wZDXKQzJO7NjwdaENdk4GHSAbtfATtW1jOF9IF5DPi32jKtjAsYSTrD6r5cX98Chg0wrneQmvz3Affkx5HAa0gH6B/Nz9vm+UW6MdljwP1UzvAidb3Nz4+TBkNcpLN07s/72P3AyU2O68/y+/086SSHDtKJF1DHz2S94qLOn8l+xHUJ0FmZ987KuvpdX77kiJmZFXFXlZmZFXHiMDOzIk4cZmZWxInDzMyKOHGYmVkRJw4zMyvixGGWSfqBpK/0cd6QtHM/t7NA0rv6s2wf1/9FSVc2av1mThw2pEg6TtLtklYp3c/jdkmn1vPihvUgabTSPUf+R9IKSY9IOmP9S5o1nhOHDRmSTif9O/yfSf/0HQd8jHTZjE1bGFp3LiBdIWBXYBvSVX8f63UJsyZx4rAhQdI2wJeAUyPi2ohYEcndEXF8RLzYzTL/V+nuaEslzZI0vsssR0p6XNKzkv45XyMLSa+XdIuk5/K0qySNLgx5b+DqiOiMiD9FuoDjtZXYviVpoaTnlS7X/c5eXvt+Snd/W6ZX3wHxxPwaVkh6QtLxhXHaEOTEYUPF/sBm9PHiipIOBr4GvJ9017UnSTe7qfpL0jW49iTdBOfDtcXzsuNJLYZJwBcL450DnCvpJEm7dDP9DmAP0h0Crwb+U9Lm3byOCcDPga/keT8J/EjSdvmie98m3Y1xK9J1qO4pjNOGICcOGyrGAs9GRO0+HFR+hb+gV98j4XhgekT8PrdGPgPsr3TznJqvR7pD31Ok+3x8ECAi5kfE7Ih4MSL+AHwTOLAw3r8HrgL+Dngwt3yOqE2MiCsj4rlIl4z/F1JSfEM36/kb4IaIuCG3XGaT7ktyZJ7+J2B3SVtEurvcA4Vx2hDkxGFDxXPAWKXb7QIQEQdExOg8retnYTyplVGbd2Wer3qXtOod1J7MyyBpe0kzJD0t6XngSlLi6rOIeCEivhrp3iKvAWaSWhXb5m2cLmmepOWSlpGOg3S3jR2BY3OCXJbnfQewQ0SsAj5AOs6zWNLPJb2xJE4bmpw4bKj4LfAifb+v8iLSly7wv/dSeA3wdGWe6o1wXsvaG+F8jXTp67dExNakX/39Pmsr0h3evkq6DPxO+XjGGaRutDE5+S3vYRsLgSsiYnTlMTIizsvrviki3k3qjnsI+I/+xmlDhxOHDQkRsQw4B/iOpPdJGiVpE0l7kL6Qu7oaOEnSHvnGRV8Fbo+IBZV5PpVvbDSJdC+QH+byrYCVwLJ8jOFTpfEq3f98b0mb5mMXHyfd5+HhvP41wB+A4ZI+T7r3Q3euBI6SdJikYZI2l9QmaaKkcZKOzknxxRzzK6Wx2tDjxGFDRkR8A/gn4NOkW2s+A3yP9Ov9ti7z3gycTbpF52Lg9aS7pFVdT7oR1D2kA9CX5vJzSAfMl+fyH1MugO8Dz5JaMu8G3pO7zG4CbiTdhOdJYDXrdptVX8dCUivrs6REs5CUyDbJj9Pz+peSjsOc2o9YbYjxjZzMzKyIWxxmZlbEicOsRSTdKGllN4/Ptjo2s964q8rMzIq4xWFmZkWcOMzMrIgTh5mZFXHiMDOzIv8fEgSYdA1YC3gAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import matplotlib as mpl\n",
"import scipy.stats as spstats\n",
"\n",
"fig, ax = plt.subplots()\n",
"bin_df['Year'].hist(color='#A9C5D3')\n",
"ax.set_title('Developer Global_Sales Hostogram', fontsize=12)\n",
"ax.set_xlabel('Global_Sales', fontsize=12)\n",
"ax.set_ylabel('Frequency', fontsize=12)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这样区间就出来了我们可以分成多个区间如1980-1985是一个区间1986-1990是一个区间"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Year</th>\n",
" <th>Year_bin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>2006.0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>1985.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>2008.0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>2009.0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>1996.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Tetris</td>\n",
" <td>1989.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>New Super Mario Bros.</td>\n",
" <td>2006.0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Wii Play</td>\n",
" <td>2006.0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>New Super Mario Bros. Wii</td>\n",
" <td>2009.0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Duck Hunt</td>\n",
" <td>1984.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Year Year_bin\n",
"0 Wii Sports 2006.0 5\n",
"1 Super Mario Bros. 1985.0 1\n",
"2 Mario Kart Wii 2008.0 6\n",
"3 Wii Sports Resort 2009.0 6\n",
"4 Pokemon Red/Pokemon Blue 1996.0 3\n",
"5 Tetris 1989.0 2\n",
"6 New Super Mario Bros. 2006.0 5\n",
"7 Wii Play 2006.0 5\n",
"8 New Super Mario Bros. Wii 2009.0 6\n",
"9 Duck Hunt 1984.0 0"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gle = LabelEncoder() # 实例化\n",
"bin_df['Year_bin'] = pd.cut(bin_df['Year'], 9) # 切分成9组也可以自己指定切分区间\n",
"bin_df['Year_bin'] = bin_df['Year_bin'].astype(str) # 转换类型为字符串\n",
"bin_year = gle.fit_transform(bin_df['Year_bin']) # 利用LabelEncoder方法变成1-9的数值\n",
"bin_df['Year_bin'] = bin_year # 赋值到新的列\n",
"bin_df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 对数变换\n",
"\n",
"经常有这样的假设:数据的分布是正态分布。如线性回归的时候误差项要满足正态分布,而当数据不满足的时候,则需要把数据变换成正态分布"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>NA_Sales</th>\n",
" <th>NA_Sales_log</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wii Sports</td>\n",
" <td>41.49</td>\n",
" <td>3.749269</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Super Mario Bros.</td>\n",
" <td>29.08</td>\n",
" <td>3.403860</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mario Kart Wii</td>\n",
" <td>15.85</td>\n",
" <td>2.824351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wii Sports Resort</td>\n",
" <td>15.75</td>\n",
" <td>2.818398</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pokemon Red/Pokemon Blue</td>\n",
" <td>11.27</td>\n",
" <td>2.507157</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name NA_Sales NA_Sales_log\n",
"0 Wii Sports 41.49 3.749269\n",
"1 Super Mario Bros. 29.08 3.403860\n",
"2 Mario Kart Wii 15.85 2.824351\n",
"3 Wii Sports Resort 15.75 2.818398\n",
"4 Pokemon Red/Pokemon Blue 11.27 2.507157"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_log = vg_df[['Name','NA_Sales']] \n",
"df_log['NA_Sales_log'] = np.log((1+df_log['NA_Sales']))\n",
"df_log.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x18ec49c7b38>"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# 画两张对比图左边log过的更偏态\n",
"fig, ax = plt.subplots()\n",
"plt.subplot(121) \n",
"df_log['NA_Sales_log'].hist(color='#A9C5D3')\n",
"\n",
"plt.subplot(122) \n",
"df_log['NA_Sales'].hist(color='#A9C5D3')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"上面是手动的还有模块化的BoxCox这里暂不做示例"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 日期相关特征\n",
"将时间特征转换成可以应用的数据"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"from dateutil.parser import parse\n",
"import pytz"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-12-16 10:30:00.360000+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2019-04-16 12:15:00.250000+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-10-16 08:30:00.750000+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2019-01-16 23:30:00.255500+00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time\n",
"0 2020-12-16 10:30:00.360000+00:00\n",
"1 2019-04-16 12:15:00.250000+00:00\n",
"2 2018-10-16 08:30:00.750000+00:00\n",
"3 2019-01-16 23:30:00.255500+00:00"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"time_stamps = ['2020-12-16 10:30:00.360000+00:00','2019-04-16 12:15:00.250000+00:00',\n",
" '2018-10-16 08:30:00.750000+00:00','2019-01-16 23:30:00.255500+00:00']\n",
"\n",
"df = pd.DataFrame(time_stamps, columns=['Time'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([Timestamp('2020-12-16 10:30:00.360000+0000', tz='UTC'),\n",
" Timestamp('2019-04-16 12:15:00.250000+0000', tz='UTC'),\n",
" Timestamp('2018-10-16 08:30:00.750000+0000', tz='UTC'),\n",
" Timestamp('2019-01-16 23:30:00.255500+0000', tz='UTC')],\n",
" dtype=object)"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])\n",
"df['TS_obj'] = ts_objs\n",
"ts_objs"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" <th>Year</th>\n",
" <th>Month</th>\n",
" <th>day</th>\n",
" <th>DayOfWeek</th>\n",
" <th>WeekDayName</th>\n",
" <th>DayOfYear</th>\n",
" <th>WeekOfYear</th>\n",
" <th>Quarter</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-12-16 10:30:00.360000+00:00</td>\n",
" <td>2020</td>\n",
" <td>12</td>\n",
" <td>16</td>\n",
" <td>2</td>\n",
" <td>Wednesday</td>\n",
" <td>351</td>\n",
" <td>51</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2019-04-16 12:15:00.250000+00:00</td>\n",
" <td>2019</td>\n",
" <td>4</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>Tuesday</td>\n",
" <td>106</td>\n",
" <td>16</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-10-16 08:30:00.750000+00:00</td>\n",
" <td>2018</td>\n",
" <td>10</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>Tuesday</td>\n",
" <td>289</td>\n",
" <td>42</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2019-01-16 23:30:00.255500+00:00</td>\n",
" <td>2019</td>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>2</td>\n",
" <td>Wednesday</td>\n",
" <td>16</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time Year Month day DayOfWeek WeekDayName \\\n",
"0 2020-12-16 10:30:00.360000+00:00 2020 12 16 2 Wednesday \n",
"1 2019-04-16 12:15:00.250000+00:00 2019 4 16 1 Tuesday \n",
"2 2018-10-16 08:30:00.750000+00:00 2018 10 16 1 Tuesday \n",
"3 2019-01-16 23:30:00.255500+00:00 2019 1 16 2 Wednesday \n",
"\n",
" DayOfYear WeekOfYear Quarter \n",
"0 351 51 4 \n",
"1 106 16 2 \n",
"2 289 42 4 \n",
"3 16 3 1 "
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Year'] = df['TS_obj'].apply(lambda d: d.year)\n",
"df['Month'] = df['TS_obj'].apply(lambda d: d.month)\n",
"df['Day'] = df['TS_obj'].apply(lambda d: d.day)\n",
"df['DayOfWeek'] = df['TS_obj'].apply(lambda d: d.dayofweek)\n",
"df['WeekDayName'] = df['TS_obj'].apply(lambda d: d.weekday_name)\n",
"df['DayOfYear'] = df['TS_obj'].apply(lambda d: d.dayofyear)\n",
"df['WeekOfYear'] = df['TS_obj'].apply(lambda d: d.weekofyear)\n",
"df['Quarter'] = df['TS_obj'].apply(lambda d: d.quarter)\n",
"\n",
"df[['Time','Year','Month','day','DayOfWeek','WeekDayName','DayOfYear','WeekOfYear','Quarter']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这样就能从时间数据中获取很多数据,不同场景对不同数据有需求,如外卖则会关注周末和季节等。"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" <th>Hour</th>\n",
" <th>Minute</th>\n",
" <th>Second</th>\n",
" <th>Microsecond</th>\n",
" <th>Utcoffset</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-12-16 10:30:00.360000+00:00</td>\n",
" <td>10</td>\n",
" <td>30</td>\n",
" <td>0</td>\n",
" <td>360000</td>\n",
" <td>0 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2019-04-16 12:15:00.250000+00:00</td>\n",
" <td>12</td>\n",
" <td>15</td>\n",
" <td>0</td>\n",
" <td>250000</td>\n",
" <td>0 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-10-16 08:30:00.750000+00:00</td>\n",
" <td>8</td>\n",
" <td>30</td>\n",
" <td>0</td>\n",
" <td>750000</td>\n",
" <td>0 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2019-01-16 23:30:00.255500+00:00</td>\n",
" <td>23</td>\n",
" <td>30</td>\n",
" <td>0</td>\n",
" <td>255500</td>\n",
" <td>0 days</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time Hour Minute Second Microsecond \\\n",
"0 2020-12-16 10:30:00.360000+00:00 10 30 0 360000 \n",
"1 2019-04-16 12:15:00.250000+00:00 12 15 0 250000 \n",
"2 2018-10-16 08:30:00.750000+00:00 8 30 0 750000 \n",
"3 2019-01-16 23:30:00.255500+00:00 23 30 0 255500 \n",
"\n",
" Utcoffset \n",
"0 0 days \n",
"1 0 days \n",
"2 0 days \n",
"3 0 days "
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Hour'] = df['TS_obj'].apply(lambda d: d.hour)\n",
"df['Minute'] = df['TS_obj'].apply(lambda d: d.minute)\n",
"df['Second'] = df['TS_obj'].apply(lambda d: d.second)\n",
"df['Microsecond'] = df['TS_obj'].apply(lambda d: d.microsecond)\n",
"df['Utcoffset'] = df['TS_obj'].apply(lambda d: d.utcoffset()) # UTC时间位移\n",
"\n",
"df[['Time','Hour','Minute','Second','Microsecond','Utcoffset']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"又比如按早晚切分时间"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Time</th>\n",
" <th>Hour</th>\n",
" <th>TimeOfDayBin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-12-16 10:30:00.360000+00:00</td>\n",
" <td>10</td>\n",
" <td>Morning</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2019-04-16 12:15:00.250000+00:00</td>\n",
" <td>12</td>\n",
" <td>Afternoon</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-10-16 08:30:00.750000+00:00</td>\n",
" <td>8</td>\n",
" <td>Morning</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2019-01-16 23:30:00.255500+00:00</td>\n",
" <td>23</td>\n",
" <td>Night</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Time Hour TimeOfDayBin\n",
"0 2020-12-16 10:30:00.360000+00:00 10 Morning\n",
"1 2019-04-16 12:15:00.250000+00:00 12 Afternoon\n",
"2 2018-10-16 08:30:00.750000+00:00 8 Morning\n",
"3 2019-01-16 23:30:00.255500+00:00 23 Night"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hour_bins = [-1, 5, 11, 16, 21, 23]\n",
"bin_names = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']\n",
"df['TimeOfDayBin'] = pd.cut(df['Hour'],bins=hour_bins,labels=bin_names)\n",
"\n",
"df[['Time','Hour','TimeOfDayBin']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}