diff --git a/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/.ipynb_checkpoints/游戏销售数据-常用特征构造方法-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/.ipynb_checkpoints/游戏销售数据-常用特征构造方法-checkpoint.ipynb new file mode 100644 index 0000000..6997bf7 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/游戏销售数据_特征常用构建方法/.ipynb_checkpoints/游戏销售数据-常用特征构造方法-checkpoint.ipynb @@ -0,0 +1,915 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 离散值处理" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NamePlatformYearGenrePublisher
1Super Mario Bros.NES1985.0PlatformNintendo
2Mario Kart WiiWii2008.0RacingNintendo
3Wii Sports ResortWii2009.0SportsNintendo
4Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo
5TetrisGB1989.0PuzzleNintendo
6New Super Mario Bros.DS2006.0PlatformNintendo
\n", + "
" + ], + "text/plain": [ + " Name Platform Year Genre Publisher\n", + "1 Super Mario Bros. NES 1985.0 Platform Nintendo\n", + "2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n", + "3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n", + "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n", + "5 Tetris GB 1989.0 Puzzle Nintendo\n", + "6 New Super Mario Bros. DS 2006.0 Platform Nintendo" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')\n", + "vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "机器无法识别字符串类型数据,需要做处理" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n", + " 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n", + " 'Strategy'], dtype=object)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres = np.unique(vg_df['Genre'])\n", + "genres # 不同的字符串并不多" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LabelEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 'Action',\n", + " 1: 'Adventure',\n", + " 2: 'Fighting',\n", + " 3: 'Misc',\n", + " 4: 'Platform',\n", + " 5: 'Puzzle',\n", + " 6: 'Racing',\n", + " 7: 'Role-Playing',\n", + " 8: 'Shooter',\n", + " 9: 'Simulation',\n", + " 10: 'Sports',\n", + " 11: 'Strategy'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "gle = LabelEncoder() # 实例化\n", + "genre_labels = gle.fit_transform(vg_df['Genre']) # 转换需要离散值的一列\n", + "genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n", + "genre_mappings # 映射成数值" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NamePlatformYearGenreGenreLabel
1Super Mario Bros.NES1985.0Platform4
2Mario Kart WiiWii2008.0Racing6
3Wii Sports ResortWii2009.0Sports10
4Pokemon Red/Pokemon BlueGB1996.0Role-Playing7
5TetrisGB1989.0Puzzle5
6New Super Mario Bros.DS2006.0Platform4
\n", + "
" + ], + "text/plain": [ + " Name Platform Year Genre GenreLabel\n", + "1 Super Mario Bros. NES 1985.0 Platform 4\n", + "2 Mario Kart Wii Wii 2008.0 Racing 6\n", + "3 Wii Sports Resort Wii 2009.0 Sports 10\n", + "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n", + "5 Tetris GB 1989.0 Puzzle 5\n", + "6 New Super Mario Bros. DS 2006.0 Platform 4" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df['GenreLabel'] = genre_labels # 赋值到一列\n", + "vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Map\n", + "自己建一个字典" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Action': 0,\n", + " 'Adventure': 1,\n", + " 'Fighting': 2,\n", + " 'Misc': 3,\n", + " 'Platform': 4,\n", + " 'Puzzle': 5,\n", + " 'Racing': 6,\n", + " 'Role-Playing': 7,\n", + " 'Shooter': 8,\n", + " 'Simulation': 9,\n", + " 'Sports': 10,\n", + " 'Strategy': 11}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gen_ord_map = {label:index for index, label in enumerate(gle.classes_)}\n", + "gen_ord_map" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenreGenreLabelGenreMap
1Super Mario Bros.Platform44
2Mario Kart WiiRacing66
3Wii Sports ResortSports1010
4Pokemon Red/Pokemon BlueRole-Playing77
5TetrisPuzzle55
6New Super Mario Bros.Platform44
\n", + "
" + ], + "text/plain": [ + " Name Genre GenreLabel GenreMap\n", + "1 Super Mario Bros. Platform 4 4\n", + "2 Mario Kart Wii Racing 6 6\n", + "3 Wii Sports Resort Sports 10 10\n", + "4 Pokemon Red/Pokemon Blue Role-Playing 7 7\n", + "5 Tetris Puzzle 5 5\n", + "6 New Super Mario Bros. Platform 4 4" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)\n", + "vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7] # 结果呈现我们设置的map" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One-Hot Encoder\n", + "对于离散型特征,基于树的方法是不需要使用one-hot编码的,例如随机森林等。基于距离的模型,都是要使用one-hot编码,例如神经网络等。" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n", + "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n", + "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n", + " warnings.warn(msg, FutureWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[0., 0., 0., ..., 0., 1., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "# 获取onehot后的结果,将字符串变成多列的0/1值,有则为1,无则为0\n", + "gen_ohe = OneHotEncoder()\n", + "gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()\n", + "gen_feature_arr" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
00.00.00.00.00.00.00.00.00.00.01.00.0
10.00.00.00.01.00.00.00.00.00.00.00.0
20.00.00.00.00.00.01.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.01.00.0
40.00.00.00.00.00.00.01.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " Action Adventure Fighting Misc Platform Puzzle Racing Role-Playing \\\n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "\n", + " Shooter Simulation Sports Strategy \n", + "0 0.0 0.0 1.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 1.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres = np.unique(vg_df['Genre']) # 获取全部不同的字符串\n", + "gen_features = pd.DataFrame(gen_feature_arr, columns=genres) # 将字符串作为列,合并onehot数据\n", + "gen_features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenre
0Wii SportsSports
1Super Mario Bros.Platform
2Mario Kart WiiRacing
3Wii Sports ResortSports
4Pokemon Red/Pokemon BlueRole-Playing
\n", + "
" + ], + "text/plain": [ + " Name Genre\n", + "0 Wii Sports Sports\n", + "1 Super Mario Bros. Platform\n", + "2 Mario Kart Wii Racing\n", + "3 Wii Sports Resort Sports\n", + "4 Pokemon Red/Pokemon Blue Role-Playing" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 拿出两列原本的数据,实际场景中是全部数据合并,这里是为了查看方便\n", + "vg_df_2 = vg_df[['Name', 'Genre']]\n", + "vg_df_2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenreActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
0Wii SportsSports0.00.00.00.00.00.00.00.00.00.01.00.0
1Super Mario Bros.Platform0.00.00.00.01.00.00.00.00.00.00.00.0
2Mario Kart WiiRacing0.00.00.00.00.00.01.00.00.00.00.00.0
3Wii Sports ResortSports0.00.00.00.00.00.00.00.00.00.01.00.0
4Pokemon Red/Pokemon BlueRole-Playing0.00.00.00.00.00.00.01.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " Name Genre Action Adventure Fighting Misc \\\n", + "0 Wii Sports Sports 0.0 0.0 0.0 0.0 \n", + "1 Super Mario Bros. Platform 0.0 0.0 0.0 0.0 \n", + "2 Mario Kart Wii Racing 0.0 0.0 0.0 0.0 \n", + "3 Wii Sports Resort Sports 0.0 0.0 0.0 0.0 \n", + "4 Pokemon Red/Pokemon Blue Role-Playing 0.0 0.0 0.0 0.0 \n", + "\n", + " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "4 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n", + "\n", + " Strategy \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1) # 两个数据合并\n", + "vg_df_ohe.head() # 可以看到Platform列第二行为1,对应着Genre列第二行是Platform字符串" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}