From 22af266e65bb6511c496bb0fb4450ba0c439255a Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 18 Dec 2020 10:33:21 +0800 Subject: [PATCH] =?UTF-8?q?Create=20=E5=B8=B8=E7=94=A8=E7=89=B9=E5=BE=81?= =?UTF-8?q?=E6=9E=84=E9=80=A0=E6=96=B9=E6=B3=95-checkpoint.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../常用特征构造方法-checkpoint.ipynb | 2765 +++++++++++++++++ 1 file changed, 2765 insertions(+) create mode 100644 机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/常用特征构造方法-checkpoint.ipynb diff --git a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/常用特征构造方法-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/常用特征构造方法-checkpoint.ipynb new file mode 100644 index 0000000..0b142fa --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/常用特征构造方法-checkpoint.ipynb @@ -0,0 +1,2765 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 离散值处理" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import warnings # 忽略普通警告,不打印太多东西\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NamePlatformYearGenrePublisher
1Super Mario Bros.NES1985.0PlatformNintendo
2Mario Kart WiiWii2008.0RacingNintendo
3Wii Sports ResortWii2009.0SportsNintendo
4Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo
5TetrisGB1989.0PuzzleNintendo
6New Super Mario Bros.DS2006.0PlatformNintendo
\n", + "
" + ], + "text/plain": [ + " Name Platform Year Genre Publisher\n", + "1 Super Mario Bros. NES 1985.0 Platform Nintendo\n", + "2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n", + "3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n", + "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n", + "5 Tetris GB 1989.0 Puzzle Nintendo\n", + "6 New Super Mario Bros. DS 2006.0 Platform Nintendo" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')\n", + "vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "机器无法识别字符串类型数据,需要做处理" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n", + " 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n", + " 'Strategy'], dtype=object)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres = np.unique(vg_df['Genre'])\n", + "genres # 不同的字符串并不多" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LabelEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 'Action',\n", + " 1: 'Adventure',\n", + " 2: 'Fighting',\n", + " 3: 'Misc',\n", + " 4: 'Platform',\n", + " 5: 'Puzzle',\n", + " 6: 'Racing',\n", + " 7: 'Role-Playing',\n", + " 8: 'Shooter',\n", + " 9: 'Simulation',\n", + " 10: 'Sports',\n", + " 11: 'Strategy'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "gle = LabelEncoder() # 实例化\n", + "genre_labels = gle.fit_transform(vg_df['Genre']) # 转换需要离散值的一列\n", + "genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n", + "genre_mappings # 映射成数值" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NamePlatformYearGenreGenreLabel
1Super Mario Bros.NES1985.0Platform4
2Mario Kart WiiWii2008.0Racing6
3Wii Sports ResortWii2009.0Sports10
4Pokemon Red/Pokemon BlueGB1996.0Role-Playing7
5TetrisGB1989.0Puzzle5
6New Super Mario Bros.DS2006.0Platform4
\n", + "
" + ], + "text/plain": [ + " Name Platform Year Genre GenreLabel\n", + "1 Super Mario Bros. NES 1985.0 Platform 4\n", + "2 Mario Kart Wii Wii 2008.0 Racing 6\n", + "3 Wii Sports Resort Wii 2009.0 Sports 10\n", + "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n", + "5 Tetris GB 1989.0 Puzzle 5\n", + "6 New Super Mario Bros. DS 2006.0 Platform 4" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df['GenreLabel'] = genre_labels # 赋值到一列\n", + "vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Map\n", + "自己建一个字典" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Action': 0,\n", + " 'Adventure': 1,\n", + " 'Fighting': 2,\n", + " 'Misc': 3,\n", + " 'Platform': 4,\n", + " 'Puzzle': 5,\n", + " 'Racing': 6,\n", + " 'Role-Playing': 7,\n", + " 'Shooter': 8,\n", + " 'Simulation': 9,\n", + " 'Sports': 10,\n", + " 'Strategy': 11}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gen_ord_map = {label:index for index, label in enumerate(gle.classes_)}\n", + "gen_ord_map" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenreGenreLabelGenreMap
1Super Mario Bros.Platform44
2Mario Kart WiiRacing66
3Wii Sports ResortSports1010
4Pokemon Red/Pokemon BlueRole-Playing77
5TetrisPuzzle55
6New Super Mario Bros.Platform44
\n", + "
" + ], + "text/plain": [ + " Name Genre GenreLabel GenreMap\n", + "1 Super Mario Bros. Platform 4 4\n", + "2 Mario Kart Wii Racing 6 6\n", + "3 Wii Sports Resort Sports 10 10\n", + "4 Pokemon Red/Pokemon Blue Role-Playing 7 7\n", + "5 Tetris Puzzle 5 5\n", + "6 New Super Mario Bros. Platform 4 4" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)\n", + "vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7] # 结果呈现我们设置的map" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One-Hot Encoder\n", + "对于离散型特征,基于树的方法是不需要使用one-hot编码的,例如随机森林等。基于距离的模型,都是要使用one-hot编码,例如神经网络等。" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 0., 0., ..., 0., 1., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "# 获取onehot后的结果,将字符串变成多列的0/1值,有则为1,无则为0\n", + "gen_ohe = OneHotEncoder()\n", + "gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()\n", + "gen_feature_arr" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
00.00.00.00.00.00.00.00.00.00.01.00.0
10.00.00.00.01.00.00.00.00.00.00.00.0
20.00.00.00.00.00.01.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.01.00.0
40.00.00.00.00.00.00.01.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " Action Adventure Fighting Misc Platform Puzzle Racing Role-Playing \\\n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "\n", + " Shooter Simulation Sports Strategy \n", + "0 0.0 0.0 1.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 1.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres = np.unique(vg_df['Genre']) # 获取全部不同的字符串\n", + "gen_features = pd.DataFrame(gen_feature_arr, columns=genres) # 将字符串作为列,合并onehot数据\n", + "gen_features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenre
0Wii SportsSports
1Super Mario Bros.Platform
2Mario Kart WiiRacing
3Wii Sports ResortSports
4Pokemon Red/Pokemon BlueRole-Playing
\n", + "
" + ], + "text/plain": [ + " Name Genre\n", + "0 Wii Sports Sports\n", + "1 Super Mario Bros. Platform\n", + "2 Mario Kart Wii Racing\n", + "3 Wii Sports Resort Sports\n", + "4 Pokemon Red/Pokemon Blue Role-Playing" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 拿出两列原本的数据,实际场景中是全部数据合并,这里是为了查看方便\n", + "vg_df_2 = vg_df[['Name', 'Genre']]\n", + "vg_df_2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenreActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
0Wii SportsSports0.00.00.00.00.00.00.00.00.00.01.00.0
1Super Mario Bros.Platform0.00.00.00.01.00.00.00.00.00.00.00.0
2Mario Kart WiiRacing0.00.00.00.00.00.01.00.00.00.00.00.0
3Wii Sports ResortSports0.00.00.00.00.00.00.00.00.00.01.00.0
4Pokemon Red/Pokemon BlueRole-Playing0.00.00.00.00.00.00.01.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " Name Genre Action Adventure Fighting Misc \\\n", + "0 Wii Sports Sports 0.0 0.0 0.0 0.0 \n", + "1 Super Mario Bros. Platform 0.0 0.0 0.0 0.0 \n", + "2 Mario Kart Wii Racing 0.0 0.0 0.0 0.0 \n", + "3 Wii Sports Resort Sports 0.0 0.0 0.0 0.0 \n", + "4 Pokemon Red/Pokemon Blue Role-Playing 0.0 0.0 0.0 0.0 \n", + "\n", + " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "4 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n", + "\n", + " Strategy \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1) # 两个数据合并\n", + "vg_df_ohe.head() # 可以看到Platform列第二行为1,对应着Genre列第二行是Platform字符串" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Dummy\n", + "更加实用的onehot" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(16598, 13)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenreAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
0Wii SportsSports00000000010
1Super Mario Bros.Platform00010000000
2Mario Kart WiiRacing00000100000
3Wii Sports ResortSports00000000010
4Pokemon Red/Pokemon BlueRole-Playing00000010000
\n", + "
" + ], + "text/plain": [ + " Name Genre Adventure Fighting Misc \\\n", + "0 Wii Sports Sports 0 0 0 \n", + "1 Super Mario Bros. Platform 0 0 0 \n", + "2 Mario Kart Wii Racing 0 0 0 \n", + "3 Wii Sports Resort Sports 0 0 0 \n", + "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 \n", + "\n", + " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n", + "0 0 0 0 0 0 0 1 \n", + "1 1 0 0 0 0 0 0 \n", + "2 0 0 1 0 0 0 0 \n", + "3 0 0 0 0 0 0 1 \n", + "4 0 0 0 1 0 0 0 \n", + "\n", + " Strategy \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gen_dummy_features = pd.get_dummies(vg_df['Genre'],drop_first=True) # drop_first=True删掉全为0的列\n", + "dummy_df = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n", + "print(dummy_df.shape)\n", + "dummy_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看到两句话就解决了我们上面那一长串" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(16598, 14)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGenreActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
0Wii SportsSports000000000010
1Super Mario Bros.Platform000010000000
2Mario Kart WiiRacing000000100000
3Wii Sports ResortSports000000000010
4Pokemon Red/Pokemon BlueRole-Playing000000010000
\n", + "
" + ], + "text/plain": [ + " Name Genre Action Adventure Fighting Misc \\\n", + "0 Wii Sports Sports 0 0 0 0 \n", + "1 Super Mario Bros. Platform 0 0 0 0 \n", + "2 Mario Kart Wii Racing 0 0 0 0 \n", + "3 Wii Sports Resort Sports 0 0 0 0 \n", + "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 0 \n", + "\n", + " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n", + "0 0 0 0 0 0 0 1 \n", + "1 1 0 0 0 0 0 0 \n", + "2 0 0 1 0 0 0 0 \n", + "3 0 0 0 0 0 0 1 \n", + "4 0 0 0 1 0 0 0 \n", + "\n", + " Strategy \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gen_dummy_features = pd.get_dummies(vg_df['Genre']) # 和上面相比少了drop_first=True,一般用这种\n", + "dummy_df_true = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n", + "print(dummy_df_true.shape)\n", + "dummy_df_true.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 二值特征化" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameYear
0Wii Sports2006.0
1Super Mario Bros.1985.0
2Mario Kart Wii2008.0
3Wii Sports Resort2009.0
4Pokemon Red/Pokemon Blue1996.0
\n", + "
" + ], + "text/plain": [ + " Name Year\n", + "0 Wii Sports 2006.0\n", + "1 Super Mario Bros. 1985.0\n", + "2 Mario Kart Wii 2008.0\n", + "3 Wii Sports Resort 2009.0\n", + "4 Pokemon Red/Pokemon Blue 1996.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_year_df = vg_df[['Name', 'Year']]\n", + "vg_year_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "我们把2000年以上的归类为1,其它归类为0" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameYearYear_tow
0Wii Sports2006.01
1Super Mario Bros.1985.00
2Mario Kart Wii2008.01
3Wii Sports Resort2009.01
4Pokemon Red/Pokemon Blue1996.00
\n", + "
" + ], + "text/plain": [ + " Name Year Year_tow\n", + "0 Wii Sports 2006.0 1\n", + "1 Super Mario Bros. 1985.0 0\n", + "2 Mario Kart Wii 2008.0 1\n", + "3 Wii Sports Resort 2009.0 1\n", + "4 Pokemon Red/Pokemon Blue 1996.0 0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_year_df['Year_tow'] = np.where(vg_year_df['Year'] >= 2000, 1, 0)\n", + "vg_year_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameYearYear_towbn_year
0Wii Sports2006.011.0
1Super Mario Bros.1985.000.0
2Mario Kart Wii2008.011.0
3Wii Sports Resort2009.011.0
4Pokemon Red/Pokemon Blue1996.000.0
\n", + "
" + ], + "text/plain": [ + " Name Year Year_tow bn_year\n", + "0 Wii Sports 2006.0 1 1.0\n", + "1 Super Mario Bros. 1985.0 0 0.0\n", + "2 Mario Kart Wii 2008.0 1 1.0\n", + "3 Wii Sports Resort 2009.0 1 1.0\n", + "4 Pokemon Red/Pokemon Blue 1996.0 0 0.0" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import Binarizer\n", + "# sklearn中的方法\n", + "bn = Binarizer(threshold=2000) # 大于2000我1,小于为0\n", + "vg_year_df['Year']=vg_year_df['Year'].fillna(0) # 数据中有Nan值,需要补0,否则无法二分\n", + "bn_year = bn.transform([vg_year_df['Year']])[0] # 获取转换的值,取第0列\n", + "vg_year_df['bn_year'] = bn_year # 插入数据\n", + "vg_year_df.head() # 结果与手动一致" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 多项式特征\n", + "获得特征的更高维度和互相间关系的项。" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NA_SalesEU_Sales
041.4929.02
129.083.58
215.8512.88
315.7511.01
411.278.89
\n", + "
" + ], + "text/plain": [ + " NA_Sales EU_Sales\n", + "0 41.49 29.02\n", + "1 29.08 3.58\n", + "2 15.85 12.88\n", + "3 15.75 11.01\n", + "4 11.27 8.89" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polynomial_df = vg_df[['NA_Sales', 'EU_Sales']]\n", + "polynomial_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[4.1490000e+01, 2.9020000e+01, 1.7214201e+03, 1.2040398e+03,\n", + " 8.4216040e+02],\n", + " [2.9080000e+01, 3.5800000e+00, 8.4564640e+02, 1.0410640e+02,\n", + " 1.2816400e+01],\n", + " [1.5850000e+01, 1.2880000e+01, 2.5122250e+02, 2.0414800e+02,\n", + " 1.6589440e+02],\n", + " ...,\n", + " [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n", + " 0.0000000e+00],\n", + " [0.0000000e+00, 1.0000000e-02, 0.0000000e+00, 0.0000000e+00,\n", + " 1.0000000e-04],\n", + " [1.0000000e-02, 0.0000000e+00, 1.0000000e-04, 0.0000000e+00,\n", + " 0.0000000e+00]])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import PolynomialFeatures\n", + "\n", + "# degree二次幂的复杂度\n", + "pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)\n", + "res = pf.fit_transform(polynomial_df)\n", + "res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "以第一行为例:\n", + "
第一列和第二列分别表示原先的第一列和第二列\n", + "
第三列和第五列表示第一列和第二列分别的平方,第四列表示两者的乘积" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NA_SalesEU_SalesNA_Sales^2NA_Sales*EU_SalesEU_Sales^2
041.4929.021721.42011204.0398842.1604
129.083.58845.6464104.106412.8164
215.8512.88251.2225204.1480165.8944
315.7511.01248.0625173.4075121.2201
411.278.89127.0129100.190379.0321
\n", + "
" + ], + "text/plain": [ + " NA_Sales EU_Sales NA_Sales^2 NA_Sales*EU_Sales EU_Sales^2\n", + "0 41.49 29.02 1721.4201 1204.0398 842.1604\n", + "1 29.08 3.58 845.6464 104.1064 12.8164\n", + "2 15.85 12.88 251.2225 204.1480 165.8944\n", + "3 15.75 11.01 248.0625 173.4075 121.2201\n", + "4 11.27 8.89 127.0129 100.1903 79.0321" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "intr_features = pd.DataFrame(res, columns=['NA_Sales',\n", + " 'EU_Sales',\n", + " 'NA_Sales^2',\n", + " 'NA_Sales*EU_Sales',\n", + " 'EU_Sales^2'])\n", + "intr_features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNamePlatformYearGenrePublisherNA_SalesEU_SalesJP_SalesOther_SalesGlobal_SalesGenreLabelGenreMap
01Wii SportsWii2006.0SportsNintendo41.4929.023.778.4682.741010
12Super Mario Bros.NES1985.0PlatformNintendo29.083.586.810.7740.2444
23Mario Kart WiiWii2008.0RacingNintendo15.8512.883.793.3135.8266
34Wii Sports ResortWii2009.0SportsNintendo15.7511.013.282.9633.001010
45Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo11.278.8910.221.0031.3777
\n", + "
" + ], + "text/plain": [ + " Rank Name Platform Year Genre Publisher \\\n", + "0 1 Wii Sports Wii 2006.0 Sports Nintendo \n", + "1 2 Super Mario Bros. NES 1985.0 Platform Nintendo \n", + "2 3 Mario Kart Wii Wii 2008.0 Racing Nintendo \n", + "3 4 Wii Sports Resort Wii 2009.0 Sports Nintendo \n", + "4 5 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo \n", + "\n", + " NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales GenreLabel \\\n", + "0 41.49 29.02 3.77 8.46 82.74 10 \n", + "1 29.08 3.58 6.81 0.77 40.24 4 \n", + "2 15.85 12.88 3.79 3.31 35.82 6 \n", + "3 15.75 11.01 3.28 2.96 33.00 10 \n", + "4 11.27 8.89 10.22 1.00 31.37 7 \n", + "\n", + " GenreMap \n", + "0 10 \n", + "1 4 \n", + "2 6 \n", + "3 10 \n", + "4 7 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vg_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Binning 特征\n", + "一般用来处理年龄" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameYear
0Wii Sports2006.0
1Super Mario Bros.1985.0
2Mario Kart Wii2008.0
3Wii Sports Resort2009.0
4Pokemon Red/Pokemon Blue1996.0
\n", + "
" + ], + "text/plain": [ + " Name Year\n", + "0 Wii Sports 2006.0\n", + "1 Super Mario Bros. 1985.0\n", + "2 Mario Kart Wii 2008.0\n", + "3 Wii Sports Resort 2009.0\n", + "4 Pokemon Red/Pokemon Blue 1996.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bin_df = vg_df[['Name','Year']] # 假设GenreLabel是年龄\n", + "bin_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Frequency')" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib as mpl\n", + "import scipy.stats as spstats\n", + "\n", + "fig, ax = plt.subplots()\n", + "bin_df['Year'].hist(color='#A9C5D3')\n", + "ax.set_title('Developer Global_Sales Hostogram', fontsize=12)\n", + "ax.set_xlabel('Global_Sales', fontsize=12)\n", + "ax.set_ylabel('Frequency', fontsize=12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这样区间就出来了,我们可以分成多个区间,如1980-1985是一个区间,1986-1990是一个区间" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameYearYear_bin
0Wii Sports2006.05
1Super Mario Bros.1985.01
2Mario Kart Wii2008.06
3Wii Sports Resort2009.06
4Pokemon Red/Pokemon Blue1996.03
5Tetris1989.02
6New Super Mario Bros.2006.05
7Wii Play2006.05
8New Super Mario Bros. Wii2009.06
9Duck Hunt1984.00
\n", + "
" + ], + "text/plain": [ + " Name Year Year_bin\n", + "0 Wii Sports 2006.0 5\n", + "1 Super Mario Bros. 1985.0 1\n", + "2 Mario Kart Wii 2008.0 6\n", + "3 Wii Sports Resort 2009.0 6\n", + "4 Pokemon Red/Pokemon Blue 1996.0 3\n", + "5 Tetris 1989.0 2\n", + "6 New Super Mario Bros. 2006.0 5\n", + "7 Wii Play 2006.0 5\n", + "8 New Super Mario Bros. Wii 2009.0 6\n", + "9 Duck Hunt 1984.0 0" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gle = LabelEncoder() # 实例化\n", + "bin_df['Year_bin'] = pd.cut(bin_df['Year'], 9) # 切分成9组,也可以自己指定切分区间\n", + "bin_df['Year_bin'] = bin_df['Year_bin'].astype(str) # 转换类型为字符串\n", + "bin_year = gle.fit_transform(bin_df['Year_bin']) # 利用LabelEncoder方法变成1-9的数值\n", + "bin_df['Year_bin'] = bin_year # 赋值到新的列\n", + "bin_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 对数变换\n", + "\n", + "经常有这样的假设:数据的分布是正态分布。如线性回归的时候误差项要满足正态分布,而当数据不满足的时候,则需要把数据变换成正态分布" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameNA_SalesNA_Sales_log
0Wii Sports41.493.749269
1Super Mario Bros.29.083.403860
2Mario Kart Wii15.852.824351
3Wii Sports Resort15.752.818398
4Pokemon Red/Pokemon Blue11.272.507157
\n", + "
" + ], + "text/plain": [ + " Name NA_Sales NA_Sales_log\n", + "0 Wii Sports 41.49 3.749269\n", + "1 Super Mario Bros. 29.08 3.403860\n", + "2 Mario Kart Wii 15.85 2.824351\n", + "3 Wii Sports Resort 15.75 2.818398\n", + "4 Pokemon Red/Pokemon Blue 11.27 2.507157" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_log = vg_df[['Name','NA_Sales']] \n", + "df_log['NA_Sales_log'] = np.log((1+df_log['NA_Sales']))\n", + "df_log.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# 画两张对比图,左边log过的更偏态\n", + "fig, ax = plt.subplots()\n", + "plt.subplot(121) \n", + "df_log['NA_Sales_log'].hist(color='#A9C5D3')\n", + "\n", + "plt.subplot(122) \n", + "df_log['NA_Sales'].hist(color='#A9C5D3')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "上面是手动的,还有模块化的BoxCox,这里暂不做示例" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 日期相关特征\n", + "将时间特征转换成可以应用的数据" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "from dateutil.parser import parse\n", + "import pytz" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Time
02020-12-16 10:30:00.360000+00:00
12019-04-16 12:15:00.250000+00:00
22018-10-16 08:30:00.750000+00:00
32019-01-16 23:30:00.255500+00:00
\n", + "
" + ], + "text/plain": [ + " Time\n", + "0 2020-12-16 10:30:00.360000+00:00\n", + "1 2019-04-16 12:15:00.250000+00:00\n", + "2 2018-10-16 08:30:00.750000+00:00\n", + "3 2019-01-16 23:30:00.255500+00:00" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "time_stamps = ['2020-12-16 10:30:00.360000+00:00','2019-04-16 12:15:00.250000+00:00',\n", + " '2018-10-16 08:30:00.750000+00:00','2019-01-16 23:30:00.255500+00:00']\n", + "\n", + "df = pd.DataFrame(time_stamps, columns=['Time'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([Timestamp('2020-12-16 10:30:00.360000+0000', tz='UTC'),\n", + " Timestamp('2019-04-16 12:15:00.250000+0000', tz='UTC'),\n", + " Timestamp('2018-10-16 08:30:00.750000+0000', tz='UTC'),\n", + " Timestamp('2019-01-16 23:30:00.255500+0000', tz='UTC')],\n", + " dtype=object)" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])\n", + "df['TS_obj'] = ts_objs\n", + "ts_objs" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeYearMonthdayDayOfWeekWeekDayNameDayOfYearWeekOfYearQuarter
02020-12-16 10:30:00.360000+00:00202012162Wednesday351514
12019-04-16 12:15:00.250000+00:0020194161Tuesday106162
22018-10-16 08:30:00.750000+00:00201810161Tuesday289424
32019-01-16 23:30:00.255500+00:0020191162Wednesday1631
\n", + "
" + ], + "text/plain": [ + " Time Year Month day DayOfWeek WeekDayName \\\n", + "0 2020-12-16 10:30:00.360000+00:00 2020 12 16 2 Wednesday \n", + "1 2019-04-16 12:15:00.250000+00:00 2019 4 16 1 Tuesday \n", + "2 2018-10-16 08:30:00.750000+00:00 2018 10 16 1 Tuesday \n", + "3 2019-01-16 23:30:00.255500+00:00 2019 1 16 2 Wednesday \n", + "\n", + " DayOfYear WeekOfYear Quarter \n", + "0 351 51 4 \n", + "1 106 16 2 \n", + "2 289 42 4 \n", + "3 16 3 1 " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Year'] = df['TS_obj'].apply(lambda d: d.year)\n", + "df['Month'] = df['TS_obj'].apply(lambda d: d.month)\n", + "df['Day'] = df['TS_obj'].apply(lambda d: d.day)\n", + "df['DayOfWeek'] = df['TS_obj'].apply(lambda d: d.dayofweek)\n", + "df['WeekDayName'] = df['TS_obj'].apply(lambda d: d.weekday_name)\n", + "df['DayOfYear'] = df['TS_obj'].apply(lambda d: d.dayofyear)\n", + "df['WeekOfYear'] = df['TS_obj'].apply(lambda d: d.weekofyear)\n", + "df['Quarter'] = df['TS_obj'].apply(lambda d: d.quarter)\n", + "\n", + "df[['Time','Year','Month','day','DayOfWeek','WeekDayName','DayOfYear','WeekOfYear','Quarter']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这样就能从时间数据中获取很多数据,不同场景对不同数据有需求,如外卖则会关注周末和季节等。" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeHourMinuteSecondMicrosecondUtcoffset
02020-12-16 10:30:00.360000+00:00103003600000 days
12019-04-16 12:15:00.250000+00:00121502500000 days
22018-10-16 08:30:00.750000+00:0083007500000 days
32019-01-16 23:30:00.255500+00:00233002555000 days
\n", + "
" + ], + "text/plain": [ + " Time Hour Minute Second Microsecond \\\n", + "0 2020-12-16 10:30:00.360000+00:00 10 30 0 360000 \n", + "1 2019-04-16 12:15:00.250000+00:00 12 15 0 250000 \n", + "2 2018-10-16 08:30:00.750000+00:00 8 30 0 750000 \n", + "3 2019-01-16 23:30:00.255500+00:00 23 30 0 255500 \n", + "\n", + " Utcoffset \n", + "0 0 days \n", + "1 0 days \n", + "2 0 days \n", + "3 0 days " + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Hour'] = df['TS_obj'].apply(lambda d: d.hour)\n", + "df['Minute'] = df['TS_obj'].apply(lambda d: d.minute)\n", + "df['Second'] = df['TS_obj'].apply(lambda d: d.second)\n", + "df['Microsecond'] = df['TS_obj'].apply(lambda d: d.microsecond)\n", + "df['Utcoffset'] = df['TS_obj'].apply(lambda d: d.utcoffset()) # UTC时间位移\n", + "\n", + "df[['Time','Hour','Minute','Second','Microsecond','Utcoffset']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "又比如按早晚切分时间" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeHourTimeOfDayBin
02020-12-16 10:30:00.360000+00:0010Morning
12019-04-16 12:15:00.250000+00:0012Afternoon
22018-10-16 08:30:00.750000+00:008Morning
32019-01-16 23:30:00.255500+00:0023Night
\n", + "
" + ], + "text/plain": [ + " Time Hour TimeOfDayBin\n", + "0 2020-12-16 10:30:00.360000+00:00 10 Morning\n", + "1 2019-04-16 12:15:00.250000+00:00 12 Afternoon\n", + "2 2018-10-16 08:30:00.750000+00:00 8 Morning\n", + "3 2019-01-16 23:30:00.255500+00:00 23 Night" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hour_bins = [-1, 5, 11, 16, 21, 23]\n", + "bin_names = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']\n", + "df['TimeOfDayBin'] = pd.cut(df['Hour'],bins=hour_bins,labels=bin_names)\n", + "\n", + "df[['Time','Hour','TimeOfDayBin']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}