{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## 离散值处理" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import warnings # 忽略普通警告,不打印太多东西\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NamePlatformYearGenrePublisher
1Super Mario Bros.NES1985.0PlatformNintendo
2Mario Kart WiiWii2008.0RacingNintendo
3Wii Sports ResortWii2009.0SportsNintendo
4Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo
5TetrisGB1989.0PuzzleNintendo
6New Super Mario Bros.DS2006.0PlatformNintendo
\n", "
" ], "text/plain": [ " Name Platform Year Genre Publisher\n", "1 Super Mario Bros. NES 1985.0 Platform Nintendo\n", "2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n", "3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n", "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n", "5 Tetris GB 1989.0 Puzzle Nintendo\n", "6 New Super Mario Bros. DS 2006.0 Platform Nintendo" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')\n", "vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "机器无法识别字符串类型数据,需要做处理" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n", " 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n", " 'Strategy'], dtype=object)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres = np.unique(vg_df['Genre'])\n", "genres # 不同的字符串并不多" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## LabelEncoder" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: 'Action',\n", " 1: 'Adventure',\n", " 2: 'Fighting',\n", " 3: 'Misc',\n", " 4: 'Platform',\n", " 5: 'Puzzle',\n", " 6: 'Racing',\n", " 7: 'Role-Playing',\n", " 8: 'Shooter',\n", " 9: 'Simulation',\n", " 10: 'Sports',\n", " 11: 'Strategy'}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "gle = LabelEncoder() # 实例化\n", "genre_labels = gle.fit_transform(vg_df['Genre']) # 转换需要离散值的一列\n", "genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n", "genre_mappings # 映射成数值" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NamePlatformYearGenreGenreLabel
1Super Mario Bros.NES1985.0Platform4
2Mario Kart WiiWii2008.0Racing6
3Wii Sports ResortWii2009.0Sports10
4Pokemon Red/Pokemon BlueGB1996.0Role-Playing7
5TetrisGB1989.0Puzzle5
6New Super Mario Bros.DS2006.0Platform4
\n", "
" ], "text/plain": [ " Name Platform Year Genre GenreLabel\n", "1 Super Mario Bros. NES 1985.0 Platform 4\n", "2 Mario Kart Wii Wii 2008.0 Racing 6\n", "3 Wii Sports Resort Wii 2009.0 Sports 10\n", "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n", "5 Tetris GB 1989.0 Puzzle 5\n", "6 New Super Mario Bros. DS 2006.0 Platform 4" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_df['GenreLabel'] = genre_labels # 赋值到一列\n", "vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Map\n", "自己建一个字典" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Action': 0,\n", " 'Adventure': 1,\n", " 'Fighting': 2,\n", " 'Misc': 3,\n", " 'Platform': 4,\n", " 'Puzzle': 5,\n", " 'Racing': 6,\n", " 'Role-Playing': 7,\n", " 'Shooter': 8,\n", " 'Simulation': 9,\n", " 'Sports': 10,\n", " 'Strategy': 11}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_ord_map = {label:index for index, label in enumerate(gle.classes_)}\n", "gen_ord_map" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenreGenreLabelGenreMap
1Super Mario Bros.Platform44
2Mario Kart WiiRacing66
3Wii Sports ResortSports1010
4Pokemon Red/Pokemon BlueRole-Playing77
5TetrisPuzzle55
6New Super Mario Bros.Platform44
\n", "
" ], "text/plain": [ " Name Genre GenreLabel GenreMap\n", "1 Super Mario Bros. Platform 4 4\n", "2 Mario Kart Wii Racing 6 6\n", "3 Wii Sports Resort Sports 10 10\n", "4 Pokemon Red/Pokemon Blue Role-Playing 7 7\n", "5 Tetris Puzzle 5 5\n", "6 New Super Mario Bros. Platform 4 4" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)\n", "vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7] # 结果呈现我们设置的map" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## One-Hot Encoder\n", "对于离散型特征,基于树的方法是不需要使用one-hot编码的,例如随机森林等。基于距离的模型,都是要使用one-hot编码,例如神经网络等。" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0., 0., 0., ..., 0., 1., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " ...,\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "# 获取onehot后的结果,将字符串变成多列的0/1值,有则为1,无则为0\n", "gen_ohe = OneHotEncoder()\n", "gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()\n", "gen_feature_arr" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
00.00.00.00.00.00.00.00.00.00.01.00.0
10.00.00.00.01.00.00.00.00.00.00.00.0
20.00.00.00.00.00.01.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.01.00.0
40.00.00.00.00.00.00.01.00.00.00.00.0
\n", "
" ], "text/plain": [ " Action Adventure Fighting Misc Platform Puzzle Racing Role-Playing \\\n", "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "\n", " Shooter Simulation Sports Strategy \n", "0 0.0 0.0 1.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 1.0 0.0 \n", "4 0.0 0.0 0.0 0.0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres = np.unique(vg_df['Genre']) # 获取全部不同的字符串\n", "gen_features = pd.DataFrame(gen_feature_arr, columns=genres) # 将字符串作为列,合并onehot数据\n", "gen_features.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenre
0Wii SportsSports
1Super Mario Bros.Platform
2Mario Kart WiiRacing
3Wii Sports ResortSports
4Pokemon Red/Pokemon BlueRole-Playing
\n", "
" ], "text/plain": [ " Name Genre\n", "0 Wii Sports Sports\n", "1 Super Mario Bros. Platform\n", "2 Mario Kart Wii Racing\n", "3 Wii Sports Resort Sports\n", "4 Pokemon Red/Pokemon Blue Role-Playing" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 拿出两列原本的数据,实际场景中是全部数据合并,这里是为了查看方便\n", "vg_df_2 = vg_df[['Name', 'Genre']]\n", "vg_df_2.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenreActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
0Wii SportsSports0.00.00.00.00.00.00.00.00.00.01.00.0
1Super Mario Bros.Platform0.00.00.00.01.00.00.00.00.00.00.00.0
2Mario Kart WiiRacing0.00.00.00.00.00.01.00.00.00.00.00.0
3Wii Sports ResortSports0.00.00.00.00.00.00.00.00.00.01.00.0
4Pokemon Red/Pokemon BlueRole-Playing0.00.00.00.00.00.00.01.00.00.00.00.0
\n", "
" ], "text/plain": [ " Name Genre Action Adventure Fighting Misc \\\n", "0 Wii Sports Sports 0.0 0.0 0.0 0.0 \n", "1 Super Mario Bros. Platform 0.0 0.0 0.0 0.0 \n", "2 Mario Kart Wii Racing 0.0 0.0 0.0 0.0 \n", "3 Wii Sports Resort Sports 0.0 0.0 0.0 0.0 \n", "4 Pokemon Red/Pokemon Blue Role-Playing 0.0 0.0 0.0 0.0 \n", "\n", " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n", "0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "4 0.0 0.0 0.0 1.0 0.0 0.0 0.0 \n", "\n", " Strategy \n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1) # 两个数据合并\n", "vg_df_ohe.head() # 可以看到Platform列第二行为1,对应着Genre列第二行是Platform字符串" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get Dummy\n", "更加实用的onehot" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(16598, 13)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenreAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
0Wii SportsSports00000000010
1Super Mario Bros.Platform00010000000
2Mario Kart WiiRacing00000100000
3Wii Sports ResortSports00000000010
4Pokemon Red/Pokemon BlueRole-Playing00000010000
\n", "
" ], "text/plain": [ " Name Genre Adventure Fighting Misc \\\n", "0 Wii Sports Sports 0 0 0 \n", "1 Super Mario Bros. Platform 0 0 0 \n", "2 Mario Kart Wii Racing 0 0 0 \n", "3 Wii Sports Resort Sports 0 0 0 \n", "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 \n", "\n", " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n", "0 0 0 0 0 0 0 1 \n", "1 1 0 0 0 0 0 0 \n", "2 0 0 1 0 0 0 0 \n", "3 0 0 0 0 0 0 1 \n", "4 0 0 0 1 0 0 0 \n", "\n", " Strategy \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_dummy_features = pd.get_dummies(vg_df['Genre'],drop_first=True) # drop_first=True删掉全为0的列\n", "dummy_df = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n", "print(dummy_df.shape)\n", "dummy_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "可以看到两句话就解决了我们上面那一长串" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(16598, 14)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenreActionAdventureFightingMiscPlatformPuzzleRacingRole-PlayingShooterSimulationSportsStrategy
0Wii SportsSports000000000010
1Super Mario Bros.Platform000010000000
2Mario Kart WiiRacing000000100000
3Wii Sports ResortSports000000000010
4Pokemon Red/Pokemon BlueRole-Playing000000010000
\n", "
" ], "text/plain": [ " Name Genre Action Adventure Fighting Misc \\\n", "0 Wii Sports Sports 0 0 0 0 \n", "1 Super Mario Bros. Platform 0 0 0 0 \n", "2 Mario Kart Wii Racing 0 0 0 0 \n", "3 Wii Sports Resort Sports 0 0 0 0 \n", "4 Pokemon Red/Pokemon Blue Role-Playing 0 0 0 0 \n", "\n", " Platform Puzzle Racing Role-Playing Shooter Simulation Sports \\\n", "0 0 0 0 0 0 0 1 \n", "1 1 0 0 0 0 0 0 \n", "2 0 0 1 0 0 0 0 \n", "3 0 0 0 0 0 0 1 \n", "4 0 0 0 1 0 0 0 \n", "\n", " Strategy \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_dummy_features = pd.get_dummies(vg_df['Genre']) # 和上面相比少了drop_first=True,一般用这种\n", "dummy_df_true = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n", "print(dummy_df_true.shape)\n", "dummy_df_true.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 二值特征化" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameYear
0Wii Sports2006.0
1Super Mario Bros.1985.0
2Mario Kart Wii2008.0
3Wii Sports Resort2009.0
4Pokemon Red/Pokemon Blue1996.0
\n", "
" ], "text/plain": [ " Name Year\n", "0 Wii Sports 2006.0\n", "1 Super Mario Bros. 1985.0\n", "2 Mario Kart Wii 2008.0\n", "3 Wii Sports Resort 2009.0\n", "4 Pokemon Red/Pokemon Blue 1996.0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_year_df = vg_df[['Name', 'Year']]\n", "vg_year_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "我们把2000年以上的归类为1,其它归类为0" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameYearYear_tow
0Wii Sports2006.01
1Super Mario Bros.1985.00
2Mario Kart Wii2008.01
3Wii Sports Resort2009.01
4Pokemon Red/Pokemon Blue1996.00
\n", "
" ], "text/plain": [ " Name Year Year_tow\n", "0 Wii Sports 2006.0 1\n", "1 Super Mario Bros. 1985.0 0\n", "2 Mario Kart Wii 2008.0 1\n", "3 Wii Sports Resort 2009.0 1\n", "4 Pokemon Red/Pokemon Blue 1996.0 0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_year_df['Year_tow'] = np.where(vg_year_df['Year'] >= 2000, 1, 0)\n", "vg_year_df.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameYearYear_towbn_year
0Wii Sports2006.011.0
1Super Mario Bros.1985.000.0
2Mario Kart Wii2008.011.0
3Wii Sports Resort2009.011.0
4Pokemon Red/Pokemon Blue1996.000.0
\n", "
" ], "text/plain": [ " Name Year Year_tow bn_year\n", "0 Wii Sports 2006.0 1 1.0\n", "1 Super Mario Bros. 1985.0 0 0.0\n", "2 Mario Kart Wii 2008.0 1 1.0\n", "3 Wii Sports Resort 2009.0 1 1.0\n", "4 Pokemon Red/Pokemon Blue 1996.0 0 0.0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import Binarizer\n", "# sklearn中的方法\n", "bn = Binarizer(threshold=2000) # 大于2000我1,小于为0\n", "vg_year_df['Year']=vg_year_df['Year'].fillna(0) # 数据中有Nan值,需要补0,否则无法二分\n", "bn_year = bn.transform([vg_year_df['Year']])[0] # 获取转换的值,取第0列\n", "vg_year_df['bn_year'] = bn_year # 插入数据\n", "vg_year_df.head() # 结果与手动一致" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 多项式特征\n", "获得特征的更高维度和互相间关系的项。" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NA_SalesEU_Sales
041.4929.02
129.083.58
215.8512.88
315.7511.01
411.278.89
\n", "
" ], "text/plain": [ " NA_Sales EU_Sales\n", "0 41.49 29.02\n", "1 29.08 3.58\n", "2 15.85 12.88\n", "3 15.75 11.01\n", "4 11.27 8.89" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "polynomial_df = vg_df[['NA_Sales', 'EU_Sales']]\n", "polynomial_df.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[4.1490000e+01, 2.9020000e+01, 1.7214201e+03, 1.2040398e+03,\n", " 8.4216040e+02],\n", " [2.9080000e+01, 3.5800000e+00, 8.4564640e+02, 1.0410640e+02,\n", " 1.2816400e+01],\n", " [1.5850000e+01, 1.2880000e+01, 2.5122250e+02, 2.0414800e+02,\n", " 1.6589440e+02],\n", " ...,\n", " [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n", " 0.0000000e+00],\n", " [0.0000000e+00, 1.0000000e-02, 0.0000000e+00, 0.0000000e+00,\n", " 1.0000000e-04],\n", " [1.0000000e-02, 0.0000000e+00, 1.0000000e-04, 0.0000000e+00,\n", " 0.0000000e+00]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import PolynomialFeatures\n", "\n", "# degree二次幂的复杂度\n", "pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)\n", "res = pf.fit_transform(polynomial_df)\n", "res" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "以第一行为例:\n", "
第一列和第二列分别表示原先的第一列和第二列\n", "
第三列和第五列表示第一列和第二列分别的平方,第四列表示两者的乘积" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NA_SalesEU_SalesNA_Sales^2NA_Sales*EU_SalesEU_Sales^2
041.4929.021721.42011204.0398842.1604
129.083.58845.6464104.106412.8164
215.8512.88251.2225204.1480165.8944
315.7511.01248.0625173.4075121.2201
411.278.89127.0129100.190379.0321
\n", "
" ], "text/plain": [ " NA_Sales EU_Sales NA_Sales^2 NA_Sales*EU_Sales EU_Sales^2\n", "0 41.49 29.02 1721.4201 1204.0398 842.1604\n", "1 29.08 3.58 845.6464 104.1064 12.8164\n", "2 15.85 12.88 251.2225 204.1480 165.8944\n", "3 15.75 11.01 248.0625 173.4075 121.2201\n", "4 11.27 8.89 127.0129 100.1903 79.0321" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intr_features = pd.DataFrame(res, columns=['NA_Sales',\n", " 'EU_Sales',\n", " 'NA_Sales^2',\n", " 'NA_Sales*EU_Sales',\n", " 'EU_Sales^2'])\n", "intr_features.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RankNamePlatformYearGenrePublisherNA_SalesEU_SalesJP_SalesOther_SalesGlobal_SalesGenreLabelGenreMap
01Wii SportsWii2006.0SportsNintendo41.4929.023.778.4682.741010
12Super Mario Bros.NES1985.0PlatformNintendo29.083.586.810.7740.2444
23Mario Kart WiiWii2008.0RacingNintendo15.8512.883.793.3135.8266
34Wii Sports ResortWii2009.0SportsNintendo15.7511.013.282.9633.001010
45Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo11.278.8910.221.0031.3777
\n", "
" ], "text/plain": [ " Rank Name Platform Year Genre Publisher \\\n", "0 1 Wii Sports Wii 2006.0 Sports Nintendo \n", "1 2 Super Mario Bros. NES 1985.0 Platform Nintendo \n", "2 3 Mario Kart Wii Wii 2008.0 Racing Nintendo \n", "3 4 Wii Sports Resort Wii 2009.0 Sports Nintendo \n", "4 5 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo \n", "\n", " NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales GenreLabel \\\n", "0 41.49 29.02 3.77 8.46 82.74 10 \n", "1 29.08 3.58 6.81 0.77 40.24 4 \n", "2 15.85 12.88 3.79 3.31 35.82 6 \n", "3 15.75 11.01 3.28 2.96 33.00 10 \n", "4 11.27 8.89 10.22 1.00 31.37 7 \n", "\n", " GenreMap \n", "0 10 \n", "1 4 \n", "2 6 \n", "3 10 \n", "4 7 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binning 特征\n", "一般用来处理年龄" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameYear
0Wii Sports2006.0
1Super Mario Bros.1985.0
2Mario Kart Wii2008.0
3Wii Sports Resort2009.0
4Pokemon Red/Pokemon Blue1996.0
\n", "
" ], "text/plain": [ " Name Year\n", "0 Wii Sports 2006.0\n", "1 Super Mario Bros. 1985.0\n", "2 Mario Kart Wii 2008.0\n", "3 Wii Sports Resort 2009.0\n", "4 Pokemon Red/Pokemon Blue 1996.0" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bin_df = vg_df[['Name','Year']] # 假设GenreLabel是年龄\n", "bin_df.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Frequency')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import scipy.stats as spstats\n", "\n", "fig, ax = plt.subplots()\n", "bin_df['Year'].hist(color='#A9C5D3')\n", "ax.set_title('Developer Global_Sales Hostogram', fontsize=12)\n", "ax.set_xlabel('Global_Sales', fontsize=12)\n", "ax.set_ylabel('Frequency', fontsize=12)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "这样区间就出来了,我们可以分成多个区间,如1980-1985是一个区间,1986-1990是一个区间" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameYearYear_bin
0Wii Sports2006.05
1Super Mario Bros.1985.01
2Mario Kart Wii2008.06
3Wii Sports Resort2009.06
4Pokemon Red/Pokemon Blue1996.03
5Tetris1989.02
6New Super Mario Bros.2006.05
7Wii Play2006.05
8New Super Mario Bros. Wii2009.06
9Duck Hunt1984.00
\n", "
" ], "text/plain": [ " Name Year Year_bin\n", "0 Wii Sports 2006.0 5\n", "1 Super Mario Bros. 1985.0 1\n", "2 Mario Kart Wii 2008.0 6\n", "3 Wii Sports Resort 2009.0 6\n", "4 Pokemon Red/Pokemon Blue 1996.0 3\n", "5 Tetris 1989.0 2\n", "6 New Super Mario Bros. 2006.0 5\n", "7 Wii Play 2006.0 5\n", "8 New Super Mario Bros. Wii 2009.0 6\n", "9 Duck Hunt 1984.0 0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gle = LabelEncoder() # 实例化\n", "bin_df['Year_bin'] = pd.cut(bin_df['Year'], 9) # 切分成9组,也可以自己指定切分区间\n", "bin_df['Year_bin'] = bin_df['Year_bin'].astype(str) # 转换类型为字符串\n", "bin_year = gle.fit_transform(bin_df['Year_bin']) # 利用LabelEncoder方法变成1-9的数值\n", "bin_df['Year_bin'] = bin_year # 赋值到新的列\n", "bin_df.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 对数变换\n", "\n", "经常有这样的假设:数据的分布是正态分布。如线性回归的时候误差项要满足正态分布,而当数据不满足的时候,则需要把数据变换成正态分布" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameNA_SalesNA_Sales_log
0Wii Sports41.493.749269
1Super Mario Bros.29.083.403860
2Mario Kart Wii15.852.824351
3Wii Sports Resort15.752.818398
4Pokemon Red/Pokemon Blue11.272.507157
\n", "
" ], "text/plain": [ " Name NA_Sales NA_Sales_log\n", "0 Wii Sports 41.49 3.749269\n", "1 Super Mario Bros. 29.08 3.403860\n", "2 Mario Kart Wii 15.85 2.824351\n", "3 Wii Sports Resort 15.75 2.818398\n", "4 Pokemon Red/Pokemon Blue 11.27 2.507157" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_log = vg_df[['Name','NA_Sales']] \n", "df_log['NA_Sales_log'] = np.log((1+df_log['NA_Sales']))\n", "df_log.head()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 画两张对比图,左边log过的更偏态\n", "fig, ax = plt.subplots()\n", "plt.subplot(121) \n", "df_log['NA_Sales_log'].hist(color='#A9C5D3')\n", "\n", "plt.subplot(122) \n", "df_log['NA_Sales'].hist(color='#A9C5D3')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "上面是手动的,还有模块化的BoxCox,这里暂不做示例" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 日期相关特征\n", "将时间特征转换成可以应用的数据" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "import datetime\n", "from dateutil.parser import parse\n", "import pytz" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Time
02020-12-16 10:30:00.360000+00:00
12019-04-16 12:15:00.250000+00:00
22018-10-16 08:30:00.750000+00:00
32019-01-16 23:30:00.255500+00:00
\n", "
" ], "text/plain": [ " Time\n", "0 2020-12-16 10:30:00.360000+00:00\n", "1 2019-04-16 12:15:00.250000+00:00\n", "2 2018-10-16 08:30:00.750000+00:00\n", "3 2019-01-16 23:30:00.255500+00:00" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "time_stamps = ['2020-12-16 10:30:00.360000+00:00','2019-04-16 12:15:00.250000+00:00',\n", " '2018-10-16 08:30:00.750000+00:00','2019-01-16 23:30:00.255500+00:00']\n", "\n", "df = pd.DataFrame(time_stamps, columns=['Time'])\n", "df" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([Timestamp('2020-12-16 10:30:00.360000+0000', tz='UTC'),\n", " Timestamp('2019-04-16 12:15:00.250000+0000', tz='UTC'),\n", " Timestamp('2018-10-16 08:30:00.750000+0000', tz='UTC'),\n", " Timestamp('2019-01-16 23:30:00.255500+0000', tz='UTC')],\n", " dtype=object)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])\n", "df['TS_obj'] = ts_objs\n", "ts_objs" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeYearMonthdayDayOfWeekWeekDayNameDayOfYearWeekOfYearQuarter
02020-12-16 10:30:00.360000+00:00202012162Wednesday351514
12019-04-16 12:15:00.250000+00:0020194161Tuesday106162
22018-10-16 08:30:00.750000+00:00201810161Tuesday289424
32019-01-16 23:30:00.255500+00:0020191162Wednesday1631
\n", "
" ], "text/plain": [ " Time Year Month day DayOfWeek WeekDayName \\\n", "0 2020-12-16 10:30:00.360000+00:00 2020 12 16 2 Wednesday \n", "1 2019-04-16 12:15:00.250000+00:00 2019 4 16 1 Tuesday \n", "2 2018-10-16 08:30:00.750000+00:00 2018 10 16 1 Tuesday \n", "3 2019-01-16 23:30:00.255500+00:00 2019 1 16 2 Wednesday \n", "\n", " DayOfYear WeekOfYear Quarter \n", "0 351 51 4 \n", "1 106 16 2 \n", "2 289 42 4 \n", "3 16 3 1 " ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Year'] = df['TS_obj'].apply(lambda d: d.year)\n", "df['Month'] = df['TS_obj'].apply(lambda d: d.month)\n", "df['Day'] = df['TS_obj'].apply(lambda d: d.day)\n", "df['DayOfWeek'] = df['TS_obj'].apply(lambda d: d.dayofweek)\n", "df['WeekDayName'] = df['TS_obj'].apply(lambda d: d.weekday_name)\n", "df['DayOfYear'] = df['TS_obj'].apply(lambda d: d.dayofyear)\n", "df['WeekOfYear'] = df['TS_obj'].apply(lambda d: d.weekofyear)\n", "df['Quarter'] = df['TS_obj'].apply(lambda d: d.quarter)\n", "\n", "df[['Time','Year','Month','day','DayOfWeek','WeekDayName','DayOfYear','WeekOfYear','Quarter']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "这样就能从时间数据中获取很多数据,不同场景对不同数据有需求,如外卖则会关注周末和季节等。" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeHourMinuteSecondMicrosecondUtcoffset
02020-12-16 10:30:00.360000+00:00103003600000 days
12019-04-16 12:15:00.250000+00:00121502500000 days
22018-10-16 08:30:00.750000+00:0083007500000 days
32019-01-16 23:30:00.255500+00:00233002555000 days
\n", "
" ], "text/plain": [ " Time Hour Minute Second Microsecond \\\n", "0 2020-12-16 10:30:00.360000+00:00 10 30 0 360000 \n", "1 2019-04-16 12:15:00.250000+00:00 12 15 0 250000 \n", "2 2018-10-16 08:30:00.750000+00:00 8 30 0 750000 \n", "3 2019-01-16 23:30:00.255500+00:00 23 30 0 255500 \n", "\n", " Utcoffset \n", "0 0 days \n", "1 0 days \n", "2 0 days \n", "3 0 days " ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Hour'] = df['TS_obj'].apply(lambda d: d.hour)\n", "df['Minute'] = df['TS_obj'].apply(lambda d: d.minute)\n", "df['Second'] = df['TS_obj'].apply(lambda d: d.second)\n", "df['Microsecond'] = df['TS_obj'].apply(lambda d: d.microsecond)\n", "df['Utcoffset'] = df['TS_obj'].apply(lambda d: d.utcoffset()) # UTC时间位移\n", "\n", "df[['Time','Hour','Minute','Second','Microsecond','Utcoffset']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "又比如按早晚切分时间" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TimeHourTimeOfDayBin
02020-12-16 10:30:00.360000+00:0010Morning
12019-04-16 12:15:00.250000+00:0012Afternoon
22018-10-16 08:30:00.750000+00:008Morning
32019-01-16 23:30:00.255500+00:0023Night
\n", "
" ], "text/plain": [ " Time Hour TimeOfDayBin\n", "0 2020-12-16 10:30:00.360000+00:00 10 Morning\n", "1 2019-04-16 12:15:00.250000+00:00 12 Afternoon\n", "2 2018-10-16 08:30:00.750000+00:00 8 Morning\n", "3 2019-01-16 23:30:00.255500+00:00 23 Night" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hour_bins = [-1, 5, 11, 16, 21, 23]\n", "bin_names = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']\n", "df['TimeOfDayBin'] = pd.cut(df['Hour'],bins=hour_bins,labels=bin_names)\n", "\n", "df[['Time','Hour','TimeOfDayBin']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }