diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/.ipynb_checkpoints/3-特征工程-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/.ipynb_checkpoints/3-特征工程-checkpoint.ipynb index 13c5d06..01cfe35 100644 --- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/.ipynb_checkpoints/3-特征工程-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/.ipynb_checkpoints/3-特征工程-checkpoint.ipynb @@ -44,6 +44,7 @@ "* 用户对各个类别操作行为统计占对所有类别操作行为统计的比重\n", "\n", "累积商品特征:\n", + "\n", "* 分时间段\n", "* 针对商品的不同行为的\n", "* 购买转化率\n", @@ -55,12 +56,729 @@ "* 均值" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from datetime import datetime\n", + "from datetime import timedelta\n", + "import pandas as pd\n", + "import pickle\n", + "import os\n", + "import math\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 11485424 entries, 0 to 11485423\n", + "Data columns (total 7 columns):\n", + "user_id float32\n", + "sku_id float32\n", + "time object\n", + "model_id float32\n", + "type float32\n", + "cate float32\n", + "brand float32\n", + "dtypes: float32(6), object(1)\n", + "memory usage: 350.5+ MB\n" + ] + } + ], + "source": [ + "#float32 降低内存消耗\n", + "test = pd.read_csv('data/JData_Action_201602.csv')\n", + "test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + "test.dtypes\n", + "test.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 11485424 entries, 0 to 11485423\n", + "Data columns (total 7 columns):\n", + "user_id int64\n", + "sku_id int64\n", + "time object\n", + "model_id float64\n", + "type int64\n", + "cate int64\n", + "brand int64\n", + "dtypes: float64(1), int64(5), object(1)\n", + "memory usage: 613.4+ MB\n" + ] + } + ], + "source": [ + "# 不转换float32\n", + "test = pd.read_csv('data/JData_Action_201602.csv')\n", + "# test[['user_id','sku_id','model_id','type','cate','brand']] = test[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + "test.dtypes\n", + "test.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以明显看到`int64`使用的memory usage: 613.4+ MB,比`float32`的多了近一倍的内存使用" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 数据路径\n", + "action_1_path = r'data/JData_Action_201602.csv'\n", + "action_2_path = r'data/JData_Action_201603.csv'\n", + "action_3_path = r'data/JData_Action_201604.csv'\n", + " \n", + "comment_path = r'data/JData_Comment.csv'\n", + "product_path = r'data/JData_Product.csv'\n", + "user_path = r'data/JData_User.csv'\n", + " \n", + "comment_date = [\n", + " \"2016-02-01\", \"2016-02-08\", \"2016-02-15\", \"2016-02-22\", \"2016-02-29\",\n", + " \"2016-03-07\", \"2016-03-14\", \"2016-03-21\", \"2016-03-28\", \"2016-04-04\",\n", + " \"2016-04-11\", \"2016-04-15\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 基本方法\n", + "def get_actions_0():\n", + " action = pd.read_csv(action_0_path)\n", + " return action\n", + " \n", + "def get_actions_1():\n", + " action = pd.read_csv(action_1_path)\n", + " action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " return action\n", + "\n", + "def get_actions_2():\n", + " action = pd.read_csv(action_2_path)\n", + " action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " \n", + " return action\n", + "def get_actions_3():\n", + " action = pd.read_csv(action_3_path)\n", + " action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " \n", + " return action\n", + " \n", + "#如果电脑性能好就不用分块\n", + "def get_actions_10():\n", + " \n", + " reader = pd.read_csv(action_1_path, iterator=True)\n", + " reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(50000)\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + " action = pd.concat(chunks, ignore_index=True)\n", + " \n", + " return action\n", + "def get_actions_20():\n", + " \n", + " reader = pd.read_csv(action_2_path, iterator=True)\n", + " reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(50000)\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + " action = pd.concat(chunks, ignore_index=True)\n", + " \n", + " return action\n", + "def get_actions_30():\n", + " \n", + " reader = pd.read_csv(action_3_path, iterator=True)\n", + " reader[['user_id','sku_id','model_id','type','cate','brand']] = reader[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(50000)\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + " action = pd.concat(chunks, ignore_index=True)\n", + " \n", + " return action\n", + "\n", + "# 读取并拼接所有行为记录文件\n", + "def get_all_action():\n", + " action_1 = get_actions_1()\n", + " action_2 = get_actions_2()\n", + " action_3 = get_actions_3()\n", + " actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame\n", + " \n", + " return actions\n", + " \n", + "# 获取某个时间段的行为记录\n", + "def get_actions(start_date, end_date, all_actions):\n", + " \"\"\"\n", + " :param start_date:\n", + " :param end_date:\n", + " :return: actions: pd.Dataframe\n", + " \"\"\"\n", + " actions = all_actions[(all_actions.time >= start_date) & (all_actions.time < end_date)].copy()\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户特征\n", + "### 用户基本特征\n", + "获取基本的用户特征,基于用户本身属性多为类别特征的特点,对age,sex,usr_lv_cd进行独热编码操作,对于用户注册时间暂时不处理" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + " \n", + "def get_basic_user_feat():\n", + " # 针对年龄的中文字符问题处理,首先是读入的时候编码,删除空值,然后将其数值化,最后独热编码,此外对于sex也进行了数值类型转换\n", + " user = pd.read_csv(user_path, encoding='gbk')\n", + " # axis=0/1=包含缺失值的行/列,how=any/all=有则删除指定行或者列/必须全部才删除,inplace=是否在原表上修改\n", + " user.dropna(axis=0, how='any',inplace=True)\n", + " user['sex'] = user['sex'].astype(int) \n", + " user['age'] = user['age'].astype(int)\n", + " le = preprocessing.LabelEncoder()\n", + " age_df = le.fit_transform(user['age']) # 将标签值标准化成0,1,2,3...\n", + " \n", + " age_df = pd.get_dummies(age_df, prefix='age') # 将标准化值变成onehot编码0/1\n", + "# 如原数据age[0,1,2,3]变成:\n", + "# age_0[1,0,0,0]\n", + "# age_1[0,1,0,0]\n", + "# age_2[0,0,1,0]\n", + "# age_3[0,0,0,1], 有值的位置为1,没有为0,且只有0/1两种情况,列名+列值=新列名\n", + " sex_df = pd.get_dummies(user['sex'], prefix='sex')\n", + " user_lv_df = pd.get_dummies(user['user_lv_cd'], prefix='user_lv_cd')\n", + " user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1) # 合并\n", + " return user" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "user_id False\n", + "age True\n", + "sex True\n", + "user_lv_cd False\n", + "user_reg_tm True\n", + "dtype: bool" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user = pd.read_csv(user_path, encoding='gbk')\n", + "user.isnull().any() # 判断是否有空值,True为有" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexuser_lv_cduser_reg_tm
34072234073NaNNaN1NaN
34072234073NaNNaN1NaN
34072234073NaNNaN1NaN
38905238906NaNNaN1NaN
38905238906NaNNaN1NaN
38905238906NaNNaN1NaN
67704267705NaNNaN1NaN
67704267705NaNNaN1NaN
67704267705NaNNaN1NaN
\n", + "
" + ], + "text/plain": [ + " user_id age sex user_lv_cd user_reg_tm\n", + "34072 234073 NaN NaN 1 NaN\n", + "34072 234073 NaN NaN 1 NaN\n", + "34072 234073 NaN NaN 1 NaN\n", + "38905 238906 NaN NaN 1 NaN\n", + "38905 238906 NaN NaN 1 NaN\n", + "38905 238906 NaN NaN 1 NaN\n", + "67704 267705 NaN NaN 1 NaN\n", + "67704 267705 NaN NaN 1 NaN\n", + "67704 267705 NaN NaN 1 NaN" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user[user.isnull().values==True] # 查看空值的部分" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这些数据不仅没有年龄、性别、注册时间,数据只有9条,比较少,我们直接删除" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "user_id False\n", + "age False\n", + "sex False\n", + "user_lv_cd False\n", + "user_reg_tm False\n", + "dtype: bool" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user.dropna(axis=0, how='any',inplace=True)\n", + "user.isnull().any() # 判断是否有空值,已经全部为False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 商品特征\n", + "### 商品基本特征\n", + "根据商品文件获取基本的特征,针对属性a1,a2,a3进行独热编码,商品类别和品牌直接作为特征" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def get_basic_product_feat():\n", + " product = pd.read_csv(product_path)\n", + " attr1_df = pd.get_dummies(product[\"a1\"], prefix=\"a1\")\n", + " attr2_df = pd.get_dummies(product[\"a2\"], prefix=\"a2\")\n", + " attr3_df = pd.get_dummies(product[\"a3\"], prefix=\"a3\")\n", + " product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)\n", + " return product" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 评论特征\n", + "* 分时间段\n", + "* 对评论数进行独热编码" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def get_comments_product_feat(end_date):\n", + " comments = pd.read_csv(comment_path)\n", + " comment_date_end = end_date\n", + " comment_date_begin = comment_date[0]\n", + " for date in reversed(comment_date):\n", + " if date < comment_date_end:\n", + " comment_date_begin = date\n", + " break\n", + " comments = comments[comments.dt==comment_date_begin]\n", + " df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n", + " # 为了防止某个时间段不具备评论数为0的情况(测试集出现过这种情况)\n", + " for i in range(0, 5):\n", + " if 'comment_num_' + str(i) not in df.columns:\n", + " df['comment_num_' + str(i)] = 0\n", + " df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + " \n", + " comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n", + " #del comments['dt']\n", + " #del comments['comment_num']\n", + " comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n", + " 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + " return comments" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "train_start_date = '2016-02-01'\n", + "train_end_date = datetime.strptime(train_start_date, '%Y-%m-%d') + timedelta(days=3)\n", + "train_end_date = train_end_date.strftime('%Y-%m-%d')\n", + "day = 3\n", + " \n", + "start_date = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=day)\n", + "start_date = start_date.strftime('%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sku_idhas_bad_commentbad_comment_ratecomment_num_0comment_num_1comment_num_2comment_num_3comment_num_4
0100010.041700010
11000000.000000100
210001110.037600001
310001800.000000010
410002000.000000010
\n", + "
" + ], + "text/plain": [ + " sku_id has_bad_comment bad_comment_rate comment_num_0 comment_num_1 \\\n", + "0 1000 1 0.0417 0 0 \n", + "1 10000 0 0.0000 0 0 \n", + "2 100011 1 0.0376 0 0 \n", + "3 100018 0 0.0000 0 0 \n", + "4 100020 0 0.0000 0 0 \n", + "\n", + " comment_num_2 comment_num_3 comment_num_4 \n", + "0 0 1 0 \n", + "1 1 0 0 \n", + "2 0 0 1 \n", + "3 0 1 0 \n", + "4 0 1 0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments = pd.read_csv(comment_path)\n", + "comment_date_end = train_end_date\n", + "comment_date_begin = comment_date[0]\n", + "for date in reversed(comment_date):\n", + " if date < comment_date_end:\n", + " comment_date_begin = date\n", + " break\n", + "comments = comments[comments.dt==comment_date_begin]\n", + "df = pd.get_dummies(comments['comment_num'], prefix='comment_num')\n", + "for i in range(0, 5):\n", + " if 'comment_num_' + str(i) not in df.columns:\n", + " df['comment_num_' + str(i)] = 0\n", + "df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + " \n", + "comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame\n", + "comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', \n", + " 'comment_num_2', 'comment_num_3', 'comment_num_4']]\n", + "comments.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "comment_num_0/1/2/3/4分别对应着,0无评论/1表示1条/2表示2-10条/3表示11-50条/4表示大于50条,bad_comment_rate差评率,has_bad_comment是否包含差评。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 行为特征\n", + "* 分时间段\n", + "* 对行为类别进行独热编码\n", + "* 分别按照用户-类别行为分组和用户-类别-商品行为分组统计,然后计算\n", + " * 用户对同类别下其他商品的行为计数\n", + " * 针对用户对同类别下目标商品的行为计数与该时间段的行为均值作差" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def get_action_feat(start_date, end_date, all_actions, i):\n", + " actions = get_actions(start_date, end_date, all_actions)\n", + " actions = actions[['user_id', 'sku_id', 'cate','type']]\n", + " # 不同时间累积的行为计数(3,5,7,10,15,21,30)\n", + " df = pd.get_dummies(actions['type'], prefix='action_before_%s' %i)\n", + " before_date = 'action_before_%s' %i\n", + " actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame\n", + " # 分组统计,用户-类别-商品,不同用户对不同类别下商品的行为计数\n", + " actions = actions.groupby(['user_id', 'sku_id','cate'], as_index=False).sum()\n", + " # 分组统计,用户-类别,不同用户对不同商品类别的行为计数\n", + " user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()\n", + " del user_cate['sku_id']\n", + " del user_cate['type']\n", + " actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])\n", + " #本类别下其他商品点击量\n", + " # 前述两种分组含有相同名称的不同行为的计数,系统会自动针对名称调整添加后缀,x,y,所以这里作差统计的是同一类别下其他商品的行为计数\n", + " actions[before_date+'_1.0_y'] = actions[before_date+'_1.0_y'] - actions[before_date+'_1.0_x']\n", + " actions[before_date+'_2.0_y'] = actions[before_date+'_2.0_y'] - actions[before_date+'_2.0_x']\n", + " actions[before_date+'_3.0_y'] = actions[before_date+'_3.0_y'] - actions[before_date+'_3.0_x']\n", + " actions[before_date+'_4.0_y'] = actions[before_date+'_4.0_y'] - actions[before_date+'_4.0_x']\n", + " actions[before_date+'_5.0_y'] = actions[before_date+'_5.0_y'] - actions[before_date+'_5.0_x']\n", + " actions[before_date+'_6.0_y'] = actions[before_date+'_6.0_y'] - actions[before_date+'_6.0_x']\n", + " # 统计用户对不同类别下商品计数与该类别下商品行为计数均值(对时间)的差值\n", + " actions[before_date+'minus_mean_1'] = actions[before_date+'_1.0_x'] - (actions[before_date+'_1.0_x']/i)\n", + " actions[before_date+'minus_mean_2'] = actions[before_date+'_2.0_x'] - (actions[before_date+'_2.0_x']/i)\n", + " actions[before_date+'minus_mean_3'] = actions[before_date+'_3.0_x'] - (actions[before_date+'_3.0_x']/i)\n", + " actions[before_date+'minus_mean_4'] = actions[before_date+'_4.0_x'] - (actions[before_date+'_4.0_x']/i)\n", + " actions[before_date+'minus_mean_5'] = actions[before_date+'_5.0_x'] - (actions[before_date+'_5.0_x']/i)\n", + " actions[before_date+'minus_mean_6'] = actions[before_date+'_6.0_x'] - (actions[before_date+'_6.0_x']/i)\n", + " del actions['type']\n", + " # 保留cate特征\n", + "# del actions['cate']\n", + " \n", + " return actions\n", + " \n", + " \n", + "actions = get_actions(start_date, train_end_date, all_actions)\n", + "actions = actions[['user_id', 'sku_id', 'cate','type']]\n", + " # 不同时间累积的行为计数(3,5,7,10,15,21,30)\n", + "df = pd.get_dummies(actions['type'], prefix='action_before_%s' %3)\n", + "before_date = 'action_before_%s' %3\n", + "actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame\n", + " # 分组统计,用户-类别-商品,不同用户对不同类别下商品的行为计数\n", + "actions = actions.groupby(['user_id', 'sku_id','cate'], as_index=False).sum()\n", + "actions.head(20)" + ] } ], "metadata": {