diff --git a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb index 074d363..fc69279 100644 --- a/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb +++ b/机器学习竞赛实战_优胜解决方案/京东用户购买意向预测/数据清洗.ipynb @@ -862,7 +862,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### user_table\n", + "### user_table特征包括:\n", "* user_table特征包括:\n", "* user_id(用户id),age(年龄),sex(性别),\n", "* user_lv_cd(用户级别),browse_num(浏览数),\n", @@ -873,7 +873,7 @@ "* buy_click_ratio(购买点击转化率),\n", "* buy_favor_ratio(购买收藏转化率)\n", "\n", - "### item_table特征包括:\n", + "### item_table特征包括:\n", "* sku_id(商品id),attr1,attr2,\n", "* attr3,cate,brand,browse_num,\n", "* addcart_num,delcart_num,\n", @@ -899,13 +899,13 @@ "outputs": [], "source": [ "# 定义文件名\n", - "ACTION_201602_FILE = \"data/JData_Action_201602.csv\" # 11M条\n", - "ACTION_201603_FILE = \"data/JData_Action_201603.csv\" #26M 条\n", - "ACTION_201604_FILE = \"data/JData_Action_201604.csv\" #13M条\n", - "COMMENT_FILE = \"data/JData_Comment.csv\" #560K条\n", - "PRODUCT_FILE = \"data/JData_Product.csv\" #24k\n", - "USER_FILE = \"data/JData_User.csv\" # 105K 条\n", - " \n", + "ACTION_201602_FILE = \"data/JData_Action_201602.csv\"\n", + "ACTION_201603_FILE = \"data/JData_Action_201603.csv\"\n", + "ACTION_201604_FILE = \"data/JData_Action_201604.csv\"\n", + "COMMENT_FILE = \"data/JData_Comment.csv\"\n", + "PRODUCT_FILE = \"data/JData_Product.csv\"\n", + "USER_FILE = \"data/JData_User.csv\"\n", + "\n", "USER_TABLE_FILE = \"data/user_table.csv\"\n", "ITEM_TABLE_FILE = \"data/item_table.csv\"" ] @@ -1450,6 +1450,184 @@ "user_table.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 构建Item_table" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# 定义文件名\n", + "ACTION_201602_FILE = \"data/JData_Action_201602.csv\"\n", + "ACTION_201603_FILE = \"data/JData_Action_201603.csv\"\n", + "ACTION_201604_FILE = \"data/JData_Action_201604.csv\"\n", + "COMMENT_FILE = \"data/JData_Comment.csv\"\n", + "PRODUCT_FILE = \"data/JData_Product.csv\"\n", + "USER_FILE = \"data/JData_User.csv\"\n", + "\n", + "USER_TABLE_FILE = \"data/user_table.csv\"\n", + "ITEM_TABLE_FILE = \"data/item_table.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取Product中商品\n", + "def get_from_jdata_product():\n", + " df_item = pd.read_csv(PRODUCT_FILE, header=0,encoding='gbk')\n", + " return df_item" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 功能函数: 对每一个商品分组的数据进行统计\n", + "def add_type_count(group):\n", + " behavior_type = group.type.astype(int) \n", + " type_cnt = Counter(behavior_type)\n", + " \n", + " group['browse_num'] = type_cnt[1]\n", + " group['addcart_num'] = type_cnt[2]\n", + " group['delcart_num'] = type_cnt[3]\n", + " group['buy_num'] = type_cnt[4]\n", + " group['favor_num'] = type_cnt[5]\n", + " group['click_num'] = type_cnt[6]\n", + " \n", + " return group[['user_id', 'browse_num', 'addcart_num',\n", + " 'delcart_num', 'buy_num', 'favor_num',\n", + " 'click_num']]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# 对action数据进行统计\n", + "def get_from_action_data(fname, chunk_size=50000):\n", + " reader = pd.read_csv(fname, header=0, iterator=True,encoding='gbk')\n", + " chunks = []\n", + " loop = True\n", + " while loop:\n", + " try:\n", + " chunk = reader.get_chunk(chunk_size)[[\"user_id\", \"type\"]]\n", + " chunks.append(chunk)\n", + " except StopIteration:\n", + " loop = False\n", + " print(\"Iteration is stopped\")\n", + " \n", + " df_ac = pd.concat(chunks, ignore_index=True)\n", + " df_ac = df_ac.groupby(['user_id'], as_index=False).apply(add_type_count)\n", + " df_ac = df_ac.drop_duplicates('user_id')\n", + " \n", + " return df_ac" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# 获取评论中的商品数据,如果存在某一个商品有两个日期的评论,我们取最晚的那一个\n", + "def get_from_jdata_comment():\n", + " df_cmt = pd.read_csv(COMMENT_FILE, header=0)\n", + " df_cmt['dt'] = pd.to_datetime(df_cmt['dt'])\n", + " # find latest comment index\n", + " idx = df_cmt.groupby(['sku_id'])['dt'].transform(max) == df_cmt['dt'] # 取最晚的那一个??\n", + " df_cmt = df_cmt[idx]\n", + " \n", + " return df_cmt[['sku_id', 'comment_num',\n", + " 'has_bad_comment', 'bad_comment_rate']]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def merge_action_data():\n", + " df_ac = []\n", + " df_ac.append(get_from_action_data(fname=ACTION_201602_FILE))\n", + " df_ac.append(get_from_action_data(fname=ACTION_201603_FILE))\n", + " df_ac.append(get_from_action_data(fname=ACTION_201604_FILE))\n", + " \n", + " df_ac = pd.concat(df_ac, ignore_index=True)\n", + " df_ac = df_ac.groupby(['user_id'], as_index=False).sum()\n", + "\n", + " df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']\n", + " df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']\n", + " df_ac['buy_click_ratio'] = df_ac['buy_num'] / df_ac['click_num']\n", + " df_ac['buy_favor_ratio'] = df_ac['buy_num'] / df_ac['favor_num']\n", + " \n", + " df_ac.loc[df_ac['buy_addcart_ratio'] > 1., 'buy_addcart_ratio'] = 1.\n", + " df_ac.loc[df_ac['buy_browse_ratio'] > 1., 'buy_browse_ratio'] = 1.\n", + " df_ac.loc[df_ac['buy_click_ratio'] > 1., 'buy_click_ratio'] = 1.\n", + " df_ac.loc[df_ac['buy_favor_ratio'] > 1., 'buy_favor_ratio'] = 1.\n", + " \n", + " return df_ac" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration is stopped\n" + ] + } + ], + "source": [ + "item_base = get_from_jdata_product()\n", + "item_behavior = merge_action_data()\n", + "item_comment = get_from_jdata_comment()\n", + "\n", + "# SQL: left join\n", + "item_behavior = pd.merge(item_base, item_behavior, on=['sku_id'], how='left')\n", + "item_behavior = pd.merge(item_behavior, item_comment, on=['sku_id'], how='left')\n", + " \n", + "item_behavior.to_csv(ITEM_TABLE_FILE, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "item_table = pd.read_csv(ITEM_TABLE_FILE)\n", + "item_table = haed()" + ] + }, { "cell_type": "code", "execution_count": null,