|
|
|
@ -128,6 +128,271 @@
|
|
|
|
|
"3. 数据集切分"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 数据集完整性验证\n",
|
|
|
|
|
"首先检查JData_User中的用户和JData_Dction中的用户是否一致,保证行为数据中锁产生的行为均由用户数据中的用户产生。\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"思路:利用pd.Merge连接sku和Action中的sku,观测Action中的数据是否减少Example:"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
" sku data\n",
|
|
|
|
|
"0 a 1\n",
|
|
|
|
|
"1 a 1\n",
|
|
|
|
|
"2 c 3\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 测试方法\n",
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"df1 = pd.DataFrame({'sku':['a','a','e','c'], 'data':[1,1,2,3]})\n",
|
|
|
|
|
"df2 = pd.DataFrame({'sku':['a','b','c']})\n",
|
|
|
|
|
"print(pd.merge(df1,df2))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"结果只会打印两者共有的部分"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 2,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Is action of Feb. from User file? True\n",
|
|
|
|
|
"Is action of Mar. from User file? True\n",
|
|
|
|
|
"Is action of Apr. from User file? True\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"#数据集验证\n",
|
|
|
|
|
"def user_action_check():\n",
|
|
|
|
|
" df_user = pd.read_csv('data/JData_User.csv',encoding='gbk')\n",
|
|
|
|
|
" df_sku = df_user.loc[:,'user_id'].to_frame()\n",
|
|
|
|
|
" df_month2 = pd.read_csv('data/JData_Action_201602.csv',encoding='gbk')\n",
|
|
|
|
|
" # pd.merge(df_sku,df_month2) 会以user_id字段为基准取两个df的交集 不是取并集,这样才能证明 action中的userid 都在df_user里面\n",
|
|
|
|
|
" print ('Is action of Feb. from User file? ', len(df_month2) == len(pd.merge(df_sku,df_month2))) \n",
|
|
|
|
|
" df_month3 = pd.read_csv('data/JData_Action_201603.csv',encoding='gbk')\n",
|
|
|
|
|
" print ('Is action of Mar. from User file? ', len(df_month3) == len(pd.merge(df_sku,df_month3)))\n",
|
|
|
|
|
" df_month4 = pd.read_csv('data/JData_Action_201604.csv',encoding='gbk')\n",
|
|
|
|
|
" print ('Is action of Apr. from User file? ', len(df_month4) == len(pd.merge(df_sku,df_month4)))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"user_action_check() "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"结论:User数据集中的用户和交互行为数据集中的用户完全一致\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"根据merge前后的数据量对,能保障Action中的用户ID是User中的ID的子集"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 检查是否有重复记录\n",
|
|
|
|
|
"除去各个数据文件中完全重复的记录,可能解释是重复数据是有意义的,比如用户同时购买多件商品,同时添加多个数量的商品到购物车等…"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"#重复数据\n",
|
|
|
|
|
"def deduplicate(filepath, filename, newpath):\n",
|
|
|
|
|
" df_file = pd.read_csv(filepath,encoding='gbk') \n",
|
|
|
|
|
" before = df_file.shape[0]\n",
|
|
|
|
|
" df_file.drop_duplicates(inplace=True) # 列相同认为是重复 inplace=True表示在原来的DataFrame上删除重复项4\n",
|
|
|
|
|
" after = df_file.shape[0]\n",
|
|
|
|
|
" n_dup = before-after # 查看前后差值\n",
|
|
|
|
|
" print ('Number of duplicate records for ' + filename + ' is: ' + str(n_dup))\n",
|
|
|
|
|
" if n_dup != 0:\n",
|
|
|
|
|
" df_file.to_csv(newpath, index=None)\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" print ('Number duplicate records in ' + filename)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Number of duplicate records for Feb. action is: 2756093\n",
|
|
|
|
|
"Number of duplicate records for Mar. action is: 7085038\n",
|
|
|
|
|
"Number of duplicate records for Feb. action is: 3672710\n",
|
|
|
|
|
"Number of duplicate records for Comment is: 0\n",
|
|
|
|
|
"Number duplicate records in Comment\n",
|
|
|
|
|
"Number of duplicate records for Product is: 0\n",
|
|
|
|
|
"Number duplicate records in Product\n",
|
|
|
|
|
"Number of duplicate records for User is: 0\n",
|
|
|
|
|
"Number duplicate records in User\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"deduplicate('data/JData_Action_201602.csv', 'Feb. action', 'data/JData_Action_201602_dedup.csv')\n",
|
|
|
|
|
"deduplicate('data/JData_Action_201603.csv', 'Mar. action', 'data/JData_Action_201603_dedup.csv')\n",
|
|
|
|
|
"deduplicate('data/JData_Action_201604.csv', 'Feb. action', 'data/JData_Action_201604_dedup.csv')\n",
|
|
|
|
|
"deduplicate('data/JData_Comment.csv', 'Comment', 'data/JData_Comment_dedup.csv')\n",
|
|
|
|
|
"deduplicate('data/JData_Product.csv', 'Product', 'data/JData_Product_dedup.csv')\n",
|
|
|
|
|
"deduplicate('data/JData_User.csv', 'User', 'data/JData_User_dedup.csv')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>user_id</th>\n",
|
|
|
|
|
" <th>sku_id</th>\n",
|
|
|
|
|
" <th>time</th>\n",
|
|
|
|
|
" <th>model_id</th>\n",
|
|
|
|
|
" <th>cate</th>\n",
|
|
|
|
|
" <th>brand</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>type</th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>2176378</td>\n",
|
|
|
|
|
" <td>2176378</td>\n",
|
|
|
|
|
" <td>2176378</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>2176378</td>\n",
|
|
|
|
|
" <td>2176378</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>636</td>\n",
|
|
|
|
|
" <td>636</td>\n",
|
|
|
|
|
" <td>636</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>636</td>\n",
|
|
|
|
|
" <td>636</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1464</td>\n",
|
|
|
|
|
" <td>1464</td>\n",
|
|
|
|
|
" <td>1464</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1464</td>\n",
|
|
|
|
|
" <td>1464</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" <td>37</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>5</th>\n",
|
|
|
|
|
" <td>1981</td>\n",
|
|
|
|
|
" <td>1981</td>\n",
|
|
|
|
|
" <td>1981</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1981</td>\n",
|
|
|
|
|
" <td>1981</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>6</th>\n",
|
|
|
|
|
" <td>575597</td>\n",
|
|
|
|
|
" <td>575597</td>\n",
|
|
|
|
|
" <td>575597</td>\n",
|
|
|
|
|
" <td>545054</td>\n",
|
|
|
|
|
" <td>575597</td>\n",
|
|
|
|
|
" <td>575597</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" user_id sku_id time model_id cate brand\n",
|
|
|
|
|
"type \n",
|
|
|
|
|
"1 2176378 2176378 2176378 0 2176378 2176378\n",
|
|
|
|
|
"2 636 636 636 0 636 636\n",
|
|
|
|
|
"3 1464 1464 1464 0 1464 1464\n",
|
|
|
|
|
"4 37 37 37 0 37 37\n",
|
|
|
|
|
"5 1981 1981 1981 0 1981 1981\n",
|
|
|
|
|
"6 575597 575597 575597 545054 575597 575597"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 查看重复数据\n",
|
|
|
|
|
"df_month2 = pd.read_csv('data/JData_Action_201602.csv',encoding='gbk')\n",
|
|
|
|
|
"IsDuplicated = df_month2.duplicated()\n",
|
|
|
|
|
"df_d = df_month2[IsDuplicated]\n",
|
|
|
|
|
"df_d.groupby('type').count() # 发现重复数据大多数都是由于浏览(1),或者点击(6)产生"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|