|
|
|
@ -30645,15 +30645,15 @@
|
|
|
|
|
" pop,value = i.split(';')\n",
|
|
|
|
|
" self.page_rank[int(pos)-self.register_day] = value\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" def get_array(self):\n",
|
|
|
|
|
" def get_array(self): # 返回递归的结果\n",
|
|
|
|
|
" return self.array\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # 得到标签 如果一个用户在未来七天活跃了,那么标记为1\n",
|
|
|
|
|
" def get_label(self):\n",
|
|
|
|
|
" self.label = np.array([None] * self.seq_length) # 一个seq_length长度的数组\n",
|
|
|
|
|
" active = self.array[:, :10].sum(axis=1) # 这里选了一部分特征做了个sum,意思是不管是转发,登录,啥的,只要做了就算一次活动\n",
|
|
|
|
|
" for i in range(self.seq_length-7): # 这地方得控制一下,如果一个用户15-30的数据,那么我们标签最多只能到23天,因为30天之后的数据我们没有\n",
|
|
|
|
|
" self.label[i] = 1 * (np.sum(active[i+1:i+8]) > 0) # 这里对于当前的i,如果未来七天内活跃过,那么标签就是1\n",
|
|
|
|
|
" active = self.array[:, :10].sum(axis=1) # 这里选了一部分特征做了个sum,即是发生登录、点赞、转发等,计为活动1次\n",
|
|
|
|
|
" for i in range(self.seq_length-7): # 如果一个用户15-30的数据,那么我们标签最多只能到23天,因为30天之后的数据我们没有\n",
|
|
|
|
|
" self.label[i] = 1 * (np.sum(active[i+1:i+8]) > 0) # 未来七天内活跃过,那么标签就是1\n",
|
|
|
|
|
" return self.label"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
@ -31769,10 +31769,10 @@
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"launch['launch'] = 1\n",
|
|
|
|
|
"launch['launch'] = 1 # 每个用户登录次数置为1,如果登录了多次则求和\n",
|
|
|
|
|
"launch_table = launch.groupby(['user_id', 'launch_day'], \n",
|
|
|
|
|
" as_index=False).agg({'launch':'sum'})\n",
|
|
|
|
|
"launch_table.head()"
|
|
|
|
|
"launch_table.head() # launch_day是时间、launch登录次数"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -31781,9 +31781,10 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def record_to_sequence(table):\n",
|
|
|
|
|
"def record_to_sequence(table): # 得到用户特征序列表\n",
|
|
|
|
|
" table.columns=['user_id','day','value']\n",
|
|
|
|
|
" table.sort_values(by=['user_id','day'],inplace=True)\n",
|
|
|
|
|
" table.sort_values(by=['user_id','day'],inplace=True) # 排序每个客户的时间\n",
|
|
|
|
|
" # 拼接字符串,如上的launch_day是13,同行的launch是1,则变成13:1\n",
|
|
|
|
|
" table['string']=table.day.map(str)+':'+table.value.map(str)\n",
|
|
|
|
|
" table=table.groupby(['user_id'],as_index=False).agg({'string':lambda x:','.join(x)})\n",
|
|
|
|
|
" return table"
|
|
|
|
@ -31865,7 +31866,14 @@
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"launch_table=record_to_sequence(launch_table)\n",
|
|
|
|
|
"launch_table.head()"
|
|
|
|
|
"launch_table.head() # 序列特征结果"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"将用户数据填充到对应的位置,如ID=16的用户会在其特征表的13,14,15...行进行填充"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -31874,8 +31882,15 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"for index,row in launch_table.iterrows():\n",
|
|
|
|
|
" data[row[0]].put_feature(1,row[1])"
|
|
|
|
|
"for index,row in launch_table.iterrows(): # 根据登录信息对用户特指标进行填充\n",
|
|
|
|
|
" data[row[0]].put_feature(1,row[1]) # 在指定特征位置填充"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"**创作视频信息**"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -31888,7 +31903,16 @@
|
|
|
|
|
"create_table = create.groupby(['user_id','create_day'],as_index=False).agg({'create':'sum'})\n",
|
|
|
|
|
"create_table = record_to_sequence(create_table)\n",
|
|
|
|
|
"for index,row in create_table.iterrows():\n",
|
|
|
|
|
" data[row[0]].put_feature(2,row[1])"
|
|
|
|
|
" data[row[0]].put_feature(2,row[1]) # 第一个位置是注册信息,所以我们从2开始填充"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"**用户使用时行为特征,例如点赞、转发等**\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"分别对不同行为进行统计,构建6种不同行为特征"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -31898,13 +31922,20 @@
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"for i in range(6):\n",
|
|
|
|
|
" act=activity[activity.act_type==i].copy()\n",
|
|
|
|
|
" act=activity[activity.act_type==i].copy() # act_type不同数字对应着不同的行为\n",
|
|
|
|
|
" act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})\n",
|
|
|
|
|
" act = record_to_sequence(act)\n",
|
|
|
|
|
" for index,row in act.iterrows():\n",
|
|
|
|
|
" data[row[0]].put_feature(i+3,row[1])"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"**产生行为的界面信息**"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 21,
|
|
|
|
@ -31919,6 +31950,13 @@
|
|
|
|
|
" data[row[0]].put_feature(i+9,row[1])"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"**用户观看其他用户作品的信息**"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 22,
|
|
|
|
@ -31927,6 +31965,7 @@
|
|
|
|
|
"source": [
|
|
|
|
|
"watched=register.loc[:,['user_id']].copy()\n",
|
|
|
|
|
"watched.columns=['author_id']\n",
|
|
|
|
|
"# 如果作者id和用户id不相等,则是观看其它用户作品的信息\n",
|
|
|
|
|
"watched=pd.merge(watched,activity[activity.author_id!=activity.user_id],how='inner')\n",
|
|
|
|
|
"watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})\n",
|
|
|
|
|
"watched=record_to_sequence(watched)\n",
|
|
|
|
@ -31934,12 +31973,20 @@
|
|
|
|
|
" data[row[0]].put_feature(10,row[1])"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"**用户观看自己作品的信息**"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 23,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 如果作者id和用户id相等,则是观看自己作品的信息\n",
|
|
|
|
|
"watched=pd.merge(watched,activity[activity.author_id==activity.user_id],how='inner')\n",
|
|
|
|
|
"watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})\n",
|
|
|
|
|
"watched=record_to_sequence(watched)\n",
|
|
|
|
@ -31947,6 +31994,26 @@
|
|
|
|
|
" data[row[0]].put_feature(11,row[1])"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"特征已构建完成,这里是用的RNN网络,所以构建矩阵,提取序列\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"如果是用lgb或者xgb,则是DataFrame类型,做一些统计类型的特征"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 标签制作\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"活跃用户定义为(自己定义的):在未来7天内使用过APP\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"从用户注册开始进行统计,对于每1天的数据展开,如果其7天后仍有行为产生,则标签为1"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 24,
|
|
|
|
@ -33595,7 +33662,14 @@
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"label"
|
|
|
|
|
"label # 1表示注册7天后,有积极行为的;0则不积极"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"用户特征数据即为之前提取的data中各项特征,转换为ndarray即可"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -49084,17 +49158,31 @@
|
|
|
|
|
"data"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 网络训练模块"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 2,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 导入封装好的包\n",
|
|
|
|
|
"# 导入封装好的包,上面方法的集合\n",
|
|
|
|
|
"from deep_tools import f\n",
|
|
|
|
|
"from deep_tools import DataGenerator"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"构建RNN网络模型"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 3,
|
|
|
|
@ -49144,11 +49232,11 @@
|
|
|
|
|
" # 变量与输入\n",
|
|
|
|
|
" lr = tf.placeholder(tf.float32, [], name='learning_rate') # 定义学习率\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # 隐藏层到输出层的参数w, b w_shape(n_hu,1) b_shape(1) n_huWie隐藏单元的个数\n",
|
|
|
|
|
" # 隐藏层到输出层的参数W, b,n_hu隐藏单元的个数\n",
|
|
|
|
|
" W_out = tf.get_variable('W_out', [n_hu, 1]) \n",
|
|
|
|
|
" b_out = tf.get_variable('b_out', [1])\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # x和y x_shape(batch_size, seq_length, n_features)\n",
|
|
|
|
|
" # x(batch_size, seq_length, n_features)\n",
|
|
|
|
|
" x = tf.placeholder(tf.float32, [None, None, n_features])\n",
|
|
|
|
|
" y = tf.placeholder(tf.float32, [None, None])\n",
|
|
|
|
|
" \n",
|
|
|
|
@ -49159,14 +49247,11 @@
|
|
|
|
|
" # RNN 层\n",
|
|
|
|
|
" cell = tf.nn.rnn_cell.GRUCell(n_hu) # n_hu表示每个GRUcell里面的单元个数\n",
|
|
|
|
|
" initial_state = cell.zero_state(batch_size, dtype=tf.float32) # 指定初识状态,因为之前没有训练过\n",
|
|
|
|
|
" outputs, state = tf.nn.dynamic_rnn(cell, x, initial_state=initial_state) # 使用的动态Rnn\n",
|
|
|
|
|
" # outputs(batch_size, max_seq_length, n_hu) 这是所有时间步的输出\n",
|
|
|
|
|
" # state (batch_size, n_hu) 这是最后一个时间步的输出\n",
|
|
|
|
|
" # 具体:https://blog.csdn.net/u010960155/article/details/81707498\n",
|
|
|
|
|
" outputs, state = tf.nn.dynamic_rnn(cell, x, initial_state=initial_state) # 使用的动态RNN\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # 输出层\n",
|
|
|
|
|
" outputs = tf.reshape(outputs, [-1, n_hu]) # (batch_size*max_seq_length, n_hu)\n",
|
|
|
|
|
" logits = tf.matmul(outputs, W_out) + b_out # (batch_size*max_seq_length)\n",
|
|
|
|
|
" outputs = tf.reshape(outputs, [-1, n_hu])\n",
|
|
|
|
|
" logits = tf.matmul(outputs, W_out) + b_out\n",
|
|
|
|
|
" logits = tf.reshape(logits, tf.stack([batch_size, seq_length]))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
@ -49177,7 +49262,7 @@
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 选择部分预测结果与标签当做训练损失计算\n",
|
|
|
|
|
"logits_local_train = logits[:, :-14] # 这里-14或者是更小,因为本地训练,我们用前16天训练,16-23天测试。\n",
|
|
|
|
|
"logits_local_train = logits[:, :-14] # 这里-14或者是更小,可以看到上面的label后面几天都是None。\n",
|
|
|
|
|
"label_local_train = y[:, :-14]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
@ -49206,18 +49291,17 @@
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# 设置损失函数\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# 正则化项\n",
|
|
|
|
|
"regularizer = tf.contrib.layers.l2_regularizer(0.00001)\n",
|
|
|
|
|
"regularizer = tf.contrib.layers.l2_regularizer(0.00001) # 正则化项\n",
|
|
|
|
|
"penalty = tf.contrib.layers.apply_regularization(regularizer, tf.trainable_variables())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# sigmoid_cross_entropy二分类任务,判断0或者1\n",
|
|
|
|
|
"obj_local = tf.losses.sigmoid_cross_entropy(label_local_train, logits_local_train) + penalty\n",
|
|
|
|
|
"optimizer = tf.train.AdamOptimizer(lr)\n",
|
|
|
|
|
"set_local = optimizer.minimize(obj_local)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# 选择部分预测结果与标签当做测试损失计算\n",
|
|
|
|
|
"logits_local_test = logits[:, -8] # 预测倒数第8天作为自己的测试标准\n",
|
|
|
|
|
"label_local_test = y[:, -8] # 这里也可以选择其他的天"
|
|
|
|
|
"logits_local_test = logits[:, -8] # 预测倒数第8天作为自己的测试标准,某一天,也可以选择其他的天\n",
|
|
|
|
|
"label_local_test = y[:, -8]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -49227,10 +49311,17 @@
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def train(n_obs=1000, step=1000, lr_feed=0.01):\n",
|
|
|
|
|
" date_seq = [31] + list(range(2, 16)) + [16] * 15\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" n_obs迭代时选择的样本个数\n",
|
|
|
|
|
" step迭代次数\n",
|
|
|
|
|
" lr_feed学习率\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" # 输入序列,2则是取序列长度为2的,选择2-16,因为16比较多,我们*15;人为的选择\n",
|
|
|
|
|
" date_seq = [31] + list(range(2, 16)) + [16] * 15 \n",
|
|
|
|
|
" variables = [set_local, obj_local, label_local_train, logits_local_train]\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" for i in range(step):\n",
|
|
|
|
|
" # next_batch 根据序列长度,先从user_queue字典里找到客户,再取指定部分\n",
|
|
|
|
|
" length, id_list, data_x, data_y = data_generator.next_batch(n_obs)\n",
|
|
|
|
|
" _, los, lab, log = sess.run(variables, \n",
|
|
|
|
|
" feed_dict={x:data_x, y:data_y, lr:lr_feed})"
|
|
|
|
@ -49243,7 +49334,7 @@
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"sess = tf.Session()\n",
|
|
|
|
|
"sess.run(tf.global_variables_initializer())"
|
|
|
|
|
"sess.run(tf.global_variables_initializer()) # 全局变量初始化"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -49252,7 +49343,7 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"train(n_obs=1000, step=2000, lr_feed=0.01)"
|
|
|
|
|
"train(n_obs=1000, step=2000, lr_feed=0.01) # step训练两千次"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -49390,8 +49481,9 @@
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"def test():\n",
|
|
|
|
|
" n_NA = 14 # 本地训练, 我们是1-16天训练,16-23天预测,所以这个地方最大是30-14\n",
|
|
|
|
|
" n_NA = 14\n",
|
|
|
|
|
" # 优化目标和数据\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" variables_1 = [obj_local, logits_local_train, label_local_train]\n",
|
|
|
|
|
" variables_2 = [logits_local_test, label_local_test]\n",
|
|
|
|
|
" \n",
|
|
|
|
@ -49401,7 +49493,9 @@
|
|
|
|
|
" # 训练损失\n",
|
|
|
|
|
" for length, id_list, data_x, data_y in zip(*data_generator.get_set('train')):\n",
|
|
|
|
|
" _obj, _logits_train, _label_train = sess.run(variables_1,\n",
|
|
|
|
|
" feed_dict={x:data_x, y:data_y, lr:0.001})\n",
|
|
|
|
|
" feed_dict={x:data_x,\n",
|
|
|
|
|
" y:data_y,\n",
|
|
|
|
|
" lr:0.001})\n",
|
|
|
|
|
" obs_count += (length - n_NA) * len(id_list)\n",
|
|
|
|
|
" cum_loss += _obj * (length - n_NA) * len(id_list)\n",
|
|
|
|
|
" correct += np.sum((1 * (_logits_train>0) == _label_train))\n",
|
|
|
|
@ -49427,6 +49521,24 @@
|
|
|
|
|
"test()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"#### 评估标准:\n",
|
|
|
|
|
"<img src=\"assets/20201129210029573.png\" width=\"100%\">\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"可以看到上面的test_score(F1 Score)很高,也是比赛中非常接近冠军的值"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"### 总结:\n",
|
|
|
|
|
"构建数据之前,考虑清楚模型的选取,如果是选择RNN网络,构建的数据应该是个序列,t × f,t是时间步长,f是时间步长对应的特征,如数据中用户的注册数据就是t,只需要把f构建好就可以训练了。"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|