## 任务目标与数据分析
### 预测用户未来一段时间的活跃度
二分类问题

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime

In [7]:
"""数据集链接：https://pan.baidu.com/s/1HRR5qKCYlcdgxLja-_Kv0w 提取码：ep7q """
# 注册数据
register = pd.read_csv('user_register_log.txt', sep='\t', names=['user_id', 'register_day', 'register_type', 'device_type'])
# 用户登录数据，一共30天时间长度，不是每个人的30天，是某个时间的30天
launch = pd.read_csv('app_launch_log.txt', sep='\t', names=['user_id', 'launch_day'])
# 用户创建数据
create = pd.read_csv('video_create_log.txt', sep='\t', names=['user_id', 'create_day'])
# 用户行为数据
activity = pd.read_csv('user_activity_log.txt', sep='\t', names=['user_id', 'act_day', 'page', 'video_id', 'author_id', 'act_type'])

拿到30天时间长度的数据，这里有个问题，可能用户从第10天才进来，所以我们需要做一个处理

In [3]:
register.head()  # user_id用户id、register_day注册时间、register_type来源渠道、device_type设备信息

Unnamed: 0,user_id,register_day,register_type,device_type
0,744025,1,1,283
1,1270299,1,1,259
2,571220,1,1,2
3,1308501,1,0,23
4,745554,1,2,0


1.注册日志
<img src="assets/20201129184929573.png" width="70%">

In [4]:
launch.head()  # launch_day启动的天，如下的330986在4、9、11、12都登录了

Unnamed: 0,user_id,launch_day
0,383135,1
1,330986,4
2,330986,9
3,330986,11
4,330986,12


2.APP启动日志
<img src="assets/20201129185429573.png" width="70%">

In [5]:
create.head()  # create_day创建视频日期

Unnamed: 0,user_id,create_day
0,720497,1
1,720497,1
2,720497,1
3,1075211,6
4,1075211,12


3.视频创建日志表
<img src="assets/20201129185629573.png" width="70%">

In [6]:
activity.head()  # act_day行为日期、page发生行为的地方、video_id视频ID、author_id作者id、act_type用户行为

Unnamed: 0,user_id,act_day,page,video_id,author_id,act_type
0,1062323,22.0,3.0,2877472.0,880271.0,0.0
1,639898,17.0,3.0,740662.0,210200.0,0.0
2,1260200,5.0,3.0,3332414.0,162866.0,0.0
3,817201,22.0,3.0,1129617.0,530246.0,0.0
4,817201,23.0,3.0,1129617.0,530246.0,0.0


4.用户行为日志表
<img src="assets/20201129185529573.png" width="70%">

### 整体模型框架
这里使用神经网络RNN来完成

用户进来的时间，可能是第一天，可能是第二天等，当用户进来后，比如第7天，用户有了数据，从第7天开始第8天第9天用户的数据纬度都是一致的，也就是用户进来的那天当做用户自己的t1，第二天是t2以此类推到tn，用来预测该客户未来的某一天的可能性，比如用第一天(t1)数据用来预测第七天数据(t1+7)。
<img src="assets/20201129190029573.png" width="70%">
目前是没有标签的，需要自己打标签，比如预测第7天，就给每个客户的ti+7天打上标签，用来预测，0表示没登录，1表示登录

### 构建用户特征

In [8]:
"""计算序列长度： 持续时间 = 数据总时间 - 注册时间"""
register['seq_length'] = 31 - register['register_day']
register.head()

Unnamed: 0,user_id,register_day,register_type,device_type,seq_length
0,744025,1,1,283,30
1,1270299,1,1,259,30
2,571220,1,1,2,30
3,1308501,1,0,23,30
4,745554,1,2,0,30


In [9]:
"""
根据前面的记录天数，创建一个以时间为key的字典
如果用户只有20天数据，则后面10天为None，如果取10，而用户超过了10，那么只取前10
"""
user_queue = {i: []  for i in range(1,31)}

for index, row in register.iterrows():  # 这个iterrows是对DataFrame进行遍历，是在数据框中的行进行迭代的一个生成器，它返回每行的索引及一个包含行本身的对象。
    user_queue[row[-1]].append(row[0]) # row[-1]是seq_length, row[0]是user_id

In [10]:
user_queue

{1: [355948,
  1141243,
  885314,
  473037,
  1161976,
  1105249,
  1034711,
  444663,
  629789,
  684292,
  1220188,
  38479,
  1166260,
  739798,
  1023596,
  731282,
  759110,
  327708,
  1102438,
  466836,
  496107,
  1222898,
  166312,
  236360,
  473209,
  91356,
  393059,
  439146,
  956525,
  49020,
  1306827,
  1198470,
  857247,
  316571,
  136844,
  691785,
  1272376,
  265670,
  560000,
  757319,
  1133544,
  822853,
  979976,
  664211,
  480676,
  277894,
  919793,
  107804,
  1191738,
  1095109,
  431613,
  153984,
  398413,
  612132,
  356852,
  146009,
  78955,
  335070,
  719818,
  560879,
  541350,
  1138509,
  38011,
  304986,
  1289748,
  864223,
  642020,
  771941,
  307046,
  774769,
  1254790,
  1308445,
  1296278,
  817006,
  761905,
  663294,
  1170617,
  272899,
  238407,
  937861,
  1021300,
  1150167,
  118095,
  1218437,
  1035477,
  343686,
  720640,
  546495,
  174056,
  478759,
  1285841,
  707129,
  833944,
  499087,
  320094,
  621343,
  309432,
  5366

In [11]:
"""定义一个user_seq类"""
class user_seq:
    
    def __init__(self, register_day, seq_length, n_feagures):
        """
        register_day: 用户第几天进行的登录
        seq_length: 用户序列的长度，就是记录了几天登录信息
        n_features: 每天提取的特征数，如某个客户有30天数据，就有30天且每天有12列特征
        """
        self.register_day = register_day
        self.seq_length = seq_length
        self.array = np.zeros([self.seq_length, n_feagures])   # 构建矩阵（初始化0）：持续天数 × 特征个数，后续新创建的往里填充
        self.array[0,0] = 1
        self.page_rank = np.zeros([self.seq_length])
        self.pointer = 1
    
    # 提取特征填入特征矩阵
    def put_feature(self, feature_number, string):
        for i in string.split(','):
            pos, value = i.split(':')     # 注册后的第几天进行了登录，1为指示符
            self.array[int(pos)-self.register_day, feature_number] = 1   # 从注册后开始记录
    
    def put_PR(self,string):
        for i in string.split(','):
            pop,value = i.split(';')
            self.page_rank[int(pos)-self.register_day] = value
    
    def get_array(self):  # 返回递归的结果
        return self.array
    
    # 得到标签  如果一个用户在未来七天活跃了，那么标记为1
    def get_label(self):
        self.label = np.array([None] * self.seq_length)    # 一个seq_length长度的数组
        active = self.array[:, :10].sum(axis=1)          # 这里选了一部分特征做了个sum，即是发生登录、点赞、转发等，计为活动1次
        for i in range(self.seq_length-7):      # 如果一个用户15-30的数据，那么我们标签最多只能到23天，因为30天之后的数据我们没有
            self.label[i] = 1 * (np.sum(active[i+1:i+8]) > 0)    # 未来七天内活跃过，那么标签就是1
        return self.label

In [12]:
"""创建用户的记录矩阵"""
n_features = 12
data = {row[0]:user_seq(register_day=row[1], seq_length=row[-1],n_feagures=n_features) for index, row in register.iterrows()}

In [13]:
data  # 得到每个用户特征列表，目前里面都是0

{744025: <__main__.user_seq at 0x1ee11bfc748>,
 1270299: <__main__.user_seq at 0x1ee11bfc630>,
 571220: <__main__.user_seq at 0x1ee11bfc940>,
 1308501: <__main__.user_seq at 0x1ee11bfc6d8>,
 745554: <__main__.user_seq at 0x1ee11bfc550>,
 1031012: <__main__.user_seq at 0x1ee11bfca58>,
 913297: <__main__.user_seq at 0x1ee11bfcfd0>,
 266500: <__main__.user_seq at 0x1ee11bfcc18>,
 475120: <__main__.user_seq at 0x1ee11bfcef0>,
 547944: <__main__.user_seq at 0x1ee11bfcf60>,
 916655: <__main__.user_seq at 0x1ee11760668>,
 719262: <__main__.user_seq at 0x1ee11760518>,
 1026175: <__main__.user_seq at 0x1ee117604a8>,
 1140342: <__main__.user_seq at 0x1ee11760160>,
 688100: <__main__.user_seq at 0x1ee117600f0>,
 1342459: <__main__.user_seq at 0x1ee11760080>,
 926263: <__main__.user_seq at 0x1ee11760b38>,
 40710: <__main__.user_seq at 0x1ee117603c8>,
 246954: <__main__.user_seq at 0x1ee11760438>,
 153579: <__main__.user_seq at 0x1ee11760550>,
 161418: <__main__.user_seq at 0x1ee11760a20>,
 649526:

In [15]:
launch['launch'] = 1  # 每个用户登录次数置为1，如果登录了多次则求和
launch_table = launch.groupby(['user_id', 'launch_day'], 
                              as_index=False).agg({'launch':'sum'})
launch_table.head()  # launch_day是时间、launch登录次数

Unnamed: 0,user_id,launch_day,launch
0,16,13,1
1,16,14,1
2,16,15,1
3,16,18,1
4,16,19,1


In [16]:
def record_to_sequence(table):  # 得到用户特征序列表
    table.columns=['user_id','day','value']
    table.sort_values(by=['user_id','day'],inplace=True)  # 排序每个客户的时间
    # 拼接字符串，如上的launch_day是13，同行的launch是1，则变成13:1
    table['string']=table.day.map(str)+':'+table.value.map(str)
    table=table.groupby(['user_id'],as_index=False).agg({'string':lambda x:','.join(x)})
    return table

In [17]:
launch_table=record_to_sequence(launch_table)
launch_table.head()  # 序列特征结果

Unnamed: 0,user_id,string
0,16,"13:1,14:1,15:1,18:1,19:1,20:1,21:1,22:1,23:1"
1,30,24:1
2,98,16:1
3,105,"12:1,14:1,15:1,16:1,17:1,18:1,19:1,20:1,21:1,2..."
4,176,"27:1,28:1,29:1,30:1"


将用户数据填充到对应的位置，如ID=16的用户会在其特征表的13，14，15...行进行填充

In [18]:
for index,row in launch_table.iterrows():  # 根据登录信息对用户特指标进行填充
    data[row[0]].put_feature(1,row[1])  # 在指定特征位置填充

**创作视频信息**

In [19]:
create['create']=1
create_table = create.groupby(['user_id','create_day'],as_index=False).agg({'create':'sum'})
create_table = record_to_sequence(create_table)
for index,row in create_table.iterrows():
    data[row[0]].put_feature(2,row[1])  # 第一个位置是注册信息，所以我们从2开始填充

**用户使用时行为特征，例如点赞、转发等**

分别对不同行为进行统计，构建6种不同行为特征

In [20]:
for i in range(6):
    act=activity[activity.act_type==i].copy()  # act_type不同数字对应着不同的行为
    act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
    act = record_to_sequence(act)
    for index,row in act.iterrows():
        data[row[0]].put_feature(i+3,row[1])

**产生行为的界面信息**

In [21]:
for i in range(1):
    act=activity[activity.page==i].copy()
    act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
    act = record_to_sequence(act)
    for index,row in act.iterrows():
        data[row[0]].put_feature(i+9,row[1])

**用户观看其他用户作品的信息**

In [22]:
watched=register.loc[:,['user_id']].copy()
watched.columns=['author_id']
# 如果作者id和用户id不相等，则是观看其它用户作品的信息
watched=pd.merge(watched,activity[activity.author_id!=activity.user_id],how='inner')
watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
watched=record_to_sequence(watched)
for index,row in watched.iterrows():
    data[row[0]].put_feature(10,row[1])

**用户观看自己作品的信息**

In [23]:
# 如果作者id和用户id相等，则是观看自己作品的信息
watched=pd.merge(watched,activity[activity.author_id==activity.user_id],how='inner')
watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
watched=record_to_sequence(watched)
for index,row in watched.iterrows():
    data[row[0]].put_feature(11,row[1])

特征已构建完成，这里是用的RNN网络，所以构建矩阵，提取序列

如果是用lgb或者xgb，则是DataFrame类型，做一些统计类型的特征

###  标签制作

活跃用户定义为（自己定义的）：在未来7天内使用过APP

从用户注册开始进行统计，对于每1天的数据展开，如果其7天后仍有行为产生，则标签为1

In [24]:
label = {user_id:user.get_label() for user_id, user in data.items()}

In [25]:
label  # 1表示注册7天后，有积极行为的；0则不积极

{744025: array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 1270299: array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 571220: array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 1308501: array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 745554: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, None, None, None, None, None, None, None], dtype=object),
 1031012: array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, None, None, None, None, None, None, None], dtype=object),
 913297: array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1

用户特征数据即为之前提取的data中各项特征，转换为ndarray即可

In [26]:
data = {user_id: user.get_array() for user_id, user in data.items()}

In [27]:
data

{744025: array([[1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      

### 网络训练模块

In [2]:
# 导入封装好的包，上面方法的集合
from deep_tools import f
from deep_tools import DataGenerator

构建RNN网络模型

In [3]:
register=pd.read_csv('user_register_log.txt',sep='\t',names=['user_id','register_day','register_type','device_type'])
launch=pd.read_csv('app_launch_log.txt',sep='\t',names=['user_id','launch_day'])
create=pd.read_csv('video_create_log.txt',sep='\t',names=['user_id','create_day'])
activity=pd.read_csv('user_activity_log.txt',sep='\t',names=['user_id','act_day','page','video_id','author_id','act_type'])

data_generator=DataGenerator(register,launch,create,activity)

In [4]:
"""
tf.placeholder：
    在神经网络构建graph的时候在模型中的占位，此时没有把数据传入模型，只会分配必要的内存。运行模型的时候通过feed_dict()函数向占位符喂入数据。
    dtype：数据类型。常用的是tf.float32,tf.float64等数值类型
    shape：数据形状。默认是None（一维），也可以是多维
    name：名称
tf.get_variable：
    创建新的tensorflow变量
    第一列：名称
    shape：变量的形状
"""
n_features = 12
n_hu = 8
with tf.variable_scope('train'):     # tf.variable_scope用来指定变量的作用域
    
    # 变量与输入
    lr = tf.placeholder(tf.float32, [], name='learning_rate')    # 定义学习率
    
    # 隐藏层到输出层的参数W, b，n_hu隐藏单元的个数
    W_out = tf.get_variable('W_out', [n_hu, 1])
    b_out = tf.get_variable('b_out', [1])
    
    # x(batch_size, seq_length, n_features)
    x = tf.placeholder(tf.float32, [None, None, n_features])
    y = tf.placeholder(tf.float32, [None, None])
    
    # batch_size和seq_length的大小
    batch_size = tf.shape(x)[0]
    seq_length = tf.shape(x)[1]
    
    # RNN 层
    cell = tf.nn.rnn_cell.GRUCell(n_hu)     # n_hu表示每个GRUcell里面的单元个数
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)   # 指定初识状态，因为之前没有训练过
    outputs, state = tf.nn.dynamic_rnn(cell, x, initial_state=initial_state)  # 使用的动态RNN
    
    # 输出层
    outputs = tf.reshape(outputs, [-1, n_hu])
    logits = tf.matmul(outputs, W_out) + b_out
    logits = tf.reshape(logits, tf.stack([batch_size, seq_length]))

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [5]:
# 选择部分预测结果与标签当做训练损失计算
logits_local_train = logits[:, :-14] # 这里-14或者是更小，可以看到上面的label后面几天都是None。
label_local_train = y[:, :-14]

In [6]:
# 设置损失函数
regularizer = tf.contrib.layers.l2_regularizer(0.00001)  # 正则化项
penalty = tf.contrib.layers.apply_regularization(regularizer, tf.trainable_variables())

# sigmoid_cross_entropy二分类任务，判断0或者1
obj_local = tf.losses.sigmoid_cross_entropy(label_local_train, logits_local_train) + penalty
optimizer = tf.train.AdamOptimizer(lr)
set_local = optimizer.minimize(obj_local)

# 选择部分预测结果与标签当做测试损失计算
logits_local_test = logits[:, -8]  # 预测倒数第8天作为自己的测试标准，某一天，也可以选择其他的天
label_local_test = y[:, -8]

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
def train(n_obs=1000, step=1000, lr_feed=0.01):
    """
    n_obs迭代时选择的样本个数
    step迭代次数
    lr_feed学习率
    """
    # 输入序列，2则是取序列长度为2的，选择2-16，因为16比较多，我们*15；人为的选择
    date_seq = [31] + list(range(2, 16)) + [16] * 15  
    variables = [set_local, obj_local, label_local_train, logits_local_train]
    
    for i in range(step):
        # next_batch 根据序列长度，先从user_queue字典里找到客户，再取指定部分
        length, id_list, data_x, data_y = data_generator.next_batch(n_obs)
        _, los, lab, log = sess.run(variables, 
                                   feed_dict={x:data_x, y:data_y, lr:lr_feed})

In [8]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())  # 全局变量初始化

In [9]:
train(n_obs=1000, step=2000, lr_feed=0.01)  # step训练两千次

In [10]:
def test():
    n_NA = 14
    # 优化目标和数据
    
    variables_1 = [obj_local, logits_local_train, label_local_train]
    variables_2 = [logits_local_test, label_local_test]
    
    obs_count, cum_loss, correct = 0, 0, 0
    user, prob, real = [], [], []
    
    # 训练损失
    for length, id_list, data_x, data_y in zip(*data_generator.get_set('train')):
        _obj, _logits_train, _label_train = sess.run(variables_1,
                                                    feed_dict={x:data_x,
                                                               y:data_y,
                                                               lr:0.001})
        obs_count += (length - n_NA) * len(id_list)
        cum_loss += _obj * (length - n_NA) * len(id_list)
        correct += np.sum((1 * (_logits_train>0) == _label_train))
    
    # 测试损失
    for length, id_list, data_x, data_y in zip(*data_generator.get_set('test')):
        _ = sess.run(variables_2, feed_dict={x:data_x, y:data_y, lr:0.001})
        _logits_test, _label_test = _
        real += list(_label_test)
        
        user += list(id_list)
        prob += list(1 / (1+np.exp(-_logits_test.reshape([-1]))))
    
    # 打印训练损失
    print('train_loss', cum_loss/obs_count)
    
    # 测试损失
    result = pd.DataFrame({'user_id':user, 'prob':prob, 'label':real})
    print('test_score:', f(result))
    
    return result

test()

train_loss 0.4713183719032424
test_score: [0.801764907923591, 0.8010107727091368, 0.8010460027751093, 0.8007391933151212, 0.7995341152252228, 0.7991629524948364]


Unnamed: 0,user_id,prob,label
0,1274576,0.796225,1.0
1,109973,0.703245,1.0
2,134299,0.752404,0.0
3,1005835,0.673118,1.0
4,864582,0.700182,1.0
...,...,...,...
37441,74058,0.036112,0.0
37442,1096813,0.984987,1.0
37443,1091221,0.989766,1.0
37444,230821,0.981827,1.0


#### 评估标准：
<img src="assets/20201129210029573.png" width="100%">

可以看到上面的test_score（F1 Score）很高，也是比赛中非常接近冠军的值

### 总结：
构建数据之前，考虑清楚模型的选取，如果是选择RNN网络，构建的数据应该是个序列，t × f，t是时间步长，f是时间步长对应的特征，如数据中用户的注册数据就是t，只需要把f构建好就可以训练了。