In [1]:
import pandas as pd
import numpy as np
# import tensorflow as tf  # 深度学习框架 pip install tensorflow
import datetime
# from deep_tools import f  # conda install -c bioconda deeptools，pip方法不奏效
# from deep_tools import DataCenerator

In [3]:
"""读取数据集"""
register = pd.read_csv('user_register_log.txt', sep='\t', names=['user_id', 'register_day', 'register_type', 'device_type'])
launch = pd.read_csv('app_launch_log.txt', sep='\t', names=['user_id', 'launch_day'])
create = pd.read_csv('video_create_log.txt', sep='\t', names=['user_id', 'create_day'])
activity = pd.read_csv('user_activity_log.txt', sep='\t', names=['user_id', 'act_day', 'page', 'video_id', 'author_id', 'act_type'])

In [4]:
"""计算序列长度： 持续时间 = 数据总时间 - 注册时间"""
register['seq_length'] = 31 - register['register_day']
register.head()

Unnamed: 0,user_id,register_day,register_type,device_type,seq_length
0,744025,1,1,283,30
1,1270299,1,1,259,30
2,571220,1,1,2,30
3,1308501,1,0,23,30
4,745554,1,2,0,30


In [5]:
"""根据前面的记录天数，创建一个字典，来存储不同记录天数的用户到底有哪些 """
user_queue = {i: []  for i in range(1,31)}

for index, row in register.iterrows():  # 这个iterrows是对DataFrame进行行遍历，是在数据框中的行进行迭代的一个生成器，它返回每行的索引及一个包含行本身的对象。
    user_queue[row[-1]].append(row[0]) # row[-1]是seq_length, row[0]是user_id

In [8]:
"""定义一个user_seq类"""
class user_seq:
    
    def __init__(self, register_day, seq_length, n_feagures):
        """
            register_day: 用户第几天进行的登录
            seq_length: 用户序列的长度，就是记录了几天登录信息， 行数
            n_features: 每天提出的特征个数， 列数
        """
        self.register_day = register_day
        self.seq_length = seq_length
        self.array = np.zeros([self.seq_length, n_feagures])   # 这就是上面那个用户对应的矩阵形式，初始化位0
        self.array[0,0] = 1
        self.page_rank = np.zeros([self.seq_length])
        self.pointer = 1
    
    # 提取特征填入特征矩阵
    def put_feature(self, feature_number, string):
        for i in string.split(','):
            pos, value = i.split(':')     # 注册后的第几天进行了登录，1为指示符
            self.array[int(pos)-self.register_day, feature_number] = 1   # 从注册后开始记录
    
    def put_PR(self,string):
        for i in string.split(','):
            pop,value = i.split(';')
            self.page_rank[int(pos)-self.register_day] = value
    
    def get_array(self):
        return self.array
    
    # 得到标签  如果一个用户在未来七天活跃了，那么标记为1
    def get_label(self):
        self.label = np.array([None] * self.seq_length)    # 一个seq_length长度的数组
        active = self.array[:, :10].sum(axis=1)          # 这里选了一部分特征做了个sum，意思是不管是转发，登录，啥的，只要做了就算一次活动
        for i in range(self.seq_length-7):      # 这地方得控制一下，如果一个用户15-30的数据，那么我们标签最多只能到23天，因为30天之后的数据我们没有
            self.label[i] = 1 * (np.sum(active[i+1:i+8]) > 0)    # 这里对于当前的i，如果未来七天内活跃过，那么标签就是1
        return self.label

In [9]:
"""创建用户的记录矩阵"""
n_features = 12
data = {row[0]:user_seq(register_day=row[1], seq_length=row[-1],n_feagures=n_features) for index, row in register.iterrows()}

In [10]:
data

{744025: <__main__.user_seq at 0x1b780282ac8>,
 1270299: <__main__.user_seq at 0x1b780282668>,
 571220: <__main__.user_seq at 0x1b780282940>,
 1308501: <__main__.user_seq at 0x1b780282748>,
 745554: <__main__.user_seq at 0x1b780282160>,
 1031012: <__main__.user_seq at 0x1b7802823c8>,
 913297: <__main__.user_seq at 0x1b780282ba8>,
 266500: <__main__.user_seq at 0x1b780282978>,
 475120: <__main__.user_seq at 0x1b780282240>,
 547944: <__main__.user_seq at 0x1b780282a90>,
 916655: <__main__.user_seq at 0x1b780282a58>,
 719262: <__main__.user_seq at 0x1b7802826a0>,
 1026175: <__main__.user_seq at 0x1b780282358>,
 1140342: <__main__.user_seq at 0x1b7802825f8>,
 688100: <__main__.user_seq at 0x1b780282048>,
 1342459: <__main__.user_seq at 0x1b780282e80>,
 926263: <__main__.user_seq at 0x1b780282438>,
 40710: <__main__.user_seq at 0x1b780282400>,
 246954: <__main__.user_seq at 0x1b780282f28>,
 153579: <__main__.user_seq at 0x1b7802826d8>,
 161418: <__main__.user_seq at 0x1b7803c1278>,
 649526:

In [12]:
launch['launch'] = 1
launch_table = launch.groupby(['user_id', 'launch_day'], 
                              as_index=False).agg({'launch':'sum'})
launch_table.head()

Unnamed: 0,user_id,launch_day,launch
0,16,13,1
1,16,14,1
2,16,15,1
3,16,18,1
4,16,19,1


In [16]:
def record_to_sequence(table):
    table.columns=['user_id','day','value']
    table.sort_values(by=['user_id','day'],inplace=True)
    table['string']=table.day.map(str)+':'+table.value.map(str)
    table=table.groupby(['user_id'],as_index=False).agg({'string':lambda x:','.join(x)})
    return table

In [17]:
launch_table=record_to_sequence(launch_table)
launch_table.head()

Unnamed: 0,user_id,string
0,16,"13:1,14:1,15:1,18:1,19:1,20:1,21:1,22:1,23:1"
1,30,24:1
2,98,16:1
3,105,"12:1,14:1,15:1,16:1,17:1,18:1,19:1,20:1,21:1,2..."
4,176,"27:1,28:1,29:1,30:1"


In [19]:
for index,row in launch_table.iterrows():
    data[row[0]].put_feature(1,row[1])

创作视频信息

In [20]:
create['create']=1
create_table = create.groupby(['user_id','create_day'],as_index=False).agg({'create':'sum'})
create_table = record_to_sequence(create_table)
for index,row in create_table.iterrows():
    data[row[0]].put_feature(2,row[1])

用户行为数据