parent
2d9f18ea8a
commit
d087770fcf
@ -0,0 +1,6 @@
|
||||
# 机器学习竞赛实战-优胜解决方案
|
||||
|
||||
### 快手短视频用户活跃度分析
|
||||
|
||||
请跳转至notebook
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,180 @@
|
||||
# coding: utf-8
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from random import shuffle
|
||||
|
||||
def f(table,name='prob'):
|
||||
table=table.copy()
|
||||
score=[]
|
||||
for i in [0.40,0.41,0.42,0.43,0.44,0.45]:
|
||||
table['pred']=1*(table[name]>i)
|
||||
c=((table.pred==1)&(table.label==1)).sum()
|
||||
p=c/table.pred.sum()
|
||||
r=c/table.label.sum()
|
||||
score.append(2*p*r/(p+r))
|
||||
return score
|
||||
|
||||
def record_to_sequence(table):
|
||||
table.columns=['user_id','day','value']
|
||||
table.sort_values(by=['user_id','day'],inplace=True)
|
||||
table['string']=table.day.map(str)+':'+table.value.map(str)
|
||||
table=table.groupby(['user_id'],as_index=False).agg({'string':lambda x:','.join(x)})
|
||||
return table
|
||||
|
||||
class user_seq:
|
||||
|
||||
def __init__(self,register_day,seq_length,n_features):
|
||||
self.register_day=register_day
|
||||
self.seq_length=seq_length
|
||||
self.array=np.zeros([self.seq_length,n_features])
|
||||
self.array[0,0]=1
|
||||
self.page_rank=np.zeros([self.seq_length])
|
||||
self.pointer=1
|
||||
|
||||
def put_feature(self,feature_number,string):
|
||||
for i in string.split(','):
|
||||
pos,value=i.split(':')
|
||||
self.array[int(pos)-self.register_day,feature_number]=1
|
||||
|
||||
def put_PR(self,string):
|
||||
for i in string.split(','):
|
||||
pos,value=i.split(':')
|
||||
self.page_rank[int(pos)-self.register_day]=value
|
||||
|
||||
def get_array(self):
|
||||
return self.array
|
||||
|
||||
def get_label(self):
|
||||
self.label=np.array([None]*self.seq_length)
|
||||
active=self.array[:,:10].sum(axis=1)
|
||||
for i in range(self.seq_length-7):
|
||||
self.label[i]=1*(np.sum(active[i+1:i+8])>0)
|
||||
return self.label
|
||||
|
||||
|
||||
class DataGenerator:
|
||||
|
||||
def __init__(self,register,launch,create,activity):
|
||||
|
||||
register=register.copy()
|
||||
launch=launch.copy()
|
||||
create=create.copy()
|
||||
activity=activity.copy()
|
||||
|
||||
#user_queue
|
||||
register['seq_length']=31-register['register_day']
|
||||
self.user_queue={i:[] for i in range(1,31)}
|
||||
for index,row in register.iterrows():
|
||||
self.user_queue[row[-1]].append(row[0]) #row[-1]是seq_length,row[0]是user_id
|
||||
|
||||
#初始化self.data
|
||||
n_features=12 #row[0]是user_id,row[1]是register_day,row[-1]是seq_length
|
||||
self.data={row[0]:user_seq(register_day=row[1],seq_length=row[-1],n_features=n_features) for index,row in register.iterrows()}
|
||||
|
||||
|
||||
#提取launch_seq
|
||||
launch['launch']=1
|
||||
launch_table=launch.groupby(['user_id','launch_day'],as_index=False).agg({'launch':'sum'})
|
||||
launch_table=record_to_sequence(launch_table)
|
||||
for index,row in launch_table.iterrows():
|
||||
self.data[row[0]].put_feature(1,row[1]) #row[0]是user_id,row[1]是string
|
||||
|
||||
#提取create_seq
|
||||
create['create']=1
|
||||
create_table=create.groupby(['user_id','create_day'],as_index=False).agg({'create':'sum'})
|
||||
create_table=record_to_sequence(create_table)
|
||||
for index,row in create_table.iterrows():
|
||||
self.data[row[0]].put_feature(2,row[1]) #row[0]是user_id,row[1]是string
|
||||
|
||||
#提取act_seq
|
||||
for i in range(6):
|
||||
act=activity[activity.act_type==i].copy()
|
||||
act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
|
||||
act=record_to_sequence(act)
|
||||
for index,row in act.iterrows():
|
||||
self.data[row[0]].put_feature(i+3,row[1]) #row[0]是user_id,row[1]是string
|
||||
|
||||
#提取page_seq
|
||||
for i in range(1):
|
||||
act=activity[activity.page==i].copy()
|
||||
act=act.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
|
||||
act=record_to_sequence(act)
|
||||
for index,row in act.iterrows():
|
||||
self.data[row[0]].put_feature(i+9,row[1]) #row[0]是user_id,row[1]是string
|
||||
|
||||
#提取watched
|
||||
watched=register.loc[:,['user_id']].copy()
|
||||
watched.columns=['author_id']
|
||||
watched=pd.merge(watched,activity[activity.author_id!=activity.user_id],how='inner')
|
||||
watched=watched.groupby(['author_id','act_day'],as_index=False).agg({'video_id':'count'})
|
||||
watched=record_to_sequence(watched)
|
||||
for index,row in watched.iterrows():
|
||||
self.data[row[0]].put_feature(10,row[1]) #row[0]是user_id,row[1]是string
|
||||
|
||||
#提取watched by self
|
||||
watched=activity[activity.author_id==activity.user_id].copy()
|
||||
watched=watched.groupby(['user_id','act_day'],as_index=False).agg({'video_id':'count'})
|
||||
watched=record_to_sequence(watched)
|
||||
for index,row in watched.iterrows():
|
||||
self.data[row[0]].put_feature(11,row[1]) #row[0]是user_id,row[1]是string
|
||||
|
||||
#提取label
|
||||
self.label={user_id:user.get_label() for user_id,user in self.data.items()}
|
||||
|
||||
#提取data
|
||||
self.data={user_id:user.get_array() for user_id,user in self.data.items()}
|
||||
|
||||
|
||||
#set sample strategy
|
||||
self.local_random_list=[]
|
||||
for i in range(15,31):
|
||||
self.local_random_list+=[i]*(i-14)
|
||||
|
||||
self.online_random_list=[]
|
||||
for i in range(8,31):
|
||||
self.online_random_list+=[i]*(i-7)
|
||||
|
||||
self.local_train_list=list(range(15,31))
|
||||
self.local_test_list=list(range(8,31))
|
||||
self.online_train_list=list(range(8,31))
|
||||
self.online_test_list=list(range(1,31))
|
||||
|
||||
self.pointer={i:0 for i in range(1,31)}
|
||||
|
||||
|
||||
def reset_pointer(self):
|
||||
self.pointer={i:0 for i in range(1,31)}
|
||||
|
||||
|
||||
def next_batch(self,batch_size=1000):
|
||||
|
||||
seq_length=self.local_random_list[np.random.randint(len(self.local_random_list))]
|
||||
batch_size=batch_size//(seq_length-14)+1
|
||||
|
||||
if self.pointer[seq_length]+batch_size>len(self.user_queue[seq_length]):
|
||||
self.pointer[seq_length]=0
|
||||
shuffle(self.user_queue[seq_length])
|
||||
#print('---------------------',seq_length,'shuffled ------------------------------')
|
||||
start=self.pointer[seq_length]
|
||||
user_list=self.user_queue[seq_length][start:start+batch_size]
|
||||
self.pointer[seq_length]+=batch_size
|
||||
|
||||
user_matrix=np.array(user_list)
|
||||
data_matrix=np.array([self.data[i] for i in user_list])
|
||||
label_matrix=np.array([self.label[i] for i in user_list])
|
||||
|
||||
return seq_length,user_matrix,data_matrix,label_matrix
|
||||
|
||||
|
||||
def get_set(self,usage='train'):
|
||||
|
||||
if usage=='train':
|
||||
test_list=self.local_train_list
|
||||
else:
|
||||
test_list=self.local_test_list
|
||||
|
||||
user_list=[np.array(self.user_queue[seq_length]) for seq_length in test_list]
|
||||
data_list=[np.array([self.data[user_id] for user_id in self.user_queue[seq_length]]) for seq_length in test_list]
|
||||
label_list=[np.array([self.label[user_id] for user_id in self.user_queue[seq_length]]) for seq_length in test_list]
|
||||
return test_list,user_list,data_list,label_list
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue