Style. Modify the name

master
benjas 4 years ago
parent 8549fbd1df
commit fdecfb456c

@ -1,153 +1,153 @@
#coding=utf-8 #coding=utf-8
""" """
Author: Aigege Author: Aigege
Code: https://github.com/AiIsBetter Code: https://github.com/AiIsBetter
""" """
# date 2021.08.01 # date 2021.08.01
import numpy as np import numpy as np
import networkx as nx import networkx as nx
import pandas as pd import pandas as pd
from gem.embedding.node2vec import node2vec from gem.embedding.node2vec import node2vec
import os import os
from utils import parallel_apply from utils import parallel_apply
from functools import partial from functools import partial
import gc import gc
def link_id_find(gr): def link_id_find(gr):
gr_ = gr.copy() gr_ = gr.copy()
tmp = list(gr_['link_id']) tmp = list(gr_['link_id'])
link_id_tuple = [] link_id_tuple = []
for i in range(len(tmp)-1): for i in range(len(tmp)-1):
link_id_tuple.append([tmp[i],tmp[i+1]]) link_id_tuple.append([tmp[i],tmp[i+1]])
return link_id_tuple return link_id_tuple
if __name__ == '__main__': if __name__ == '__main__':
root_path = '../data/giscup_2021/' root_path = '../data/giscup_2021/'
nrows = None nrows = None
######################################nextlinks ####################################### ######################################nextlinks #######################################
nextlinks = pd.read_csv(root_path + 'nextlinks.txt', sep=' ', header=None) nextlinks = pd.read_csv(root_path + 'nextlinks.txt', sep=' ', header=None)
nextlinks.columns = ['from_id', 'to_id'] nextlinks.columns = ['from_id', 'to_id']
nextlinks['to_id'] = nextlinks['to_id'].astype('str') nextlinks['to_id'] = nextlinks['to_id'].astype('str')
nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(","))
nextlinks = pd.DataFrame({'from_id': nextlinks.from_id.repeat(nextlinks.to_id.str.len()), nextlinks = pd.DataFrame({'from_id': nextlinks.from_id.repeat(nextlinks.to_id.str.len()),
'to_id': np.concatenate(nextlinks.to_id.values)}) 'to_id': np.concatenate(nextlinks.to_id.values)})
nextlinks['from_id'] = nextlinks['from_id'].astype(int) nextlinks['from_id'] = nextlinks['from_id'].astype(int)
nextlinks['to_id'] = nextlinks['to_id'].astype(int) nextlinks['to_id'] = nextlinks['to_id'].astype(int)
from_id = nextlinks['from_id'].unique() from_id = nextlinks['from_id'].unique()
# nextlinks.to_csv('../data/giscup_2021/nextlink_all.csv',index=False) # nextlinks.to_csv('../data/giscup_2021/nextlink_all.csv',index=False)
# nextlinks = pd.read_csv('../data/giscup_2021/nextlink_all.csv') # nextlinks = pd.read_csv('../data/giscup_2021/nextlink_all.csv')
######################################nextlinks ####################################### ######################################nextlinks #######################################
if 'nextlinks_allday.csv' in os.listdir(root_path): if 'nextlinks_allday.csv' in os.listdir(root_path):
nextlinks = pd.read_csv(root_path + 'nextlinks_allday.csv') nextlinks = pd.read_csv(root_path + 'nextlinks_allday.csv')
else: else:
nextlinks_new = [] nextlinks_new = []
for name in os.listdir(root_path + 'train/'): for name in os.listdir(root_path + 'train/'):
data_time = name.split('.')[0] data_time = name.split('.')[0]
if data_time == '20200803': if data_time == '20200803':
continue continue
train = pd.read_csv(root_path + 'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) train = pd.read_csv(root_path + 'train/{}'.format(name),sep= ';;',header=None,nrows=nrows)
train_head = pd.DataFrame(train[0].str.split(' ').tolist(), train_head = pd.DataFrame(train[0].str.split(' ').tolist(),
columns=['order_id', 'ata', 'distance', 'simple_eta', 'driver_id', 'slice_id']) columns=['order_id', 'ata', 'distance', 'simple_eta', 'driver_id', 'slice_id'])
train_head['order_id'] = train_head['order_id'].astype(str) train_head['order_id'] = train_head['order_id'].astype(str)
train_head['ata'] = train_head['ata'].astype(float) train_head['ata'] = train_head['ata'].astype(float)
train_head['distance'] = train_head['distance'].astype(float) train_head['distance'] = train_head['distance'].astype(float)
train_head['simple_eta'] = train_head['simple_eta'].astype(float) train_head['simple_eta'] = train_head['simple_eta'].astype(float)
train_head['driver_id'] = train_head['driver_id'].astype(int) train_head['driver_id'] = train_head['driver_id'].astype(int)
train_head['slice_id'] = train_head['slice_id'].astype(int) train_head['slice_id'] = train_head['slice_id'].astype(int)
data_link = train[[1]] data_link = train[[1]]
print("flag:", 1) print("flag:", 1)
data_link['index'] = train_head.index data_link['index'] = train_head.index
data_link['order_id'] = train_head['order_id'] data_link['order_id'] = train_head['order_id']
print("flag:", 2) print("flag:", 2)
data_link['ata'] = train_head['ata'] data_link['ata'] = train_head['ata']
data_link['distance'] = train_head['distance'] data_link['distance'] = train_head['distance']
data_link['simple_eta'] = train_head['simple_eta'] data_link['simple_eta'] = train_head['simple_eta']
print("flag:", 3) print("flag:", 3)
data_link['slice_id'] = train_head['slice_id'] data_link['slice_id'] = train_head['slice_id']
print("flag:", 4) print("flag:", 4)
data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame()
print("flag:", 5) print("flag:", 5)
data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'})
print("flag:", 6) print("flag:", 6)
data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join(
data_link_split) data_link_split)
print("flag:", 7) print("flag:", 7)
data_link_split = data_link_split.reset_index(drop=True) data_link_split = data_link_split.reset_index(drop=True)
data_link_split[['link_id', data_link_split[['link_id',
'link_time', 'link_time',
'link_ratio', 'link_ratio',
'link_current_status', 'link_current_status',
'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True)
print("flag:", 8) print("flag:", 8)
data_link_split = data_link_split[['order_id','link_id']] data_link_split = data_link_split[['order_id','link_id']]
data_link_split['link_id'] = data_link_split['link_id'].astype(int) data_link_split['link_id'] = data_link_split['link_id'].astype(int)
features = pd.DataFrame({'order_id': data_link_split['order_id'].unique()}) features = pd.DataFrame({'order_id': data_link_split['order_id'].unique()})
groupby = data_link_split.groupby(['order_id']) groupby = data_link_split.groupby(['order_id'])
func = partial(link_id_find) func = partial(link_id_find)
g = parallel_apply(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = parallel_apply(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000)
g = pd.DataFrame(g,columns=['from_id','to_id']) g = pd.DataFrame(g,columns=['from_id','to_id'])
g = g.drop_duplicates() g = g.drop_duplicates()
nextlinks_new.append(g) nextlinks_new.append(g)
nextlinks_new = pd.concat(nextlinks_new, axis=0) nextlinks_new = pd.concat(nextlinks_new, axis=0)
nextlinks_new = nextlinks_new.drop_duplicates() nextlinks_new = nextlinks_new.drop_duplicates()
nextlinks_new = nextlinks_new.sort_values(by='from_id').reset_index(drop=True) nextlinks_new = nextlinks_new.sort_values(by='from_id').reset_index(drop=True)
nextlinks = pd.concat([nextlinks,nextlinks_new],axis=0) nextlinks = pd.concat([nextlinks,nextlinks_new],axis=0)
nextlinks = nextlinks.drop_duplicates() nextlinks = nextlinks.drop_duplicates()
nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True) nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True)
print('save all csv') print('save all csv')
nextlinks.to_csv(root_path+'nextlinks_allday.csv',index=False) nextlinks.to_csv(root_path+'nextlinks_allday.csv',index=False)
print('calcute weight') print('calcute weight')
nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True) nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True)
nextlinks = nextlinks.drop_duplicates() nextlinks = nextlinks.drop_duplicates()
from_id_weight = nextlinks['from_id'].value_counts() from_id_weight = nextlinks['from_id'].value_counts()
from_id_weight = from_id_weight.to_frame() from_id_weight = from_id_weight.to_frame()
from_id_weight['index'] = from_id_weight.index from_id_weight['index'] = from_id_weight.index
from_id_weight.columns = ['weight', 'from_id'] from_id_weight.columns = ['weight', 'from_id']
nextlinks = pd.merge(nextlinks, from_id_weight, 'left', on=['from_id']) nextlinks = pd.merge(nextlinks, from_id_weight, 'left', on=['from_id'])
print('calcute weight finish!') print('calcute weight finish!')
nextlinks['to_id'] = nextlinks['to_id'].astype(int) nextlinks['to_id'] = nextlinks['to_id'].astype(int)
nextlinks['from_id'] = nextlinks['from_id'].astype(int) nextlinks['from_id'] = nextlinks['from_id'].astype(int)
id_key = list(set(nextlinks['from_id'].unique().tolist() + nextlinks['to_id'].unique().tolist())) id_key = list(set(nextlinks['from_id'].unique().tolist() + nextlinks['to_id'].unique().tolist()))
id_key_to_connected = dict(zip(id_key, range(len(id_key)))) id_key_to_connected = dict(zip(id_key, range(len(id_key))))
nextlinks['from_id'] = nextlinks['from_id'].map(id_key_to_connected) nextlinks['from_id'] = nextlinks['from_id'].map(id_key_to_connected)
nextlinks['to_id'] = nextlinks['to_id'].map(id_key_to_connected) nextlinks['to_id'] = nextlinks['to_id'].map(id_key_to_connected)
np.save(root_path + 'id_key_to_connected_allday.npy', id_key_to_connected) np.save(root_path + 'id_key_to_connected_allday.npy', id_key_to_connected)
print('id key save finish!') print('id key save finish!')
print('start creating graph') print('start creating graph')
G = nx.DiGraph() G = nx.DiGraph()
from_id = nextlinks['from_id'].to_list() from_id = nextlinks['from_id'].to_list()
to_id = nextlinks['to_id'].to_list() to_id = nextlinks['to_id'].to_list()
weight = nextlinks['weight'].to_list() weight = nextlinks['weight'].to_list()
edge_tuple = list(zip(from_id, to_id,weight)) edge_tuple = list(zip(from_id, to_id,weight))
# edge_tuple = tuple(from_id,to_id,weight) # edge_tuple = tuple(from_id,to_id,weight)
print('adding') print('adding')
G.add_weighted_edges_from(edge_tuple) G.add_weighted_edges_from(edge_tuple)
G = G.to_directed() G = G.to_directed()
print('finish create graph!') print('finish create graph!')
print('start train n2v') print('start train n2v')
look_back = list(G.nodes()) look_back = list(G.nodes())
embeddings = {} embeddings = {}
models = [] models = []
models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=1, inout_p=1)) models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=1, inout_p=1))
for embedding in models: for embedding in models:
Y, t = embedding.learn_embedding(graph=G, edge_f=None, Y, t = embedding.learn_embedding(graph=G, edge_f=None,
is_weighted=True, no_python=True) is_weighted=True, no_python=True)
for i, embedding in enumerate(embedding.get_embedding()): for i, embedding in enumerate(embedding.get_embedding()):
embeddings[look_back[i]] = embedding embeddings[look_back[i]] = embedding
np.save(root_path+'graph_embeddings_retp1.npy', embeddings) np.save(root_path+'graph_embeddings_retp1.npy', embeddings)
print('nextlink graph embedding retp 1 finish!') # displays "world" print('nextlink graph embedding retp 1 finish!') # displays "world"
del models del models
gc.collect() gc.collect()
look_back = list(G.nodes()) look_back = list(G.nodes())
embeddings = {} embeddings = {}
models = [] models = []
models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=0.5, inout_p=1)) models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=0.5, inout_p=1))
for embedding in models: for embedding in models:
Y, t = embedding.learn_embedding(graph=G, edge_f=None, Y, t = embedding.learn_embedding(graph=G, edge_f=None,
is_weighted=True, no_python=True) is_weighted=True, no_python=True)
for i, embedding in enumerate(embedding.get_embedding()): for i, embedding in enumerate(embedding.get_embedding()):
embeddings[look_back[i]] = embedding embeddings[look_back[i]] = embedding
np.save(root_path + 'graph_embeddings_retp05.npy', embeddings) np.save(root_path + 'graph_embeddings_retp05.npy', embeddings)
print('nextlink graph embedding retp 0.5 finish!') print('nextlink graph embedding retp 0.5 finish!')

@ -1,400 +1,400 @@
#coding=utf-8 #coding=utf-8
""" """
Author: Aigege Author: Aigege
Code: https://github.com/AiIsBetter Code: https://github.com/AiIsBetter
""" """
# date 2021.08.01 # date 2021.08.01
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx import networkx as nx
import os import os
import gc import gc
import warnings import warnings
from utils import parallel_apply_fea,add_features_in_group from utils import parallel_apply_fea,add_features_in_group
from functools import partial from functools import partial
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
def last_k_cross_time_interval(gr, periods): def last_k_cross_time_interval(gr, periods):
gr_ = gr.copy() gr_ = gr.copy()
gr_ = gr_.iloc[::-1] gr_ = gr_.iloc[::-1]
gr_['t_i_v'] = gr_['cross_time'].diff() gr_['t_i_v'] = gr_['cross_time'].diff()
gr_['t_i_v'] = gr_['t_i_v'] gr_['t_i_v'] = gr_['t_i_v']
gr_['t_i_v'] = gr_['t_i_v'].fillna(0) gr_['t_i_v'] = gr_['t_i_v'].fillna(0)
gr_ = gr_.drop_duplicates().reset_index(drop = True) gr_ = gr_.drop_duplicates().reset_index(drop = True)
# cross time变化 # cross time变化
features = {} features = {}
for period in periods: for period in periods:
if period > 10e5: if period > 10e5:
period_name = 'zsl_cross_time_interval_all' period_name = 'zsl_cross_time_interval_all'
gr_period = gr_.copy() gr_period = gr_.copy()
else: else:
period_name = 'zsl_cross_time_interval_last_{}_'.format(period) period_name = 'zsl_cross_time_interval_last_{}_'.format(period)
gr_period = gr_.iloc[:period] gr_period = gr_.iloc[:period]
features = add_features_in_group(features, gr_period, 't_i_v', features = add_features_in_group(features, gr_period, 't_i_v',
['mean','max', 'min', 'std','sum'], ['mean','max', 'min', 'std','sum'],
period_name) period_name)
return features return features
# last k cross id time trend # last k cross id time trend
def last_cross_time_features(gr,periods): def last_cross_time_features(gr,periods):
gr_ = gr.copy() gr_ = gr.copy()
gr_ = gr_.iloc[::-1] gr_ = gr_.iloc[::-1]
features = {} features = {}
for period in periods: for period in periods:
if period > 10e5: if period > 10e5:
period_name = 'zsl_all_' period_name = 'zsl_all_'
gr_period = gr_.copy() gr_period = gr_.copy()
else: else:
period_name = 'zsl_last_{}_'.format(period) period_name = 'zsl_last_{}_'.format(period)
gr_period = gr_.iloc[:period] gr_period = gr_.iloc[:period]
features = add_features_in_group(features, gr_period, 'cross_time', features = add_features_in_group(features, gr_period, 'cross_time',
['max', 'sum', 'mean','min','std'], ['max', 'sum', 'mean','min','std'],
period_name) period_name)
return features return features
# last k cross id time trend # last k cross id time trend
def trend_in_last_k_cross_id_time(gr, periods): def trend_in_last_k_cross_id_time(gr, periods):
gr_ = gr.copy() gr_ = gr.copy()
gr_ = gr_.iloc[::-1] gr_ = gr_.iloc[::-1]
features = {} features = {}
for period in periods: for period in periods:
gr_period = gr_.iloc[:period] gr_period = gr_.iloc[:period]
features = add_trend_feature(features, gr_period, features = add_trend_feature(features, gr_period,
'cross_time', 'zsl_{}_period_trend_'.format(period) 'cross_time', 'zsl_{}_period_trend_'.format(period)
) )
return features return features
# trend feature # trend feature
def add_trend_feature(features, gr, feature_name, prefix): def add_trend_feature(features, gr, feature_name, prefix):
y = gr[feature_name].values y = gr[feature_name].values
try: try:
x = np.arange(0, len(y)).reshape(-1, 1) x = np.arange(0, len(y)).reshape(-1, 1)
lr = LinearRegression() lr = LinearRegression()
lr.fit(x, y) lr.fit(x, y)
trend = lr.coef_[0] trend = lr.coef_[0]
except: except:
trend = np.nan trend = np.nan
features['{}{}'.format(prefix, feature_name)] = trend features['{}{}'.format(prefix, feature_name)] = trend
return features return features
def slice_id_change(x): def slice_id_change(x):
hour = x * 5 / 60 hour = x * 5 / 60
hour = np.floor(hour) hour = np.floor(hour)
hour += 8 hour += 8
if hour >= 24: if hour >= 24:
hour = hour - 24 hour = hour - 24
return hour return hour
if __name__ == '__main__': if __name__ == '__main__':
nrows = None nrows = None
root_path = '../data/giscup_2021/' root_path = '../data/giscup_2021/'
read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item() read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item()
read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item() read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item()
read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item() read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item()
for i in read_grapheb: for i in read_grapheb:
read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i]) read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i])
del read_grapheb_retp del read_grapheb_retp
head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id']
embedding_k = 256 embedding_k = 256
fill_list = [0] * embedding_k fill_list = [0] * embedding_k
df = [] df = []
#######################################nextlinks ####################################### #######################################nextlinks #######################################
nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None) nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None)
nextlinks.columns=['from_id', 'to_id'] nextlinks.columns=['from_id', 'to_id']
nextlinks['to_id'] = nextlinks['to_id'].astype('str') nextlinks['to_id'] = nextlinks['to_id'].astype('str')
nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(","))
nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()), nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()),
'to_id':np.concatenate(nextlinks.to_id.values)}) 'to_id':np.concatenate(nextlinks.to_id.values)})
from_id_weight = nextlinks['from_id'].value_counts() from_id_weight = nextlinks['from_id'].value_counts()
from_id_weight = from_id_weight.to_frame() from_id_weight = from_id_weight.to_frame()
from_id_weight['index'] = from_id_weight.index from_id_weight['index'] = from_id_weight.index
from_id_weight.columns=['weight', 'from_id'] from_id_weight.columns=['weight', 'from_id']
nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id']) nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id'])
nextlinks = nextlinks.sort_values(by='weight',ascending=False) nextlinks = nextlinks.sort_values(by='weight',ascending=False)
G = nx.DiGraph() G = nx.DiGraph()
from_id = nextlinks['from_id'].astype(str).to_list() from_id = nextlinks['from_id'].astype(str).to_list()
to_id = nextlinks['to_id'].to_list() to_id = nextlinks['to_id'].to_list()
weight = nextlinks['weight'].to_list() weight = nextlinks['weight'].to_list()
edge_tuple = list(zip(from_id, to_id,weight)) edge_tuple = list(zip(from_id, to_id,weight))
print('adding') print('adding')
G.add_weighted_edges_from(edge_tuple) G.add_weighted_edges_from(edge_tuple)
dc = nx.algorithms.centrality.degree_centrality(G) dc = nx.algorithms.centrality.degree_centrality(G)
dc = sorted(dc.items(), key=lambda d: d[1],reverse=True) dc = sorted(dc.items(), key=lambda d: d[1],reverse=True)
dc = dc[:50000] dc = dc[:50000]
dc = [str(i[0]) for i in dc ] dc = [str(i[0]) for i in dc ]
#######################################cross ####################################### #######################################cross #######################################
for name in os.listdir(root_path+'train/'): for name in os.listdir(root_path+'train/'):
data_time = name.split('.')[0] data_time = name.split('.')[0]
if data_time=='20200803': if data_time=='20200803':
continue continue
train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows)
print("开始处理", data_time) print("开始处理", data_time)
train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
train_head['order_id'] = train_head['order_id'].astype(str) train_head['order_id'] = train_head['order_id'].astype(str)
train_head['ata'] = train_head['ata'].astype(float) train_head['ata'] = train_head['ata'].astype(float)
train_head['distance'] = train_head['distance'].astype(float) train_head['distance'] = train_head['distance'].astype(float)
train_head['simple_eta'] = train_head['simple_eta'].astype(float) train_head['simple_eta'] = train_head['simple_eta'].astype(float)
train_head['driver_id'] = train_head['driver_id'].astype(int) train_head['driver_id'] = train_head['driver_id'].astype(int)
train_head['slice_id'] = train_head['slice_id'].astype(int) train_head['slice_id'] = train_head['slice_id'].astype(int)
# 处理corss数据 # 处理corss数据
data_cross = train[[2]] data_cross = train[[2]]
data_cross['index'] = train_head.index data_cross['index'] = train_head.index
data_cross['order_id'] = train_head['order_id'] data_cross['order_id'] = train_head['order_id']
data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame() data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame()
data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'}) data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'})
data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split) data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split)
data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True) data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True)
data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float) data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float)
tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True) tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True)
tmp_cross_id.columns=['cross_id_in','cross_id_out'] tmp_cross_id.columns=['cross_id_in','cross_id_out']
data_cross_split = pd.concat([data_cross_split,tmp_cross_id],axis=1).drop(['cross_id','cross_info'],axis=1) data_cross_split = pd.concat([data_cross_split,tmp_cross_id],axis=1).drop(['cross_id','cross_info'],axis=1)
data_cross_split['date_time'] = data_time data_cross_split['date_time'] = data_time
data_cross_split = data_cross_split.drop('index',axis=1).reset_index(drop=True) data_cross_split = data_cross_split.drop('index',axis=1).reset_index(drop=True)
print('preprocess finish!') print('preprocess finish!')
print('start feature engineering') print('start feature engineering')
feature = train_head[['order_id', 'distance']] feature = train_head[['order_id', 'distance']]
###################static fea############################################# ###################static fea#############################################
data_cross_split['zsl_cross_id_isnull'] =0 data_cross_split['zsl_cross_id_isnull'] =0
data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'zsl_cross_id_isnull'] = 1 data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'zsl_cross_id_isnull'] = 1
data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'cross_id_in'] = '-1' data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'cross_id_in'] = '-1'
data_cross_split.loc[data_cross_split['cross_id_out'].isnull(),'cross_id_out'] = '-1' data_cross_split.loc[data_cross_split['cross_id_out'].isnull(),'cross_id_out'] = '-1'
#######################order cross_id count############################### #######################order cross_id count###############################
df = data_cross_split.groupby('order_id', as_index=False) df = data_cross_split.groupby('order_id', as_index=False)
tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'}) tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'})
tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0 tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0
tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=5)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<10),'zsl_order_cross_id_in_count_bins']=1 tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=5)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<10),'zsl_order_cross_id_in_count_bins']=1
tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=10)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<20),'zsl_order_cross_id_in_count_bins']=2 tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=10)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<20),'zsl_order_cross_id_in_count_bins']=2
tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=20),'zsl_order_cross_id_in_count_bins']=3 tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=20),'zsl_order_cross_id_in_count_bins']=3
feature = feature.merge(tmp_crossid_agg,on='order_id',how='left') feature = feature.merge(tmp_crossid_agg,on='order_id',how='left')
print('order cross_id count finish!') print('order cross_id count finish!')
#######################order cross id & distance############################### #######################order cross id & distance###############################
feature['zsl_order_cross_is_highspeed'] = 0 feature['zsl_order_cross_is_highspeed'] = 0
feature.loc[(feature['distance']>90000)&(feature['zsl_order_cross_id_in_count']<30),'zsl_order_cross_is_highspeed'] = 1 feature.loc[(feature['distance']>90000)&(feature['zsl_order_cross_id_in_count']<30),'zsl_order_cross_is_highspeed'] = 1
print('order cross id & distance finish!') print('order cross id & distance finish!')
#######################order cross id & nextlinks centry############################### #######################order cross id & nextlinks centry###############################
tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)] tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)]
tmp = tmp.groupby('order_id', as_index=False) tmp = tmp.groupby('order_id', as_index=False)
tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'}) tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'})
feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left') feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left')
feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0) feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0)
tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)] tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)]
tmp = tmp.groupby('order_id', as_index=False) tmp = tmp.groupby('order_id', as_index=False)
tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'}) tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'})
feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left')
feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0) feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0)
print('order cross_id & nextlinks centry finish!') print('order cross_id & nextlinks centry finish!')
#######################order cross_time sum mean max min var std############################### #######################order cross_time sum mean max min var std###############################
tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum','zsl_order_cross_time_mean': 'mean', tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum','zsl_order_cross_time_mean': 'mean',
'zsl_order_cross_time_max': 'max','zsl_order_cross_time_min': 'min', 'zsl_order_cross_time_max': 'max','zsl_order_cross_time_min': 'min',
'zsl_order_cross_time_var': 'var'}) 'zsl_order_cross_time_var': 'var'})
feature = feature.merge(tmp_linktime_agg,on='order_id',how='left') feature = feature.merge(tmp_linktime_agg,on='order_id',how='left')
print('order cross_time sum mean max min var std finish!') print('order cross_time sum mean max min var std finish!')
#######################order distance/link_id_count############################### #######################order distance/link_id_count###############################
feature['zsl_distance_div_cross_id_count'] = feature['distance']*10/feature['zsl_order_cross_id_in_count'] feature['zsl_distance_div_cross_id_count'] = feature['distance']*10/feature['zsl_order_cross_id_in_count']
feature = feature.drop('distance', axis=1) feature = feature.drop('distance', axis=1)
print('order distance div link_id_count finish!') print('order distance div link_id_count finish!')
###################trend fea############################################# ###################trend fea#############################################
###################trend cross time##################################### ###################trend cross time#####################################
groupby = data_cross_split.groupby(['order_id']) groupby = data_cross_split.groupby(['order_id'])
func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20,100000000]) func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20,100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_cross_time_features, periods=[2, 5, 10, 20,100000000]) func = partial(last_cross_time_features, periods=[2, 5, 10, 20,100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000]) func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
print('trend cross time finish!') print('trend cross time finish!')
####################nextlinks graph embedding####################### ####################nextlinks graph embedding#######################
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0') data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0')
def replace_list(x): def replace_list(x):
if isinstance(x, str): if isinstance(x, str):
x = fill_list x = fill_list
return x return x
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list)
cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)] cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)]
agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col))) agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col)))
cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list()) cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list())
cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16) cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16)
data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1) data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1)
tmp = data_cross_split.groupby('order_id', as_index=False) tmp = data_cross_split.groupby('order_id', as_index=False)
tmp_crossidin_agg = tmp.agg(agg_col) tmp_crossidin_agg = tmp.agg(agg_col)
feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left') feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left')
print('trend cross_id_in eb finish!') print('trend cross_id_in eb finish!')
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0') data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0')
def replace_list(x): def replace_list(x):
if isinstance(x, str): if isinstance(x, str):
x = fill_list x = fill_list
return x return x
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list)
cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)] cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)]
agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col))) agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col)))
cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list()) cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list())
cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16) cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16)
data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1) data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1)
tmp = data_cross_split.groupby('order_id', as_index=False) tmp = data_cross_split.groupby('order_id', as_index=False)
tmp_crossidout_agg = tmp.agg(agg_col) tmp_crossidout_agg = tmp.agg(agg_col)
feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left') feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left')
print('trend cross_id_out eb finish!') print('trend cross_id_out eb finish!')
multipy_df = [] multipy_df = []
multipy_col = [] multipy_col = []
for col1, col2 in zip(cross_id_in_col, cross_id_out_col): for col1, col2 in zip(cross_id_in_col, cross_id_out_col):
tmp = feature[col1] * feature[col2] tmp = feature[col1] * feature[col2]
multipy_df.append(tmp) multipy_df.append(tmp)
multipy_col.append(col1 + '_mul_' + col2) multipy_col.append(col1 + '_mul_' + col2)
multipy_df = pd.concat(multipy_df, axis=1) multipy_df = pd.concat(multipy_df, axis=1)
multipy_df.columns = multipy_col multipy_df.columns = multipy_col
feature = pd.concat([feature, multipy_df], axis=1) feature = pd.concat([feature, multipy_df], axis=1)
print('trend cross_id_out eb multipy finish!') print('trend cross_id_out eb multipy finish!')
feature.to_csv(root_path + 'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time), index=False) feature.to_csv(root_path + 'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time), index=False)
del train del train
gc.collect() gc.collect()
test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows)
test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
test_head['order_id'] = test_head['order_id'].astype(str) test_head['order_id'] = test_head['order_id'].astype(str)
test_head['ata'] = test_head['ata'].astype(float) test_head['ata'] = test_head['ata'].astype(float)
test_head['distance'] = test_head['distance'].astype(float) test_head['distance'] = test_head['distance'].astype(float)
test_head['simple_eta'] = test_head['simple_eta'].astype(float) test_head['simple_eta'] = test_head['simple_eta'].astype(float)
test_head['driver_id'] = test_head['driver_id'].astype(int) test_head['driver_id'] = test_head['driver_id'].astype(int)
test_head['slice_id'] = test_head['slice_id'].astype(int) test_head['slice_id'] = test_head['slice_id'].astype(int)
# 处理corss数据 # 处理corss数据
data_cross = test[[2]] data_cross = test[[2]]
data_cross['index'] = test_head.index data_cross['index'] = test_head.index
data_cross['order_id'] = test_head['order_id'] data_cross['order_id'] = test_head['order_id']
data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame() data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame()
data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'}) data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'})
data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split) data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split)
data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True) data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True)
data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float) data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float)
tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True) tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True)
tmp_cross_id.columns = ['cross_id_in', 'cross_id_out'] tmp_cross_id.columns = ['cross_id_in', 'cross_id_out']
data_cross_split = pd.concat([data_cross_split, tmp_cross_id], axis=1).drop(['cross_id', 'cross_info'], axis=1) data_cross_split = pd.concat([data_cross_split, tmp_cross_id], axis=1).drop(['cross_id', 'cross_info'], axis=1)
data_cross_split['date_time'] = '20200901' data_cross_split['date_time'] = '20200901'
data_cross_split = data_cross_split.drop('index', axis=1).reset_index(drop=True) data_cross_split = data_cross_split.drop('index', axis=1).reset_index(drop=True)
print('preprocess finish!') print('preprocess finish!')
print('start feature engineering') print('start feature engineering')
feature = test_head[['order_id', 'distance']] feature = test_head[['order_id', 'distance']]
###################static fea############################################# ###################static fea#############################################
data_cross_split['zsl_cross_id_isnull'] = 0 data_cross_split['zsl_cross_id_isnull'] = 0
data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'zsl_cross_id_isnull'] = 1 data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'zsl_cross_id_isnull'] = 1
data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'cross_id_in'] = '-1' data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'cross_id_in'] = '-1'
data_cross_split.loc[data_cross_split['cross_id_out'].isnull(), 'cross_id_out'] = '-1' data_cross_split.loc[data_cross_split['cross_id_out'].isnull(), 'cross_id_out'] = '-1'
#######################order cross_id count############################### #######################order cross_id count###############################
df = data_cross_split.groupby('order_id', as_index=False) df = data_cross_split.groupby('order_id', as_index=False)
tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'}) tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'})
tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0 tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0
tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 5) & ( tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 5) & (
tmp_crossid_agg['zsl_order_cross_id_in_count'] < 10), 'zsl_order_cross_id_in_count_bins'] = 1 tmp_crossid_agg['zsl_order_cross_id_in_count'] < 10), 'zsl_order_cross_id_in_count_bins'] = 1
tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 10) & ( tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 10) & (
tmp_crossid_agg['zsl_order_cross_id_in_count'] < 20), 'zsl_order_cross_id_in_count_bins'] = 2 tmp_crossid_agg['zsl_order_cross_id_in_count'] < 20), 'zsl_order_cross_id_in_count_bins'] = 2
tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 20), 'zsl_order_cross_id_in_count_bins'] = 3 tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 20), 'zsl_order_cross_id_in_count_bins'] = 3
feature = feature.merge(tmp_crossid_agg, on='order_id', how='left') feature = feature.merge(tmp_crossid_agg, on='order_id', how='left')
print('order cross_id count finish!') print('order cross_id count finish!')
#######################order cross id & distance############################### #######################order cross id & distance###############################
feature['zsl_order_cross_is_highspeed'] = 0 feature['zsl_order_cross_is_highspeed'] = 0
feature.loc[(feature['distance'] > 90000) & ( feature.loc[(feature['distance'] > 90000) & (
feature['zsl_order_cross_id_in_count'] < 30), 'zsl_order_cross_is_highspeed'] = 1 feature['zsl_order_cross_id_in_count'] < 30), 'zsl_order_cross_is_highspeed'] = 1
print('order cross id & distance finish!') print('order cross id & distance finish!')
#######################order cross id & nextlinks centry############################### #######################order cross id & nextlinks centry###############################
tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)] tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)]
tmp = tmp.groupby('order_id', as_index=False) tmp = tmp.groupby('order_id', as_index=False)
tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'}) tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'})
feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left')
feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0) feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0)
tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)] tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)]
tmp = tmp.groupby('order_id', as_index=False) tmp = tmp.groupby('order_id', as_index=False)
tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'}) tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'})
feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left')
feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0) feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0)
print('order cross_id & nextlinks centry finish!') print('order cross_id & nextlinks centry finish!')
#######################order cross_time sum mean max min var std############################### #######################order cross_time sum mean max min var std###############################
tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum', 'zsl_order_cross_time_mean': 'mean', tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum', 'zsl_order_cross_time_mean': 'mean',
'zsl_order_cross_time_max': 'max', 'zsl_order_cross_time_min': 'min', 'zsl_order_cross_time_max': 'max', 'zsl_order_cross_time_min': 'min',
'zsl_order_cross_time_var': 'var'}) 'zsl_order_cross_time_var': 'var'})
feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') feature = feature.merge(tmp_linktime_agg, on='order_id', how='left')
print('order cross_time sum mean max min var std finish!') print('order cross_time sum mean max min var std finish!')
#######################order distance/link_id_count############################### #######################order distance/link_id_count###############################
feature['zsl_distance_div_cross_id_count'] = feature['distance'] * 10 / feature['zsl_order_cross_id_in_count'] feature['zsl_distance_div_cross_id_count'] = feature['distance'] * 10 / feature['zsl_order_cross_id_in_count']
feature = feature.drop('distance', axis=1) feature = feature.drop('distance', axis=1)
print('order distance div link_id_count finish!') print('order distance div link_id_count finish!')
###################trend fea############################################# ###################trend fea#############################################
###################trend cross time##################################### ###################trend cross time#####################################
groupby = data_cross_split.groupby(['order_id']) groupby = data_cross_split.groupby(['order_id'])
func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20, 100000000]) func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_cross_time_features, periods=[2, 5, 10, 20, 100000000]) func = partial(last_cross_time_features, periods=[2, 5, 10, 20, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000]) func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
print('trend cross time finish!') print('trend cross time finish!')
####################nextlinks graph embedding####################### ####################nextlinks graph embedding#######################
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb)
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0') data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0')
def replace_list(x): def replace_list(x):
if isinstance(x, str): if isinstance(x, str):
x = fill_list x = fill_list
return x return x
data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list) data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list)
cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)] cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)]
agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col))) agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col)))
cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list()) cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list())
cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16) cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16)
data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1) data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1)
tmp = data_cross_split.groupby('order_id', as_index=False) tmp = data_cross_split.groupby('order_id', as_index=False)
tmp_crossidin_agg = tmp.agg(agg_col) tmp_crossidin_agg = tmp.agg(agg_col)
feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left') feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left')
print('trend cross_id_in eb finish!') print('trend cross_id_in eb finish!')
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb)
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0') data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0')
def replace_list(x): def replace_list(x):
if isinstance(x, str): if isinstance(x, str):
x = fill_list x = fill_list
return x return x
data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list) data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list)
cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)] cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)]
agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col))) agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col)))
cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list()) cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list())
cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16) cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16)
data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1) data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1)
tmp = data_cross_split.groupby('order_id', as_index=False) tmp = data_cross_split.groupby('order_id', as_index=False)
tmp_crossidout_agg = tmp.agg(agg_col) tmp_crossidout_agg = tmp.agg(agg_col)
feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left') feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left')
print('trend cross_id_out eb finish!') print('trend cross_id_out eb finish!')
multipy_df = [] multipy_df = []
multipy_col = [] multipy_col = []
for col1, col2 in zip(cross_id_in_col, cross_id_out_col): for col1, col2 in zip(cross_id_in_col, cross_id_out_col):
tmp = feature[col1] * feature[col2] tmp = feature[col1] * feature[col2]
multipy_df.append(tmp) multipy_df.append(tmp)
multipy_col.append(col1 + '_mul_' + col2) multipy_col.append(col1 + '_mul_' + col2)
multipy_df = pd.concat(multipy_df, axis=1) multipy_df = pd.concat(multipy_df, axis=1)
multipy_df.columns = multipy_col multipy_df.columns = multipy_col
feature = pd.concat([feature, multipy_df], axis=1) feature = pd.concat([feature, multipy_df], axis=1)
print('trend cross_id_out eb multipy finish!') print('trend cross_id_out eb multipy finish!')
feature.to_csv(root_path + 'feature/test/cross_fea_order_id_level_20200901.csv', index=False) feature.to_csv(root_path + 'feature/test/cross_fea_order_id_level_20200901.csv', index=False)

@ -1,438 +1,438 @@
#coding=utf-8 #coding=utf-8
""" """
Author: Aigege Author: Aigege
Code: https://github.com/AiIsBetter Code: https://github.com/AiIsBetter
""" """
# date 2021.08.01 # date 2021.08.01
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx import networkx as nx
import os import os
import gc import gc
import warnings import warnings
from utils import parallel_apply_fea,add_features_in_group from utils import parallel_apply_fea,add_features_in_group
from functools import partial from functools import partial
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
def last_k_link_time_interval(gr, periods): def last_k_link_time_interval(gr, periods):
gr_ = gr.copy() gr_ = gr.copy()
gr_ = gr_.iloc[::-1] gr_ = gr_.iloc[::-1]
gr_['t_i_v'] = gr_['link_time'].diff() gr_['t_i_v'] = gr_['link_time'].diff()
gr_['t_i_v'] = gr_['t_i_v'] gr_['t_i_v'] = gr_['t_i_v']
gr_['t_i_v'] = gr_['t_i_v'].fillna(0) gr_['t_i_v'] = gr_['t_i_v'].fillna(0)
gr_['c_s_v'] = gr_['link_current_status'].diff() gr_['c_s_v'] = gr_['link_current_status'].diff()
gr_['c_s_v'] = gr_['c_s_v'] gr_['c_s_v'] = gr_['c_s_v']
gr_['c_s_v'] = gr_['c_s_v'].fillna(0) gr_['c_s_v'] = gr_['c_s_v'].fillna(0)
gr_ = gr_.drop_duplicates().reset_index(drop = True) gr_ = gr_.drop_duplicates().reset_index(drop = True)
# link time变化 # link time变化
features = {} features = {}
for period in periods: for period in periods:
if period > 10e5: if period > 10e5:
period_name = 'zsl_link_time_interval_all' period_name = 'zsl_link_time_interval_all'
gr_period = gr_.copy() gr_period = gr_.copy()
else: else:
period_name = 'zsl_link_time_interval_last_{}_'.format(period) period_name = 'zsl_link_time_interval_last_{}_'.format(period)
gr_period = gr_.iloc[:period] gr_period = gr_.iloc[:period]
features = add_features_in_group(features, gr_period, 't_i_v', features = add_features_in_group(features, gr_period, 't_i_v',
['mean','max', 'min', 'std','skew','sum'], ['mean','max', 'min', 'std','skew','sum'],
# ['diff'], # ['diff'],
period_name) period_name)
# current status变化 # current status变化
for period in periods: for period in periods:
if period > 10e5: if period > 10e5:
period_name = 'zsl_link_current_status_interval_all' period_name = 'zsl_link_current_status_interval_all'
gr_period = gr_.copy() gr_period = gr_.copy()
else: else:
period_name = 'zsl_link_current_status_interval_last_{}_'.format(period) period_name = 'zsl_link_current_status_interval_last_{}_'.format(period)
gr_period = gr_.iloc[:period] gr_period = gr_.iloc[:period]
features = add_features_in_group(features, gr_period, 'c_s_v', features = add_features_in_group(features, gr_period, 'c_s_v',
['mean', 'std', 'skew'], ['mean', 'std', 'skew'],
# ['diff'], # ['diff'],
period_name) period_name)
return features return features
# last k link id time trend # last k link id time trend
def last_link_time_features(gr,periods): def last_link_time_features(gr,periods):
gr_ = gr.copy() gr_ = gr.copy()
gr_ = gr_.iloc[::-1] gr_ = gr_.iloc[::-1]
features = {} features = {}
for period in periods: for period in periods:
if period > 10e5: if period > 10e5:
period_name = 'zsl_all_' period_name = 'zsl_all_'
gr_period = gr_.copy() gr_period = gr_.copy()
else: else:
period_name = 'zsl_last_{}_'.format(period) period_name = 'zsl_last_{}_'.format(period)
gr_period = gr_.iloc[:period] gr_period = gr_.iloc[:period]
features = add_features_in_group(features, gr_period, 'link_time', features = add_features_in_group(features, gr_period, 'link_time',
['max', 'sum', 'mean','min','skew','std'], ['max', 'sum', 'mean','min','skew','std'],
period_name) period_name)
features = add_features_in_group(features, gr_period, 'link_current_status', features = add_features_in_group(features, gr_period, 'link_current_status',
['mean', 'nunique'], ['mean', 'nunique'],
period_name) period_name)
return features return features
# last k link id time trend # last k link id time trend
def trend_in_last_k_link_id_time(gr, periods): def trend_in_last_k_link_id_time(gr, periods):
gr_ = gr.copy() gr_ = gr.copy()
gr_ = gr_.iloc[::-1] gr_ = gr_.iloc[::-1]
features = {} features = {}
for period in periods: for period in periods:
gr_period = gr_.iloc[:period] gr_period = gr_.iloc[:period]
features = add_trend_feature(features, gr_period, features = add_trend_feature(features, gr_period,
'link_time', 'zsl_{}_period_trend_'.format(period) 'link_time', 'zsl_{}_period_trend_'.format(period)
) )
return features return features
# trend feature # trend feature
def add_trend_feature(features, gr, feature_name, prefix): def add_trend_feature(features, gr, feature_name, prefix):
y = gr[feature_name].values y = gr[feature_name].values
try: try:
x = np.arange(0, len(y)).reshape(-1, 1) x = np.arange(0, len(y)).reshape(-1, 1)
lr = LinearRegression() lr = LinearRegression()
lr.fit(x, y) lr.fit(x, y)
trend = lr.coef_[0] trend = lr.coef_[0]
except: except:
trend = np.nan trend = np.nan
features['{}{}'.format(prefix, feature_name)] = trend features['{}{}'.format(prefix, feature_name)] = trend
return features return features
def slice_id_change(x): def slice_id_change(x):
hour = x * 5 / 60 hour = x * 5 / 60
hour = np.floor(hour) hour = np.floor(hour)
hour += 8 hour += 8
if hour >= 24: if hour >= 24:
hour = hour - 24 hour = hour - 24
return hour return hour
if __name__ == '__main__': if __name__ == '__main__':
nrows = None nrows = None
root_path = '../data/giscup_2021/' root_path = '../data/giscup_2021/'
read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item() read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item()
read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item() read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item()
read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item() read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item()
for i in read_grapheb: for i in read_grapheb:
read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i]) read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i])
del read_grapheb_retp del read_grapheb_retp
head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id']
embedding_k = 256 embedding_k = 256
fill_list = [0] * embedding_k fill_list = [0] * embedding_k
#######################################nextlinks ####################################### #######################################nextlinks #######################################
nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None) nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None)
nextlinks.columns=['from_id', 'to_id'] nextlinks.columns=['from_id', 'to_id']
nextlinks['to_id'] = nextlinks['to_id'].astype('str') nextlinks['to_id'] = nextlinks['to_id'].astype('str')
nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(","))
nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()), nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()),
'to_id':np.concatenate(nextlinks.to_id.values)}) 'to_id':np.concatenate(nextlinks.to_id.values)})
from_id_weight = nextlinks['from_id'].value_counts() from_id_weight = nextlinks['from_id'].value_counts()
from_id_weight = from_id_weight.to_frame() from_id_weight = from_id_weight.to_frame()
from_id_weight['index'] = from_id_weight.index from_id_weight['index'] = from_id_weight.index
from_id_weight.columns=['weight', 'from_id'] from_id_weight.columns=['weight', 'from_id']
nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id']) nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id'])
nextlinks = nextlinks.sort_values(by='weight',ascending=False) nextlinks = nextlinks.sort_values(by='weight',ascending=False)
G = nx.DiGraph() G = nx.DiGraph()
from_id = nextlinks['from_id'].astype(str).to_list() from_id = nextlinks['from_id'].astype(str).to_list()
to_id = nextlinks['to_id'].to_list() to_id = nextlinks['to_id'].to_list()
weight = nextlinks['weight'].to_list() weight = nextlinks['weight'].to_list()
edge_tuple = list(zip(from_id, to_id,weight)) edge_tuple = list(zip(from_id, to_id,weight))
print('adding') print('adding')
G.add_weighted_edges_from(edge_tuple) G.add_weighted_edges_from(edge_tuple)
dc = nx.algorithms.centrality.degree_centrality(G) dc = nx.algorithms.centrality.degree_centrality(G)
dc = sorted(dc.items(), key=lambda d: d[1],reverse=True) dc = sorted(dc.items(), key=lambda d: d[1],reverse=True)
dc = dc[:50000] dc = dc[:50000]
dc = [str(i[0]) for i in dc ] dc = [str(i[0]) for i in dc ]
#######################################link ####################################### #######################################link #######################################
for name in os.listdir(root_path+'train/'): for name in os.listdir(root_path+'train/'):
data_time = name.split('.')[0] data_time = name.split('.')[0]
if data_time=='20200803': if data_time=='20200803':
continue continue
train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows)
print("开始处理", data_time) print("开始处理", data_time)
train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
train_head['order_id'] = train_head['order_id'].astype(str) train_head['order_id'] = train_head['order_id'].astype(str)
train_head['ata'] = train_head['ata'].astype(float) train_head['ata'] = train_head['ata'].astype(float)
train_head['distance'] = train_head['distance'].astype(float) train_head['distance'] = train_head['distance'].astype(float)
train_head['simple_eta'] = train_head['simple_eta'].astype(float) train_head['simple_eta'] = train_head['simple_eta'].astype(float)
train_head['driver_id'] = train_head['driver_id'].astype(int) train_head['driver_id'] = train_head['driver_id'].astype(int)
train_head['slice_id'] = train_head['slice_id'].astype(int) train_head['slice_id'] = train_head['slice_id'].astype(int)
#link preprocess #link preprocess
data_link = train[[1]] data_link = train[[1]]
data_link['index'] = train_head.index data_link['index'] = train_head.index
data_link['order_id'] = train_head['order_id'] data_link['order_id'] = train_head['order_id']
data_link['ata'] = train_head['ata'] data_link['ata'] = train_head['ata']
data_link['distance'] = train_head['distance'] data_link['distance'] = train_head['distance']
data_link['simple_eta'] = train_head['simple_eta'] data_link['simple_eta'] = train_head['simple_eta']
data_link['slice_id'] = train_head['slice_id'] data_link['slice_id'] = train_head['slice_id']
# data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change) # data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change)
gc.collect() gc.collect()
data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame()
data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'})
# data_link_split = data_link_split.reset_index(drop=True) # data_link_split = data_link_split.reset_index(drop=True)
data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join(
data_link_split) data_link_split)
data_link_split = data_link_split.reset_index(drop=True) data_link_split = data_link_split.reset_index(drop=True)
data_link_split[['link_id', data_link_split[['link_id',
'link_time', 'link_time',
'link_ratio', 'link_ratio',
'link_current_status', 'link_current_status',
'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True)
data_link_split = data_link_split.drop(['link_info'], axis=1) data_link_split = data_link_split.drop(['link_info'], axis=1)
data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float) data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float)
data_link_split['link_time'] = data_link_split['link_time'].astype(float) data_link_split['link_time'] = data_link_split['link_time'].astype(float)
data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int)
print('preprocess finish!') print('preprocess finish!')
print('start feature engineering') print('start feature engineering')
feature = train_head[['order_id', 'distance']] feature = train_head[['order_id', 'distance']]
###################static fea############################################# ###################static fea#############################################
#######################order link id count############################### #######################order link id count###############################
df = data_link_split.groupby('order_id', as_index=False) df = data_link_split.groupby('order_id', as_index=False)
tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'}) tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'})
tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0 tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0
tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=75)&(tmp_linkid_agg['zsl_order_link_id_count']<100),'zsl_order_link_id_count_bins']=1 tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=75)&(tmp_linkid_agg['zsl_order_link_id_count']<100),'zsl_order_link_id_count_bins']=1
tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=100)&(tmp_linkid_agg['zsl_order_link_id_count']<120),'zsl_order_link_id_count_bins']=2 tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=100)&(tmp_linkid_agg['zsl_order_link_id_count']<120),'zsl_order_link_id_count_bins']=2
tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=120),'zsl_order_link_id_count_bins']=3 tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=120),'zsl_order_link_id_count_bins']=3
feature = feature.merge(tmp_linkid_agg,on='order_id',how='left') feature = feature.merge(tmp_linkid_agg,on='order_id',how='left')
print('order link id count finish!') print('order link id count finish!')
#######################order link id & distance############################### #######################order link id & distance###############################
feature['zsl_order_is_highspeed'] = 0 feature['zsl_order_is_highspeed'] = 0
feature.loc[(feature['distance']>90000)&(feature['zsl_order_link_id_count']<300),'zsl_order_is_highspeed'] = 1 feature.loc[(feature['distance']>90000)&(feature['zsl_order_link_id_count']<300),'zsl_order_is_highspeed'] = 1
print('order link id & distance finish!') print('order link id & distance finish!')
#######################order link id & nextlinks centry############################### #######################order link id & nextlinks centry###############################
tmp = data_link_split[data_link_split['link_id'].isin(dc)] tmp = data_link_split[data_link_split['link_id'].isin(dc)]
tmp = tmp.groupby('order_id', as_index=False) tmp = tmp.groupby('order_id', as_index=False)
tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'}) tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'})
feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left') feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left')
feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0) feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0)
print('order link id & nextlinks centry finish!') print('order link id & nextlinks centry finish!')
#######################order link time sum mean max min var std############################### #######################order link time sum mean max min var std###############################
tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum','zsl_order_link_time_mean': 'mean', tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum','zsl_order_link_time_mean': 'mean',
'zsl_order_link_time_max': 'max','zsl_order_link_time_min': 'min', 'zsl_order_link_time_max': 'max','zsl_order_link_time_min': 'min',
'zsl_order_link_time_var': 'var','zsl_order_link_time_skew': 'skew'}) 'zsl_order_link_time_var': 'var','zsl_order_link_time_skew': 'skew'})
feature = feature.merge(tmp_linktime_agg,on='order_id',how='left') feature = feature.merge(tmp_linktime_agg,on='order_id',how='left')
print('order link time sum mean max min var std finish!') print('order link time sum mean max min var std finish!')
#######################order link current status mean nunique############################### #######################order link current status mean nunique###############################
tmp_linktime_agg = df['link_current_status'].agg({'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'}) tmp_linktime_agg = df['link_current_status'].agg({'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'})
feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') feature = feature.merge(tmp_linktime_agg, on='order_id', how='left')
print('order link current status mean nunique finish!') print('order link current status mean nunique finish!')
#######################order link current status count vector############################### #######################order link current status count vector###############################
data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str) data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str)
data_link_split.loc[data_link_split['link_current_status'].astype(int)<0,'link_current_status'] = '0' data_link_split.loc[data_link_split['link_current_status'].astype(int)<0,'link_current_status'] = '0'
data_link_split.loc[data_link_split['link_current_status'].astype(int)>3,'link_current_status'] = '3' data_link_split.loc[data_link_split['link_current_status'].astype(int)>3,'link_current_status'] = '3'
data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index() data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index()
cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b') cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
train_x = cv_encode.fit_transform(data['link_current_status']) train_x = cv_encode.fit_transform(data['link_current_status'])
train_x = train_x.toarray() train_x = train_x.toarray()
link_current_status = pd.DataFrame(train_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1', 'zsl_link_current_status2', link_current_status = pd.DataFrame(train_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1', 'zsl_link_current_status2',
'zsl_link_current_status3']) 'zsl_link_current_status3'])
data = pd.concat([data[['order_id']],link_current_status],axis=1) data = pd.concat([data[['order_id']],link_current_status],axis=1)
feature = feature.merge(data, on='order_id', how='left') feature = feature.merge(data, on='order_id', how='left')
print('order link current status count vector finish!') print('order link current status count vector finish!')
#######################order distance/link_id_count############################### #######################order distance/link_id_count###############################
feature['zsl_distance_div_link_id_count'] = feature['distance']*10/feature['zsl_order_link_id_count'] feature['zsl_distance_div_link_id_count'] = feature['distance']*10/feature['zsl_order_link_id_count']
feature = feature.drop('distance', axis=1) feature = feature.drop('distance', axis=1)
print('order distance div link_id_count finish!') print('order distance div link_id_count finish!')
#######################order link ratio sum mean max min var std############################### #######################order link ratio sum mean max min var std###############################
tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean', tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean',
'zsl_order_link_ratio_min': 'min', 'zsl_order_link_ratio_min': 'min',
'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'}) 'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'})
feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left') feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left')
print('order link ratio sum mean max min var std finish!') print('order link ratio sum mean max min var std finish!')
#######################weather################################################################### #######################weather###################################################################
weather = pd.read_csv(root_path+'weather.csv') weather = pd.read_csv(root_path+'weather.csv')
weather_dict={'rainstorm':0,'heavy rain':1,'moderate rain':2,'cloudy':3, weather_dict={'rainstorm':0,'heavy rain':1,'moderate rain':2,'cloudy':3,
'showers':4} 'showers':4}
weather['weather'] = weather['weather'].map(weather_dict) weather['weather'] = weather['weather'].map(weather_dict)
weather['date'] = weather['date'].astype(str) weather['date'] = weather['date'].astype(str)
weather=weather[weather['date']==data_time] weather=weather[weather['date']==data_time]
feature['weather'] = weather['weather'].values[0] feature['weather'] = weather['weather'].values[0]
feature['hightemp'] = weather['hightemp'].values[0] feature['hightemp'] = weather['hightemp'].values[0]
feature['lowtemp'] = weather['lowtemp'].values[0] feature['lowtemp'] = weather['lowtemp'].values[0]
print('weather finish!') print('weather finish!')
###################trend fea############################################# ###################trend fea#############################################
###################trend link time##################################### ###################trend link time#####################################
data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int)
groupby = data_link_split.groupby(['order_id']) groupby = data_link_split.groupby(['order_id'])
func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
print('trend link time finish!') print('trend link time finish!')
####################nextlinks graph embedding####################### ####################nextlinks graph embedding#######################
data_link_split['link_id'] = data_link_split['link_id'].astype(int) data_link_split['link_id'] = data_link_split['link_id'].astype(int)
data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey) data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey)
data_link_split['link_id'] = data_link_split['link_id'].fillna(0) data_link_split['link_id'] = data_link_split['link_id'].fillna(0)
data_link_split['link_id'] = data_link_split['link_id'].astype(int) data_link_split['link_id'] = data_link_split['link_id'].astype(int)
data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb) data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb)
data_link_split['link_id'] = data_link_split['link_id'].fillna('0') data_link_split['link_id'] = data_link_split['link_id'].fillna('0')
def replace_list(x): def replace_list(x):
if isinstance(x, str): if isinstance(x, str):
x = fill_list x = fill_list
return x return x
data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list) data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list)
link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)] link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)]
agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col))) agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col)))
link_id_array = np.array(data_link_split.pop('link_id').to_list()) link_id_array = np.array(data_link_split.pop('link_id').to_list())
link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16) link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16)
data_link_split = pd.concat([data_link_split, link_id_array], axis=1) data_link_split = pd.concat([data_link_split, link_id_array], axis=1)
tmp = data_link_split.groupby('order_id', as_index=False) tmp = data_link_split.groupby('order_id', as_index=False)
tmp_linkid_agg = tmp.agg(agg_col) tmp_linkid_agg = tmp.agg(agg_col)
feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') feature = feature.merge(tmp_linkid_agg, on='order_id', how='left')
feature.to_csv(root_path + 'feature/train/link_fea_order_id_level_{}.csv'.format(data_time), index=False) feature.to_csv(root_path + 'feature/train/link_fea_order_id_level_{}.csv'.format(data_time), index=False)
del train del train
gc.collect() gc.collect()
test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows)
test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
test_head['order_id'] = test_head['order_id'].astype(str) test_head['order_id'] = test_head['order_id'].astype(str)
test_head['ata'] = test_head['ata'].astype(float) test_head['ata'] = test_head['ata'].astype(float)
test_head['distance'] = test_head['distance'].astype(float) test_head['distance'] = test_head['distance'].astype(float)
test_head['simple_eta'] = test_head['simple_eta'].astype(float) test_head['simple_eta'] = test_head['simple_eta'].astype(float)
test_head['driver_id'] = test_head['driver_id'].astype(int) test_head['driver_id'] = test_head['driver_id'].astype(int)
test_head['slice_id'] = test_head['slice_id'].astype(int) test_head['slice_id'] = test_head['slice_id'].astype(int)
# link preprocess # link preprocess
data_link = test[[1]] data_link = test[[1]]
data_link['index'] = test_head.index data_link['index'] = test_head.index
data_link['order_id'] = test_head['order_id'] data_link['order_id'] = test_head['order_id']
data_link['ata'] = test_head['ata'] data_link['ata'] = test_head['ata']
data_link['distance'] = test_head['distance'] data_link['distance'] = test_head['distance']
data_link['simple_eta'] = test_head['simple_eta'] data_link['simple_eta'] = test_head['simple_eta']
data_link['slice_id'] = test_head['slice_id'] data_link['slice_id'] = test_head['slice_id']
# data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change) # data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change)
gc.collect() gc.collect()
data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame()
data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'})
# data_link_split = data_link_split.reset_index(drop=True) # data_link_split = data_link_split.reset_index(drop=True)
data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join(
data_link_split) data_link_split)
data_link_split = data_link_split.reset_index(drop=True) data_link_split = data_link_split.reset_index(drop=True)
data_link_split[['link_id', data_link_split[['link_id',
'link_time', 'link_time',
'link_ratio', 'link_ratio',
'link_current_status', 'link_current_status',
'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True)
data_link_split = data_link_split.drop(['link_info'], axis=1) data_link_split = data_link_split.drop(['link_info'], axis=1)
data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float) data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float)
data_link_split['link_time'] = data_link_split['link_time'].astype(float) data_link_split['link_time'] = data_link_split['link_time'].astype(float)
data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int)
print('preprocess finish!') print('preprocess finish!')
print('start feature engineering') print('start feature engineering')
feature = test_head[['order_id', 'distance']] feature = test_head[['order_id', 'distance']]
###################static fea############################################# ###################static fea#############################################
#######################order link id count############################### #######################order link id count###############################
df = data_link_split.groupby('order_id', as_index=False) df = data_link_split.groupby('order_id', as_index=False)
tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'}) tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'})
tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0 tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0
tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 75) & ( tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 75) & (
tmp_linkid_agg['zsl_order_link_id_count'] < 100), 'zsl_order_link_id_count_bins'] = 1 tmp_linkid_agg['zsl_order_link_id_count'] < 100), 'zsl_order_link_id_count_bins'] = 1
tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 100) & ( tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 100) & (
tmp_linkid_agg['zsl_order_link_id_count'] < 120), 'zsl_order_link_id_count_bins'] = 2 tmp_linkid_agg['zsl_order_link_id_count'] < 120), 'zsl_order_link_id_count_bins'] = 2
tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 120), 'zsl_order_link_id_count_bins'] = 3 tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 120), 'zsl_order_link_id_count_bins'] = 3
feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') feature = feature.merge(tmp_linkid_agg, on='order_id', how='left')
print('order link id count finish!') print('order link id count finish!')
#######################order link id & distance############################### #######################order link id & distance###############################
feature['zsl_order_is_highspeed'] = 0 feature['zsl_order_is_highspeed'] = 0
feature.loc[ feature.loc[
(feature['distance'] > 90000) & (feature['zsl_order_link_id_count'] < 300), 'zsl_order_is_highspeed'] = 1 (feature['distance'] > 90000) & (feature['zsl_order_link_id_count'] < 300), 'zsl_order_is_highspeed'] = 1
print('order link id & distance finish!') print('order link id & distance finish!')
#######################order link id & nextlinks centry############################### #######################order link id & nextlinks centry###############################
tmp = data_link_split[data_link_split['link_id'].isin(dc)] tmp = data_link_split[data_link_split['link_id'].isin(dc)]
tmp = tmp.groupby('order_id', as_index=False) tmp = tmp.groupby('order_id', as_index=False)
tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'}) tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'})
feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left')
feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0) feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0)
print('order link id & nextlinks centry finish!') print('order link id & nextlinks centry finish!')
#######################order link time sum mean max min var std############################### #######################order link time sum mean max min var std###############################
tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum', 'zsl_order_link_time_mean': 'mean', tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum', 'zsl_order_link_time_mean': 'mean',
'zsl_order_link_time_max': 'max', 'zsl_order_link_time_min': 'min', 'zsl_order_link_time_max': 'max', 'zsl_order_link_time_min': 'min',
'zsl_order_link_time_var': 'var', 'zsl_order_link_time_skew': 'skew'}) 'zsl_order_link_time_var': 'var', 'zsl_order_link_time_skew': 'skew'})
feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') feature = feature.merge(tmp_linktime_agg, on='order_id', how='left')
print('order link time sum mean max min var std finish!') print('order link time sum mean max min var std finish!')
#######################order link current status mean nunique############################### #######################order link current status mean nunique###############################
tmp_linktime_agg = df['link_current_status'].agg( tmp_linktime_agg = df['link_current_status'].agg(
{'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'}) {'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'})
feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') feature = feature.merge(tmp_linktime_agg, on='order_id', how='left')
print('order link current status mean nunique finish!') print('order link current status mean nunique finish!')
#######################order link current status count vector############################### #######################order link current status count vector###############################
data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str) data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str)
data_link_split.loc[data_link_split['link_current_status'].astype(int) < 0, 'link_current_status'] = '0' data_link_split.loc[data_link_split['link_current_status'].astype(int) < 0, 'link_current_status'] = '0'
data_link_split.loc[data_link_split['link_current_status'].astype(int) > 3, 'link_current_status'] = '3' data_link_split.loc[data_link_split['link_current_status'].astype(int) > 3, 'link_current_status'] = '3'
data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index() data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index()
cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b') cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
test_x = cv_encode.fit_transform(data['link_current_status']) test_x = cv_encode.fit_transform(data['link_current_status'])
test_x = test_x.toarray() test_x = test_x.toarray()
link_current_status = pd.DataFrame(test_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1', link_current_status = pd.DataFrame(test_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1',
'zsl_link_current_status2', 'zsl_link_current_status2',
'zsl_link_current_status3']) 'zsl_link_current_status3'])
data = pd.concat([data[['order_id']], link_current_status], axis=1) data = pd.concat([data[['order_id']], link_current_status], axis=1)
feature = feature.merge(data, on='order_id', how='left') feature = feature.merge(data, on='order_id', how='left')
print('order link current status count vector finish!') print('order link current status count vector finish!')
#######################order distance/link_id_count############################### #######################order distance/link_id_count###############################
feature['zsl_distance_div_link_id_count'] = feature['distance'] * 10 / feature['zsl_order_link_id_count'] feature['zsl_distance_div_link_id_count'] = feature['distance'] * 10 / feature['zsl_order_link_id_count']
feature = feature.drop('distance', axis=1) feature = feature.drop('distance', axis=1)
print('order distance div link_id_count finish!') print('order distance div link_id_count finish!')
#######################order link ratio sum mean max min var std############################### #######################order link ratio sum mean max min var std###############################
tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean', tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean',
'zsl_order_link_ratio_min': 'min', 'zsl_order_link_ratio_min': 'min',
'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'}) 'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'})
feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left') feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left')
print('order link ratio sum mean max min var std finish!') print('order link ratio sum mean max min var std finish!')
#######################weather################################################################### #######################weather###################################################################
weather = pd.read_csv(root_path + 'weather.csv') weather = pd.read_csv(root_path + 'weather.csv')
weather_dict = {'rainstorm': 0, 'heavy rain': 1, 'moderate rain': 2, 'cloudy': 3, weather_dict = {'rainstorm': 0, 'heavy rain': 1, 'moderate rain': 2, 'cloudy': 3,
'showers': 4} 'showers': 4}
weather['weather'] = weather['weather'].map(weather_dict) weather['weather'] = weather['weather'].map(weather_dict)
weather['date'] = weather['date'].astype(str) weather['date'] = weather['date'].astype(str)
weather = weather[weather['date'] == data_time] weather = weather[weather['date'] == data_time]
feature['weather'] = weather['weather'].values[0] feature['weather'] = weather['weather'].values[0]
feature['hightemp'] = weather['hightemp'].values[0] feature['hightemp'] = weather['hightemp'].values[0]
feature['lowtemp'] = weather['lowtemp'].values[0] feature['lowtemp'] = weather['lowtemp'].values[0]
print('weather finish!') print('weather finish!')
###################trend fea############################################# ###################trend fea#############################################
###################trend link time##################################### ###################trend link time#####################################
data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int)
groupby = data_link_split.groupby(['order_id']) groupby = data_link_split.groupby(['order_id'])
func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000])
g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000)
feature = feature.merge(g, on='order_id', how='left') feature = feature.merge(g, on='order_id', how='left')
print('trend link time finish!') print('trend link time finish!')
####################nextlinks graph embedding####################### ####################nextlinks graph embedding#######################
data_link_split['link_id'] = data_link_split['link_id'].astype(int) data_link_split['link_id'] = data_link_split['link_id'].astype(int)
data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey) data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey)
data_link_split['link_id'] = data_link_split['link_id'].fillna(0) data_link_split['link_id'] = data_link_split['link_id'].fillna(0)
data_link_split['link_id'] = data_link_split['link_id'].astype(int) data_link_split['link_id'] = data_link_split['link_id'].astype(int)
data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb) data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb)
data_link_split['link_id'] = data_link_split['link_id'].fillna('0') data_link_split['link_id'] = data_link_split['link_id'].fillna('0')
def replace_list(x): def replace_list(x):
if isinstance(x, str): if isinstance(x, str):
x = fill_list x = fill_list
return x return x
data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list) data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list)
link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)] link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)]
agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col))) agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col)))
link_id_array = np.array(data_link_split.pop('link_id').to_list()) link_id_array = np.array(data_link_split.pop('link_id').to_list())
link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16) link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16)
data_link_split = pd.concat([data_link_split, link_id_array], axis=1) data_link_split = pd.concat([data_link_split, link_id_array], axis=1)
tmp = data_link_split.groupby('order_id', as_index=False) tmp = data_link_split.groupby('order_id', as_index=False)
tmp_linkid_agg = tmp.agg(agg_col) tmp_linkid_agg = tmp.agg(agg_col)
feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') feature = feature.merge(tmp_linkid_agg, on='order_id', how='left')
feature.to_csv(root_path+'feature/test/link_fea_order_id_level_20200901.csv',index=False) feature.to_csv(root_path+'feature/test/link_fea_order_id_level_20200901.csv',index=False)

@ -1,207 +1,207 @@
#coding=utf-8 #coding=utf-8
""" """
Author: Aigege Author: Aigege
Code: https://github.com/AiIsBetter Code: https://github.com/AiIsBetter
""" """
# date 2021.08.01 # date 2021.08.01
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.model_selection import KFold from sklearn.model_selection import KFold
import lightgbm as lgb import lightgbm as lgb
from utils import reduce_mem_usage,reduce_mem_usage_parallel from utils import reduce_mem_usage,reduce_mem_usage_parallel
import os import os
import gc import gc
import warnings import warnings
import time import time
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
def slice_id_change(x): def slice_id_change(x):
hour = x * 5 / 60 hour = x * 5 / 60
hour = np.floor(hour) hour = np.floor(hour)
hour += 8 hour += 8
if hour >= 24: if hour >= 24:
hour = hour - 24 hour = hour - 24
return hour return hour
# 评估指标 # 评估指标
def MAPE(true, pred): def MAPE(true, pred):
diff = np.abs(np.array(pred) - np.array(true)) diff = np.abs(np.array(pred) - np.array(true))
return np.mean(diff / true) return np.mean(diff / true)
# 自定义lgb评估指标 # 自定义lgb评估指标
def lgb_score_mape(train_data,preds): def lgb_score_mape(train_data,preds):
labels = train_data labels = train_data
diff = np.abs(np.array(preds) - np.array(labels)) diff = np.abs(np.array(preds) - np.array(labels))
result = np.mean(diff / labels) result = np.mean(diff / labels)
return 'mape',result, False return 'mape',result, False
head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id']
result = [] result = []
result_time_weight = [] result_time_weight = []
result_dis_weight = [] result_dis_weight = []
count = 0 count = 0
df = [] df = []
nrows=None nrows=None
root_path = '../data/giscup_2021/' root_path = '../data/giscup_2021/'
data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824', data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824',
'20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831'] '20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831']
#######################################本地验证####################################### #######################################本地验证#######################################
for name in os.listdir(root_path+'train/'): for name in os.listdir(root_path+'train/'):
data_time = name.split('.')[0] data_time = name.split('.')[0]
if data_time not in data_list: if data_time not in data_list:
continue continue
train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows)
feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows)
feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows)
feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows) feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows)
feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows) feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows)
feature_cross['order_id'] = feature_cross['order_id'].astype(str) feature_cross['order_id'] = feature_cross['order_id'].astype(str)
feature_link['order_id'] = feature_link['order_id'].astype(str) feature_link['order_id'] = feature_link['order_id'].astype(str)
feature_head['order_id'] = feature_head['order_id'].astype(str) feature_head['order_id'] = feature_head['order_id'].astype(str)
feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) feature_sqe['order_id'] = feature_sqe['order_id'].astype(str)
print("开始处理", data_time) print("开始处理", data_time)
# train.columns = ['head','link','cross'] # train.columns = ['head','link','cross']
# train['head'] = train['head'].apply(lambda x:x.split(' ')) # train['head'] = train['head'].apply(lambda x:x.split(' '))
train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
train_head['order_id'] = train_head['order_id'].astype(str) train_head['order_id'] = train_head['order_id'].astype(str)
train_head['ata'] = train_head['ata'].astype(float) train_head['ata'] = train_head['ata'].astype(float)
train_head['distance'] = train_head['distance'].astype(float) train_head['distance'] = train_head['distance'].astype(float)
train_head['simple_eta'] = train_head['simple_eta'].astype(float) train_head['simple_eta'] = train_head['simple_eta'].astype(float)
train_head['driver_id'] = train_head['driver_id'].astype(int) train_head['driver_id'] = train_head['driver_id'].astype(int)
train_head['slice_id'] = train_head['slice_id'].astype(int) train_head['slice_id'] = train_head['slice_id'].astype(int)
train_head['date_time'] = int(data_time) train_head['date_time'] = int(data_time)
train_head = train_head.merge(feature_cross,on='order_id',how='left') train_head = train_head.merge(feature_cross,on='order_id',how='left')
train_head = train_head.merge(feature_link,on='order_id',how='left') train_head = train_head.merge(feature_link,on='order_id',how='left')
feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index',
'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum',
'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp',
'len_tmp', 'len_tmp',
'link_time_mean', 'link_time_std'], 'link_time_mean', 'link_time_std'],
axis=1) axis=1)
feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1)
train_head = train_head.merge(feature_sqe, on='order_id', how='left') train_head = train_head.merge(feature_sqe, on='order_id', how='left')
train_head = train_head.merge(feature_head, on='order_id', how='left') train_head = train_head.merge(feature_head, on='order_id', how='left')
print('merge finish!') print('merge finish!')
train_head = reduce_mem_usage_parallel(train_head,28) train_head = reduce_mem_usage_parallel(train_head,28)
df.append(train_head.drop('order_id',axis=1)) df.append(train_head.drop('order_id',axis=1))
del train del train
gc.collect() gc.collect()
count +=1 count +=1
df = pd.concat(df,axis=0) df = pd.concat(df,axis=0)
test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows)
test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
test_head['order_id'] = test_head['order_id'].astype(str) test_head['order_id'] = test_head['order_id'].astype(str)
test_head['ata'] = test_head['ata'].astype(float) test_head['ata'] = test_head['ata'].astype(float)
test_head['distance'] = test_head['distance'].astype(float) test_head['distance'] = test_head['distance'].astype(float)
test_head['simple_eta'] = test_head['simple_eta'].astype(float) test_head['simple_eta'] = test_head['simple_eta'].astype(float)
test_head['driver_id'] = test_head['driver_id'].astype(int) test_head['driver_id'] = test_head['driver_id'].astype(int)
test_head['slice_id'] = test_head['slice_id'].astype(int) test_head['slice_id'] = test_head['slice_id'].astype(int)
feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows) feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows)
feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows) feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows)
feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows) feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows)
feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows) feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows)
test_head['date_time'] = 20200901 test_head['date_time'] = 20200901
feature_cross['order_id'] = feature_cross['order_id'].astype(str) feature_cross['order_id'] = feature_cross['order_id'].astype(str)
feature_link['order_id'] = feature_link['order_id'].astype(str) feature_link['order_id'] = feature_link['order_id'].astype(str)
feature_head['order_id'] = feature_head['order_id'].astype(str) feature_head['order_id'] = feature_head['order_id'].astype(str)
feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) feature_sqe['order_id'] = feature_sqe['order_id'].astype(str)
test_head = test_head.merge(feature_cross, on='order_id', how='left') test_head = test_head.merge(feature_cross, on='order_id', how='left')
test_head = test_head.merge(feature_link,on='order_id',how='left') test_head = test_head.merge(feature_link,on='order_id',how='left')
feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index',
'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum',
'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp',
'len_tmp', 'len_tmp',
'link_time_mean', 'link_time_std'], 'link_time_mean', 'link_time_std'],
axis=1) axis=1)
feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1)
test_head = test_head.merge(feature_sqe, on='order_id', how='left') test_head = test_head.merge(feature_sqe, on='order_id', how='left')
test_head = test_head.merge(feature_head, on='order_id', how='left') test_head = test_head.merge(feature_head, on='order_id', how='left')
test_head = reduce_mem_usage_parallel(test_head,28) test_head = reduce_mem_usage_parallel(test_head,28)
del feature_cross,feature_link del feature_cross,feature_link
gc.collect() gc.collect()
X_train = df.drop('ata',axis=1) X_train = df.drop('ata',axis=1)
y_train = df['ata'] y_train = df['ata']
X_test = test_head.drop(['order_id','ata'],axis=1) X_test = test_head.drop(['order_id','ata'],axis=1)
folds = 5 folds = 5
skf = KFold(n_splits=folds, shuffle=True, random_state=2021) skf = KFold(n_splits=folds, shuffle=True, random_state=2021)
train_mean = np.zeros(shape=[1,folds]) train_mean = np.zeros(shape=[1,folds])
test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float) test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float)
k_fold_mape = [] k_fold_mape = []
feature_importance_df = pd.DataFrame() feature_importance_df = pd.DataFrame()
# Display/plot feature importance # Display/plot feature importance
def display_importances(feature_importance_df_): def display_importances(feature_importance_df_):
feature_importance_df_.to_csv('feature_importances.csv',index=False) feature_importance_df_.to_csv('feature_importances.csv',index=False)
cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
best_features = best_features.groupby('feature',as_index = False)['importance'].mean() best_features = best_features.groupby('feature',as_index = False)['importance'].mean()
best_features = best_features.sort_values(by = 'importance',ascending=False) best_features = best_features.sort_values(by = 'importance',ascending=False)
plt.figure(figsize=(8, 10)) plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)') plt.title('LightGBM Features (avg over folds)')
plt.tight_layout() plt.tight_layout()
plt.savefig('feature_importances.jpg') plt.savefig('feature_importances.jpg')
# plt.show() # plt.show()
scores = 0 scores = 0
threshold = 0 threshold = 0
print('start training......') print('start training......')
print('训练集维度:',X_train.shape) print('训练集维度:',X_train.shape)
print('测试集维度:',X_test.shape) print('测试集维度:',X_test.shape)
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)): for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
clf = lgb.LGBMRegressor( clf = lgb.LGBMRegressor(
boosting_type='gbdt', boosting_type='gbdt',
objective='regression', objective='regression',
n_estimators=10000, n_estimators=10000,
learning_rate=0.1, learning_rate=0.1,
num_leaves=170, num_leaves=170,
max_bin=63, max_bin=63,
max_depth=-1, max_depth=-1,
random_state = 2021, random_state = 2021,
subsample_for_bin=200000, subsample_for_bin=200000,
feature_fraction=0.84, feature_fraction=0.84,
bagging_fraction=0.86, bagging_fraction=0.86,
bagging_freq=7, bagging_freq=7,
min_child_samples=89, min_child_samples=89,
lambda_l1=0.006237830242067111, lambda_l1=0.006237830242067111,
lambda_l2=2.016472023736186e-05, lambda_l2=2.016472023736186e-05,
metric=None, metric=None,
n_jobs = 30, n_jobs = 30,
# device='gpu' # device='gpu'
) )
clf.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train.iloc[trn_idx], y_train.iloc[trn_idx]) clf.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
, (X_train.iloc[val_idx], y_train.iloc[val_idx])], , (X_train.iloc[val_idx], y_train.iloc[val_idx])],
eval_metric=lambda y_true, y_pred:[lgb_score_mape(y_true, y_pred)], eval_metric=lambda y_true, y_pred:[lgb_score_mape(y_true, y_pred)],
verbose=100, early_stopping_rounds=100) verbose=100, early_stopping_rounds=100)
fold_importance_df = pd.DataFrame() fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = X_train.columns fold_importance_df["feature"] = X_train.columns
fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["importance"] = clf.feature_importances_
fold_importance_df["fold"] = i + 1 fold_importance_df["fold"] = i + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('predicting') print('predicting')
val_predict = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration_) val_predict = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration_)
test_predict[:,i] = clf.predict(X_test, num_iteration=clf.best_iteration_) test_predict[:,i] = clf.predict(X_test, num_iteration=clf.best_iteration_)
k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict)) k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict))
print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i])) print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i]))
print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape)) print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape))
display_importances(feature_importance_df) display_importances(feature_importance_df)
test_head['result'] = np.mean(test_predict,axis=1) test_head['result'] = np.mean(test_predict,axis=1)
test_head['id'] = test_head['order_id'] test_head['id'] = test_head['order_id']
test_head[['id','result']].to_csv('submission.csv',index=False) test_head[['id','result']].to_csv('submission.csv',index=False)

@ -1,301 +1,301 @@
#coding=utf-8 #coding=utf-8
""" """
Author: Aigege Author: Aigege
Code: https://github.com/AiIsBetter Code: https://github.com/AiIsBetter
""" """
# date 2021.08.01 # date 2021.08.01
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.model_selection import KFold from sklearn.model_selection import KFold
import lightgbm as lgb import lightgbm as lgb
from utils import reduce_mem_usage,reduce_mem_usage_parallel,lgb_score_mape,MAPE from utils import reduce_mem_usage,reduce_mem_usage_parallel,lgb_score_mape,MAPE
import gc import gc
import warnings import warnings
import os,random,pickle import os,random,pickle
import optuna import optuna
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
def slice_id_change(x): def slice_id_change(x):
hour = x * 5 / 60 hour = x * 5 / 60
hour = np.floor(hour) hour = np.floor(hour)
hour += 8 hour += 8
if hour >= 24: if hour >= 24:
hour = hour - 24 hour = hour - 24
return hour return hour
def optuna_print(tr_x, tr_y, te_x,te_y): def optuna_print(tr_x, tr_y, te_x,te_y):
def objective(trial,tr_x, tr_y, te_x,te_y): def objective(trial,tr_x, tr_y, te_x,te_y):
dtrain = lgb.Dataset(tr_x, label=tr_y) dtrain = lgb.Dataset(tr_x, label=tr_y)
dvalid = lgb.Dataset(te_x, label=te_y) dvalid = lgb.Dataset(te_x, label=te_y)
param = { param = {
"objective": "regression", "objective": "regression",
"metric": "mape", "metric": "mape",
"verbosity": -1, "verbosity": -1,
"boosting_type": "gbdt", "boosting_type": "gbdt",
'min_split_gain': 0, 'min_split_gain': 0,
'random_state':2021, 'random_state':2021,
'max_bin':trial.suggest_int('max_bin',63,250), 'max_bin':trial.suggest_int('max_bin',63,250),
'subsample_for_bin': trial.suggest_int('subsample_for_bin', 40000, 300000), 'subsample_for_bin': trial.suggest_int('subsample_for_bin', 40000, 300000),
"lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0), "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
"lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0), "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
"num_leaves": trial.suggest_int("num_leaves", 2, 256), "num_leaves": trial.suggest_int("num_leaves", 2, 256),
"feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0), "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
"bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0), "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7), "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
"min_child_samples": trial.suggest_int("min_child_samples", 5, 100), "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
} }
# Add a callback for pruning. # Add a callback for pruning.
pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "mape") pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "mape")
gbm = lgb.train( gbm = lgb.train(
param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback] param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
) )
preds = gbm.predict(te_x) preds = gbm.predict(te_x)
pred_labels = np.rint(preds) pred_labels = np.rint(preds)
mape = MAPE(te_y, pred_labels) mape = MAPE(te_y, pred_labels)
return mape return mape
study = optuna.create_study( study = optuna.create_study(
pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize" pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize"
) )
study.optimize(lambda trial: objective(trial, tr_x, tr_y, te_x, te_y), study.optimize(lambda trial: objective(trial, tr_x, tr_y, te_x, te_y),
n_trials=100) n_trials=100)
print("Number of finished trials: {}".format(len(study.trials))) print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:") print("Best trial:")
trial = study.best_trial trial = study.best_trial
print(" Value: {}".format(trial.value)) print(" Value: {}".format(trial.value))
print(" Params: ") print(" Params: ")
for key, value in trial.params.items(): for key, value in trial.params.items():
print(" {}: {}".format(key, value)) print(" {}: {}".format(key, value))
head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id']
result = [] result = []
result_time_weight = [] result_time_weight = []
result_dis_weight = [] result_dis_weight = []
count = 0 count = 0
df = [] df = []
nrows=None nrows=None
root_path = '../data/giscup_2021/' root_path = '../data/giscup_2021/'
data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824', data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824',
'20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831'] '20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831']
for name in os.listdir(root_path+'train/'): for name in os.listdir(root_path+'train/'):
data_time = name.split('.')[0] data_time = name.split('.')[0]
if data_time not in data_list: if data_time not in data_list:
continue continue
train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows)
feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows)
feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows)
feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows) feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows)
feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows) feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows)
feature_cross['order_id'] = feature_cross['order_id'].astype(str) feature_cross['order_id'] = feature_cross['order_id'].astype(str)
feature_link['order_id'] = feature_link['order_id'].astype(str) feature_link['order_id'] = feature_link['order_id'].astype(str)
feature_head['order_id'] = feature_head['order_id'].astype(str) feature_head['order_id'] = feature_head['order_id'].astype(str)
feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) feature_sqe['order_id'] = feature_sqe['order_id'].astype(str)
print("开始处理", data_time) print("开始处理", data_time)
# train.columns = ['head','link','cross'] # train.columns = ['head','link','cross']
# train['head'] = train['head'].apply(lambda x:x.split(' ')) # train['head'] = train['head'].apply(lambda x:x.split(' '))
train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
train_head['order_id'] = train_head['order_id'].astype(str) train_head['order_id'] = train_head['order_id'].astype(str)
train_head['ata'] = train_head['ata'].astype(float) train_head['ata'] = train_head['ata'].astype(float)
train_head['distance'] = train_head['distance'].astype(float) train_head['distance'] = train_head['distance'].astype(float)
train_head['simple_eta'] = train_head['simple_eta'].astype(float) train_head['simple_eta'] = train_head['simple_eta'].astype(float)
train_head['driver_id'] = train_head['driver_id'].astype(int) train_head['driver_id'] = train_head['driver_id'].astype(int)
train_head['slice_id'] = train_head['slice_id'].astype(int) train_head['slice_id'] = train_head['slice_id'].astype(int)
train_head['date_time'] = int(data_time) train_head['date_time'] = int(data_time)
train_head = train_head.merge(feature_cross,on='order_id',how='left') train_head = train_head.merge(feature_cross,on='order_id',how='left')
train_head = train_head.merge(feature_link,on='order_id',how='left') train_head = train_head.merge(feature_link,on='order_id',how='left')
feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index',
'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum',
'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp',
'len_tmp', 'len_tmp',
'link_time_mean', 'link_time_std'], 'link_time_mean', 'link_time_std'],
axis=1) axis=1)
feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1)
train_head = train_head.merge(feature_sqe, on='order_id', how='left') train_head = train_head.merge(feature_sqe, on='order_id', how='left')
train_head = train_head.merge(feature_head, on='order_id', how='left') train_head = train_head.merge(feature_head, on='order_id', how='left')
print('merge finish!') print('merge finish!')
train_head = reduce_mem_usage_parallel(train_head,28) train_head = reduce_mem_usage_parallel(train_head,28)
df.append(train_head.drop('order_id',axis=1)) df.append(train_head.drop('order_id',axis=1))
del train del train
gc.collect() gc.collect()
count +=1 count +=1
df = pd.concat(df,axis=0) df = pd.concat(df,axis=0)
test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows)
test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id'])
test_head['order_id'] = test_head['order_id'].astype(str) test_head['order_id'] = test_head['order_id'].astype(str)
test_head['ata'] = test_head['ata'].astype(float) test_head['ata'] = test_head['ata'].astype(float)
test_head['distance'] = test_head['distance'].astype(float) test_head['distance'] = test_head['distance'].astype(float)
test_head['simple_eta'] = test_head['simple_eta'].astype(float) test_head['simple_eta'] = test_head['simple_eta'].astype(float)
test_head['driver_id'] = test_head['driver_id'].astype(int) test_head['driver_id'] = test_head['driver_id'].astype(int)
test_head['slice_id'] = test_head['slice_id'].astype(int) test_head['slice_id'] = test_head['slice_id'].astype(int)
feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows) feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows)
feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows) feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows)
feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows) feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows)
feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows) feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows)
test_head['date_time'] = 20200901 test_head['date_time'] = 20200901
feature_cross['order_id'] = feature_cross['order_id'].astype(str) feature_cross['order_id'] = feature_cross['order_id'].astype(str)
feature_link['order_id'] = feature_link['order_id'].astype(str) feature_link['order_id'] = feature_link['order_id'].astype(str)
feature_head['order_id'] = feature_head['order_id'].astype(str) feature_head['order_id'] = feature_head['order_id'].astype(str)
feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) feature_sqe['order_id'] = feature_sqe['order_id'].astype(str)
test_head = test_head.merge(feature_cross, on='order_id', how='left') test_head = test_head.merge(feature_cross, on='order_id', how='left')
test_head = test_head.merge(feature_link,on='order_id',how='left') test_head = test_head.merge(feature_link,on='order_id',how='left')
feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index',
'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum',
'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp',
'len_tmp', 'len_tmp',
'link_time_mean', 'link_time_std'], 'link_time_mean', 'link_time_std'],
axis=1) axis=1)
feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1)
test_head = test_head.merge(feature_sqe, on='order_id', how='left') test_head = test_head.merge(feature_sqe, on='order_id', how='left')
test_head = test_head.merge(feature_head, on='order_id', how='left') test_head = test_head.merge(feature_head, on='order_id', how='left')
test_head = reduce_mem_usage_parallel(test_head,28) test_head = reduce_mem_usage_parallel(test_head,28)
del feature_cross,feature_link del feature_cross,feature_link
gc.collect() gc.collect()
X_train = df.drop('ata',axis=1) X_train = df.drop('ata',axis=1)
y_train = df['ata'] y_train = df['ata']
X_test = test_head.drop(['order_id','ata'],axis=1) X_test = test_head.drop(['order_id','ata'],axis=1)
#调参 #调参
#tr_x, te_x,tr_y,te_y = train_test_split(X_train,y_train,test_size=0.2,random_state=2021) #tr_x, te_x,tr_y,te_y = train_test_split(X_train,y_train,test_size=0.2,random_state=2021)
#optuna_print(tr_x, tr_y, te_x,te_y) #optuna_print(tr_x, tr_y, te_x,te_y)
#del tr_x, te_x,tr_y,te_y #del tr_x, te_x,tr_y,te_y
#gc.collect() #gc.collect()
folds = 5 folds = 5
skf = KFold(n_splits=folds, shuffle=True, random_state=2021) skf = KFold(n_splits=folds, shuffle=True, random_state=2021)
train_mean = np.zeros(shape=[1,folds]) train_mean = np.zeros(shape=[1,folds])
test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float) test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float)
k_fold_mape = [] k_fold_mape = []
feature_importance_df = pd.DataFrame() feature_importance_df = pd.DataFrame()
# Display/plot feature importance # Display/plot feature importance
def display_importances(feature_importance_df_): def display_importances(feature_importance_df_):
feature_importance_df_.to_csv('feature_importances.csv',index=False) feature_importance_df_.to_csv('feature_importances.csv',index=False)
cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
best_features = best_features.groupby('feature',as_index = False)['importance'].mean() best_features = best_features.groupby('feature',as_index = False)['importance'].mean()
best_features = best_features.sort_values(by = 'importance',ascending=False) best_features = best_features.sort_values(by = 'importance',ascending=False)
plt.figure(figsize=(8, 10)) plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)') plt.title('LightGBM Features (avg over folds)')
plt.tight_layout() plt.tight_layout()
plt.savefig('feature_importances.jpg') plt.savefig('feature_importances.jpg')
# plt.show() # plt.show()
#use single model feature importance as best_feature_importances #use single model feature importance as best_feature_importances
feature_importance_df_ = pd.read_csv('best_feature_importances.csv') feature_importance_df_ = pd.read_csv('best_feature_importances.csv')
cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
best_features = best_features.groupby('feature',as_index = False)['importance'].mean() best_features = best_features.groupby('feature',as_index = False)['importance'].mean()
best_features = best_features.sort_values(by = 'importance',ascending=False) best_features = best_features.sort_values(by = 'importance',ascending=False)
data=best_features.sort_values(by="importance", ascending=False) data=best_features.sort_values(by="importance", ascending=False)
feature_select = list(data['feature'].values) feature_select = list(data['feature'].values)
feature_cols = feature_select feature_cols = feature_select
random_seed = list(range(2021)) random_seed = list(range(2021))
max_depth = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7] max_depth = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7]
lambd1 = np.arange(0, 1, 0.0001) lambd1 = np.arange(0, 1, 0.0001)
lambd2 = np.arange(0, 1, 0.0001) lambd2 = np.arange(0, 1, 0.0001)
bagging_fraction = [i / 1000.0 for i in range(700, 800)] bagging_fraction = [i / 1000.0 for i in range(700, 800)]
feature_fraction = [i / 1000.0 for i in range(700, 800)] feature_fraction = [i / 1000.0 for i in range(700, 800)]
min_child_weight = [i / 100.0 for i in range(150, 250)] min_child_weight = [i / 100.0 for i in range(150, 250)]
n_feature = [i / 100.0 for i in range(1, 32,2)] n_feature = [i / 100.0 for i in range(1, 32,2)]
max_bin = list(range(130, 240)) max_bin = list(range(130, 240))
subsample_for_bin = list(range(50000, 220000,10000)) subsample_for_bin = list(range(50000, 220000,10000))
bagging_freq = [1,2,3,4,5,6,7,8,9,10,1,2,3,4,5] bagging_freq = [1,2,3,4,5,6,7,8,9,10,1,2,3,4,5]
num_leaves = list(range(130, 250)) num_leaves = list(range(130, 250))
random.shuffle(random_seed) random.shuffle(random_seed)
random.shuffle(max_depth) random.shuffle(max_depth)
random.shuffle(lambd1) random.shuffle(lambd1)
random.shuffle(lambd2) random.shuffle(lambd2)
random.shuffle(bagging_fraction) random.shuffle(bagging_fraction)
random.shuffle(feature_fraction) random.shuffle(feature_fraction)
random.shuffle(min_child_weight) random.shuffle(min_child_weight)
random.shuffle(max_bin) random.shuffle(max_bin)
random.shuffle(subsample_for_bin) random.shuffle(subsample_for_bin)
random.shuffle(bagging_freq) random.shuffle(bagging_freq)
random.shuffle(num_leaves) random.shuffle(num_leaves)
random.shuffle(n_feature) random.shuffle(n_feature)
with open('params.pkl', 'wb') as f: with open('params.pkl', 'wb') as f:
pickle.dump((random_seed, max_depth, lambd1,lambd2, bagging_fraction, feature_fraction, min_child_weight, max_bin,subsample_for_bin,bagging_freq,num_leaves,n_feature), f) pickle.dump((random_seed, max_depth, lambd1,lambd2, bagging_fraction, feature_fraction, min_child_weight, max_bin,subsample_for_bin,bagging_freq,num_leaves,n_feature), f)
for iter in range(15): for iter in range(15):
print('max_depth:',max_depth[iter],'random_seed:',random_seed[iter],'feature_fraction:',feature_fraction[iter], print('max_depth:',max_depth[iter],'random_seed:',random_seed[iter],'feature_fraction:',feature_fraction[iter],
'bagging_fraction:',bagging_fraction[iter],'min_child_weight:',min_child_weight[iter], 'bagging_fraction:',bagging_fraction[iter],'min_child_weight:',min_child_weight[iter],
'lambd1:',lambd1[iter],'lambd2:',lambd2[iter],'max_bin:',max_bin[iter],'num_leaves:',num_leaves[iter] 'lambd1:',lambd1[iter],'lambd2:',lambd2[iter],'max_bin:',max_bin[iter],'num_leaves:',num_leaves[iter]
,'subsample_for_bin:',subsample_for_bin[iter],'bagging_freq:',bagging_freq[iter],'n_feature:',n_feature[iter]) ,'subsample_for_bin:',subsample_for_bin[iter],'bagging_freq:',bagging_freq[iter],'n_feature:',n_feature[iter])
nround = 5000 nround = 5000
for iter in range(15): for iter in range(15):
if max_depth[iter]==4: if max_depth[iter]==4:
nround = 10000 nround = 10000
elif max_depth[iter]==5: elif max_depth[iter]==5:
nround = 8000 nround = 8000
elif max_depth[iter]==6: elif max_depth[iter]==6:
nround = 6000 nround = 6000
elif max_depth[iter] == 7: elif max_depth[iter] == 7:
nround = 5000 nround = 5000
X_train_r = X_train[feature_cols[:int(len(feature_cols)*0.7)]+ X_train_r = X_train[feature_cols[:int(len(feature_cols)*0.7)]+
feature_cols[int(len(feature_cols)*0.7):int(len(feature_cols)*0.7)+int(len(feature_cols)*n_feature[iter])]] feature_cols[int(len(feature_cols)*0.7):int(len(feature_cols)*0.7)+int(len(feature_cols)*n_feature[iter])]]
X_test_r = X_test[feature_cols[:int(len(feature_cols) * 0.7)] + X_test_r = X_test[feature_cols[:int(len(feature_cols) * 0.7)] +
feature_cols[int(len(feature_cols) * 0.7):int(len(feature_cols) * 0.7) + int( feature_cols[int(len(feature_cols) * 0.7):int(len(feature_cols) * 0.7) + int(
len(feature_cols) * n_feature[iter])]] len(feature_cols) * n_feature[iter])]]
scores = 0 scores = 0
threshold = 0 threshold = 0
print('start training......') print('start training......')
print('训练集维度:',X_train_r.shape) print('训练集维度:',X_train_r.shape)
print('测试集维度:',X_test_r.shape) print('测试集维度:',X_test_r.shape)
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train_r, y_train)): for i, (trn_idx, val_idx) in enumerate(skf.split(X_train_r, y_train)):
clf = lgb.LGBMRegressor( clf = lgb.LGBMRegressor(
boosting_type='gbdt', boosting_type='gbdt',
objective='regression', objective='regression',
n_estimators=nround, n_estimators=nround,
learning_rate=0.08, learning_rate=0.08,
num_leaves=num_leaves[iter], num_leaves=num_leaves[iter],
max_bin=max_bin[iter], max_bin=max_bin[iter],
max_depth=max_depth[iter], max_depth=max_depth[iter],
random_state=random_seed[iter], random_state=random_seed[iter],
subsample_for_bin=subsample_for_bin[iter], subsample_for_bin=subsample_for_bin[iter],
feature_fraction=feature_fraction[iter], feature_fraction=feature_fraction[iter],
bagging_fraction=bagging_fraction[iter], bagging_fraction=bagging_fraction[iter],
bagging_freq=bagging_freq[iter], bagging_freq=bagging_freq[iter],
min_child_weight=min_child_weight[iter], min_child_weight=min_child_weight[iter],
lambda_l1=lambd1[iter], lambda_l1=lambd1[iter],
lambda_l2=lambd2[iter], lambda_l2=lambd2[iter],
metric=None, metric=None,
n_jobs=30, n_jobs=30,
device='gpu' device='gpu'
) )
clf.fit(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx]), (X_train_r.iloc[val_idx], y_train.iloc[val_idx])],eval_metric='mape',verbose=100, early_stopping_rounds=200) clf.fit(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx]), (X_train_r.iloc[val_idx], y_train.iloc[val_idx])],eval_metric='mape',verbose=100, early_stopping_rounds=200)
print('predicting') print('predicting')
val_predict = clf.predict(X_train_r.iloc[val_idx], num_iteration=clf.best_iteration_) val_predict = clf.predict(X_train_r.iloc[val_idx], num_iteration=clf.best_iteration_)
test_predict[:,i] = clf.predict(X_test_r, num_iteration=clf.best_iteration_) test_predict[:,i] = clf.predict(X_test_r, num_iteration=clf.best_iteration_)
k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict)) k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict))
print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i])) print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i]))
print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape)) print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape))
#display_importances(feature_importance_df) #display_importances(feature_importance_df)
test_head['result'] = np.mean(test_predict,axis=1) test_head['result'] = np.mean(test_predict,axis=1)
test_head['id'] = test_head['order_id'] test_head['id'] = test_head['order_id']
test_head[['id','result']].to_csv('random_result/submission_{}.csv'.format(iter),index=False) test_head[['id','result']].to_csv('random_result/submission_{}.csv'.format(iter),index=False)
del X_train_r,X_test_r del X_train_r,X_test_r
gc.collect() gc.collect()
#merge #merge
count = 0 count = 0
result = 1 result = 1
for name in os.listdir('random_result/'): for name in os.listdir('random_result/'):
tmp = pd.read_csv('random_result/'+name) tmp = pd.read_csv('random_result/'+name)
if count == 0: if count == 0:
result = tmp[['id']] result = tmp[['id']]
tmp = tmp.rename(columns={'result':'result{}'.format(count)}) tmp = tmp.rename(columns={'result':'result{}'.format(count)})
result = result.merge(tmp,on='id',how='left') result = result.merge(tmp,on='id',how='left')
count += 1 count += 1
result['result'] = result.drop('id',axis=1).sum(axis=1) result['result'] = result.drop('id',axis=1).sum(axis=1)
result['result'] = result['result']/count result['result'] = result['result']/count
result[['id','result']].to_csv('submission_merge.csv',index=False) result[['id','result']].to_csv('submission_merge.csv',index=False)

@ -51,7 +51,7 @@
### 4. 模型说明 ### 4. 模型说明
- [DCN模型](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953) - [DCN蒸馏模型](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953)
- ![1628669063602](assets/1628669063602.png) - ![1628669063602](assets/1628669063602.png)
- [WDR模型](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/WD_128544) - [WDR模型](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/WD_128544)
- ![1628669073291](assets/1628669073291.png) - ![1628669073291](assets/1628669073291.png)
@ -81,8 +81,8 @@
### 7. 文件说明 ### 7. 文件说明
- [DCN_12953](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953) - [DCN蒸馏_12953](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953)
- DCN模型线上分数0.12953 - DCN蒸馏模型(利用“未来”数据)线上分数0.12953
- dcn_model/[dcn_model.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/dcn_model.py):模型代码 - dcn_model/[dcn_model.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/dcn_model.py):模型代码
- dcn_model/[main.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/main.py):主函数,训练和预测 - dcn_model/[main.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/main.py):主函数,训练和预测
- dcn_model/[process.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/process.py):特征预处理 - dcn_model/[process.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/process.py):特征预处理

@ -3349,7 +3349,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.3" "version": "3.8.8"
} }
}, },
"nbformat": 4, "nbformat": 4,

Loading…
Cancel
Save