diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/__pycache__/dcn_model.cpython-36.pyc b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/__pycache__/dcn_model.cpython-36.pyc similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/__pycache__/dcn_model.cpython-36.pyc rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/__pycache__/dcn_model.cpython-36.pyc diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/__pycache__/process.cpython-36.pyc b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/__pycache__/process.cpython-36.pyc similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/__pycache__/process.cpython-36.pyc rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/__pycache__/process.cpython-36.pyc diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/dcn_model.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/dcn_model.py similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/dcn_model.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/dcn_model.py diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/main.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/main.py similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/main.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/main.py diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/process.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/process.py similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/dcn_model/process.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/dcn_model/process.py diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/log/main_0720_1.log b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/log/main_0720_1.log similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/log/main_0720_1.log rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/log/main_0720_1.log diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/log/main_0720_2.log b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/log/main_0720_2.log similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/log/main_0720_2.log rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/log/main_0720_2.log diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/CATEGORICAL_COLS_0720_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/CATEGORICAL_COLS_0720_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/CATEGORICAL_COLS_0720_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/CATEGORICAL_COLS_0720_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/cross_cols_list_0720_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/cross_cols_list_0720_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/cross_cols_list_0720_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/cross_cols_list_0720_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/history_0720_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/history_0720_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/history_0720_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/history_0720_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/link_cols_list_0720_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/link_cols_list_0720_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/link_cols_list_0720_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/link_cols_list_0720_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/mk_cols_list_0720_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/mk_cols_list_0720_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/mk_cols_list_0720_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/mk_cols_list_0720_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/t_model_0720_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/t_model_0720_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/DCN_12953/model_h5/t_model_0720_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/DCN蒸馏_12953/model_h5/t_model_0720_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/1_sdne_embedding_allnext.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/1_sdne_embedding_allnext.py similarity index 98% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/1_sdne_embedding_allnext.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/1_sdne_embedding_allnext.py index 2528f1e..85fba36 100644 --- a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/1_sdne_embedding_allnext.py +++ b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/1_sdne_embedding_allnext.py @@ -1,153 +1,153 @@ -#coding=utf-8 -""" -Author: Aigege -Code: https://github.com/AiIsBetter -""" -# date 2021.08.01 -import numpy as np -import networkx as nx -import pandas as pd -from gem.embedding.node2vec import node2vec -import os -from utils import parallel_apply -from functools import partial -import gc -def link_id_find(gr): - gr_ = gr.copy() - tmp = list(gr_['link_id']) - link_id_tuple = [] - for i in range(len(tmp)-1): - link_id_tuple.append([tmp[i],tmp[i+1]]) - return link_id_tuple - -if __name__ == '__main__': - root_path = '../data/giscup_2021/' - nrows = None - ######################################nextlinks ####################################### - nextlinks = pd.read_csv(root_path + 'nextlinks.txt', sep=' ', header=None) - nextlinks.columns = ['from_id', 'to_id'] - nextlinks['to_id'] = nextlinks['to_id'].astype('str') - nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) - nextlinks = pd.DataFrame({'from_id': nextlinks.from_id.repeat(nextlinks.to_id.str.len()), - 'to_id': np.concatenate(nextlinks.to_id.values)}) - nextlinks['from_id'] = nextlinks['from_id'].astype(int) - nextlinks['to_id'] = nextlinks['to_id'].astype(int) - from_id = nextlinks['from_id'].unique() - # nextlinks.to_csv('../data/giscup_2021/nextlink_all.csv',index=False) - # nextlinks = pd.read_csv('../data/giscup_2021/nextlink_all.csv') - - ######################################nextlinks ####################################### - if 'nextlinks_allday.csv' in os.listdir(root_path): - nextlinks = pd.read_csv(root_path + 'nextlinks_allday.csv') - else: - nextlinks_new = [] - for name in os.listdir(root_path + 'train/'): - data_time = name.split('.')[0] - if data_time == '20200803': - continue - train = pd.read_csv(root_path + 'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) - train_head = pd.DataFrame(train[0].str.split(' ').tolist(), - columns=['order_id', 'ata', 'distance', 'simple_eta', 'driver_id', 'slice_id']) - train_head['order_id'] = train_head['order_id'].astype(str) - train_head['ata'] = train_head['ata'].astype(float) - train_head['distance'] = train_head['distance'].astype(float) - train_head['simple_eta'] = train_head['simple_eta'].astype(float) - train_head['driver_id'] = train_head['driver_id'].astype(int) - train_head['slice_id'] = train_head['slice_id'].astype(int) - data_link = train[[1]] - print("flag:", 1) - data_link['index'] = train_head.index - data_link['order_id'] = train_head['order_id'] - print("flag:", 2) - data_link['ata'] = train_head['ata'] - data_link['distance'] = train_head['distance'] - data_link['simple_eta'] = train_head['simple_eta'] - print("flag:", 3) - data_link['slice_id'] = train_head['slice_id'] - print("flag:", 4) - data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() - print("flag:", 5) - data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) - print("flag:", 6) - data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( - data_link_split) - print("flag:", 7) - data_link_split = data_link_split.reset_index(drop=True) - data_link_split[['link_id', - 'link_time', - 'link_ratio', - 'link_current_status', - 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) - print("flag:", 8) - data_link_split = data_link_split[['order_id','link_id']] - data_link_split['link_id'] = data_link_split['link_id'].astype(int) - features = pd.DataFrame({'order_id': data_link_split['order_id'].unique()}) - groupby = data_link_split.groupby(['order_id']) - func = partial(link_id_find) - g = parallel_apply(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) - g = pd.DataFrame(g,columns=['from_id','to_id']) - g = g.drop_duplicates() - nextlinks_new.append(g) - nextlinks_new = pd.concat(nextlinks_new, axis=0) - nextlinks_new = nextlinks_new.drop_duplicates() - nextlinks_new = nextlinks_new.sort_values(by='from_id').reset_index(drop=True) - nextlinks = pd.concat([nextlinks,nextlinks_new],axis=0) - nextlinks = nextlinks.drop_duplicates() - nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True) - print('save all csv') - nextlinks.to_csv(root_path+'nextlinks_allday.csv',index=False) - print('calcute weight') - nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True) - nextlinks = nextlinks.drop_duplicates() - from_id_weight = nextlinks['from_id'].value_counts() - from_id_weight = from_id_weight.to_frame() - from_id_weight['index'] = from_id_weight.index - from_id_weight.columns = ['weight', 'from_id'] - nextlinks = pd.merge(nextlinks, from_id_weight, 'left', on=['from_id']) - print('calcute weight finish!') - nextlinks['to_id'] = nextlinks['to_id'].astype(int) - nextlinks['from_id'] = nextlinks['from_id'].astype(int) - id_key = list(set(nextlinks['from_id'].unique().tolist() + nextlinks['to_id'].unique().tolist())) - id_key_to_connected = dict(zip(id_key, range(len(id_key)))) - nextlinks['from_id'] = nextlinks['from_id'].map(id_key_to_connected) - nextlinks['to_id'] = nextlinks['to_id'].map(id_key_to_connected) - np.save(root_path + 'id_key_to_connected_allday.npy', id_key_to_connected) - print('id key save finish!') - print('start creating graph') - G = nx.DiGraph() - from_id = nextlinks['from_id'].to_list() - to_id = nextlinks['to_id'].to_list() - weight = nextlinks['weight'].to_list() - edge_tuple = list(zip(from_id, to_id,weight)) - # edge_tuple = tuple(from_id,to_id,weight) - print('adding') - G.add_weighted_edges_from(edge_tuple) - G = G.to_directed() - print('finish create graph!') - print('start train n2v') - look_back = list(G.nodes()) - embeddings = {} - models = [] - models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=1, inout_p=1)) - for embedding in models: - Y, t = embedding.learn_embedding(graph=G, edge_f=None, - is_weighted=True, no_python=True) - for i, embedding in enumerate(embedding.get_embedding()): - embeddings[look_back[i]] = embedding - np.save(root_path+'graph_embeddings_retp1.npy', embeddings) - print('nextlink graph embedding retp 1 finish!') # displays "world" - del models - gc.collect() - - look_back = list(G.nodes()) - embeddings = {} - models = [] - models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=0.5, inout_p=1)) - for embedding in models: - Y, t = embedding.learn_embedding(graph=G, edge_f=None, - is_weighted=True, no_python=True) - for i, embedding in enumerate(embedding.get_embedding()): - embeddings[look_back[i]] = embedding - np.save(root_path + 'graph_embeddings_retp05.npy', embeddings) - print('nextlink graph embedding retp 0.5 finish!') - +#coding=utf-8 +""" +Author: Aigege +Code: https://github.com/AiIsBetter +""" +# date 2021.08.01 +import numpy as np +import networkx as nx +import pandas as pd +from gem.embedding.node2vec import node2vec +import os +from utils import parallel_apply +from functools import partial +import gc +def link_id_find(gr): + gr_ = gr.copy() + tmp = list(gr_['link_id']) + link_id_tuple = [] + for i in range(len(tmp)-1): + link_id_tuple.append([tmp[i],tmp[i+1]]) + return link_id_tuple + +if __name__ == '__main__': + root_path = '../data/giscup_2021/' + nrows = None + ######################################nextlinks ####################################### + nextlinks = pd.read_csv(root_path + 'nextlinks.txt', sep=' ', header=None) + nextlinks.columns = ['from_id', 'to_id'] + nextlinks['to_id'] = nextlinks['to_id'].astype('str') + nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) + nextlinks = pd.DataFrame({'from_id': nextlinks.from_id.repeat(nextlinks.to_id.str.len()), + 'to_id': np.concatenate(nextlinks.to_id.values)}) + nextlinks['from_id'] = nextlinks['from_id'].astype(int) + nextlinks['to_id'] = nextlinks['to_id'].astype(int) + from_id = nextlinks['from_id'].unique() + # nextlinks.to_csv('../data/giscup_2021/nextlink_all.csv',index=False) + # nextlinks = pd.read_csv('../data/giscup_2021/nextlink_all.csv') + + ######################################nextlinks ####################################### + if 'nextlinks_allday.csv' in os.listdir(root_path): + nextlinks = pd.read_csv(root_path + 'nextlinks_allday.csv') + else: + nextlinks_new = [] + for name in os.listdir(root_path + 'train/'): + data_time = name.split('.')[0] + if data_time == '20200803': + continue + train = pd.read_csv(root_path + 'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) + train_head = pd.DataFrame(train[0].str.split(' ').tolist(), + columns=['order_id', 'ata', 'distance', 'simple_eta', 'driver_id', 'slice_id']) + train_head['order_id'] = train_head['order_id'].astype(str) + train_head['ata'] = train_head['ata'].astype(float) + train_head['distance'] = train_head['distance'].astype(float) + train_head['simple_eta'] = train_head['simple_eta'].astype(float) + train_head['driver_id'] = train_head['driver_id'].astype(int) + train_head['slice_id'] = train_head['slice_id'].astype(int) + data_link = train[[1]] + print("flag:", 1) + data_link['index'] = train_head.index + data_link['order_id'] = train_head['order_id'] + print("flag:", 2) + data_link['ata'] = train_head['ata'] + data_link['distance'] = train_head['distance'] + data_link['simple_eta'] = train_head['simple_eta'] + print("flag:", 3) + data_link['slice_id'] = train_head['slice_id'] + print("flag:", 4) + data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() + print("flag:", 5) + data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) + print("flag:", 6) + data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( + data_link_split) + print("flag:", 7) + data_link_split = data_link_split.reset_index(drop=True) + data_link_split[['link_id', + 'link_time', + 'link_ratio', + 'link_current_status', + 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) + print("flag:", 8) + data_link_split = data_link_split[['order_id','link_id']] + data_link_split['link_id'] = data_link_split['link_id'].astype(int) + features = pd.DataFrame({'order_id': data_link_split['order_id'].unique()}) + groupby = data_link_split.groupby(['order_id']) + func = partial(link_id_find) + g = parallel_apply(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) + g = pd.DataFrame(g,columns=['from_id','to_id']) + g = g.drop_duplicates() + nextlinks_new.append(g) + nextlinks_new = pd.concat(nextlinks_new, axis=0) + nextlinks_new = nextlinks_new.drop_duplicates() + nextlinks_new = nextlinks_new.sort_values(by='from_id').reset_index(drop=True) + nextlinks = pd.concat([nextlinks,nextlinks_new],axis=0) + nextlinks = nextlinks.drop_duplicates() + nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True) + print('save all csv') + nextlinks.to_csv(root_path+'nextlinks_allday.csv',index=False) + print('calcute weight') + nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True) + nextlinks = nextlinks.drop_duplicates() + from_id_weight = nextlinks['from_id'].value_counts() + from_id_weight = from_id_weight.to_frame() + from_id_weight['index'] = from_id_weight.index + from_id_weight.columns = ['weight', 'from_id'] + nextlinks = pd.merge(nextlinks, from_id_weight, 'left', on=['from_id']) + print('calcute weight finish!') + nextlinks['to_id'] = nextlinks['to_id'].astype(int) + nextlinks['from_id'] = nextlinks['from_id'].astype(int) + id_key = list(set(nextlinks['from_id'].unique().tolist() + nextlinks['to_id'].unique().tolist())) + id_key_to_connected = dict(zip(id_key, range(len(id_key)))) + nextlinks['from_id'] = nextlinks['from_id'].map(id_key_to_connected) + nextlinks['to_id'] = nextlinks['to_id'].map(id_key_to_connected) + np.save(root_path + 'id_key_to_connected_allday.npy', id_key_to_connected) + print('id key save finish!') + print('start creating graph') + G = nx.DiGraph() + from_id = nextlinks['from_id'].to_list() + to_id = nextlinks['to_id'].to_list() + weight = nextlinks['weight'].to_list() + edge_tuple = list(zip(from_id, to_id,weight)) + # edge_tuple = tuple(from_id,to_id,weight) + print('adding') + G.add_weighted_edges_from(edge_tuple) + G = G.to_directed() + print('finish create graph!') + print('start train n2v') + look_back = list(G.nodes()) + embeddings = {} + models = [] + models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=1, inout_p=1)) + for embedding in models: + Y, t = embedding.learn_embedding(graph=G, edge_f=None, + is_weighted=True, no_python=True) + for i, embedding in enumerate(embedding.get_embedding()): + embeddings[look_back[i]] = embedding + np.save(root_path+'graph_embeddings_retp1.npy', embeddings) + print('nextlink graph embedding retp 1 finish!') # displays "world" + del models + gc.collect() + + look_back = list(G.nodes()) + embeddings = {} + models = [] + models.append(node2vec(d=128, max_iter=10, walk_len=80, num_walks=10, con_size=10, ret_p=0.5, inout_p=1)) + for embedding in models: + Y, t = embedding.learn_embedding(graph=G, edge_f=None, + is_weighted=True, no_python=True) + for i, embedding in enumerate(embedding.get_embedding()): + embeddings[look_back[i]] = embedding + np.save(root_path + 'graph_embeddings_retp05.npy', embeddings) + print('nextlink graph embedding retp 0.5 finish!') + diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/2_cross_fea_order_id_level.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/2_cross_fea_order_id_level.py similarity index 98% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/2_cross_fea_order_id_level.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/2_cross_fea_order_id_level.py index d36dc0a..c947d23 100644 --- a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/2_cross_fea_order_id_level.py +++ b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/2_cross_fea_order_id_level.py @@ -1,400 +1,400 @@ -#coding=utf-8 -""" -Author: Aigege -Code: https://github.com/AiIsBetter -""" -# date 2021.08.01 -import pandas as pd -import numpy as np -from sklearn.linear_model import LinearRegression -from sklearn.feature_extraction.text import CountVectorizer -import networkx as nx -import os -import gc -import warnings -from utils import parallel_apply_fea,add_features_in_group -from functools import partial -warnings.filterwarnings("ignore") - -def last_k_cross_time_interval(gr, periods): - gr_ = gr.copy() - gr_ = gr_.iloc[::-1] - gr_['t_i_v'] = gr_['cross_time'].diff() - gr_['t_i_v'] = gr_['t_i_v'] - gr_['t_i_v'] = gr_['t_i_v'].fillna(0) - gr_ = gr_.drop_duplicates().reset_index(drop = True) - - # cross time变化 - features = {} - for period in periods: - if period > 10e5: - period_name = 'zsl_cross_time_interval_all' - gr_period = gr_.copy() - else: - period_name = 'zsl_cross_time_interval_last_{}_'.format(period) - gr_period = gr_.iloc[:period] - features = add_features_in_group(features, gr_period, 't_i_v', - ['mean','max', 'min', 'std','sum'], - period_name) - return features - -# last k cross id time trend -def last_cross_time_features(gr,periods): - gr_ = gr.copy() - gr_ = gr_.iloc[::-1] - features = {} - for period in periods: - if period > 10e5: - period_name = 'zsl_all_' - gr_period = gr_.copy() - else: - period_name = 'zsl_last_{}_'.format(period) - gr_period = gr_.iloc[:period] - features = add_features_in_group(features, gr_period, 'cross_time', - ['max', 'sum', 'mean','min','std'], - period_name) - return features - - -# last k cross id time trend -def trend_in_last_k_cross_id_time(gr, periods): - gr_ = gr.copy() - gr_ = gr_.iloc[::-1] - features = {} - for period in periods: - gr_period = gr_.iloc[:period] - features = add_trend_feature(features, gr_period, - 'cross_time', 'zsl_{}_period_trend_'.format(period) - ) - return features -# trend feature -def add_trend_feature(features, gr, feature_name, prefix): - y = gr[feature_name].values - try: - x = np.arange(0, len(y)).reshape(-1, 1) - lr = LinearRegression() - lr.fit(x, y) - trend = lr.coef_[0] - except: - trend = np.nan - features['{}{}'.format(prefix, feature_name)] = trend - return features - -def slice_id_change(x): - hour = x * 5 / 60 - hour = np.floor(hour) - hour += 8 - if hour >= 24: - hour = hour - 24 - return hour -if __name__ == '__main__': - nrows = None - root_path = '../data/giscup_2021/' - read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item() - read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item() - read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item() - for i in read_grapheb: - read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i]) - del read_grapheb_retp - head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] - embedding_k = 256 - fill_list = [0] * embedding_k - df = [] - #######################################nextlinks ####################################### - nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None) - nextlinks.columns=['from_id', 'to_id'] - nextlinks['to_id'] = nextlinks['to_id'].astype('str') - nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) - nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()), - 'to_id':np.concatenate(nextlinks.to_id.values)}) - from_id_weight = nextlinks['from_id'].value_counts() - from_id_weight = from_id_weight.to_frame() - from_id_weight['index'] = from_id_weight.index - - from_id_weight.columns=['weight', 'from_id'] - nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id']) - nextlinks = nextlinks.sort_values(by='weight',ascending=False) - G = nx.DiGraph() - from_id = nextlinks['from_id'].astype(str).to_list() - to_id = nextlinks['to_id'].to_list() - weight = nextlinks['weight'].to_list() - edge_tuple = list(zip(from_id, to_id,weight)) - print('adding') - G.add_weighted_edges_from(edge_tuple) - - dc = nx.algorithms.centrality.degree_centrality(G) - dc = sorted(dc.items(), key=lambda d: d[1],reverse=True) - dc = dc[:50000] - dc = [str(i[0]) for i in dc ] - #######################################cross ####################################### - for name in os.listdir(root_path+'train/'): - data_time = name.split('.')[0] - if data_time=='20200803': - continue - train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) - print("开始处理", data_time) - train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) - train_head['order_id'] = train_head['order_id'].astype(str) - train_head['ata'] = train_head['ata'].astype(float) - train_head['distance'] = train_head['distance'].astype(float) - train_head['simple_eta'] = train_head['simple_eta'].astype(float) - train_head['driver_id'] = train_head['driver_id'].astype(int) - train_head['slice_id'] = train_head['slice_id'].astype(int) - # 处理corss数据 - data_cross = train[[2]] - data_cross['index'] = train_head.index - data_cross['order_id'] = train_head['order_id'] - data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame() - data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'}) - data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split) - data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True) - data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float) - tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True) - tmp_cross_id.columns=['cross_id_in','cross_id_out'] - data_cross_split = pd.concat([data_cross_split,tmp_cross_id],axis=1).drop(['cross_id','cross_info'],axis=1) - data_cross_split['date_time'] = data_time - data_cross_split = data_cross_split.drop('index',axis=1).reset_index(drop=True) - print('preprocess finish!') - print('start feature engineering') - feature = train_head[['order_id', 'distance']] - ###################static fea############################################# - data_cross_split['zsl_cross_id_isnull'] =0 - data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'zsl_cross_id_isnull'] = 1 - data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'cross_id_in'] = '-1' - data_cross_split.loc[data_cross_split['cross_id_out'].isnull(),'cross_id_out'] = '-1' - #######################order cross_id count############################### - df = data_cross_split.groupby('order_id', as_index=False) - tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'}) - tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0 - tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=5)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<10),'zsl_order_cross_id_in_count_bins']=1 - tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=10)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<20),'zsl_order_cross_id_in_count_bins']=2 - tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=20),'zsl_order_cross_id_in_count_bins']=3 - feature = feature.merge(tmp_crossid_agg,on='order_id',how='left') - print('order cross_id count finish!') - #######################order cross id & distance############################### - feature['zsl_order_cross_is_highspeed'] = 0 - feature.loc[(feature['distance']>90000)&(feature['zsl_order_cross_id_in_count']<30),'zsl_order_cross_is_highspeed'] = 1 - print('order cross id & distance finish!') - #######################order cross id & nextlinks centry############################### - tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)] - tmp = tmp.groupby('order_id', as_index=False) - tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'}) - feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left') - feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0) - tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)] - tmp = tmp.groupby('order_id', as_index=False) - tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'}) - feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') - feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0) - print('order cross_id & nextlinks centry finish!') - #######################order cross_time sum mean max min var std############################### - tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum','zsl_order_cross_time_mean': 'mean', - 'zsl_order_cross_time_max': 'max','zsl_order_cross_time_min': 'min', - 'zsl_order_cross_time_var': 'var'}) - feature = feature.merge(tmp_linktime_agg,on='order_id',how='left') - print('order cross_time sum mean max min var std finish!') - #######################order distance/link_id_count############################### - feature['zsl_distance_div_cross_id_count'] = feature['distance']*10/feature['zsl_order_cross_id_in_count'] - feature = feature.drop('distance', axis=1) - print('order distance div link_id_count finish!') - ###################trend fea############################################# - ###################trend cross time##################################### - groupby = data_cross_split.groupby(['order_id']) - func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20,100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_cross_time_features, periods=[2, 5, 10, 20,100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - print('trend cross time finish!') - ####################nextlinks graph embedding####################### - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0') - def replace_list(x): - if isinstance(x, str): - x = fill_list - return x - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list) - cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)] - agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col))) - cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list()) - cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16) - data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1) - tmp = data_cross_split.groupby('order_id', as_index=False) - tmp_crossidin_agg = tmp.agg(agg_col) - feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left') - print('trend cross_id_in eb finish!') - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0') - def replace_list(x): - if isinstance(x, str): - x = fill_list - return x - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list) - cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)] - agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col))) - cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list()) - cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16) - data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1) - tmp = data_cross_split.groupby('order_id', as_index=False) - tmp_crossidout_agg = tmp.agg(agg_col) - feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left') - print('trend cross_id_out eb finish!') - multipy_df = [] - multipy_col = [] - for col1, col2 in zip(cross_id_in_col, cross_id_out_col): - tmp = feature[col1] * feature[col2] - multipy_df.append(tmp) - multipy_col.append(col1 + '_mul_' + col2) - multipy_df = pd.concat(multipy_df, axis=1) - multipy_df.columns = multipy_col - feature = pd.concat([feature, multipy_df], axis=1) - print('trend cross_id_out eb multipy finish!') - feature.to_csv(root_path + 'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time), index=False) - del train - gc.collect() - - test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) - test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) - test_head['order_id'] = test_head['order_id'].astype(str) - test_head['ata'] = test_head['ata'].astype(float) - test_head['distance'] = test_head['distance'].astype(float) - test_head['simple_eta'] = test_head['simple_eta'].astype(float) - test_head['driver_id'] = test_head['driver_id'].astype(int) - test_head['slice_id'] = test_head['slice_id'].astype(int) - # 处理corss数据 - data_cross = test[[2]] - data_cross['index'] = test_head.index - data_cross['order_id'] = test_head['order_id'] - data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame() - data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'}) - data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split) - data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True) - data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float) - tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True) - tmp_cross_id.columns = ['cross_id_in', 'cross_id_out'] - data_cross_split = pd.concat([data_cross_split, tmp_cross_id], axis=1).drop(['cross_id', 'cross_info'], axis=1) - data_cross_split['date_time'] = '20200901' - data_cross_split = data_cross_split.drop('index', axis=1).reset_index(drop=True) - print('preprocess finish!') - print('start feature engineering') - feature = test_head[['order_id', 'distance']] - ###################static fea############################################# - data_cross_split['zsl_cross_id_isnull'] = 0 - data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'zsl_cross_id_isnull'] = 1 - data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'cross_id_in'] = '-1' - data_cross_split.loc[data_cross_split['cross_id_out'].isnull(), 'cross_id_out'] = '-1' - #######################order cross_id count############################### - df = data_cross_split.groupby('order_id', as_index=False) - tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'}) - tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0 - tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 5) & ( - tmp_crossid_agg['zsl_order_cross_id_in_count'] < 10), 'zsl_order_cross_id_in_count_bins'] = 1 - tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 10) & ( - tmp_crossid_agg['zsl_order_cross_id_in_count'] < 20), 'zsl_order_cross_id_in_count_bins'] = 2 - tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 20), 'zsl_order_cross_id_in_count_bins'] = 3 - feature = feature.merge(tmp_crossid_agg, on='order_id', how='left') - print('order cross_id count finish!') - #######################order cross id & distance############################### - feature['zsl_order_cross_is_highspeed'] = 0 - feature.loc[(feature['distance'] > 90000) & ( - feature['zsl_order_cross_id_in_count'] < 30), 'zsl_order_cross_is_highspeed'] = 1 - print('order cross id & distance finish!') - #######################order cross id & nextlinks centry############################### - tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)] - tmp = tmp.groupby('order_id', as_index=False) - tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'}) - feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') - feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0) - tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)] - tmp = tmp.groupby('order_id', as_index=False) - tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'}) - feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') - feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0) - print('order cross_id & nextlinks centry finish!') - #######################order cross_time sum mean max min var std############################### - tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum', 'zsl_order_cross_time_mean': 'mean', - 'zsl_order_cross_time_max': 'max', 'zsl_order_cross_time_min': 'min', - 'zsl_order_cross_time_var': 'var'}) - feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') - print('order cross_time sum mean max min var std finish!') - #######################order distance/link_id_count############################### - feature['zsl_distance_div_cross_id_count'] = feature['distance'] * 10 / feature['zsl_order_cross_id_in_count'] - feature = feature.drop('distance', axis=1) - print('order distance div link_id_count finish!') - ###################trend fea############################################# - ###################trend cross time##################################### - groupby = data_cross_split.groupby(['order_id']) - func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_cross_time_features, periods=[2, 5, 10, 20, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - print('trend cross time finish!') - ####################nextlinks graph embedding####################### - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb) - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0') - def replace_list(x): - if isinstance(x, str): - x = fill_list - return x - data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list) - cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)] - agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col))) - cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list()) - cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16) - data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1) - tmp = data_cross_split.groupby('order_id', as_index=False) - tmp_crossidin_agg = tmp.agg(agg_col) - feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left') - print('trend cross_id_in eb finish!') - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb) - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0') - def replace_list(x): - if isinstance(x, str): - x = fill_list - return x - data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list) - cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)] - agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col))) - cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list()) - cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16) - data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1) - tmp = data_cross_split.groupby('order_id', as_index=False) - tmp_crossidout_agg = tmp.agg(agg_col) - feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left') - print('trend cross_id_out eb finish!') - multipy_df = [] - multipy_col = [] - for col1, col2 in zip(cross_id_in_col, cross_id_out_col): - tmp = feature[col1] * feature[col2] - multipy_df.append(tmp) - multipy_col.append(col1 + '_mul_' + col2) - multipy_df = pd.concat(multipy_df, axis=1) - multipy_df.columns = multipy_col - feature = pd.concat([feature, multipy_df], axis=1) - print('trend cross_id_out eb multipy finish!') - feature.to_csv(root_path + 'feature/test/cross_fea_order_id_level_20200901.csv', index=False) +#coding=utf-8 +""" +Author: Aigege +Code: https://github.com/AiIsBetter +""" +# date 2021.08.01 +import pandas as pd +import numpy as np +from sklearn.linear_model import LinearRegression +from sklearn.feature_extraction.text import CountVectorizer +import networkx as nx +import os +import gc +import warnings +from utils import parallel_apply_fea,add_features_in_group +from functools import partial +warnings.filterwarnings("ignore") + +def last_k_cross_time_interval(gr, periods): + gr_ = gr.copy() + gr_ = gr_.iloc[::-1] + gr_['t_i_v'] = gr_['cross_time'].diff() + gr_['t_i_v'] = gr_['t_i_v'] + gr_['t_i_v'] = gr_['t_i_v'].fillna(0) + gr_ = gr_.drop_duplicates().reset_index(drop = True) + + # cross time变化 + features = {} + for period in periods: + if period > 10e5: + period_name = 'zsl_cross_time_interval_all' + gr_period = gr_.copy() + else: + period_name = 'zsl_cross_time_interval_last_{}_'.format(period) + gr_period = gr_.iloc[:period] + features = add_features_in_group(features, gr_period, 't_i_v', + ['mean','max', 'min', 'std','sum'], + period_name) + return features + +# last k cross id time trend +def last_cross_time_features(gr,periods): + gr_ = gr.copy() + gr_ = gr_.iloc[::-1] + features = {} + for period in periods: + if period > 10e5: + period_name = 'zsl_all_' + gr_period = gr_.copy() + else: + period_name = 'zsl_last_{}_'.format(period) + gr_period = gr_.iloc[:period] + features = add_features_in_group(features, gr_period, 'cross_time', + ['max', 'sum', 'mean','min','std'], + period_name) + return features + + +# last k cross id time trend +def trend_in_last_k_cross_id_time(gr, periods): + gr_ = gr.copy() + gr_ = gr_.iloc[::-1] + features = {} + for period in periods: + gr_period = gr_.iloc[:period] + features = add_trend_feature(features, gr_period, + 'cross_time', 'zsl_{}_period_trend_'.format(period) + ) + return features +# trend feature +def add_trend_feature(features, gr, feature_name, prefix): + y = gr[feature_name].values + try: + x = np.arange(0, len(y)).reshape(-1, 1) + lr = LinearRegression() + lr.fit(x, y) + trend = lr.coef_[0] + except: + trend = np.nan + features['{}{}'.format(prefix, feature_name)] = trend + return features + +def slice_id_change(x): + hour = x * 5 / 60 + hour = np.floor(hour) + hour += 8 + if hour >= 24: + hour = hour - 24 + return hour +if __name__ == '__main__': + nrows = None + root_path = '../data/giscup_2021/' + read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item() + read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item() + read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item() + for i in read_grapheb: + read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i]) + del read_grapheb_retp + head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] + embedding_k = 256 + fill_list = [0] * embedding_k + df = [] + #######################################nextlinks ####################################### + nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None) + nextlinks.columns=['from_id', 'to_id'] + nextlinks['to_id'] = nextlinks['to_id'].astype('str') + nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) + nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()), + 'to_id':np.concatenate(nextlinks.to_id.values)}) + from_id_weight = nextlinks['from_id'].value_counts() + from_id_weight = from_id_weight.to_frame() + from_id_weight['index'] = from_id_weight.index + + from_id_weight.columns=['weight', 'from_id'] + nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id']) + nextlinks = nextlinks.sort_values(by='weight',ascending=False) + G = nx.DiGraph() + from_id = nextlinks['from_id'].astype(str).to_list() + to_id = nextlinks['to_id'].to_list() + weight = nextlinks['weight'].to_list() + edge_tuple = list(zip(from_id, to_id,weight)) + print('adding') + G.add_weighted_edges_from(edge_tuple) + + dc = nx.algorithms.centrality.degree_centrality(G) + dc = sorted(dc.items(), key=lambda d: d[1],reverse=True) + dc = dc[:50000] + dc = [str(i[0]) for i in dc ] + #######################################cross ####################################### + for name in os.listdir(root_path+'train/'): + data_time = name.split('.')[0] + if data_time=='20200803': + continue + train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) + print("开始处理", data_time) + train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) + train_head['order_id'] = train_head['order_id'].astype(str) + train_head['ata'] = train_head['ata'].astype(float) + train_head['distance'] = train_head['distance'].astype(float) + train_head['simple_eta'] = train_head['simple_eta'].astype(float) + train_head['driver_id'] = train_head['driver_id'].astype(int) + train_head['slice_id'] = train_head['slice_id'].astype(int) + # 处理corss数据 + data_cross = train[[2]] + data_cross['index'] = train_head.index + data_cross['order_id'] = train_head['order_id'] + data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame() + data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'}) + data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split) + data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True) + data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float) + tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True) + tmp_cross_id.columns=['cross_id_in','cross_id_out'] + data_cross_split = pd.concat([data_cross_split,tmp_cross_id],axis=1).drop(['cross_id','cross_info'],axis=1) + data_cross_split['date_time'] = data_time + data_cross_split = data_cross_split.drop('index',axis=1).reset_index(drop=True) + print('preprocess finish!') + print('start feature engineering') + feature = train_head[['order_id', 'distance']] + ###################static fea############################################# + data_cross_split['zsl_cross_id_isnull'] =0 + data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'zsl_cross_id_isnull'] = 1 + data_cross_split.loc[data_cross_split['cross_id_in'].isnull(),'cross_id_in'] = '-1' + data_cross_split.loc[data_cross_split['cross_id_out'].isnull(),'cross_id_out'] = '-1' + #######################order cross_id count############################### + df = data_cross_split.groupby('order_id', as_index=False) + tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'}) + tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0 + tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=5)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<10),'zsl_order_cross_id_in_count_bins']=1 + tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=10)&(tmp_crossid_agg['zsl_order_cross_id_in_count']<20),'zsl_order_cross_id_in_count_bins']=2 + tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count']>=20),'zsl_order_cross_id_in_count_bins']=3 + feature = feature.merge(tmp_crossid_agg,on='order_id',how='left') + print('order cross_id count finish!') + #######################order cross id & distance############################### + feature['zsl_order_cross_is_highspeed'] = 0 + feature.loc[(feature['distance']>90000)&(feature['zsl_order_cross_id_in_count']<30),'zsl_order_cross_is_highspeed'] = 1 + print('order cross id & distance finish!') + #######################order cross id & nextlinks centry############################### + tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)] + tmp = tmp.groupby('order_id', as_index=False) + tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'}) + feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left') + feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0) + tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)] + tmp = tmp.groupby('order_id', as_index=False) + tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'}) + feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') + feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0) + print('order cross_id & nextlinks centry finish!') + #######################order cross_time sum mean max min var std############################### + tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum','zsl_order_cross_time_mean': 'mean', + 'zsl_order_cross_time_max': 'max','zsl_order_cross_time_min': 'min', + 'zsl_order_cross_time_var': 'var'}) + feature = feature.merge(tmp_linktime_agg,on='order_id',how='left') + print('order cross_time sum mean max min var std finish!') + #######################order distance/link_id_count############################### + feature['zsl_distance_div_cross_id_count'] = feature['distance']*10/feature['zsl_order_cross_id_in_count'] + feature = feature.drop('distance', axis=1) + print('order distance div link_id_count finish!') + ###################trend fea############################################# + ###################trend cross time##################################### + groupby = data_cross_split.groupby(['order_id']) + func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20,100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_cross_time_features, periods=[2, 5, 10, 20,100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + print('trend cross time finish!') + ####################nextlinks graph embedding####################### + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0') + def replace_list(x): + if isinstance(x, str): + x = fill_list + return x + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list) + cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)] + agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col))) + cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list()) + cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16) + data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1) + tmp = data_cross_split.groupby('order_id', as_index=False) + tmp_crossidin_agg = tmp.agg(agg_col) + feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left') + print('trend cross_id_in eb finish!') + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0') + def replace_list(x): + if isinstance(x, str): + x = fill_list + return x + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list) + cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)] + agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col))) + cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list()) + cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16) + data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1) + tmp = data_cross_split.groupby('order_id', as_index=False) + tmp_crossidout_agg = tmp.agg(agg_col) + feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left') + print('trend cross_id_out eb finish!') + multipy_df = [] + multipy_col = [] + for col1, col2 in zip(cross_id_in_col, cross_id_out_col): + tmp = feature[col1] * feature[col2] + multipy_df.append(tmp) + multipy_col.append(col1 + '_mul_' + col2) + multipy_df = pd.concat(multipy_df, axis=1) + multipy_df.columns = multipy_col + feature = pd.concat([feature, multipy_df], axis=1) + print('trend cross_id_out eb multipy finish!') + feature.to_csv(root_path + 'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time), index=False) + del train + gc.collect() + + test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) + test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) + test_head['order_id'] = test_head['order_id'].astype(str) + test_head['ata'] = test_head['ata'].astype(float) + test_head['distance'] = test_head['distance'].astype(float) + test_head['simple_eta'] = test_head['simple_eta'].astype(float) + test_head['driver_id'] = test_head['driver_id'].astype(int) + test_head['slice_id'] = test_head['slice_id'].astype(int) + # 处理corss数据 + data_cross = test[[2]] + data_cross['index'] = test_head.index + data_cross['order_id'] = test_head['order_id'] + data_cross_split = data_cross[2].str.split(' ', expand=True).stack().to_frame() + data_cross_split = data_cross_split.reset_index(level=1, drop=True).rename(columns={0: 'cross_info'}) + data_cross_split = data_cross[['index', 'order_id']].join(data_cross_split) + data_cross_split[['cross_id', 'cross_time']] = data_cross_split['cross_info'].str.split(':', 2, expand=True) + data_cross_split['cross_time'] = data_cross_split['cross_time'].astype(float) + tmp_cross_id = data_cross_split['cross_id'].str.split('_', expand=True) + tmp_cross_id.columns = ['cross_id_in', 'cross_id_out'] + data_cross_split = pd.concat([data_cross_split, tmp_cross_id], axis=1).drop(['cross_id', 'cross_info'], axis=1) + data_cross_split['date_time'] = '20200901' + data_cross_split = data_cross_split.drop('index', axis=1).reset_index(drop=True) + print('preprocess finish!') + print('start feature engineering') + feature = test_head[['order_id', 'distance']] + ###################static fea############################################# + data_cross_split['zsl_cross_id_isnull'] = 0 + data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'zsl_cross_id_isnull'] = 1 + data_cross_split.loc[data_cross_split['cross_id_in'].isnull(), 'cross_id_in'] = '-1' + data_cross_split.loc[data_cross_split['cross_id_out'].isnull(), 'cross_id_out'] = '-1' + #######################order cross_id count############################### + df = data_cross_split.groupby('order_id', as_index=False) + tmp_crossid_agg = df['cross_id_in'].agg({'zsl_order_cross_id_in_count': 'count'}) + tmp_crossid_agg['zsl_order_cross_id_in_count_bins'] = 0 + tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 5) & ( + tmp_crossid_agg['zsl_order_cross_id_in_count'] < 10), 'zsl_order_cross_id_in_count_bins'] = 1 + tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 10) & ( + tmp_crossid_agg['zsl_order_cross_id_in_count'] < 20), 'zsl_order_cross_id_in_count_bins'] = 2 + tmp_crossid_agg.loc[(tmp_crossid_agg['zsl_order_cross_id_in_count'] >= 20), 'zsl_order_cross_id_in_count_bins'] = 3 + feature = feature.merge(tmp_crossid_agg, on='order_id', how='left') + print('order cross_id count finish!') + #######################order cross id & distance############################### + feature['zsl_order_cross_is_highspeed'] = 0 + feature.loc[(feature['distance'] > 90000) & ( + feature['zsl_order_cross_id_in_count'] < 30), 'zsl_order_cross_is_highspeed'] = 1 + print('order cross id & distance finish!') + #######################order cross id & nextlinks centry############################### + tmp = data_cross_split[data_cross_split['cross_id_in'].isin(dc)] + tmp = tmp.groupby('order_id', as_index=False) + tmp_linkid_centry_count = tmp['cross_id_in'].agg({'zsl_order_cross_id_in_centry_count': 'count'}) + feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') + feature['zsl_order_cross_id_in_centry_count'] = feature['zsl_order_cross_id_in_centry_count'].fillna(0) + tmp = data_cross_split[data_cross_split['cross_id_out'].isin(dc)] + tmp = tmp.groupby('order_id', as_index=False) + tmp_linkid_centry_count = tmp['cross_id_out'].agg({'zsl_order_cross_id_out_centry_count': 'count'}) + feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') + feature['zsl_order_cross_id_out_centry_count'] = feature['zsl_order_cross_id_out_centry_count'].fillna(0) + print('order cross_id & nextlinks centry finish!') + #######################order cross_time sum mean max min var std############################### + tmp_linktime_agg = df['cross_time'].agg({'zsl_order_cross_time_sum': 'sum', 'zsl_order_cross_time_mean': 'mean', + 'zsl_order_cross_time_max': 'max', 'zsl_order_cross_time_min': 'min', + 'zsl_order_cross_time_var': 'var'}) + feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') + print('order cross_time sum mean max min var std finish!') + #######################order distance/link_id_count############################### + feature['zsl_distance_div_cross_id_count'] = feature['distance'] * 10 / feature['zsl_order_cross_id_in_count'] + feature = feature.drop('distance', axis=1) + print('order distance div link_id_count finish!') + ###################trend fea############################################# + ###################trend cross time##################################### + groupby = data_cross_split.groupby(['order_id']) + func = partial(trend_in_last_k_cross_id_time, periods=[2, 5, 10, 20, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_cross_time_features, periods=[2, 5, 10, 20, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_k_cross_time_interval, periods=[2, 5, 10, 20, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + print('trend cross time finish!') + ####################nextlinks graph embedding####################### + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_idkey) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna(0) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].astype(int) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].map(read_grapheb) + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].fillna('0') + def replace_list(x): + if isinstance(x, str): + x = fill_list + return x + data_cross_split['cross_id_in'] = data_cross_split['cross_id_in'].apply(replace_list) + cross_id_in_col = ['zsl_cross_id_in_eb{}'.format(i) for i in range(embedding_k)] + agg_col = dict(zip(cross_id_in_col, ['mean'] * len(cross_id_in_col))) + cross_id_in_array = np.array(data_cross_split.pop('cross_id_in').to_list()) + cross_id_in_array = pd.DataFrame(cross_id_in_array, columns=agg_col, dtype=np.float16) + data_cross_split = pd.concat([data_cross_split, cross_id_in_array], axis=1) + tmp = data_cross_split.groupby('order_id', as_index=False) + tmp_crossidin_agg = tmp.agg(agg_col) + feature = feature.merge(tmp_crossidin_agg, on='order_id', how='left') + print('trend cross_id_in eb finish!') + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_idkey) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna(0) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].astype(int) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].map(read_grapheb) + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].fillna('0') + def replace_list(x): + if isinstance(x, str): + x = fill_list + return x + data_cross_split['cross_id_out'] = data_cross_split['cross_id_out'].apply(replace_list) + cross_id_out_col = ['zsl_cross_id_out_eb{}'.format(i) for i in range(embedding_k)] + agg_col = dict(zip(cross_id_out_col, ['mean'] * len(cross_id_out_col))) + cross_id_out_array = np.array(data_cross_split.pop('cross_id_out').to_list()) + cross_id_out_array = pd.DataFrame(cross_id_out_array, columns=agg_col, dtype=np.float16) + data_cross_split = pd.concat([data_cross_split, cross_id_out_array], axis=1) + tmp = data_cross_split.groupby('order_id', as_index=False) + tmp_crossidout_agg = tmp.agg(agg_col) + feature = feature.merge(tmp_crossidout_agg, on='order_id', how='left') + print('trend cross_id_out eb finish!') + multipy_df = [] + multipy_col = [] + for col1, col2 in zip(cross_id_in_col, cross_id_out_col): + tmp = feature[col1] * feature[col2] + multipy_df.append(tmp) + multipy_col.append(col1 + '_mul_' + col2) + multipy_df = pd.concat(multipy_df, axis=1) + multipy_df.columns = multipy_col + feature = pd.concat([feature, multipy_df], axis=1) + print('trend cross_id_out eb multipy finish!') + feature.to_csv(root_path + 'feature/test/cross_fea_order_id_level_20200901.csv', index=False) diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/3_link_fea_order_id_level.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/3_link_fea_order_id_level.py similarity index 98% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/3_link_fea_order_id_level.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/3_link_fea_order_id_level.py index 5dc42ed..1d540dd 100644 --- a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/3_link_fea_order_id_level.py +++ b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/3_link_fea_order_id_level.py @@ -1,438 +1,438 @@ -#coding=utf-8 -""" -Author: Aigege -Code: https://github.com/AiIsBetter -""" -# date 2021.08.01 -import pandas as pd -import numpy as np - -from sklearn.linear_model import LinearRegression -from sklearn.feature_extraction.text import CountVectorizer -import networkx as nx -import os -import gc -import warnings -from utils import parallel_apply_fea,add_features_in_group -from functools import partial -warnings.filterwarnings("ignore") - -def last_k_link_time_interval(gr, periods): - gr_ = gr.copy() - gr_ = gr_.iloc[::-1] - gr_['t_i_v'] = gr_['link_time'].diff() - gr_['t_i_v'] = gr_['t_i_v'] - gr_['t_i_v'] = gr_['t_i_v'].fillna(0) - - gr_['c_s_v'] = gr_['link_current_status'].diff() - gr_['c_s_v'] = gr_['c_s_v'] - gr_['c_s_v'] = gr_['c_s_v'].fillna(0) - - gr_ = gr_.drop_duplicates().reset_index(drop = True) - - # link time变化 - features = {} - for period in periods: - if period > 10e5: - period_name = 'zsl_link_time_interval_all' - gr_period = gr_.copy() - else: - period_name = 'zsl_link_time_interval_last_{}_'.format(period) - gr_period = gr_.iloc[:period] - features = add_features_in_group(features, gr_period, 't_i_v', - ['mean','max', 'min', 'std','skew','sum'], - # ['diff'], - period_name) - # current status变化 - for period in periods: - if period > 10e5: - period_name = 'zsl_link_current_status_interval_all' - gr_period = gr_.copy() - else: - period_name = 'zsl_link_current_status_interval_last_{}_'.format(period) - gr_period = gr_.iloc[:period] - features = add_features_in_group(features, gr_period, 'c_s_v', - ['mean', 'std', 'skew'], - # ['diff'], - period_name) - return features - -# last k link id time trend -def last_link_time_features(gr,periods): - gr_ = gr.copy() - gr_ = gr_.iloc[::-1] - features = {} - for period in periods: - if period > 10e5: - period_name = 'zsl_all_' - gr_period = gr_.copy() - else: - period_name = 'zsl_last_{}_'.format(period) - gr_period = gr_.iloc[:period] - features = add_features_in_group(features, gr_period, 'link_time', - ['max', 'sum', 'mean','min','skew','std'], - period_name) - features = add_features_in_group(features, gr_period, 'link_current_status', - ['mean', 'nunique'], - period_name) - return features -# last k link id time trend -def trend_in_last_k_link_id_time(gr, periods): - gr_ = gr.copy() - gr_ = gr_.iloc[::-1] - features = {} - for period in periods: - gr_period = gr_.iloc[:period] - features = add_trend_feature(features, gr_period, - 'link_time', 'zsl_{}_period_trend_'.format(period) - ) - - return features -# trend feature -def add_trend_feature(features, gr, feature_name, prefix): - y = gr[feature_name].values - try: - x = np.arange(0, len(y)).reshape(-1, 1) - lr = LinearRegression() - lr.fit(x, y) - trend = lr.coef_[0] - except: - trend = np.nan - features['{}{}'.format(prefix, feature_name)] = trend - return features - -def slice_id_change(x): - hour = x * 5 / 60 - hour = np.floor(hour) - hour += 8 - if hour >= 24: - hour = hour - 24 - return hour -if __name__ == '__main__': - nrows = None - root_path = '../data/giscup_2021/' - read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item() - read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item() - read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item() - for i in read_grapheb: - read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i]) - del read_grapheb_retp - head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] - embedding_k = 256 - fill_list = [0] * embedding_k - #######################################nextlinks ####################################### - nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None) - nextlinks.columns=['from_id', 'to_id'] - nextlinks['to_id'] = nextlinks['to_id'].astype('str') - nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) - nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()), - 'to_id':np.concatenate(nextlinks.to_id.values)}) - from_id_weight = nextlinks['from_id'].value_counts() - from_id_weight = from_id_weight.to_frame() - from_id_weight['index'] = from_id_weight.index - - from_id_weight.columns=['weight', 'from_id'] - nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id']) - nextlinks = nextlinks.sort_values(by='weight',ascending=False) - G = nx.DiGraph() - from_id = nextlinks['from_id'].astype(str).to_list() - to_id = nextlinks['to_id'].to_list() - weight = nextlinks['weight'].to_list() - edge_tuple = list(zip(from_id, to_id,weight)) - print('adding') - G.add_weighted_edges_from(edge_tuple) - - dc = nx.algorithms.centrality.degree_centrality(G) - dc = sorted(dc.items(), key=lambda d: d[1],reverse=True) - dc = dc[:50000] - dc = [str(i[0]) for i in dc ] - #######################################link ####################################### - for name in os.listdir(root_path+'train/'): - data_time = name.split('.')[0] - if data_time=='20200803': - continue - train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) - print("开始处理", data_time) - train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) - train_head['order_id'] = train_head['order_id'].astype(str) - train_head['ata'] = train_head['ata'].astype(float) - train_head['distance'] = train_head['distance'].astype(float) - train_head['simple_eta'] = train_head['simple_eta'].astype(float) - train_head['driver_id'] = train_head['driver_id'].astype(int) - train_head['slice_id'] = train_head['slice_id'].astype(int) - #link preprocess - data_link = train[[1]] - data_link['index'] = train_head.index - data_link['order_id'] = train_head['order_id'] - data_link['ata'] = train_head['ata'] - data_link['distance'] = train_head['distance'] - data_link['simple_eta'] = train_head['simple_eta'] - data_link['slice_id'] = train_head['slice_id'] - - # data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change) - gc.collect() - data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() - data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) - # data_link_split = data_link_split.reset_index(drop=True) - data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( - data_link_split) - data_link_split = data_link_split.reset_index(drop=True) - data_link_split[['link_id', - 'link_time', - 'link_ratio', - 'link_current_status', - 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) - data_link_split = data_link_split.drop(['link_info'], axis=1) - data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float) - data_link_split['link_time'] = data_link_split['link_time'].astype(float) - data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) - print('preprocess finish!') - print('start feature engineering') - feature = train_head[['order_id', 'distance']] - ###################static fea############################################# - #######################order link id count############################### - df = data_link_split.groupby('order_id', as_index=False) - tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'}) - tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0 - tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=75)&(tmp_linkid_agg['zsl_order_link_id_count']<100),'zsl_order_link_id_count_bins']=1 - tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=100)&(tmp_linkid_agg['zsl_order_link_id_count']<120),'zsl_order_link_id_count_bins']=2 - tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=120),'zsl_order_link_id_count_bins']=3 - feature = feature.merge(tmp_linkid_agg,on='order_id',how='left') - print('order link id count finish!') - #######################order link id & distance############################### - feature['zsl_order_is_highspeed'] = 0 - feature.loc[(feature['distance']>90000)&(feature['zsl_order_link_id_count']<300),'zsl_order_is_highspeed'] = 1 - print('order link id & distance finish!') - #######################order link id & nextlinks centry############################### - tmp = data_link_split[data_link_split['link_id'].isin(dc)] - tmp = tmp.groupby('order_id', as_index=False) - tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'}) - feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left') - feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0) - print('order link id & nextlinks centry finish!') - #######################order link time sum mean max min var std############################### - tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum','zsl_order_link_time_mean': 'mean', - 'zsl_order_link_time_max': 'max','zsl_order_link_time_min': 'min', - 'zsl_order_link_time_var': 'var','zsl_order_link_time_skew': 'skew'}) - feature = feature.merge(tmp_linktime_agg,on='order_id',how='left') - print('order link time sum mean max min var std finish!') - #######################order link current status mean nunique############################### - tmp_linktime_agg = df['link_current_status'].agg({'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'}) - feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') - print('order link current status mean nunique finish!') - #######################order link current status count vector############################### - data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str) - data_link_split.loc[data_link_split['link_current_status'].astype(int)<0,'link_current_status'] = '0' - data_link_split.loc[data_link_split['link_current_status'].astype(int)>3,'link_current_status'] = '3' - data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index() - cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b') - train_x = cv_encode.fit_transform(data['link_current_status']) - train_x = train_x.toarray() - link_current_status = pd.DataFrame(train_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1', 'zsl_link_current_status2', - 'zsl_link_current_status3']) - data = pd.concat([data[['order_id']],link_current_status],axis=1) - feature = feature.merge(data, on='order_id', how='left') - print('order link current status count vector finish!') - #######################order distance/link_id_count############################### - feature['zsl_distance_div_link_id_count'] = feature['distance']*10/feature['zsl_order_link_id_count'] - feature = feature.drop('distance', axis=1) - print('order distance div link_id_count finish!') - #######################order link ratio sum mean max min var std############################### - tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean', - 'zsl_order_link_ratio_min': 'min', - 'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'}) - feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left') - print('order link ratio sum mean max min var std finish!') - #######################weather################################################################### - weather = pd.read_csv(root_path+'weather.csv') - weather_dict={'rainstorm':0,'heavy rain':1,'moderate rain':2,'cloudy':3, - 'showers':4} - weather['weather'] = weather['weather'].map(weather_dict) - weather['date'] = weather['date'].astype(str) - weather=weather[weather['date']==data_time] - feature['weather'] = weather['weather'].values[0] - feature['hightemp'] = weather['hightemp'].values[0] - feature['lowtemp'] = weather['lowtemp'].values[0] - print('weather finish!') - ###################trend fea############################################# - ###################trend link time##################################### - data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) - groupby = data_link_split.groupby(['order_id']) - func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - print('trend link time finish!') - ####################nextlinks graph embedding####################### - data_link_split['link_id'] = data_link_split['link_id'].astype(int) - data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey) - data_link_split['link_id'] = data_link_split['link_id'].fillna(0) - data_link_split['link_id'] = data_link_split['link_id'].astype(int) - data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb) - data_link_split['link_id'] = data_link_split['link_id'].fillna('0') - def replace_list(x): - if isinstance(x, str): - x = fill_list - return x - data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list) - link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)] - agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col))) - link_id_array = np.array(data_link_split.pop('link_id').to_list()) - link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16) - data_link_split = pd.concat([data_link_split, link_id_array], axis=1) - tmp = data_link_split.groupby('order_id', as_index=False) - tmp_linkid_agg = tmp.agg(agg_col) - feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') - - feature.to_csv(root_path + 'feature/train/link_fea_order_id_level_{}.csv'.format(data_time), index=False) - del train - gc.collect() - - test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) - test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) - test_head['order_id'] = test_head['order_id'].astype(str) - test_head['ata'] = test_head['ata'].astype(float) - test_head['distance'] = test_head['distance'].astype(float) - test_head['simple_eta'] = test_head['simple_eta'].astype(float) - test_head['driver_id'] = test_head['driver_id'].astype(int) - test_head['slice_id'] = test_head['slice_id'].astype(int) - - # link preprocess - data_link = test[[1]] - data_link['index'] = test_head.index - data_link['order_id'] = test_head['order_id'] - data_link['ata'] = test_head['ata'] - data_link['distance'] = test_head['distance'] - data_link['simple_eta'] = test_head['simple_eta'] - data_link['slice_id'] = test_head['slice_id'] - - # data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change) - gc.collect() - data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() - data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) - # data_link_split = data_link_split.reset_index(drop=True) - data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( - data_link_split) - data_link_split = data_link_split.reset_index(drop=True) - data_link_split[['link_id', - 'link_time', - 'link_ratio', - 'link_current_status', - 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) - data_link_split = data_link_split.drop(['link_info'], axis=1) - data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float) - data_link_split['link_time'] = data_link_split['link_time'].astype(float) - data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) - print('preprocess finish!') - print('start feature engineering') - feature = test_head[['order_id', 'distance']] - ###################static fea############################################# - #######################order link id count############################### - df = data_link_split.groupby('order_id', as_index=False) - tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'}) - tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0 - tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 75) & ( - tmp_linkid_agg['zsl_order_link_id_count'] < 100), 'zsl_order_link_id_count_bins'] = 1 - tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 100) & ( - tmp_linkid_agg['zsl_order_link_id_count'] < 120), 'zsl_order_link_id_count_bins'] = 2 - tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 120), 'zsl_order_link_id_count_bins'] = 3 - feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') - print('order link id count finish!') - #######################order link id & distance############################### - feature['zsl_order_is_highspeed'] = 0 - feature.loc[ - (feature['distance'] > 90000) & (feature['zsl_order_link_id_count'] < 300), 'zsl_order_is_highspeed'] = 1 - print('order link id & distance finish!') - #######################order link id & nextlinks centry############################### - tmp = data_link_split[data_link_split['link_id'].isin(dc)] - tmp = tmp.groupby('order_id', as_index=False) - tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'}) - feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') - feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0) - print('order link id & nextlinks centry finish!') - #######################order link time sum mean max min var std############################### - tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum', 'zsl_order_link_time_mean': 'mean', - 'zsl_order_link_time_max': 'max', 'zsl_order_link_time_min': 'min', - 'zsl_order_link_time_var': 'var', 'zsl_order_link_time_skew': 'skew'}) - feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') - print('order link time sum mean max min var std finish!') - #######################order link current status mean nunique############################### - tmp_linktime_agg = df['link_current_status'].agg( - {'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'}) - feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') - print('order link current status mean nunique finish!') - #######################order link current status count vector############################### - data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str) - data_link_split.loc[data_link_split['link_current_status'].astype(int) < 0, 'link_current_status'] = '0' - data_link_split.loc[data_link_split['link_current_status'].astype(int) > 3, 'link_current_status'] = '3' - data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index() - cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b') - test_x = cv_encode.fit_transform(data['link_current_status']) - test_x = test_x.toarray() - link_current_status = pd.DataFrame(test_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1', - 'zsl_link_current_status2', - 'zsl_link_current_status3']) - data = pd.concat([data[['order_id']], link_current_status], axis=1) - feature = feature.merge(data, on='order_id', how='left') - print('order link current status count vector finish!') - #######################order distance/link_id_count############################### - feature['zsl_distance_div_link_id_count'] = feature['distance'] * 10 / feature['zsl_order_link_id_count'] - feature = feature.drop('distance', axis=1) - print('order distance div link_id_count finish!') - #######################order link ratio sum mean max min var std############################### - tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean', - 'zsl_order_link_ratio_min': 'min', - 'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'}) - feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left') - print('order link ratio sum mean max min var std finish!') - #######################weather################################################################### - weather = pd.read_csv(root_path + 'weather.csv') - weather_dict = {'rainstorm': 0, 'heavy rain': 1, 'moderate rain': 2, 'cloudy': 3, - 'showers': 4} - weather['weather'] = weather['weather'].map(weather_dict) - weather['date'] = weather['date'].astype(str) - weather = weather[weather['date'] == data_time] - feature['weather'] = weather['weather'].values[0] - feature['hightemp'] = weather['hightemp'].values[0] - feature['lowtemp'] = weather['lowtemp'].values[0] - print('weather finish!') - ###################trend fea############################################# - ###################trend link time##################################### - data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) - groupby = data_link_split.groupby(['order_id']) - func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) - g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) - feature = feature.merge(g, on='order_id', how='left') - print('trend link time finish!') - ####################nextlinks graph embedding####################### - data_link_split['link_id'] = data_link_split['link_id'].astype(int) - data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey) - data_link_split['link_id'] = data_link_split['link_id'].fillna(0) - data_link_split['link_id'] = data_link_split['link_id'].astype(int) - data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb) - data_link_split['link_id'] = data_link_split['link_id'].fillna('0') - def replace_list(x): - if isinstance(x, str): - x = fill_list - return x - data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list) - link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)] - agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col))) - link_id_array = np.array(data_link_split.pop('link_id').to_list()) - link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16) - data_link_split = pd.concat([data_link_split, link_id_array], axis=1) - tmp = data_link_split.groupby('order_id', as_index=False) - tmp_linkid_agg = tmp.agg(agg_col) - feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') - feature.to_csv(root_path+'feature/test/link_fea_order_id_level_20200901.csv',index=False) +#coding=utf-8 +""" +Author: Aigege +Code: https://github.com/AiIsBetter +""" +# date 2021.08.01 +import pandas as pd +import numpy as np + +from sklearn.linear_model import LinearRegression +from sklearn.feature_extraction.text import CountVectorizer +import networkx as nx +import os +import gc +import warnings +from utils import parallel_apply_fea,add_features_in_group +from functools import partial +warnings.filterwarnings("ignore") + +def last_k_link_time_interval(gr, periods): + gr_ = gr.copy() + gr_ = gr_.iloc[::-1] + gr_['t_i_v'] = gr_['link_time'].diff() + gr_['t_i_v'] = gr_['t_i_v'] + gr_['t_i_v'] = gr_['t_i_v'].fillna(0) + + gr_['c_s_v'] = gr_['link_current_status'].diff() + gr_['c_s_v'] = gr_['c_s_v'] + gr_['c_s_v'] = gr_['c_s_v'].fillna(0) + + gr_ = gr_.drop_duplicates().reset_index(drop = True) + + # link time变化 + features = {} + for period in periods: + if period > 10e5: + period_name = 'zsl_link_time_interval_all' + gr_period = gr_.copy() + else: + period_name = 'zsl_link_time_interval_last_{}_'.format(period) + gr_period = gr_.iloc[:period] + features = add_features_in_group(features, gr_period, 't_i_v', + ['mean','max', 'min', 'std','skew','sum'], + # ['diff'], + period_name) + # current status变化 + for period in periods: + if period > 10e5: + period_name = 'zsl_link_current_status_interval_all' + gr_period = gr_.copy() + else: + period_name = 'zsl_link_current_status_interval_last_{}_'.format(period) + gr_period = gr_.iloc[:period] + features = add_features_in_group(features, gr_period, 'c_s_v', + ['mean', 'std', 'skew'], + # ['diff'], + period_name) + return features + +# last k link id time trend +def last_link_time_features(gr,periods): + gr_ = gr.copy() + gr_ = gr_.iloc[::-1] + features = {} + for period in periods: + if period > 10e5: + period_name = 'zsl_all_' + gr_period = gr_.copy() + else: + period_name = 'zsl_last_{}_'.format(period) + gr_period = gr_.iloc[:period] + features = add_features_in_group(features, gr_period, 'link_time', + ['max', 'sum', 'mean','min','skew','std'], + period_name) + features = add_features_in_group(features, gr_period, 'link_current_status', + ['mean', 'nunique'], + period_name) + return features +# last k link id time trend +def trend_in_last_k_link_id_time(gr, periods): + gr_ = gr.copy() + gr_ = gr_.iloc[::-1] + features = {} + for period in periods: + gr_period = gr_.iloc[:period] + features = add_trend_feature(features, gr_period, + 'link_time', 'zsl_{}_period_trend_'.format(period) + ) + + return features +# trend feature +def add_trend_feature(features, gr, feature_name, prefix): + y = gr[feature_name].values + try: + x = np.arange(0, len(y)).reshape(-1, 1) + lr = LinearRegression() + lr.fit(x, y) + trend = lr.coef_[0] + except: + trend = np.nan + features['{}{}'.format(prefix, feature_name)] = trend + return features + +def slice_id_change(x): + hour = x * 5 / 60 + hour = np.floor(hour) + hour += 8 + if hour >= 24: + hour = hour - 24 + return hour +if __name__ == '__main__': + nrows = None + root_path = '../data/giscup_2021/' + read_idkey = np.load(root_path + 'id_key_to_connected_allday.npy', allow_pickle=True).item() + read_grapheb = np.load(root_path + 'graph_embeddings_retp1_directed.npy', allow_pickle=True).item() + read_grapheb_retp = np.load(root_path + 'graph_embeddings_retp05_directed.npy', allow_pickle=True).item() + for i in read_grapheb: + read_grapheb[i] = list(read_grapheb[i]) + list(read_grapheb_retp[i]) + del read_grapheb_retp + head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] + embedding_k = 256 + fill_list = [0] * embedding_k + #######################################nextlinks ####################################### + nextlinks = pd.read_csv(root_path+'nextlinks.txt', sep=' ', header=None) + nextlinks.columns=['from_id', 'to_id'] + nextlinks['to_id'] = nextlinks['to_id'].astype('str') + nextlinks['to_id'] = nextlinks['to_id'].apply(lambda x: x.split(",")) + nextlinks = pd.DataFrame({'from_id':nextlinks.from_id.repeat(nextlinks.to_id.str.len()), + 'to_id':np.concatenate(nextlinks.to_id.values)}) + from_id_weight = nextlinks['from_id'].value_counts() + from_id_weight = from_id_weight.to_frame() + from_id_weight['index'] = from_id_weight.index + + from_id_weight.columns=['weight', 'from_id'] + nextlinks = pd.merge(nextlinks,from_id_weight, 'left', on=['from_id']) + nextlinks = nextlinks.sort_values(by='weight',ascending=False) + G = nx.DiGraph() + from_id = nextlinks['from_id'].astype(str).to_list() + to_id = nextlinks['to_id'].to_list() + weight = nextlinks['weight'].to_list() + edge_tuple = list(zip(from_id, to_id,weight)) + print('adding') + G.add_weighted_edges_from(edge_tuple) + + dc = nx.algorithms.centrality.degree_centrality(G) + dc = sorted(dc.items(), key=lambda d: d[1],reverse=True) + dc = dc[:50000] + dc = [str(i[0]) for i in dc ] + #######################################link ####################################### + for name in os.listdir(root_path+'train/'): + data_time = name.split('.')[0] + if data_time=='20200803': + continue + train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) + print("开始处理", data_time) + train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) + train_head['order_id'] = train_head['order_id'].astype(str) + train_head['ata'] = train_head['ata'].astype(float) + train_head['distance'] = train_head['distance'].astype(float) + train_head['simple_eta'] = train_head['simple_eta'].astype(float) + train_head['driver_id'] = train_head['driver_id'].astype(int) + train_head['slice_id'] = train_head['slice_id'].astype(int) + #link preprocess + data_link = train[[1]] + data_link['index'] = train_head.index + data_link['order_id'] = train_head['order_id'] + data_link['ata'] = train_head['ata'] + data_link['distance'] = train_head['distance'] + data_link['simple_eta'] = train_head['simple_eta'] + data_link['slice_id'] = train_head['slice_id'] + + # data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change) + gc.collect() + data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() + data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) + # data_link_split = data_link_split.reset_index(drop=True) + data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( + data_link_split) + data_link_split = data_link_split.reset_index(drop=True) + data_link_split[['link_id', + 'link_time', + 'link_ratio', + 'link_current_status', + 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) + data_link_split = data_link_split.drop(['link_info'], axis=1) + data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float) + data_link_split['link_time'] = data_link_split['link_time'].astype(float) + data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) + print('preprocess finish!') + print('start feature engineering') + feature = train_head[['order_id', 'distance']] + ###################static fea############################################# + #######################order link id count############################### + df = data_link_split.groupby('order_id', as_index=False) + tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'}) + tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0 + tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=75)&(tmp_linkid_agg['zsl_order_link_id_count']<100),'zsl_order_link_id_count_bins']=1 + tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=100)&(tmp_linkid_agg['zsl_order_link_id_count']<120),'zsl_order_link_id_count_bins']=2 + tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count']>=120),'zsl_order_link_id_count_bins']=3 + feature = feature.merge(tmp_linkid_agg,on='order_id',how='left') + print('order link id count finish!') + #######################order link id & distance############################### + feature['zsl_order_is_highspeed'] = 0 + feature.loc[(feature['distance']>90000)&(feature['zsl_order_link_id_count']<300),'zsl_order_is_highspeed'] = 1 + print('order link id & distance finish!') + #######################order link id & nextlinks centry############################### + tmp = data_link_split[data_link_split['link_id'].isin(dc)] + tmp = tmp.groupby('order_id', as_index=False) + tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'}) + feature = feature.merge(tmp_linkid_centry_count,on='order_id',how='left') + feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0) + print('order link id & nextlinks centry finish!') + #######################order link time sum mean max min var std############################### + tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum','zsl_order_link_time_mean': 'mean', + 'zsl_order_link_time_max': 'max','zsl_order_link_time_min': 'min', + 'zsl_order_link_time_var': 'var','zsl_order_link_time_skew': 'skew'}) + feature = feature.merge(tmp_linktime_agg,on='order_id',how='left') + print('order link time sum mean max min var std finish!') + #######################order link current status mean nunique############################### + tmp_linktime_agg = df['link_current_status'].agg({'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'}) + feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') + print('order link current status mean nunique finish!') + #######################order link current status count vector############################### + data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str) + data_link_split.loc[data_link_split['link_current_status'].astype(int)<0,'link_current_status'] = '0' + data_link_split.loc[data_link_split['link_current_status'].astype(int)>3,'link_current_status'] = '3' + data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index() + cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b') + train_x = cv_encode.fit_transform(data['link_current_status']) + train_x = train_x.toarray() + link_current_status = pd.DataFrame(train_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1', 'zsl_link_current_status2', + 'zsl_link_current_status3']) + data = pd.concat([data[['order_id']],link_current_status],axis=1) + feature = feature.merge(data, on='order_id', how='left') + print('order link current status count vector finish!') + #######################order distance/link_id_count############################### + feature['zsl_distance_div_link_id_count'] = feature['distance']*10/feature['zsl_order_link_id_count'] + feature = feature.drop('distance', axis=1) + print('order distance div link_id_count finish!') + #######################order link ratio sum mean max min var std############################### + tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean', + 'zsl_order_link_ratio_min': 'min', + 'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'}) + feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left') + print('order link ratio sum mean max min var std finish!') + #######################weather################################################################### + weather = pd.read_csv(root_path+'weather.csv') + weather_dict={'rainstorm':0,'heavy rain':1,'moderate rain':2,'cloudy':3, + 'showers':4} + weather['weather'] = weather['weather'].map(weather_dict) + weather['date'] = weather['date'].astype(str) + weather=weather[weather['date']==data_time] + feature['weather'] = weather['weather'].values[0] + feature['hightemp'] = weather['hightemp'].values[0] + feature['lowtemp'] = weather['lowtemp'].values[0] + print('weather finish!') + ###################trend fea############################################# + ###################trend link time##################################### + data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) + groupby = data_link_split.groupby(['order_id']) + func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + print('trend link time finish!') + ####################nextlinks graph embedding####################### + data_link_split['link_id'] = data_link_split['link_id'].astype(int) + data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey) + data_link_split['link_id'] = data_link_split['link_id'].fillna(0) + data_link_split['link_id'] = data_link_split['link_id'].astype(int) + data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb) + data_link_split['link_id'] = data_link_split['link_id'].fillna('0') + def replace_list(x): + if isinstance(x, str): + x = fill_list + return x + data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list) + link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)] + agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col))) + link_id_array = np.array(data_link_split.pop('link_id').to_list()) + link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16) + data_link_split = pd.concat([data_link_split, link_id_array], axis=1) + tmp = data_link_split.groupby('order_id', as_index=False) + tmp_linkid_agg = tmp.agg(agg_col) + feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') + + feature.to_csv(root_path + 'feature/train/link_fea_order_id_level_{}.csv'.format(data_time), index=False) + del train + gc.collect() + + test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) + test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) + test_head['order_id'] = test_head['order_id'].astype(str) + test_head['ata'] = test_head['ata'].astype(float) + test_head['distance'] = test_head['distance'].astype(float) + test_head['simple_eta'] = test_head['simple_eta'].astype(float) + test_head['driver_id'] = test_head['driver_id'].astype(int) + test_head['slice_id'] = test_head['slice_id'].astype(int) + + # link preprocess + data_link = test[[1]] + data_link['index'] = test_head.index + data_link['order_id'] = test_head['order_id'] + data_link['ata'] = test_head['ata'] + data_link['distance'] = test_head['distance'] + data_link['simple_eta'] = test_head['simple_eta'] + data_link['slice_id'] = test_head['slice_id'] + + # data_link['slice_id'] = data_link['slice_id'].apply(slice_id_change) + gc.collect() + data_link_split = data_link[1].str.split(' ', expand=True).stack().to_frame() + data_link_split = data_link_split.reset_index(level=1, drop=True).rename(columns={0: 'link_info'}) + # data_link_split = data_link_split.reset_index(drop=True) + data_link_split = data_link[['order_id', 'index', 'ata', 'distance', 'simple_eta', 'slice_id']].join( + data_link_split) + data_link_split = data_link_split.reset_index(drop=True) + data_link_split[['link_id', + 'link_time', + 'link_ratio', + 'link_current_status', + 'link_arrival_status']] = data_link_split['link_info'].str.split(':|,', 5, expand=True) + data_link_split = data_link_split.drop(['link_info'], axis=1) + data_link_split['link_ratio'] = data_link_split['link_ratio'].astype(float) + data_link_split['link_time'] = data_link_split['link_time'].astype(float) + data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) + print('preprocess finish!') + print('start feature engineering') + feature = test_head[['order_id', 'distance']] + ###################static fea############################################# + #######################order link id count############################### + df = data_link_split.groupby('order_id', as_index=False) + tmp_linkid_agg = df['link_id'].agg({'zsl_order_link_id_count': 'count'}) + tmp_linkid_agg['zsl_order_link_id_count_bins'] = 0 + tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 75) & ( + tmp_linkid_agg['zsl_order_link_id_count'] < 100), 'zsl_order_link_id_count_bins'] = 1 + tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 100) & ( + tmp_linkid_agg['zsl_order_link_id_count'] < 120), 'zsl_order_link_id_count_bins'] = 2 + tmp_linkid_agg.loc[(tmp_linkid_agg['zsl_order_link_id_count'] >= 120), 'zsl_order_link_id_count_bins'] = 3 + feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') + print('order link id count finish!') + #######################order link id & distance############################### + feature['zsl_order_is_highspeed'] = 0 + feature.loc[ + (feature['distance'] > 90000) & (feature['zsl_order_link_id_count'] < 300), 'zsl_order_is_highspeed'] = 1 + print('order link id & distance finish!') + #######################order link id & nextlinks centry############################### + tmp = data_link_split[data_link_split['link_id'].isin(dc)] + tmp = tmp.groupby('order_id', as_index=False) + tmp_linkid_centry_count = tmp['link_id'].agg({'zsl_order_link_id_centry_count': 'count'}) + feature = feature.merge(tmp_linkid_centry_count, on='order_id', how='left') + feature['zsl_order_link_id_centry_count'] = feature['zsl_order_link_id_centry_count'].fillna(0) + print('order link id & nextlinks centry finish!') + #######################order link time sum mean max min var std############################### + tmp_linktime_agg = df['link_time'].agg({'zsl_order_link_time_sum': 'sum', 'zsl_order_link_time_mean': 'mean', + 'zsl_order_link_time_max': 'max', 'zsl_order_link_time_min': 'min', + 'zsl_order_link_time_var': 'var', 'zsl_order_link_time_skew': 'skew'}) + feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') + print('order link time sum mean max min var std finish!') + #######################order link current status mean nunique############################### + tmp_linktime_agg = df['link_current_status'].agg( + {'zsl_link_current_status_mean': 'mean', 'zsl_link_current_status_nunique': 'nunique'}) + feature = feature.merge(tmp_linktime_agg, on='order_id', how='left') + print('order link current status mean nunique finish!') + #######################order link current status count vector############################### + data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(str) + data_link_split.loc[data_link_split['link_current_status'].astype(int) < 0, 'link_current_status'] = '0' + data_link_split.loc[data_link_split['link_current_status'].astype(int) > 3, 'link_current_status'] = '3' + data = data_link_split.groupby('order_id')['link_current_status'].apply(lambda x: x.str.cat(sep=',')).reset_index() + cv_encode = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b') + test_x = cv_encode.fit_transform(data['link_current_status']) + test_x = test_x.toarray() + link_current_status = pd.DataFrame(test_x, columns=['zsl_link_current_status0', 'zsl_link_current_status1', + 'zsl_link_current_status2', + 'zsl_link_current_status3']) + data = pd.concat([data[['order_id']], link_current_status], axis=1) + feature = feature.merge(data, on='order_id', how='left') + print('order link current status count vector finish!') + #######################order distance/link_id_count############################### + feature['zsl_distance_div_link_id_count'] = feature['distance'] * 10 / feature['zsl_order_link_id_count'] + feature = feature.drop('distance', axis=1) + print('order distance div link_id_count finish!') + #######################order link ratio sum mean max min var std############################### + tmp_linkratio_agg = df['link_ratio'].agg({'zsl_order_link_ratio_sum': 'sum', 'zsl_order_link_ratio_mean': 'mean', + 'zsl_order_link_ratio_min': 'min', + 'zsl_order_link_ratio_var': 'var', 'zsl_order_link_ratio_skew': 'skew'}) + feature = feature.merge(tmp_linkratio_agg, on='order_id', how='left') + print('order link ratio sum mean max min var std finish!') + #######################weather################################################################### + weather = pd.read_csv(root_path + 'weather.csv') + weather_dict = {'rainstorm': 0, 'heavy rain': 1, 'moderate rain': 2, 'cloudy': 3, + 'showers': 4} + weather['weather'] = weather['weather'].map(weather_dict) + weather['date'] = weather['date'].astype(str) + weather = weather[weather['date'] == data_time] + feature['weather'] = weather['weather'].values[0] + feature['hightemp'] = weather['hightemp'].values[0] + feature['lowtemp'] = weather['lowtemp'].values[0] + print('weather finish!') + ###################trend fea############################################# + ###################trend link time##################################### + data_link_split['link_current_status'] = data_link_split['link_current_status'].astype(int) + groupby = data_link_split.groupby(['order_id']) + func = partial(trend_in_last_k_link_id_time, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_link_time_features, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + func = partial(last_k_link_time_interval, periods=[2, 5, 7, 10, 15, 20, 30, 50, 80, 100, 100000000]) + g = parallel_apply_fea(groupby, func, index_name='order_id', num_workers=20, chunk_size=10000) + feature = feature.merge(g, on='order_id', how='left') + print('trend link time finish!') + ####################nextlinks graph embedding####################### + data_link_split['link_id'] = data_link_split['link_id'].astype(int) + data_link_split['link_id'] = data_link_split['link_id'].map(read_idkey) + data_link_split['link_id'] = data_link_split['link_id'].fillna(0) + data_link_split['link_id'] = data_link_split['link_id'].astype(int) + data_link_split['link_id'] = data_link_split['link_id'].map(read_grapheb) + data_link_split['link_id'] = data_link_split['link_id'].fillna('0') + def replace_list(x): + if isinstance(x, str): + x = fill_list + return x + data_link_split['link_id'] = data_link_split['link_id'].apply(replace_list) + link_id_col = ['zsl_link_id_eb{}'.format(i) for i in range(embedding_k)] + agg_col = dict(zip(link_id_col, ['mean'] * len(link_id_col))) + link_id_array = np.array(data_link_split.pop('link_id').to_list()) + link_id_array = pd.DataFrame(link_id_array, columns=agg_col, dtype=np.float16) + data_link_split = pd.concat([data_link_split, link_id_array], axis=1) + tmp = data_link_split.groupby('order_id', as_index=False) + tmp_linkid_agg = tmp.agg(agg_col) + feature = feature.merge(tmp_linkid_agg, on='order_id', how='left') + feature.to_csv(root_path+'feature/test/link_fea_order_id_level_20200901.csv',index=False) diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/4_single_model.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/4_single_model.py similarity index 97% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/4_single_model.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/4_single_model.py index ea9cd74..ec37c37 100644 --- a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/4_single_model.py +++ b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/4_single_model.py @@ -1,207 +1,207 @@ -#coding=utf-8 -""" -Author: Aigege -Code: https://github.com/AiIsBetter -""" -# date 2021.08.01 -import pandas as pd -import numpy as np -import seaborn as sns -import matplotlib.pyplot as plt -from sklearn.model_selection import KFold -import lightgbm as lgb -from utils import reduce_mem_usage,reduce_mem_usage_parallel -import os -import gc -import warnings -import time -warnings.filterwarnings("ignore") -def slice_id_change(x): - hour = x * 5 / 60 - hour = np.floor(hour) - hour += 8 - if hour >= 24: - hour = hour - 24 - return hour -# 评估指标 -def MAPE(true, pred): - diff = np.abs(np.array(pred) - np.array(true)) - return np.mean(diff / true) -# 自定义lgb评估指标 -def lgb_score_mape(train_data,preds): - labels = train_data - diff = np.abs(np.array(preds) - np.array(labels)) - result = np.mean(diff / labels) - return 'mape',result, False -head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] -result = [] -result_time_weight = [] -result_dis_weight = [] -count = 0 -df = [] -nrows=None -root_path = '../data/giscup_2021/' -data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824', - '20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831'] -#######################################本地验证####################################### -for name in os.listdir(root_path+'train/'): - data_time = name.split('.')[0] - if data_time not in data_list: - continue - train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) - feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) - feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) - feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows) - feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows) - - - feature_cross['order_id'] = feature_cross['order_id'].astype(str) - feature_link['order_id'] = feature_link['order_id'].astype(str) - feature_head['order_id'] = feature_head['order_id'].astype(str) - feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) - - print("开始处理", data_time) - # train.columns = ['head','link','cross'] - # train['head'] = train['head'].apply(lambda x:x.split(' ')) - train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) - train_head['order_id'] = train_head['order_id'].astype(str) - train_head['ata'] = train_head['ata'].astype(float) - train_head['distance'] = train_head['distance'].astype(float) - train_head['simple_eta'] = train_head['simple_eta'].astype(float) - train_head['driver_id'] = train_head['driver_id'].astype(int) - train_head['slice_id'] = train_head['slice_id'].astype(int) - train_head['date_time'] = int(data_time) - - train_head = train_head.merge(feature_cross,on='order_id',how='left') - train_head = train_head.merge(feature_link,on='order_id',how='left') - - feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', - 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', - 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', - 'len_tmp', - 'link_time_mean', 'link_time_std'], - axis=1) - feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) - train_head = train_head.merge(feature_sqe, on='order_id', how='left') - train_head = train_head.merge(feature_head, on='order_id', how='left') - - print('merge finish!') - train_head = reduce_mem_usage_parallel(train_head,28) - df.append(train_head.drop('order_id',axis=1)) - del train - gc.collect() - count +=1 -df = pd.concat(df,axis=0) - -test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) -test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) -test_head['order_id'] = test_head['order_id'].astype(str) -test_head['ata'] = test_head['ata'].astype(float) -test_head['distance'] = test_head['distance'].astype(float) -test_head['simple_eta'] = test_head['simple_eta'].astype(float) -test_head['driver_id'] = test_head['driver_id'].astype(int) -test_head['slice_id'] = test_head['slice_id'].astype(int) - - -feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows) -feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows) -feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows) -feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows) - -test_head['date_time'] = 20200901 - -feature_cross['order_id'] = feature_cross['order_id'].astype(str) -feature_link['order_id'] = feature_link['order_id'].astype(str) -feature_head['order_id'] = feature_head['order_id'].astype(str) -feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) - -test_head = test_head.merge(feature_cross, on='order_id', how='left') - -test_head = test_head.merge(feature_link,on='order_id',how='left') - -feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', - 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', - 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', - 'len_tmp', - 'link_time_mean', 'link_time_std'], - axis=1) -feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) -test_head = test_head.merge(feature_sqe, on='order_id', how='left') -test_head = test_head.merge(feature_head, on='order_id', how='left') - -test_head = reduce_mem_usage_parallel(test_head,28) -del feature_cross,feature_link -gc.collect() - -X_train = df.drop('ata',axis=1) -y_train = df['ata'] -X_test = test_head.drop(['order_id','ata'],axis=1) - -folds = 5 -skf = KFold(n_splits=folds, shuffle=True, random_state=2021) -train_mean = np.zeros(shape=[1,folds]) -test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float) -k_fold_mape = [] -feature_importance_df = pd.DataFrame() -# Display/plot feature importance -def display_importances(feature_importance_df_): - feature_importance_df_.to_csv('feature_importances.csv',index=False) - cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index - best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] - best_features = best_features.groupby('feature',as_index = False)['importance'].mean() - best_features = best_features.sort_values(by = 'importance',ascending=False) - plt.figure(figsize=(8, 10)) - sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) - plt.title('LightGBM Features (avg over folds)') - plt.tight_layout() - plt.savefig('feature_importances.jpg') - # plt.show() - -scores = 0 -threshold = 0 -print('start training......') -print('训练集维度:',X_train.shape) -print('测试集维度:',X_test.shape) -for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)): - clf = lgb.LGBMRegressor( - boosting_type='gbdt', - objective='regression', - n_estimators=10000, - learning_rate=0.1, - num_leaves=170, - max_bin=63, - max_depth=-1, - random_state = 2021, - subsample_for_bin=200000, - feature_fraction=0.84, - bagging_fraction=0.86, - bagging_freq=7, - min_child_samples=89, - lambda_l1=0.006237830242067111, - lambda_l2=2.016472023736186e-05, - metric=None, - n_jobs = 30, - # device='gpu' - ) - clf.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train.iloc[trn_idx], y_train.iloc[trn_idx]) - , (X_train.iloc[val_idx], y_train.iloc[val_idx])], - eval_metric=lambda y_true, y_pred:[lgb_score_mape(y_true, y_pred)], - verbose=100, early_stopping_rounds=100) - - fold_importance_df = pd.DataFrame() - fold_importance_df["feature"] = X_train.columns - fold_importance_df["importance"] = clf.feature_importances_ - fold_importance_df["fold"] = i + 1 - feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) - print('predicting') - val_predict = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration_) - test_predict[:,i] = clf.predict(X_test, num_iteration=clf.best_iteration_) - - k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict)) - print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i])) - -print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape)) -display_importances(feature_importance_df) -test_head['result'] = np.mean(test_predict,axis=1) -test_head['id'] = test_head['order_id'] -test_head[['id','result']].to_csv('submission.csv',index=False) +#coding=utf-8 +""" +Author: Aigege +Code: https://github.com/AiIsBetter +""" +# date 2021.08.01 +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.model_selection import KFold +import lightgbm as lgb +from utils import reduce_mem_usage,reduce_mem_usage_parallel +import os +import gc +import warnings +import time +warnings.filterwarnings("ignore") +def slice_id_change(x): + hour = x * 5 / 60 + hour = np.floor(hour) + hour += 8 + if hour >= 24: + hour = hour - 24 + return hour +# 评估指标 +def MAPE(true, pred): + diff = np.abs(np.array(pred) - np.array(true)) + return np.mean(diff / true) +# 自定义lgb评估指标 +def lgb_score_mape(train_data,preds): + labels = train_data + diff = np.abs(np.array(preds) - np.array(labels)) + result = np.mean(diff / labels) + return 'mape',result, False +head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] +result = [] +result_time_weight = [] +result_dis_weight = [] +count = 0 +df = [] +nrows=None +root_path = '../data/giscup_2021/' +data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824', + '20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831'] +#######################################本地验证####################################### +for name in os.listdir(root_path+'train/'): + data_time = name.split('.')[0] + if data_time not in data_list: + continue + train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) + feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) + feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) + feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows) + feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows) + + + feature_cross['order_id'] = feature_cross['order_id'].astype(str) + feature_link['order_id'] = feature_link['order_id'].astype(str) + feature_head['order_id'] = feature_head['order_id'].astype(str) + feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) + + print("开始处理", data_time) + # train.columns = ['head','link','cross'] + # train['head'] = train['head'].apply(lambda x:x.split(' ')) + train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) + train_head['order_id'] = train_head['order_id'].astype(str) + train_head['ata'] = train_head['ata'].astype(float) + train_head['distance'] = train_head['distance'].astype(float) + train_head['simple_eta'] = train_head['simple_eta'].astype(float) + train_head['driver_id'] = train_head['driver_id'].astype(int) + train_head['slice_id'] = train_head['slice_id'].astype(int) + train_head['date_time'] = int(data_time) + + train_head = train_head.merge(feature_cross,on='order_id',how='left') + train_head = train_head.merge(feature_link,on='order_id',how='left') + + feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', + 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', + 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', + 'len_tmp', + 'link_time_mean', 'link_time_std'], + axis=1) + feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) + train_head = train_head.merge(feature_sqe, on='order_id', how='left') + train_head = train_head.merge(feature_head, on='order_id', how='left') + + print('merge finish!') + train_head = reduce_mem_usage_parallel(train_head,28) + df.append(train_head.drop('order_id',axis=1)) + del train + gc.collect() + count +=1 +df = pd.concat(df,axis=0) + +test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) +test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) +test_head['order_id'] = test_head['order_id'].astype(str) +test_head['ata'] = test_head['ata'].astype(float) +test_head['distance'] = test_head['distance'].astype(float) +test_head['simple_eta'] = test_head['simple_eta'].astype(float) +test_head['driver_id'] = test_head['driver_id'].astype(int) +test_head['slice_id'] = test_head['slice_id'].astype(int) + + +feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows) +feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows) +feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows) +feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows) + +test_head['date_time'] = 20200901 + +feature_cross['order_id'] = feature_cross['order_id'].astype(str) +feature_link['order_id'] = feature_link['order_id'].astype(str) +feature_head['order_id'] = feature_head['order_id'].astype(str) +feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) + +test_head = test_head.merge(feature_cross, on='order_id', how='left') + +test_head = test_head.merge(feature_link,on='order_id',how='left') + +feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', + 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', + 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', + 'len_tmp', + 'link_time_mean', 'link_time_std'], + axis=1) +feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) +test_head = test_head.merge(feature_sqe, on='order_id', how='left') +test_head = test_head.merge(feature_head, on='order_id', how='left') + +test_head = reduce_mem_usage_parallel(test_head,28) +del feature_cross,feature_link +gc.collect() + +X_train = df.drop('ata',axis=1) +y_train = df['ata'] +X_test = test_head.drop(['order_id','ata'],axis=1) + +folds = 5 +skf = KFold(n_splits=folds, shuffle=True, random_state=2021) +train_mean = np.zeros(shape=[1,folds]) +test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float) +k_fold_mape = [] +feature_importance_df = pd.DataFrame() +# Display/plot feature importance +def display_importances(feature_importance_df_): + feature_importance_df_.to_csv('feature_importances.csv',index=False) + cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index + best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] + best_features = best_features.groupby('feature',as_index = False)['importance'].mean() + best_features = best_features.sort_values(by = 'importance',ascending=False) + plt.figure(figsize=(8, 10)) + sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) + plt.title('LightGBM Features (avg over folds)') + plt.tight_layout() + plt.savefig('feature_importances.jpg') + # plt.show() + +scores = 0 +threshold = 0 +print('start training......') +print('训练集维度:',X_train.shape) +print('测试集维度:',X_test.shape) +for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)): + clf = lgb.LGBMRegressor( + boosting_type='gbdt', + objective='regression', + n_estimators=10000, + learning_rate=0.1, + num_leaves=170, + max_bin=63, + max_depth=-1, + random_state = 2021, + subsample_for_bin=200000, + feature_fraction=0.84, + bagging_fraction=0.86, + bagging_freq=7, + min_child_samples=89, + lambda_l1=0.006237830242067111, + lambda_l2=2.016472023736186e-05, + metric=None, + n_jobs = 30, + # device='gpu' + ) + clf.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train.iloc[trn_idx], y_train.iloc[trn_idx]) + , (X_train.iloc[val_idx], y_train.iloc[val_idx])], + eval_metric=lambda y_true, y_pred:[lgb_score_mape(y_true, y_pred)], + verbose=100, early_stopping_rounds=100) + + fold_importance_df = pd.DataFrame() + fold_importance_df["feature"] = X_train.columns + fold_importance_df["importance"] = clf.feature_importances_ + fold_importance_df["fold"] = i + 1 + feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) + print('predicting') + val_predict = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration_) + test_predict[:,i] = clf.predict(X_test, num_iteration=clf.best_iteration_) + + k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict)) + print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i])) + +print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape)) +display_importances(feature_importance_df) +test_head['result'] = np.mean(test_predict,axis=1) +test_head['id'] = test_head['order_id'] +test_head[['id','result']].to_csv('submission.csv',index=False) diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/5_model_final.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/5_model_final.py similarity index 97% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/5_model_final.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/5_model_final.py index 6788352..2b8dd89 100644 --- a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/5_model_final.py +++ b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/5_model_final.py @@ -1,301 +1,301 @@ -#coding=utf-8 -""" -Author: Aigege -Code: https://github.com/AiIsBetter -""" -# date 2021.08.01 -import pandas as pd -import numpy as np -import seaborn as sns -import matplotlib.pyplot as plt -from sklearn.model_selection import KFold -import lightgbm as lgb -from utils import reduce_mem_usage,reduce_mem_usage_parallel,lgb_score_mape,MAPE -import gc -import warnings -import os,random,pickle -import optuna -warnings.filterwarnings("ignore") -def slice_id_change(x): - hour = x * 5 / 60 - hour = np.floor(hour) - hour += 8 - if hour >= 24: - hour = hour - 24 - return hour -def optuna_print(tr_x, tr_y, te_x,te_y): - def objective(trial,tr_x, tr_y, te_x,te_y): - dtrain = lgb.Dataset(tr_x, label=tr_y) - dvalid = lgb.Dataset(te_x, label=te_y) - param = { - "objective": "regression", - "metric": "mape", - "verbosity": -1, - "boosting_type": "gbdt", - 'min_split_gain': 0, - 'random_state':2021, - 'max_bin':trial.suggest_int('max_bin',63,250), - 'subsample_for_bin': trial.suggest_int('subsample_for_bin', 40000, 300000), - "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0), - "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0), - "num_leaves": trial.suggest_int("num_leaves", 2, 256), - "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0), - "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0), - "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), - "min_child_samples": trial.suggest_int("min_child_samples", 5, 100), - } - # Add a callback for pruning. - pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "mape") - gbm = lgb.train( - param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback] - ) - - preds = gbm.predict(te_x) - pred_labels = np.rint(preds) - mape = MAPE(te_y, pred_labels) - return mape - study = optuna.create_study( - pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize" - ) - study.optimize(lambda trial: objective(trial, tr_x, tr_y, te_x, te_y), - n_trials=100) - - print("Number of finished trials: {}".format(len(study.trials))) - print("Best trial:") - trial = study.best_trial - print(" Value: {}".format(trial.value)) - print(" Params: ") - for key, value in trial.params.items(): - print(" {}: {}".format(key, value)) -head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] -result = [] -result_time_weight = [] -result_dis_weight = [] -count = 0 -df = [] -nrows=None -root_path = '../data/giscup_2021/' -data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824', - '20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831'] -for name in os.listdir(root_path+'train/'): - data_time = name.split('.')[0] - if data_time not in data_list: - continue - train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) - feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) - feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) - feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows) - feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows) - feature_cross['order_id'] = feature_cross['order_id'].astype(str) - feature_link['order_id'] = feature_link['order_id'].astype(str) - feature_head['order_id'] = feature_head['order_id'].astype(str) - feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) - - print("开始处理", data_time) - # train.columns = ['head','link','cross'] - # train['head'] = train['head'].apply(lambda x:x.split(' ')) - train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) - train_head['order_id'] = train_head['order_id'].astype(str) - train_head['ata'] = train_head['ata'].astype(float) - train_head['distance'] = train_head['distance'].astype(float) - train_head['simple_eta'] = train_head['simple_eta'].astype(float) - train_head['driver_id'] = train_head['driver_id'].astype(int) - train_head['slice_id'] = train_head['slice_id'].astype(int) - train_head['date_time'] = int(data_time) - train_head = train_head.merge(feature_cross,on='order_id',how='left') - train_head = train_head.merge(feature_link,on='order_id',how='left') - - feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', - 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', - 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', - 'len_tmp', - 'link_time_mean', 'link_time_std'], - axis=1) - feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) - train_head = train_head.merge(feature_sqe, on='order_id', how='left') - train_head = train_head.merge(feature_head, on='order_id', how='left') - print('merge finish!') - train_head = reduce_mem_usage_parallel(train_head,28) - df.append(train_head.drop('order_id',axis=1)) - del train - gc.collect() - count +=1 -df = pd.concat(df,axis=0) -test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) -test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) -test_head['order_id'] = test_head['order_id'].astype(str) -test_head['ata'] = test_head['ata'].astype(float) -test_head['distance'] = test_head['distance'].astype(float) -test_head['simple_eta'] = test_head['simple_eta'].astype(float) -test_head['driver_id'] = test_head['driver_id'].astype(int) -test_head['slice_id'] = test_head['slice_id'].astype(int) - -feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows) -feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows) -feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows) -feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows) -test_head['date_time'] = 20200901 - -feature_cross['order_id'] = feature_cross['order_id'].astype(str) -feature_link['order_id'] = feature_link['order_id'].astype(str) -feature_head['order_id'] = feature_head['order_id'].astype(str) -feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) - -test_head = test_head.merge(feature_cross, on='order_id', how='left') -test_head = test_head.merge(feature_link,on='order_id',how='left') -feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', - 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', - 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', - 'len_tmp', - 'link_time_mean', 'link_time_std'], - axis=1) -feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) -test_head = test_head.merge(feature_sqe, on='order_id', how='left') -test_head = test_head.merge(feature_head, on='order_id', how='left') - -test_head = reduce_mem_usage_parallel(test_head,28) -del feature_cross,feature_link -gc.collect() - -X_train = df.drop('ata',axis=1) -y_train = df['ata'] -X_test = test_head.drop(['order_id','ata'],axis=1) -#调参 -#tr_x, te_x,tr_y,te_y = train_test_split(X_train,y_train,test_size=0.2,random_state=2021) -#optuna_print(tr_x, tr_y, te_x,te_y) -#del tr_x, te_x,tr_y,te_y -#gc.collect() - -folds = 5 -skf = KFold(n_splits=folds, shuffle=True, random_state=2021) -train_mean = np.zeros(shape=[1,folds]) -test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float) -k_fold_mape = [] -feature_importance_df = pd.DataFrame() -# Display/plot feature importance -def display_importances(feature_importance_df_): - feature_importance_df_.to_csv('feature_importances.csv',index=False) - cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index - best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] - best_features = best_features.groupby('feature',as_index = False)['importance'].mean() - best_features = best_features.sort_values(by = 'importance',ascending=False) - plt.figure(figsize=(8, 10)) - sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) - plt.title('LightGBM Features (avg over folds)') - plt.tight_layout() - plt.savefig('feature_importances.jpg') - # plt.show() -#use single model feature importance as best_feature_importances -feature_importance_df_ = pd.read_csv('best_feature_importances.csv') -cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index -best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] -best_features = best_features.groupby('feature',as_index = False)['importance'].mean() -best_features = best_features.sort_values(by = 'importance',ascending=False) -data=best_features.sort_values(by="importance", ascending=False) -feature_select = list(data['feature'].values) -feature_cols = feature_select - -random_seed = list(range(2021)) -max_depth = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7] -lambd1 = np.arange(0, 1, 0.0001) -lambd2 = np.arange(0, 1, 0.0001) -bagging_fraction = [i / 1000.0 for i in range(700, 800)] -feature_fraction = [i / 1000.0 for i in range(700, 800)] -min_child_weight = [i / 100.0 for i in range(150, 250)] -n_feature = [i / 100.0 for i in range(1, 32,2)] -max_bin = list(range(130, 240)) -subsample_for_bin = list(range(50000, 220000,10000)) -bagging_freq = [1,2,3,4,5,6,7,8,9,10,1,2,3,4,5] -num_leaves = list(range(130, 250)) - - -random.shuffle(random_seed) -random.shuffle(max_depth) -random.shuffle(lambd1) -random.shuffle(lambd2) -random.shuffle(bagging_fraction) -random.shuffle(feature_fraction) -random.shuffle(min_child_weight) -random.shuffle(max_bin) -random.shuffle(subsample_for_bin) -random.shuffle(bagging_freq) -random.shuffle(num_leaves) -random.shuffle(n_feature) - - -with open('params.pkl', 'wb') as f: - pickle.dump((random_seed, max_depth, lambd1,lambd2, bagging_fraction, feature_fraction, min_child_weight, max_bin,subsample_for_bin,bagging_freq,num_leaves,n_feature), f) -for iter in range(15): - print('max_depth:',max_depth[iter],'random_seed:',random_seed[iter],'feature_fraction:',feature_fraction[iter], - 'bagging_fraction:',bagging_fraction[iter],'min_child_weight:',min_child_weight[iter], - 'lambd1:',lambd1[iter],'lambd2:',lambd2[iter],'max_bin:',max_bin[iter],'num_leaves:',num_leaves[iter] - ,'subsample_for_bin:',subsample_for_bin[iter],'bagging_freq:',bagging_freq[iter],'n_feature:',n_feature[iter]) -nround = 5000 -for iter in range(15): - if max_depth[iter]==4: - nround = 10000 - elif max_depth[iter]==5: - nround = 8000 - elif max_depth[iter]==6: - nround = 6000 - elif max_depth[iter] == 7: - nround = 5000 - X_train_r = X_train[feature_cols[:int(len(feature_cols)*0.7)]+ - feature_cols[int(len(feature_cols)*0.7):int(len(feature_cols)*0.7)+int(len(feature_cols)*n_feature[iter])]] - X_test_r = X_test[feature_cols[:int(len(feature_cols) * 0.7)] + - feature_cols[int(len(feature_cols) * 0.7):int(len(feature_cols) * 0.7) + int( - len(feature_cols) * n_feature[iter])]] - scores = 0 - threshold = 0 - print('start training......') - print('训练集维度:',X_train_r.shape) - print('测试集维度:',X_test_r.shape) - for i, (trn_idx, val_idx) in enumerate(skf.split(X_train_r, y_train)): - clf = lgb.LGBMRegressor( - boosting_type='gbdt', - objective='regression', - n_estimators=nround, - learning_rate=0.08, - num_leaves=num_leaves[iter], - max_bin=max_bin[iter], - max_depth=max_depth[iter], - random_state=random_seed[iter], - subsample_for_bin=subsample_for_bin[iter], - feature_fraction=feature_fraction[iter], - bagging_fraction=bagging_fraction[iter], - bagging_freq=bagging_freq[iter], - min_child_weight=min_child_weight[iter], - lambda_l1=lambd1[iter], - lambda_l2=lambd2[iter], - metric=None, - n_jobs=30, - device='gpu' - ) - clf.fit(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx]), (X_train_r.iloc[val_idx], y_train.iloc[val_idx])],eval_metric='mape',verbose=100, early_stopping_rounds=200) - - print('predicting') - val_predict = clf.predict(X_train_r.iloc[val_idx], num_iteration=clf.best_iteration_) - test_predict[:,i] = clf.predict(X_test_r, num_iteration=clf.best_iteration_) - k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict)) - print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i])) - - print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape)) - #display_importances(feature_importance_df) - test_head['result'] = np.mean(test_predict,axis=1) - test_head['id'] = test_head['order_id'] - test_head[['id','result']].to_csv('random_result/submission_{}.csv'.format(iter),index=False) - del X_train_r,X_test_r - gc.collect() -#merge -count = 0 -result = 1 -for name in os.listdir('random_result/'): - tmp = pd.read_csv('random_result/'+name) - if count == 0: - result = tmp[['id']] - tmp = tmp.rename(columns={'result':'result{}'.format(count)}) - result = result.merge(tmp,on='id',how='left') - count += 1 -result['result'] = result.drop('id',axis=1).sum(axis=1) -result['result'] = result['result']/count -result[['id','result']].to_csv('submission_merge.csv',index=False) +#coding=utf-8 +""" +Author: Aigege +Code: https://github.com/AiIsBetter +""" +# date 2021.08.01 +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.model_selection import KFold +import lightgbm as lgb +from utils import reduce_mem_usage,reduce_mem_usage_parallel,lgb_score_mape,MAPE +import gc +import warnings +import os,random,pickle +import optuna +warnings.filterwarnings("ignore") +def slice_id_change(x): + hour = x * 5 / 60 + hour = np.floor(hour) + hour += 8 + if hour >= 24: + hour = hour - 24 + return hour +def optuna_print(tr_x, tr_y, te_x,te_y): + def objective(trial,tr_x, tr_y, te_x,te_y): + dtrain = lgb.Dataset(tr_x, label=tr_y) + dvalid = lgb.Dataset(te_x, label=te_y) + param = { + "objective": "regression", + "metric": "mape", + "verbosity": -1, + "boosting_type": "gbdt", + 'min_split_gain': 0, + 'random_state':2021, + 'max_bin':trial.suggest_int('max_bin',63,250), + 'subsample_for_bin': trial.suggest_int('subsample_for_bin', 40000, 300000), + "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0), + "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0), + "num_leaves": trial.suggest_int("num_leaves", 2, 256), + "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0), + "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0), + "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), + "min_child_samples": trial.suggest_int("min_child_samples", 5, 100), + } + # Add a callback for pruning. + pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "mape") + gbm = lgb.train( + param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback] + ) + + preds = gbm.predict(te_x) + pred_labels = np.rint(preds) + mape = MAPE(te_y, pred_labels) + return mape + study = optuna.create_study( + pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize" + ) + study.optimize(lambda trial: objective(trial, tr_x, tr_y, te_x, te_y), + n_trials=100) + + print("Number of finished trials: {}".format(len(study.trials))) + print("Best trial:") + trial = study.best_trial + print(" Value: {}".format(trial.value)) + print(" Params: ") + for key, value in trial.params.items(): + print(" {}: {}".format(key, value)) +head_columns = ['order_id', 'ata', 'distance', 'simple_eta', 'driver_id','slice_id'] +result = [] +result_time_weight = [] +result_dis_weight = [] +count = 0 +df = [] +nrows=None +root_path = '../data/giscup_2021/' +data_list = ['20200818', '20200819', '20200820', '20200821', '20200822', '20200823', '20200824', + '20200825', '20200826', '20200827', '20200828', '20200829', '20200830', '20200831'] +for name in os.listdir(root_path+'train/'): + data_time = name.split('.')[0] + if data_time not in data_list: + continue + train = pd.read_csv(root_path+'train/{}'.format(name),sep= ';;',header=None,nrows=nrows) + feature_cross = pd.read_csv(root_path+'feature/train/cross_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) + feature_link = pd.read_csv(root_path+'feature/train/link_fea_order_id_level_{}.csv'.format(data_time),nrows=nrows) + feature_head = pd.read_csv(root_path+'feature/train/head_link_{}.csv'.format(data_time),nrows=nrows) + feature_sqe = pd.read_csv(root_path + 'feature/train/{}.csv'.format(data_time),nrows=nrows) + feature_cross['order_id'] = feature_cross['order_id'].astype(str) + feature_link['order_id'] = feature_link['order_id'].astype(str) + feature_head['order_id'] = feature_head['order_id'].astype(str) + feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) + + print("开始处理", data_time) + # train.columns = ['head','link','cross'] + # train['head'] = train['head'].apply(lambda x:x.split(' ')) + train_head = pd.DataFrame(train[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) + train_head['order_id'] = train_head['order_id'].astype(str) + train_head['ata'] = train_head['ata'].astype(float) + train_head['distance'] = train_head['distance'].astype(float) + train_head['simple_eta'] = train_head['simple_eta'].astype(float) + train_head['driver_id'] = train_head['driver_id'].astype(int) + train_head['slice_id'] = train_head['slice_id'].astype(int) + train_head['date_time'] = int(data_time) + train_head = train_head.merge(feature_cross,on='order_id',how='left') + train_head = train_head.merge(feature_link,on='order_id',how='left') + + feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', + 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', + 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', + 'len_tmp', + 'link_time_mean', 'link_time_std'], + axis=1) + feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) + train_head = train_head.merge(feature_sqe, on='order_id', how='left') + train_head = train_head.merge(feature_head, on='order_id', how='left') + print('merge finish!') + train_head = reduce_mem_usage_parallel(train_head,28) + df.append(train_head.drop('order_id',axis=1)) + del train + gc.collect() + count +=1 +df = pd.concat(df,axis=0) +test = pd.read_csv(root_path+'20200901_test.txt',sep= ';;',header=None,nrows=nrows) +test_head = pd.DataFrame(test[0].str.split(' ').tolist(),columns = ['order_id', 'ata', 'distance','simple_eta', 'driver_id', 'slice_id']) +test_head['order_id'] = test_head['order_id'].astype(str) +test_head['ata'] = test_head['ata'].astype(float) +test_head['distance'] = test_head['distance'].astype(float) +test_head['simple_eta'] = test_head['simple_eta'].astype(float) +test_head['driver_id'] = test_head['driver_id'].astype(int) +test_head['slice_id'] = test_head['slice_id'].astype(int) + +feature_cross = pd.read_csv(root_path + 'feature/test/cross_fea_order_id_level_{}.csv'.format('20200901'),nrows=nrows) +feature_link = pd.read_csv(root_path + 'feature/test/link_fea_order_id_level_{}.csv'.format('20200901'), nrows=nrows) +feature_head = pd.read_csv(root_path + 'feature/test/head_link_{}.csv'.format('20200901'),nrows=nrows) +feature_sqe = pd.read_csv(root_path + 'feature/test/{}.csv'.format('20200901'),nrows=nrows) +test_head['date_time'] = 20200901 + +feature_cross['order_id'] = feature_cross['order_id'].astype(str) +feature_link['order_id'] = feature_link['order_id'].astype(str) +feature_head['order_id'] = feature_head['order_id'].astype(str) +feature_sqe['order_id'] = feature_sqe['order_id'].astype(str) + +test_head = test_head.merge(feature_cross, on='order_id', how='left') +test_head = test_head.merge(feature_link,on='order_id',how='left') +feature_head = feature_head.drop(['ata', 'distance', 'simple_eta', 'driver_id', 'slice_id', 'index', + 'date_time', 'link_count', 'link_time_sum', 'link_ratio_sum', + 'date_time_dt', 'weekday', 'hour', 'weather', 'hightemp', 'lowtemp', + 'len_tmp', + 'link_time_mean', 'link_time_std'], + axis=1) +feature_sqe = feature_sqe.drop(['pre_arrival_status', 'arrive_slice_id', 'slice_id'], axis=1) +test_head = test_head.merge(feature_sqe, on='order_id', how='left') +test_head = test_head.merge(feature_head, on='order_id', how='left') + +test_head = reduce_mem_usage_parallel(test_head,28) +del feature_cross,feature_link +gc.collect() + +X_train = df.drop('ata',axis=1) +y_train = df['ata'] +X_test = test_head.drop(['order_id','ata'],axis=1) +#调参 +#tr_x, te_x,tr_y,te_y = train_test_split(X_train,y_train,test_size=0.2,random_state=2021) +#optuna_print(tr_x, tr_y, te_x,te_y) +#del tr_x, te_x,tr_y,te_y +#gc.collect() + +folds = 5 +skf = KFold(n_splits=folds, shuffle=True, random_state=2021) +train_mean = np.zeros(shape=[1,folds]) +test_predict = np.zeros(shape=[X_test.shape[0], folds],dtype=float) +k_fold_mape = [] +feature_importance_df = pd.DataFrame() +# Display/plot feature importance +def display_importances(feature_importance_df_): + feature_importance_df_.to_csv('feature_importances.csv',index=False) + cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index + best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] + best_features = best_features.groupby('feature',as_index = False)['importance'].mean() + best_features = best_features.sort_values(by = 'importance',ascending=False) + plt.figure(figsize=(8, 10)) + sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) + plt.title('LightGBM Features (avg over folds)') + plt.tight_layout() + plt.savefig('feature_importances.jpg') + # plt.show() +#use single model feature importance as best_feature_importances +feature_importance_df_ = pd.read_csv('best_feature_importances.csv') +cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index +best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] +best_features = best_features.groupby('feature',as_index = False)['importance'].mean() +best_features = best_features.sort_values(by = 'importance',ascending=False) +data=best_features.sort_values(by="importance", ascending=False) +feature_select = list(data['feature'].values) +feature_cols = feature_select + +random_seed = list(range(2021)) +max_depth = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7] +lambd1 = np.arange(0, 1, 0.0001) +lambd2 = np.arange(0, 1, 0.0001) +bagging_fraction = [i / 1000.0 for i in range(700, 800)] +feature_fraction = [i / 1000.0 for i in range(700, 800)] +min_child_weight = [i / 100.0 for i in range(150, 250)] +n_feature = [i / 100.0 for i in range(1, 32,2)] +max_bin = list(range(130, 240)) +subsample_for_bin = list(range(50000, 220000,10000)) +bagging_freq = [1,2,3,4,5,6,7,8,9,10,1,2,3,4,5] +num_leaves = list(range(130, 250)) + + +random.shuffle(random_seed) +random.shuffle(max_depth) +random.shuffle(lambd1) +random.shuffle(lambd2) +random.shuffle(bagging_fraction) +random.shuffle(feature_fraction) +random.shuffle(min_child_weight) +random.shuffle(max_bin) +random.shuffle(subsample_for_bin) +random.shuffle(bagging_freq) +random.shuffle(num_leaves) +random.shuffle(n_feature) + + +with open('params.pkl', 'wb') as f: + pickle.dump((random_seed, max_depth, lambd1,lambd2, bagging_fraction, feature_fraction, min_child_weight, max_bin,subsample_for_bin,bagging_freq,num_leaves,n_feature), f) +for iter in range(15): + print('max_depth:',max_depth[iter],'random_seed:',random_seed[iter],'feature_fraction:',feature_fraction[iter], + 'bagging_fraction:',bagging_fraction[iter],'min_child_weight:',min_child_weight[iter], + 'lambd1:',lambd1[iter],'lambd2:',lambd2[iter],'max_bin:',max_bin[iter],'num_leaves:',num_leaves[iter] + ,'subsample_for_bin:',subsample_for_bin[iter],'bagging_freq:',bagging_freq[iter],'n_feature:',n_feature[iter]) +nround = 5000 +for iter in range(15): + if max_depth[iter]==4: + nround = 10000 + elif max_depth[iter]==5: + nround = 8000 + elif max_depth[iter]==6: + nround = 6000 + elif max_depth[iter] == 7: + nround = 5000 + X_train_r = X_train[feature_cols[:int(len(feature_cols)*0.7)]+ + feature_cols[int(len(feature_cols)*0.7):int(len(feature_cols)*0.7)+int(len(feature_cols)*n_feature[iter])]] + X_test_r = X_test[feature_cols[:int(len(feature_cols) * 0.7)] + + feature_cols[int(len(feature_cols) * 0.7):int(len(feature_cols) * 0.7) + int( + len(feature_cols) * n_feature[iter])]] + scores = 0 + threshold = 0 + print('start training......') + print('训练集维度:',X_train_r.shape) + print('测试集维度:',X_test_r.shape) + for i, (trn_idx, val_idx) in enumerate(skf.split(X_train_r, y_train)): + clf = lgb.LGBMRegressor( + boosting_type='gbdt', + objective='regression', + n_estimators=nround, + learning_rate=0.08, + num_leaves=num_leaves[iter], + max_bin=max_bin[iter], + max_depth=max_depth[iter], + random_state=random_seed[iter], + subsample_for_bin=subsample_for_bin[iter], + feature_fraction=feature_fraction[iter], + bagging_fraction=bagging_fraction[iter], + bagging_freq=bagging_freq[iter], + min_child_weight=min_child_weight[iter], + lambda_l1=lambd1[iter], + lambda_l2=lambd2[iter], + metric=None, + n_jobs=30, + device='gpu' + ) + clf.fit(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx], eval_set=[(X_train_r.iloc[trn_idx], y_train.iloc[trn_idx]), (X_train_r.iloc[val_idx], y_train.iloc[val_idx])],eval_metric='mape',verbose=100, early_stopping_rounds=200) + + print('predicting') + val_predict = clf.predict(X_train_r.iloc[val_idx], num_iteration=clf.best_iteration_) + test_predict[:,i] = clf.predict(X_test_r, num_iteration=clf.best_iteration_) + k_fold_mape.append(MAPE(y_train.iloc[val_idx],val_predict)) + print("kfold_{}_mape_score:{} ".format(i, k_fold_mape[i])) + + print('Train set kfold {} mean mape:'.format(i), np.mean(k_fold_mape)) + #display_importances(feature_importance_df) + test_head['result'] = np.mean(test_predict,axis=1) + test_head['id'] = test_head['order_id'] + test_head[['id','result']].to_csv('random_result/submission_{}.csv'.format(iter),index=False) + del X_train_r,X_test_r + gc.collect() +#merge +count = 0 +result = 1 +for name in os.listdir('random_result/'): + tmp = pd.read_csv('random_result/'+name) + if count == 0: + result = tmp[['id']] + tmp = tmp.rename(columns={'result':'result{}'.format(count)}) + result = result.merge(tmp,on='id',how='left') + count += 1 +result['result'] = result.drop('id',axis=1).sum(axis=1) +result['result'] = result['result']/count +result[['id','result']].to_csv('submission_merge.csv',index=False) diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/README.MD b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/README.MD similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/README.MD rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/README.MD diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/node2vec b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/node2vec similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/node2vec rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/node2vec diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/utils.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/utils.py similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/LGB_13700/utils.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/LGB_13700/utils.py diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/README.md b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/README.md similarity index 87% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/README.md rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/README.md index 6768e7b..892170b 100644 --- a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/README.md +++ b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/README.md @@ -51,7 +51,7 @@ ### 4. 模型说明 -- [DCN模型](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953) +- [DCN蒸馏模型](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953) - ![1628669063602](assets/1628669063602.png) - [WDR模型](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/WD_128544) - ![1628669073291](assets/1628669073291.png) @@ -81,8 +81,8 @@ ### 7. 文件说明 -- [DCN_12953](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953) - - DCN模型,线上分数0.12953 +- [DCN蒸馏_12953](https://github.com/ben1234560/AiLearning-Theory-Applying/tree/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953) + - DCN蒸馏模型(利用“未来”数据),线上分数0.12953 - dcn_model/[dcn_model.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/dcn_model.py):模型代码 - dcn_model/[main.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/main.py):主函数,训练和预测 - dcn_model/[process.py](https://github.com/ben1234560/AiLearning-Theory-Applying/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AB%9E%E8%B5%9B%E5%AE%9E%E6%88%98_%E4%BC%98%E8%83%9C%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88/%E6%BB%B4%E6%BB%B4%E2%80%94%E2%80%94%E9%A2%84%E4%BC%B0%E5%88%B0%E8%BE%BE%E6%97%B6%E9%97%B4/DCN_12953/dcn_model/process.py):特征预处理 diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/log/main_0730_2.log b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/log/main_0730_2.log similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/log/main_0730_2.log rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/log/main_0730_2.log diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/model_h5/wd_cross_cols_list_0730_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/model_h5/wd_cross_cols_list_0730_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/model_h5/wd_cross_cols_list_0730_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/model_h5/wd_cross_cols_list_0730_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/model_h5/wd_link_cols_list_0730_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/model_h5/wd_link_cols_list_0730_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/model_h5/wd_link_cols_list_0730_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/model_h5/wd_link_cols_list_0730_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/model_h5/wd_mk_cols_list_0730_2.npy b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/model_h5/wd_mk_cols_list_0730_2.npy similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/model_h5/wd_mk_cols_list_0730_2.npy rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/model_h5/wd_mk_cols_list_0730_2.npy diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/__pycache__/process.cpython-36.pyc b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/__pycache__/process.cpython-36.pyc similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/__pycache__/process.cpython-36.pyc rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/__pycache__/process.cpython-36.pyc diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/__pycache__/process.cpython-38.pyc b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/__pycache__/process.cpython-38.pyc similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/__pycache__/process.cpython-38.pyc rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/__pycache__/process.cpython-38.pyc diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/__pycache__/wd_model.cpython-36.pyc b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/__pycache__/wd_model.cpython-36.pyc similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/__pycache__/wd_model.cpython-36.pyc rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/__pycache__/wd_model.cpython-36.pyc diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/main.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/main.py similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/main.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/main.py diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/process.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/process.py similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/process.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/process.py diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/wd_model.py b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/wd_model.py similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/WD_128544/wd_model/wd_model.py rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/WD_128544/wd_model/wd_model.py diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628602069041.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628602069041.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628602069041.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628602069041.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628602545539.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628602545539.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628602545539.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628602545539.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628668115968.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628668115968.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628668115968.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628668115968.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628669063602.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628669063602.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628669063602.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628669063602.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628669073291.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628669073291.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628669073291.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628669073291.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628669152380.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628669152380.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628669152380.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628669152380.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628670144983.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628670144983.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628670144983.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628670144983.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628670345575.png b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628670345575.png similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/assets/1628670345575.png rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/assets/1628670345575.png diff --git a/机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/预估到达时间解题思路.pdf b/机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/预估到达时间解题思路.pdf similarity index 100% rename from 机器学习竞赛实战_优胜解决方案/滴滴——预估到达时间/预估到达时间解题思路.pdf rename to 机器学习竞赛实战_优胜解决方案/ACM SIGSPATIAL 2021 GISCUP/预估到达时间解题思路.pdf diff --git a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/1_数据预处理_建筑能源利用率预测.ipynb b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/1_数据预处理_建筑能源利用率预测.ipynb index be93b6c..5295011 100644 --- a/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/1_数据预处理_建筑能源利用率预测.ipynb +++ b/机器学习竞赛实战_优胜解决方案/建筑能源利用率预测/1_数据预处理_建筑能源利用率预测.ipynb @@ -3349,7 +3349,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.8" } }, "nbformat": 4,