parent
a272e2334c
commit
2b6f48b567
Binary file not shown.
Binary file not shown.
@ -0,0 +1,431 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import gc
|
||||
import tensorflow as tf
|
||||
import process
|
||||
import dcn_model
|
||||
import sys
|
||||
import random
|
||||
import os
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tensorflow.compat.v1 import ConfigProto
|
||||
from tensorflow.compat.v1 import InteractiveSession
|
||||
config = ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
session = InteractiveSession(config=config)
|
||||
tf.random.set_seed(42)
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'
|
||||
|
||||
RANDOM_SEED = 42
|
||||
# types of columns of the data_set DataFrame
|
||||
CATEGORICAL_COLS = [
|
||||
'weather_le', 'hightemp', 'lowtemp', 'dayofweek',
|
||||
'slice_id', 'link_current_status_4'
|
||||
]
|
||||
|
||||
NUMERIC_COLS = [
|
||||
'distance', 'simple_eta', 'link_time_sum', 'link_count',
|
||||
'cr_t_sum', 'link_current_status_4_percent', 'link_current_status_mean',
|
||||
'pr_mean', 'dc_mean','lk_arrival_0_percent', 'lk_arrival_1_percent',
|
||||
'lk_arrival_2_percent', 'lk_arrival_3_percent', 'lk_arrival_4_percent'
|
||||
|
||||
]
|
||||
|
||||
WIDE_COLS = [
|
||||
'weather_le', 'hightemp', 'lowtemp', 'dayofweek'
|
||||
]
|
||||
|
||||
IGNORE_COLS = [
|
||||
'order_id', 'ata'
|
||||
]
|
||||
|
||||
TRAINING = True
|
||||
VAL_TO_TEST = False
|
||||
|
||||
|
||||
def set_seed(seed=42):
|
||||
random.seed(seed)
|
||||
os.environ["PYTHONHASHSEED"] = str(seed)
|
||||
np.random.seed(seed)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
set_seed(RANDOM_SEED)
|
||||
print(dcn_model.get_available_gpus()) # 返回格式为:['/device:GPU:0', '/device:GPU:1']
|
||||
|
||||
# LOAD DATA
|
||||
print('*-' * 40, 'LOAD DATA')
|
||||
making_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_order_xt/'
|
||||
link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_link_sqe_for_order/'
|
||||
cross_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/for_0714_cross_sqe_for_order/'
|
||||
link_data_other_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/for_0714_link_sqe_for_order_other/'
|
||||
head_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_head_link_data_clear/'
|
||||
win_order_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/win_order_xw/'
|
||||
#pre_arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/final_pre_arrival_data/'
|
||||
arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_link_sqe_for_order_arrival/'
|
||||
zsl_arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/zsl_arrival/'
|
||||
arrival_sqe_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_lk_arrival_sqe_for_order/'
|
||||
#h_s_for_link_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_hightmp_slice_for_link_eb/'
|
||||
pre_arrival_sqe_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/sqe_arrival_for_link/'
|
||||
zsl_link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/zsl_train_link/'
|
||||
data, mk_cols_list, link_cols_list, cross_cols_list = process.load_data(making_data_dir,
|
||||
link_data_dir,
|
||||
cross_data_dir,
|
||||
link_data_other_dir,
|
||||
head_data_dir,
|
||||
win_order_data_dir,
|
||||
pre_arrival_sqe_dir,
|
||||
zsl_link_data_dir,
|
||||
#pre_arrival_data_dir,
|
||||
#h_s_for_link_dir,
|
||||
arrival_data_dir,
|
||||
zsl_arrival_data_dir,
|
||||
arrival_sqe_data_dir)
|
||||
|
||||
#fd = dcn_model.FeatureDictionary(data, numeric_cols=NUMERIC_COLS, ignore_cols=IGNORE_COLS,
|
||||
# cate_cols=CATEGORICAL_COLS)
|
||||
# PROCESSING DATA
|
||||
data['date_time'] = data['date_time'].astype(int)
|
||||
print("type(data['date_time']):", data['date_time'].dtype)
|
||||
data = data[data['date_time'] != 20200901]
|
||||
print('Here train_test_split..................')
|
||||
# all_train_data, _ = train_test_split(all_train_data, test_size=0.9, random_state=42)
|
||||
data = data.reset_index()
|
||||
del data['index']
|
||||
print('*-' * 40, 'The data.shape:', data.shape)
|
||||
train_data, val_data = train_test_split(data, test_size=0.15, random_state=RANDOM_SEED)
|
||||
train_data = train_data.reset_index()
|
||||
val_data = val_data.reset_index()
|
||||
del train_data['index']
|
||||
del val_data['index']
|
||||
print('Save End.................')
|
||||
fb_list = CATEGORICAL_COLS+NUMERIC_COLS+IGNORE_COLS
|
||||
data_bak = data[fb_list]
|
||||
del data
|
||||
data = data_bak.copy()
|
||||
del data_bak
|
||||
gc.collect()
|
||||
|
||||
print('*-' * 40, 'PROCESSING DATA FOR TRAIN')
|
||||
train_data = process.processing_data(train_data, link_cols_list, cross_cols_list, mk_cols_list, WIDE_COLS)
|
||||
#del data
|
||||
#fb_list = CATEGORICAL_COLS+NUMERIC_COLS+IGNORE_COLS
|
||||
#data = data[fb_list]
|
||||
#gc.collect()
|
||||
# print(train_data.columns.tolist())
|
||||
|
||||
# PROCESSING INPUTS
|
||||
print('*-' * 40, 'PROCESSING INPUTS')
|
||||
# SAVE LIST
|
||||
a = np.array(mk_cols_list)
|
||||
np.save('../model_h5/mk_cols_list_0720_2.npy', a)
|
||||
a = np.array(link_cols_list)
|
||||
np.save('../model_h5/link_cols_list_0720_2.npy', a)
|
||||
a = np.array(cross_cols_list)
|
||||
np.save('../model_h5/cross_cols_list_0720_2.npy', cross_cols_list)
|
||||
a = np.array(CATEGORICAL_COLS)
|
||||
np.save('../model_h5/CATEGORICAL_COLS_0720_2.npy', a)
|
||||
del a
|
||||
pred_cols = ['ata']
|
||||
print('*-' * 40, 'PROCESSING INPUTS FOR TRAIN_DATA', train_data.shape)
|
||||
train_link_inputs, train_cross_inputs, train_deep_input, train_wide_input, \
|
||||
train_inputs_slice, train_labels, train_arrival = process.processing_inputs(
|
||||
train_data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS)
|
||||
X_train = dcn_model.preprocess(train_data, CATEGORICAL_COLS, NUMERIC_COLS)
|
||||
train_pre = train_data[['order_id']]
|
||||
del train_data
|
||||
gc.collect()
|
||||
|
||||
print('*-' * 40, 'PROCESSING DATA FOR TRAIN')
|
||||
val_data = process.processing_data(val_data, link_cols_list, cross_cols_list, mk_cols_list, WIDE_COLS, is_test=True)
|
||||
print('*-' * 40, 'PROCESSING INPUTS FOR VAL_DATA', val_data.shape)
|
||||
val_link_inputs, val_cross_inputs, val_deep_input, val_wide_input, \
|
||||
val_inputs_slice, val_labels, val_arrival = process.processing_inputs(
|
||||
val_data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS)
|
||||
X_val = dcn_model.preprocess(val_data, CATEGORICAL_COLS, NUMERIC_COLS)
|
||||
# val_data.to_csv('../model_h5/val_data.csv', index=0) # saving csv for test running
|
||||
val_pre = val_data[['order_id']]
|
||||
del val_data
|
||||
gc.collect()
|
||||
|
||||
# MODEL_INIT
|
||||
print('*-' * 40, 'T_MODEL_INIT')
|
||||
deep_col_len, wide_col_len = train_deep_input.values.shape[1], train_wide_input.shape[1]
|
||||
link_size = 639877 + 2
|
||||
cross_size = 44313 + 2
|
||||
link_nf_size, cross_nf_size = train_link_inputs.shape[2], train_cross_inputs.shape[2]
|
||||
slice_size = 288
|
||||
# link_seqlen, cross_seqlen = 170, 12 # 已默认
|
||||
print("link_size:{},link_nf_size:{},cross_size:{},cross_nf_size:{},slice_size:{}".format(link_size, link_nf_size,
|
||||
cross_size, cross_nf_size,
|
||||
slice_size))
|
||||
print("deep_col_len:{}, wide_col_len:{}".format(deep_col_len, wide_col_len))
|
||||
|
||||
fd = dcn_model.FeatureDictionary(data, numeric_cols=NUMERIC_COLS, ignore_cols=IGNORE_COLS,
|
||||
cate_cols=CATEGORICAL_COLS)
|
||||
inp_layer, inp_embed = dcn_model.embedding_layers(fd)
|
||||
autoencoder, encoder = dcn_model.create_autoencoder(train_deep_input.values.shape[-1], 1, noise=0.1)
|
||||
if TRAINING:
|
||||
autoencoder.fit(train_deep_input.values, (train_deep_input.values, train_labels.values),
|
||||
epochs=1000, # 1000
|
||||
batch_size=2048, # 1024
|
||||
validation_split=0.1,
|
||||
callbacks=[tf.keras.callbacks.EarlyStopping('val_ata_output_loss', patience=10, restore_best_weights=True)])
|
||||
encoder.save_weights('../model_h5/t_encoder.hdf5')
|
||||
else:
|
||||
encoder.load_weights('../model_h5/t_encoder.hdf5')
|
||||
encoder.trainable = False
|
||||
del autoencoder
|
||||
|
||||
t_model = dcn_model.DCN_model(inp_layer, inp_embed, link_size, cross_size, slice_size, deep_col_len, wide_col_len,
|
||||
link_nf_size, cross_nf_size, encoder, conv=True, have_knowledge=False)
|
||||
#del encoder
|
||||
gc.collect()
|
||||
|
||||
mc, es, lr = dcn_model.get_mc_es_lr('0720_2', patience=5, min_delta=1e-4)
|
||||
print('*-' * 40, 'MODEL_INIT END')
|
||||
|
||||
print('*-' * 40, 'ARRIVAL_MODEL_FIT')
|
||||
t_history = t_model.fit(
|
||||
[
|
||||
X_train['weather_le'], X_train['hightemp'], X_train['lowtemp'], X_train['dayofweek'],
|
||||
X_train['slice_id'], X_train['link_current_status_4'],
|
||||
X_train['distance'], X_train['simple_eta'], X_train['link_time_sum'], X_train['link_count'],
|
||||
X_train['cr_t_sum'], X_train['link_current_status_4_percent'], X_train['link_current_status_mean'],
|
||||
X_train['pr_mean'], X_train['dc_mean'],
|
||||
X_train['lk_arrival_0_percent'], X_train['lk_arrival_1_percent'],X_train['lk_arrival_2_percent'],
|
||||
X_train['lk_arrival_3_percent'],X_train['lk_arrival_4_percent'],
|
||||
train_link_inputs, train_cross_inputs, train_deep_input.values, train_wide_input, train_inputs_slice],
|
||||
train_labels.values,
|
||||
validation_data=(
|
||||
[
|
||||
X_val['weather_le'], X_val['hightemp'], X_val['lowtemp'], X_val['dayofweek'],
|
||||
X_val['slice_id'], X_val['link_current_status_4'],
|
||||
X_val['distance'], X_val['simple_eta'], X_val['link_time_sum'], X_val['link_count'],
|
||||
X_val['cr_t_sum'], X_val['link_current_status_4_percent'], X_val['link_current_status_mean'],
|
||||
X_val['pr_mean'], X_val['dc_mean'],
|
||||
X_val['lk_arrival_0_percent'], X_val['lk_arrival_1_percent'],X_val['lk_arrival_2_percent'],
|
||||
X_val['lk_arrival_3_percent'],X_val['lk_arrival_4_percent'],
|
||||
val_link_inputs, val_cross_inputs, val_deep_input.values, val_wide_input, val_inputs_slice],
|
||||
(val_labels.values),),
|
||||
batch_size=2048, # 2048,1024
|
||||
epochs=100, # 100
|
||||
verbose=1,
|
||||
# )
|
||||
callbacks=[es]) # lr
|
||||
np.save('../model_h5/t_model_0720_2.npy', t_history.history)
|
||||
t_model.save_weights("../model_h5/t_model_0720_2.h5")
|
||||
print('*-' * 40, 't_MODEL_PREDICT')
|
||||
y_knowledge_train = t_model.predict(
|
||||
[X_train['weather_le'], X_train['hightemp'], X_train['lowtemp'], X_train['dayofweek'],
|
||||
X_train['slice_id'], X_train['link_current_status_4'],
|
||||
X_train['distance'], X_train['simple_eta'], X_train['link_time_sum'], X_train['link_count'],
|
||||
X_train['cr_t_sum'], X_train['link_current_status_4_percent'], X_train['link_current_status_mean'],
|
||||
X_train['pr_mean'], X_train['dc_mean'],
|
||||
X_train['lk_arrival_0_percent'], X_train['lk_arrival_1_percent'],X_train['lk_arrival_2_percent'],
|
||||
X_train['lk_arrival_3_percent'],X_train['lk_arrival_4_percent'],
|
||||
train_link_inputs, train_cross_inputs, train_deep_input.values, train_wide_input, train_inputs_slice],
|
||||
batch_size=2048)
|
||||
y_knowledge_val = t_model.predict(
|
||||
[
|
||||
X_val['weather_le'], X_val['hightemp'], X_val['lowtemp'], X_val['dayofweek'],
|
||||
X_val['slice_id'], X_val['link_current_status_4'],
|
||||
X_val['distance'], X_val['simple_eta'], X_val['link_time_sum'], X_val['link_count'],
|
||||
X_val['cr_t_sum'], X_val['link_current_status_4_percent'], X_val['link_current_status_mean'],
|
||||
X_val['pr_mean'], X_val['dc_mean'],
|
||||
X_val['lk_arrival_0_percent'], X_val['lk_arrival_1_percent'],X_val['lk_arrival_2_percent'],
|
||||
X_val['lk_arrival_3_percent'],X_val['lk_arrival_4_percent'],
|
||||
val_link_inputs, val_cross_inputs, val_deep_input.values, val_wide_input, val_inputs_slice],
|
||||
batch_size=2048)
|
||||
print('*-'*40, 'TRAINFORME')
|
||||
train_labels = pd.DataFrame(train_labels)
|
||||
train_labels['y_knowledge_train'] = np.squeeze(y_knowledge_train)
|
||||
print(np.squeeze(y_knowledge_train)[:2])
|
||||
print(train_labels['y_knowledge_train'].head(2))
|
||||
val_labels = pd.DataFrame(val_labels)
|
||||
val_labels['y_knowledge_val'] = np.squeeze(y_knowledge_val)
|
||||
print('*-' * 40, 't_MODEL_END')
|
||||
zsl_arrival_cols = ['zsl_link_arrival_status_mean','zsl_link_arrival_status_nunique','zsl_link_arrival_status0','zsl_link_arrival_status1','zsl_link_arrival_status2','zsl_link_arrival_status3']
|
||||
train_deep_input = train_deep_input.drop(['lk_arrival_0_percent','lk_arrival_1_percent','lk_arrival_2_percent','lk_arrival_3_percent','lk_arrival_4_percent'],axis=1)
|
||||
train_deep_input = train_deep_input.drop(zsl_arrival_cols, axis=1)
|
||||
|
||||
val_deep_input = val_deep_input.drop(['lk_arrival_0_percent','lk_arrival_1_percent','lk_arrival_2_percent','lk_arrival_3_percent','lk_arrival_4_percent'],axis=1)
|
||||
val_deep_input = val_deep_input.drop(zsl_arrival_cols, axis=1)
|
||||
|
||||
if 'ata' in train_deep_input.columns.tolist():
|
||||
print('The ata in the train_deep_input')
|
||||
print('*-' * 40, 'EXIT')
|
||||
sys.exit(0)
|
||||
if 'lk_arrival_0_percent' in train_deep_input.columns.tolist():
|
||||
print('The lk_arrival_0_percent in the train_deep_input')
|
||||
print('*-' * 40, 'EXIT')
|
||||
sys.exit(0)
|
||||
if 'lk_arrival_0_percent' in val_deep_input.columns.tolist():
|
||||
print('The lk_arrival_0_percent in the val_deep_input')
|
||||
print('*-' * 40, 'EXIT')
|
||||
sys.exit(0)
|
||||
if 'zsl_link_arrival_status_mean' in train_deep_input.columns.tolist():
|
||||
print('The zsl_link_arrival_status_mean in the train_deep_input')
|
||||
print('*-' * 40, 'EXIT')
|
||||
sys.exit(0)
|
||||
|
||||
mk_cols_list = train_deep_input.columns.tolist()
|
||||
print('*-' * 40, 'MODEL_FIT')
|
||||
deep_col_len, wide_col_len = train_deep_input.values.shape[1], train_wide_input.shape[1]
|
||||
print("deep_col_len:{}, wide_col_len:{}".format(deep_col_len, wide_col_len))
|
||||
NUMERIC_COLS = list(set(NUMERIC_COLS)-set(['lk_arrival_0_percent','lk_arrival_1_percent','lk_arrival_2_percent',
|
||||
'lk_arrival_3_percent','lk_arrival_4_percent']))
|
||||
fb_list = CATEGORICAL_COLS+NUMERIC_COLS+IGNORE_COLS
|
||||
if 'lk_arrival_0_percent' in fb_list:
|
||||
print('The lk_arrival_0_percent in the fb_list')
|
||||
print('*-' * 40, 'EXIT')
|
||||
sys.exit(0)
|
||||
data = data[fb_list]
|
||||
fd = dcn_model.FeatureDictionary(data, numeric_cols=NUMERIC_COLS, ignore_cols=IGNORE_COLS,
|
||||
cate_cols=CATEGORICAL_COLS)
|
||||
inp_layer, inp_embed = dcn_model.embedding_layers(fd)
|
||||
autoencoder, encoder = dcn_model.create_autoencoder(train_deep_input.values.shape[-1], 1, noise=0.1)
|
||||
if TRAINING:
|
||||
autoencoder.fit(train_deep_input.values, (train_deep_input.values, train_labels['ata'].values),
|
||||
epochs=1000, # 1000
|
||||
batch_size=2048, # 1024
|
||||
validation_split=0.1,
|
||||
callbacks=[tf.keras.callbacks.EarlyStopping('val_ata_output_loss', patience=10, restore_best_weights=True)])
|
||||
encoder.save_weights('../model_h5/main_encoder.hdf5')
|
||||
else:
|
||||
encoder.load_weights('../model_h5/main_encoder.hdf5')
|
||||
encoder.trainable = False
|
||||
del autoencoder
|
||||
|
||||
#print(type(train_labels['y_knowledge_train']))
|
||||
#print(type(train_labels))
|
||||
#y_train = np.vstack((train_labels, train_pre['y_knowledge_train'])).T
|
||||
#y_valid = np.vstack((val_labels, val_pre['y_knowledge_val'])).T
|
||||
#print(train_labels.shape)
|
||||
print(train_labels.head(1))
|
||||
print(train_labels.values[0])
|
||||
|
||||
print('*-'*40, 'The shape of train_link_inputs before', train_link_inputs.shape)
|
||||
train_link_inputs = np.concatenate((train_link_inputs[:, :, :5], train_link_inputs[:, :, 6:]), axis=2)
|
||||
|
||||
print('*-'*40, 'The shape of train_link_inputs after', train_link_inputs.shape)
|
||||
val_link_inputs = np.concatenate((val_link_inputs[:, :, :5], val_link_inputs[:, :, 6:]), axis=2)
|
||||
link_nf_size, cross_nf_size = train_link_inputs.shape[2], train_cross_inputs.shape[2]
|
||||
mc, es, lr = dcn_model.get_mc_es_lr_for_student('0720_2', patience=5, min_delta=1e-4)
|
||||
model = dcn_model.DCN_model(inp_layer, inp_embed, link_size, cross_size, slice_size, deep_col_len, wide_col_len,
|
||||
link_nf_size, cross_nf_size, encoder, conv=True)
|
||||
history = model.fit(
|
||||
[
|
||||
X_train['weather_le'], X_train['hightemp'], X_train['lowtemp'], X_train['dayofweek'],
|
||||
X_train['slice_id'], X_train['link_current_status_4'],
|
||||
X_train['distance'], X_train['simple_eta'], X_train['link_time_sum'], X_train['link_count'],
|
||||
X_train['cr_t_sum'], X_train['link_current_status_4_percent'], X_train['link_current_status_mean'],
|
||||
X_train['pr_mean'], X_train['dc_mean'],
|
||||
train_link_inputs, train_cross_inputs, train_deep_input.values, train_wide_input, train_inputs_slice],
|
||||
train_labels.values,
|
||||
validation_data=(
|
||||
[
|
||||
X_val['weather_le'], X_val['hightemp'], X_val['lowtemp'], X_val['dayofweek'],
|
||||
X_val['slice_id'], X_val['link_current_status_4'],
|
||||
X_val['distance'], X_val['simple_eta'], X_val['link_time_sum'], X_val['link_count'],
|
||||
X_val['cr_t_sum'], X_val['link_current_status_4_percent'], X_val['link_current_status_mean'],
|
||||
X_val['pr_mean'], X_val['dc_mean'],
|
||||
val_link_inputs, val_cross_inputs, val_deep_input.values, val_wide_input, val_inputs_slice],
|
||||
(val_labels.values),),
|
||||
batch_size=2048, # 2048,1024
|
||||
epochs=100, # 100
|
||||
verbose=1,
|
||||
# )
|
||||
callbacks=[es]) # lr
|
||||
np.save('../model_h5/history_0720_2.npy', history.history)
|
||||
model.save_weights("../model_h5/dcn_model_0720_2.h5")
|
||||
# MODEL_RPEDICT
|
||||
if VAL_TO_TEST:
|
||||
print('*-'*40,'val_to_test')
|
||||
val_pre = val_pre.rename(columns={'order_id': 'id'})
|
||||
print(val_link_inputs.shape, val_cross_inputs.shape, X_val.shape)
|
||||
print('*-' * 40, 'MODEL_RPEDICT')
|
||||
val_pred = model.predict(
|
||||
[
|
||||
X_val['weather_le'], X_val['hightemp'], X_val['lowtemp'], X_val['dayofweek'],
|
||||
X_val['slice_id'], X_val['link_current_status_4'],
|
||||
X_val['distance'], X_val['simple_eta'], X_val['link_time_sum'], X_val['link_count'],
|
||||
X_val['cr_t_sum'], X_val['link_current_status_4_percent'], X_val['link_current_status_mean'],
|
||||
X_val['pr_mean'], X_val['dc_mean'],
|
||||
val_link_inputs, val_cross_inputs, val_deep_input.values, val_wide_input, val_inputs_slice],
|
||||
batch_size=2048)
|
||||
val_pre['val_predict'] = np.squeeze(val_pred[:, 1])
|
||||
val_pre['other_predict'] = np.squeeze(val_pred[:, 0])
|
||||
# val_pre['val_predict'] = val_pre['val_predict'].round(0)
|
||||
val_pre = val_pre.rename(columns={'val_predict': 'result'}) # 更改列名
|
||||
val_pre = val_pre[['id', 'result', 'other_predict']]
|
||||
val_pre['ata'] = val_labels['ata'].values
|
||||
print(val_pre.head())
|
||||
result_save_path = '../result_csv/val_0720_2.csv'
|
||||
print('*-' * 40, 'CSV_SAVE_PATH:', result_save_path)
|
||||
print('..........Finish')
|
||||
|
||||
del X_train, train_link_inputs, train_cross_inputs, train_deep_input, \
|
||||
train_wide_input, train_inputs_slice, train_labels
|
||||
del X_val, val_link_inputs, val_cross_inputs, val_deep_input, val_wide_input, val_inputs_slice, val_labels
|
||||
gc.collect()
|
||||
#print('*-' * 40, 'EXIT')
|
||||
#sys.exit(0)
|
||||
print('*-' * 40, 'LOAD TEST DATA')
|
||||
making_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/order_xt/'
|
||||
link_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/max_170_link_sqe_for_order/'
|
||||
cross_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/cross_sqe_for_order/'
|
||||
link_test_data_other_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/link_sqe_for_order_other/'
|
||||
head_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/head_link_data_clear/'
|
||||
win_order_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/win_order_xw/'
|
||||
pre_arrival_sqe_test_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/sqe_arrival_for_link/'
|
||||
#h_s_for_test_link_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/max_hightmp_slice_for_link_eb/'
|
||||
#pre_arrival_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/final_pre_arrival_data/'
|
||||
zsl_link_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/zsl_test_link/'
|
||||
#zsl_cross_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/zsl_test_cross_0703/'
|
||||
test_data, _, _, _ = process.load_data(making_test_data_dir,
|
||||
link_test_data_dir,
|
||||
cross_test_data_dir,
|
||||
link_test_data_other_dir,
|
||||
head_test_data_dir,
|
||||
win_order_test_data_dir,
|
||||
pre_arrival_sqe_test_dir,
|
||||
zsl_link_test_data_dir) #,
|
||||
#h_s_for_test_link_dir)
|
||||
#pre_arrival_test_data_dir)
|
||||
print('*-' * 40, 'PROCESSING DATA')
|
||||
link_cols_list.remove('link_arrival_status')
|
||||
test_data = process.processing_data(test_data, link_cols_list, cross_cols_list, mk_cols_list, WIDE_COLS, is_test=True)
|
||||
gc.collect()
|
||||
print('*-' * 40, 'PROCESSING INPUTS FOR TEST_DATA', test_data.shape)
|
||||
test_link_inputs, test_cross_inputs, test_deep_input, test_wide_input, \
|
||||
test_inputs_slice, _ = process.processing_inputs(
|
||||
test_data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS, arrival=False)
|
||||
X_test = dcn_model.preprocess(test_data, CATEGORICAL_COLS, NUMERIC_COLS)
|
||||
test_pre = test_data[['order_id']]
|
||||
test_arrival_pre = test_data[['order_id']]
|
||||
gc.collect()
|
||||
|
||||
test_pre = test_pre.rename(columns={'order_id': 'id'})
|
||||
print(test_link_inputs.shape, test_cross_inputs.shape, X_test.shape, test_deep_input.shape)
|
||||
print('*-' * 40, 'MODEL_RPEDICT')
|
||||
test_pred = model.predict(
|
||||
[
|
||||
X_test['weather_le'], X_test['hightemp'], X_test['lowtemp'], X_test['dayofweek'],
|
||||
X_test['slice_id'], X_test['link_current_status_4'],
|
||||
X_test['distance'], X_test['simple_eta'], X_test['link_time_sum'], X_test['link_count'],
|
||||
X_test['cr_t_sum'], X_test['link_current_status_4_percent'], X_test['link_current_status_mean'],
|
||||
X_test['pr_mean'], X_test['dc_mean'],
|
||||
test_link_inputs, test_cross_inputs, test_deep_input.values, test_wide_input, test_inputs_slice],
|
||||
batch_size=2048)
|
||||
test_pre['test_predict'] = np.squeeze(test_pred[:, 1])
|
||||
test_pre['other_predict'] = np.squeeze(test_pred[:, 0])
|
||||
# test_pre['test_predict'] = test_pre['test_predict'].round(0)
|
||||
test_pre = test_pre.rename(columns={'test_predict': 'result'}) # 更改列名
|
||||
test_pre = test_pre[['id', 'result','other_predict']]
|
||||
print(test_pre.head())
|
||||
result_save_path = '../result_csv/submit_0720_2.csv'
|
||||
print('*-' * 40, 'CSV_SAVE_PATH:', result_save_path)
|
||||
test_pre.to_csv(result_save_path, index=0) # 保存
|
||||
|
||||
print('..........Finish')
|
@ -0,0 +1,3 @@
|
||||
import joblib
|
||||
cross_le = joblib.load('/data/didi_2021/model_h5/crossid_le')
|
||||
print(len(cross_le.classes_.tolist()))
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 3.6 MiB |
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,154 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import gc
|
||||
import process
|
||||
import wd_model
|
||||
import time
|
||||
|
||||
|
||||
RANDOM_SEED = 42
|
||||
|
||||
# types of columns of the data_set DataFrame
|
||||
WIDE_COLS = [
|
||||
'weather_le', 'hightemp', 'lowtemp', 'dayofweek'
|
||||
]
|
||||
|
||||
if __name__ == '__main__':
|
||||
t1 = time.time()
|
||||
print(wd_model.get_available_gpus()) # 返回格式为:['/device:GPU:0', '/device:GPU:1']
|
||||
|
||||
# LOAD DATA
|
||||
print('*-' * 40, 'LOAD DATA')
|
||||
making_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_order_xt/'
|
||||
link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_link_sqe_for_order/'
|
||||
cross_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/for_0714_cross_sqe_for_order/'
|
||||
head_link_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_head_link_data_clear/'
|
||||
win_order_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/win_order_xw/'
|
||||
pre_arrival_sqe_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/sqe_arrival_for_link/'
|
||||
data_for_driver_xw = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/data_for_driver_xw/'
|
||||
downstream_status_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/downstream_status_for_order/'
|
||||
data, mk_cols_list, link_cols_list, cross_cols_list = process.load_data(making_data_dir,
|
||||
link_data_dir,
|
||||
cross_data_dir,
|
||||
head_link_dir,
|
||||
win_order_data_dir,
|
||||
pre_arrival_sqe_dir,
|
||||
data_for_driver_xw,
|
||||
downstream_status_dir)
|
||||
|
||||
# PROCESSING DATA
|
||||
print('*-' * 40, 'PROCESSING DATA')
|
||||
train_data, val_data = process.processing_data(data, mk_cols_list, link_cols_list, cross_cols_list,
|
||||
WIDE_COLS)
|
||||
del data
|
||||
gc.collect()
|
||||
# print(train_data.columns.tolist())
|
||||
|
||||
# PROCESSING INPUTS
|
||||
print('*-' * 40, 'PROCESSING INPUTS')
|
||||
# SAVE LIST
|
||||
a = np.array(mk_cols_list)
|
||||
np.save('../model_h5/wd_mk_cols_list_0730_5.npy', a)
|
||||
a = np.array(link_cols_list)
|
||||
np.save('../model_h5/wd_link_cols_list_0730_5.npy', a)
|
||||
a = np.array(cross_cols_list)
|
||||
np.save('../model_h5/wd_cross_cols_list_0730_5.npy', cross_cols_list)
|
||||
pred_cols = ['ata']
|
||||
print('*-' * 40, 'PROCESSING INPUTS FOR TRAIN_DATA', train_data.shape)
|
||||
train_link_inputs, train_cross_inputs, train_deep_input, train_wide_input, \
|
||||
train_inputs_slice, train_labels = process.processing_inputs(
|
||||
train_data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS)
|
||||
del train_data
|
||||
gc.collect()
|
||||
|
||||
print('*-' * 40, 'PROCESSING INPUTS FOR VAL_DATA', val_data.shape)
|
||||
val_link_inputs, val_cross_inputs, val_deep_input, val_wide_input, \
|
||||
val_inputs_slice, val_labels = process.processing_inputs(
|
||||
val_data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS)
|
||||
del val_data
|
||||
gc.collect()
|
||||
|
||||
|
||||
# MODEL_INIT
|
||||
print('*-' * 40, 'MODEL_INIT')
|
||||
deep_col_len, wide_col_len = train_deep_input.shape[1], train_wide_input.shape[1]
|
||||
link_nf_size, cross_nf_size = train_link_inputs.shape[2], train_cross_inputs.shape[2]
|
||||
link_size = 639877 + 2
|
||||
cross_size = 44313 + 2
|
||||
slice_size = 288
|
||||
# link_seqlen, cross_seqlen = 170, 12 # 已默认
|
||||
print("link_size:{},link_nf_size:{},cross_size:{},cross_nf_size:{},slice_size:{}".format(link_size, link_nf_size,
|
||||
cross_size, cross_nf_size,
|
||||
slice_size))
|
||||
print("deep_col_len:{}, wide_col_len:{}".format(deep_col_len, wide_col_len))
|
||||
|
||||
model = wd_model.wd_model(link_size, cross_size, slice_size, deep_col_len, wide_col_len,
|
||||
link_nf_size, cross_nf_size, conv='conv')
|
||||
|
||||
mc, es, lr = wd_model.get_mc_es_lr('0730_5', patience=4, min_delta=1e-4)
|
||||
print('*-' * 40, 'MODEL_INIT END')
|
||||
# MODEL_FIT
|
||||
print('*-' * 40, 'MODEL_FIT_PREDICT')
|
||||
history = model.fit(
|
||||
[train_link_inputs, train_cross_inputs, train_deep_input, train_wide_input, train_inputs_slice], train_labels,
|
||||
validation_data=(
|
||||
[val_link_inputs, val_cross_inputs, val_deep_input, val_wide_input, val_inputs_slice], val_labels),
|
||||
batch_size=2048, # 2048,256
|
||||
epochs=100,
|
||||
verbose=1,
|
||||
callbacks=[es])
|
||||
np.save('../model_h5/history_0730_5.npy', history.history)
|
||||
model.save_weights("../model_h5/wd_model_0730_5.h5")
|
||||
|
||||
del train_link_inputs, train_cross_inputs, train_deep_input, \
|
||||
train_wide_input, train_inputs_slice, train_labels
|
||||
del val_link_inputs, val_cross_inputs, val_deep_input, val_wide_input, val_inputs_slice, val_labels
|
||||
gc.collect()
|
||||
|
||||
print('*-' * 40, 'LOAD TEST DATA')
|
||||
making_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/order_xt/'
|
||||
link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/max_170_link_sqe_for_order/'
|
||||
cross_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/cross_sqe_for_order/'
|
||||
head_link_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/head_link_data_clear/'
|
||||
win_order_test_data_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/win_order_xw/'
|
||||
pre_arrival_sqe_test_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/sqe_arrival_for_link/'
|
||||
data_test_for_driver_xw = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/data_for_driver_xw/'
|
||||
downstream_status_test_dir = '/home/didi2021/didi2021/giscup_2021/final_test_data_0703/downstream_status_for_order/'
|
||||
test_data, _, _, _ = process.load_data(making_data_dir,
|
||||
link_data_dir,
|
||||
cross_data_dir,
|
||||
head_link_dir,
|
||||
win_order_test_data_dir,
|
||||
pre_arrival_sqe_test_dir,
|
||||
data_test_for_driver_xw,
|
||||
downstream_status_test_dir)
|
||||
|
||||
# PROCESSING DATA
|
||||
print('*-' * 40, 'PROCESSING DATA')
|
||||
test_data = process.processing_data(test_data, mk_cols_list, link_cols_list, cross_cols_list,
|
||||
WIDE_COLS, is_test=True)
|
||||
print('*-' * 40, 'PROCESSING INPUTS FOR TEST_DATA', test_data.shape)
|
||||
test_link_inputs, test_cross_inputs, test_deep_input, test_wide_input, \
|
||||
test_inputs_slice, test_labels = process.processing_inputs(
|
||||
test_data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS)
|
||||
test_pre = test_data[['order_id']]
|
||||
del test_data
|
||||
gc.collect()
|
||||
|
||||
# MODEL_RPEDICT
|
||||
print('*-' * 40, 'MODEL_RPEDICT')
|
||||
test_pre = test_pre.rename(columns={'order_id': 'id'})
|
||||
test_pred = model.predict(
|
||||
[test_link_inputs, test_cross_inputs, test_deep_input, test_wide_input, test_inputs_slice],
|
||||
batch_size=2048)
|
||||
test_pre['test_predict'] = test_pred
|
||||
# test_pre['test_predict'] = test_pre['test_predict'].round(0)
|
||||
test_pre = test_pre.rename(columns={'test_predict': 'result'}) # 更改列名
|
||||
test_pre = test_pre[['id', 'result']]
|
||||
print(test_pre.head())
|
||||
result_save_path = '../result_csv/submit_w_0730_5.csv'
|
||||
print('*-' * 40, 'CSV_SAVE_PATH:', result_save_path)
|
||||
test_pre.to_csv(result_save_path, index=0) # 保存
|
||||
print('..........Finish')
|
||||
t2 = time.time()
|
||||
print("Total time spent: {:.4f}".format((t2-t1)/3600))
|
@ -0,0 +1,312 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
||||
from tqdm import tqdm
|
||||
from pandarallel import pandarallel
|
||||
from sklearn.model_selection import train_test_split
|
||||
# import random
|
||||
import gc
|
||||
import ast
|
||||
import os
|
||||
import warnings
|
||||
import joblib
|
||||
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
pd.options.mode.chained_assignment = None
|
||||
pandarallel.initialize()
|
||||
|
||||
|
||||
def pandas_list_to_array(df):
|
||||
"""
|
||||
Input: DataFrame of shape (x, y), containing list of length l
|
||||
Return: np.array of shape (x, l, y)
|
||||
"""
|
||||
|
||||
return np.transpose(
|
||||
np.array(df.values.tolist()),
|
||||
(0, 2, 1)
|
||||
)
|
||||
|
||||
|
||||
def preprocess_inputs(df, cols: list):
|
||||
return pandas_list_to_array(
|
||||
df[cols]
|
||||
)
|
||||
|
||||
|
||||
def append_all_data(files_list, file_head_path):
|
||||
"""
|
||||
concat all the data
|
||||
:param files_list: the name of data
|
||||
:param file_head_path: the path of data
|
||||
:return: DataFrame of data for all
|
||||
"""
|
||||
data_all_path = file_head_path + files_list[0]
|
||||
data_all = pd.read_csv(data_all_path)
|
||||
data_all = data_all.head(0)
|
||||
try:
|
||||
del data_all['Unnamed: 0']
|
||||
except KeyError as e:
|
||||
pass
|
||||
# 循环添加全部数据
|
||||
for i in files_list:
|
||||
data_path = file_head_path + i
|
||||
print("当前文件为:", data_path)
|
||||
data = pd.read_csv(data_path)
|
||||
try:
|
||||
del data['Unnamed: 0']
|
||||
except KeyError as e:
|
||||
pass
|
||||
data_all = data_all.append(data)
|
||||
return data_all
|
||||
|
||||
|
||||
def file_name(file_dir):
|
||||
files_list = []
|
||||
for root, dirs, files in os.walk(file_dir):
|
||||
# print("success")
|
||||
for name in files:
|
||||
files_list.append(name)
|
||||
return files_list
|
||||
|
||||
|
||||
def load_data(making_data_dir, link_data_dir, cross_data_dir, head_link_dir,
|
||||
win_order_data_dir, pre_arrival_sqe_dir, data_for_driver_xw, downstream_status_dir):
|
||||
"""
|
||||
loading three path of data, then merge them
|
||||
:return: all data by order_level
|
||||
"""
|
||||
print('-------------LOAD DATA for mk_data----------------')
|
||||
mk_list = file_name(making_data_dir)
|
||||
mk_list.sort()
|
||||
mk_data = append_all_data(mk_list, making_data_dir)
|
||||
#mk_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_order_xt/join_20200825.csv') # for test running
|
||||
mk_data['date_time'] = mk_data['date_time'].astype(str)
|
||||
mk_data['dayofweek'] = pd.to_datetime(mk_data['date_time'])
|
||||
mk_data['dayofweek'] = mk_data['dayofweek'].dt.dayofweek+1
|
||||
|
||||
weather_le = LabelEncoder()
|
||||
mk_data['weather_le'] = weather_le.fit_transform(mk_data['weather'])
|
||||
mk_data['driver_id'] = mk_data['driver_id'].astype(str)
|
||||
|
||||
"""
|
||||
print('-------------LOAD DATA for driver_data----------------')
|
||||
driver_list = file_name(data_for_driver_xw)
|
||||
driver_list.sort()
|
||||
driver_data = append_all_data(driver_list, data_for_driver_xw)
|
||||
#driver_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/data_for_driver_xw/driver_20200825_head.txt')
|
||||
driver_data = driver_data[['driver_id','date_time','entropy','hour_mean','workday_order','weekend_order']]
|
||||
driver_data['date_time'] = driver_data['date_time'].astype(str)
|
||||
driver_data['driver_id'] = driver_data['driver_id'].astype(str)
|
||||
mk_data = mk_data.merge(driver_data, on=['driver_id', 'date_time'], how='left')
|
||||
del driver_data
|
||||
"""
|
||||
|
||||
"""
|
||||
print('-------------LOAD DATA for downstream_status_for_order----------------')
|
||||
ds_data_list = file_name(downstream_status_dir)
|
||||
ds_data_list.sort()
|
||||
ds_link_data = append_all_data(ds_data_list, downstream_status_dir)
|
||||
#ds_link_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/downstream_status_for_order/ds_for_order_20200825.csv')
|
||||
mk_data = mk_data.merge(ds_link_data, on=['order_id'], how='left')
|
||||
del ds_link_data
|
||||
"""
|
||||
|
||||
|
||||
"""
|
||||
print('-------------LOAD DATA for rate_status_for_order----------------')
|
||||
#rate_data_list = file_name(rate_status_for_order)
|
||||
#rate_data_list.sort()
|
||||
#rate_data = append_all_data(rate_data_list, rate_status_for_order)
|
||||
rate_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/rate_status_for_order/rate_for_order_20200825.csv')
|
||||
mk_data = mk_data.merge(rate_data, on=['order_id'], how='left')
|
||||
del rate_data
|
||||
"""
|
||||
|
||||
|
||||
print('Remove the wk2_ and m1_ and ratio')
|
||||
del_cols = []
|
||||
mk_cols = mk_data.columns.tolist()
|
||||
for i in range(len(mk_cols)):
|
||||
if 'wk2_' in mk_cols[i]:
|
||||
del_cols.append(mk_cols[i])
|
||||
if 'm1_' in mk_cols[i]:
|
||||
del_cols.append(mk_cols[i])
|
||||
if 'ratio' in mk_cols[i]:
|
||||
del_cols.append(mk_cols[i])
|
||||
del_cols = del_cols + ['date_time_mean','weather', 'driver_id', 'date_time_dt', 'link_time_sum','date_time_sum']
|
||||
print('*-' * 40, 'Will be drop the list:', del_cols)
|
||||
mk_data.drop(columns=del_cols, axis=1, inplace=True)
|
||||
print('The init shape of mk_data:', mk_data.shape)
|
||||
|
||||
|
||||
print('-------------LOAD WIN DATA----------------')
|
||||
win_order_list = file_name(win_order_data_dir)
|
||||
win_order_list.sort()
|
||||
win_order_data = append_all_data(win_order_list, win_order_data_dir)
|
||||
#win_order_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/win_order_xw/win_for_slice_20200825.csv') # for test running
|
||||
del_win_order_cols = []
|
||||
win_order_cols = win_order_data.columns.tolist()
|
||||
for i in range(len(win_order_cols)):
|
||||
if 'last_wk_lk_current' in win_order_cols[i]:
|
||||
del_win_order_cols.append(win_order_cols[i])
|
||||
#if 'distance' in win_order_cols[i]:
|
||||
# del_win_order_cols.append(win_order_cols[i])
|
||||
#if '1_percent' in win_order_cols[i]:
|
||||
# del_win_order_cols.append(win_order_cols[i])
|
||||
#if '0_percent' in win_order_cols[i]:
|
||||
# del_win_order_cols.append(win_order_cols[i])
|
||||
del_win_order_cols = del_win_order_cols + ['slice_id', 'date_time']
|
||||
win_order_data.drop(columns=del_win_order_cols, axis=1, inplace=True)
|
||||
print('win_order_data.shape',win_order_data.shape)
|
||||
mk_data = pd.merge(mk_data, win_order_data, how='left', on='order_id')
|
||||
print('mk_data.shape',mk_data.shape)
|
||||
del win_order_data
|
||||
gc.collect()
|
||||
|
||||
|
||||
print('-------------LOAD HEAD DATA----------------')
|
||||
head_list = file_name(head_link_dir)
|
||||
head_list.sort()
|
||||
head_data = append_all_data(head_list, head_link_dir)
|
||||
#head_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/head_link_data_clear/head_link_20200825.csv') # for test running
|
||||
get_head_cols = ['len_tmp','status_0','status_1','status_2','status_3','status_4','rate_0','rate_1','rate_2','rate_3','rate_4']
|
||||
get_head_cols.insert(0, 'order_id')
|
||||
print('head_data.shape:',head_data.shape)
|
||||
head_data = head_data[get_head_cols]
|
||||
print('mk_data.shape',mk_data.shape)
|
||||
mk_data = pd.merge(mk_data, head_data, how='left', on='order_id')
|
||||
print('mk_data.shape',mk_data.shape)
|
||||
del head_data
|
||||
gc.collect()
|
||||
|
||||
|
||||
print('-------------LOAD DATA for link_data----------------')
|
||||
link_list = file_name(link_data_dir)
|
||||
link_list.sort()
|
||||
link_data = append_all_data(link_list, link_data_dir)
|
||||
#link_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_link_sqe_for_order/sqe_20200825_link.txt') # for test running
|
||||
#del_link_cols = ['link_time_sub','link_time_sub_sum','link_time_sub_mean', 'link_time_sub_std','link_time_sub_skew']
|
||||
#link_data.drop(del_link_cols, axis=1, inplace=True)
|
||||
print('The init shape of link_data:', link_data.shape)
|
||||
gc.collect()
|
||||
|
||||
|
||||
print('-------------LOAD DATA for arrival_sqe_data----------------')
|
||||
arrival_sqe_list = file_name(pre_arrival_sqe_dir)
|
||||
arrival_sqe_list.sort()
|
||||
arrival_sqe_data = append_all_data(arrival_sqe_list, pre_arrival_sqe_dir)
|
||||
#arrival_sqe_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/sqe_arrival_for_link/20200825.csv') # for test running
|
||||
del arrival_sqe_data['slice_id']
|
||||
del arrival_sqe_data['pre_arrival_status']
|
||||
del arrival_sqe_data['arrive_slice_id']
|
||||
arrival_cols = arrival_sqe_data.columns.tolist()
|
||||
new_arrival_cols = ['future_'+i for i in arrival_cols if i != 'order_id']
|
||||
new_arrival_cols.insert(0, 'order_id')
|
||||
arrival_sqe_data.columns = new_arrival_cols
|
||||
print('The init shape of arrival_sqe_data:', arrival_sqe_data.shape)
|
||||
link_data = pd.merge(link_data, arrival_sqe_data, how='left', on='order_id')
|
||||
del arrival_sqe_data
|
||||
gc.collect()
|
||||
link_cols_list = ['link_id', 'link_time', 'link_current_status', 'pr','dc']
|
||||
|
||||
|
||||
|
||||
print('-------------LOAD DATA for cross_data----------------')
|
||||
cross_list = file_name(cross_data_dir)
|
||||
cross_list.sort()
|
||||
cross_data = append_all_data(cross_list, cross_data_dir)
|
||||
#cross_data = pd.read_csv('/home/didi2021/didi2021/giscup_2021/final_train_data_0703/for_0714_cross_sqe_for_order/sqe_20200825_cross.txt') # for test running
|
||||
del_cross_cols = ['cr_t_sub_by_min', 'cr_t_sub_by_q50', 'total_crosstime_std']
|
||||
cross_data.drop(columns=del_cross_cols, axis=1, inplace=True)
|
||||
print('The init shape of cross_data:', cross_data.shape)
|
||||
cross_cols_list = ['cross_id', 'cross_time']
|
||||
|
||||
|
||||
data = pd.merge(mk_data, link_data, how='left', on='order_id')
|
||||
del mk_data
|
||||
del link_data
|
||||
gc.collect()
|
||||
data = pd.merge(data, cross_data, how='left', on='order_id')
|
||||
del cross_data
|
||||
gc.collect()
|
||||
|
||||
# remove the class type and id and label, for deep inputs
|
||||
mk_cols_list = data.columns.tolist()
|
||||
remove_mk_cols = ['order_id', 'slice_id', 'hightemp', 'lowtemp', 'weather_le', 'dayofweek', 'date_time', 'ata']
|
||||
mk_cols_list = list(set(mk_cols_list) - set(remove_mk_cols))
|
||||
mk_cols_list = list(set(mk_cols_list) - set(link_cols_list))
|
||||
mk_cols_list = list(set(mk_cols_list) - set(cross_cols_list))
|
||||
print('lenght of mk_cols_list', len(mk_cols_list))
|
||||
print('*-' * 40)
|
||||
print('The finish shape of data is:', data.shape)
|
||||
|
||||
return data, mk_cols_list, link_cols_list, cross_cols_list
|
||||
|
||||
|
||||
def processing_data(data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS, is_test=False):
|
||||
"""
|
||||
fix data, ast.literal_eval + StandardScaler + train_test_split
|
||||
:return: train_data, val_data, test_data
|
||||
"""
|
||||
print('Now, Starting parallel_apply the link..................')
|
||||
for i in tqdm(link_cols_list):
|
||||
data[i] = data[i].parallel_apply(ast.literal_eval)
|
||||
print('Now, Starting parallel_apply the cross..................')
|
||||
for i in tqdm(cross_cols_list):
|
||||
data[i] = data[i].parallel_apply(ast.literal_eval)
|
||||
# data = data.fillna(0)
|
||||
data.fillna(data.median(),inplace=True)
|
||||
ss_cols = mk_cols_list + WIDE_COLS
|
||||
|
||||
# train, val
|
||||
if is_test is True:
|
||||
print('is_test is True')
|
||||
ss = joblib.load('../model_h5/ss_scaler')
|
||||
data[ss_cols] = ss.transform(data[ss_cols])
|
||||
return data
|
||||
else:
|
||||
ss = StandardScaler()
|
||||
ss.fit(data[ss_cols])
|
||||
data[ss_cols] = ss.transform(data[ss_cols])
|
||||
joblib.dump(ss, '../model_h5/ss_scaler')
|
||||
print('is_test is False')
|
||||
data['date_time'] = data['date_time'].astype(int)
|
||||
print("type(data['date_time']):", data['date_time'].dtype)
|
||||
# print('Here train_test_split..................')
|
||||
# all_train_data, _ = train_test_split(all_train_data, test_size=0.9, random_state=42)
|
||||
print('*-' * 40, 'The data.shape:', data.shape)
|
||||
train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)
|
||||
train_data = train_data.reset_index()
|
||||
val_data = val_data.reset_index()
|
||||
del train_data['index']
|
||||
del val_data['index']
|
||||
return train_data, val_data
|
||||
|
||||
|
||||
def processing_inputs(data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS):
|
||||
"""
|
||||
change the data for model
|
||||
:return:
|
||||
"""
|
||||
if 'ata' in mk_cols_list:
|
||||
print('The ata in the mk_cols_list')
|
||||
if 'ata' in link_cols_list:
|
||||
print('The ata in the link_cols_list')
|
||||
if 'ata' in cross_cols_list:
|
||||
print('The ata in the cross_cols_list')
|
||||
if 'ata' in WIDE_COLS:
|
||||
print('The ata in the WIDE_COLS')
|
||||
#link_cols_list = ['link_id', 'link_time','link_id_count','pr','dc',
|
||||
# 'top_a','link_current_status','link_ratio']
|
||||
#cross_cols_list = ['cross_id', 'cross_time']
|
||||
data_link_inputs = preprocess_inputs(data, cols=link_cols_list)
|
||||
data_cross_inputs = preprocess_inputs(data, cols=cross_cols_list)
|
||||
data_deep_input = data[mk_cols_list].values
|
||||
data_wide_input = data[WIDE_COLS].values
|
||||
data_inputs_slice = data['slice_id'].values
|
||||
# print('--------------------------------test, ', min(data['slice_id'].values.tolist()))
|
||||
data_labels = data['ata'].values
|
||||
|
||||
return data_link_inputs, data_cross_inputs, data_deep_input, data_wide_input, data_inputs_slice, data_labels
|
@ -0,0 +1,198 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from tensorflow import keras
|
||||
import tensorflow as tf
|
||||
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
||||
import tensorflow.keras.layers as L
|
||||
# import tensorflow.keras.models as M
|
||||
import tensorflow.keras.backend as K
|
||||
from tensorflow.python.client import device_lib
|
||||
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
|
||||
from keras_radam.training import RAdamOptimizer
|
||||
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
|
||||
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, Conv1D
|
||||
from tensorflow.keras.layers import Input, Dense, Lambda, Layer
|
||||
from tensorflow.keras.initializers import Constant
|
||||
from tensorflow.keras.models import Model
|
||||
|
||||
|
||||
def get_available_gpus():
|
||||
local_device_protos = device_lib.list_local_devices()
|
||||
return [x.name for x in local_device_protos if x.device_type == 'GPU']
|
||||
|
||||
|
||||
def gru_layer(hidden_dim, dropout):
|
||||
return L.Bidirectional(L.GRU(
|
||||
hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))
|
||||
|
||||
|
||||
def lstm_layer(hidden_dim, dropout):
|
||||
return L.Bidirectional(L.LSTM(
|
||||
hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))
|
||||
|
||||
|
||||
def preprocess(df, cate_cols, numeric_cols):
|
||||
for cl in cate_cols:
|
||||
le = LabelEncoder()
|
||||
df[cl] = le.fit_transform(df[cl])
|
||||
cols = cate_cols + numeric_cols
|
||||
X_train = df[cols]
|
||||
return X_train
|
||||
|
||||
|
||||
def wd_model(link_size, cross_size, slice_size, input_deep_col, input_wide_col,
|
||||
link_nf_size, cross_nf_size, link_seqlen=170, cross_seqlen=12, pred_len=1,
|
||||
dropout=0.25, sp_dropout=0.1, embed_dim=64, hidden_dim=128, n_layers=3, lr=0.001,
|
||||
kernel_size1=3, kernel_size2=2, conv_size=128, conv='conv'):
|
||||
link_inputs = L.Input(shape=(link_seqlen, link_nf_size))
|
||||
cross_inputs = L.Input(shape=(cross_seqlen, cross_nf_size))
|
||||
deep_inputs = L.Input(shape=(input_deep_col,), name='deep_input')
|
||||
slice_input = L.Input(shape=(1,))
|
||||
wide_inputs = keras.layers.Input(shape=(input_wide_col,), name='wide_input')
|
||||
|
||||
# link----------------------------
|
||||
categorical_fea1 = link_inputs[:, :, :1]
|
||||
numerical_fea1 = link_inputs[:, :, 1:5]
|
||||
|
||||
embed = L.Embedding(input_dim=link_size, output_dim=embed_dim)(categorical_fea1)
|
||||
reshaped = tf.reshape(embed, shape=(-1, embed.shape[1], embed.shape[2] * embed.shape[3]))
|
||||
#reshaped = L.SpatialDropout1D(sp_dropout)(reshaped)
|
||||
|
||||
hidden = L.concatenate([reshaped, numerical_fea1], axis=2)
|
||||
hidden = L.SpatialDropout1D(sp_dropout)(hidden)
|
||||
"""
|
||||
categorical_ar_st = link_inputs[:, :, 5:6]
|
||||
categorical_ar_st = L.Masking(mask_value=-1, name='categorical_ar_st')(categorical_ar_st)
|
||||
embed_ar_st = L.Embedding(input_dim=(-1,289), output_dim=8)(categorical_ar_st)
|
||||
reshaped_ar_st = tf.reshape(embed_ar_st, shape=(-1, embed_ar_st.shape[1], embed_ar_st.shape[2] * embed_ar_st.shape[3]))
|
||||
reshaped_ar_st = L.SpatialDropout1D(sp_dropout)(reshaped_ar_st)
|
||||
|
||||
categorical_ar_sl = link_inputs[:, :, 6:7]
|
||||
categorical_ar_sl = L.Masking(mask_value=-1, name='categorical_ar_sl')(categorical_ar_sl)
|
||||
embed_ar_sl = L.Embedding(input_dim=(-1, 289), output_dim=8)(categorical_ar_sl)
|
||||
reshaped_ar_sl = tf.reshape(embed_ar_sl, shape=(-1, embed_ar_sl.shape[1], embed_ar_sl.shape[2] * embed_ar_sl.shape[3]))
|
||||
reshaped_ar_sl = L.SpatialDropout1D(sp_dropout)(reshaped_ar_sl)
|
||||
hidden = L.concatenate([reshaped, reshaped_ar_st, reshaped_ar_sl, numerical_fea1],axis=2)
|
||||
"""
|
||||
for x in range(n_layers):
|
||||
hidden = lstm_layer(hidden_dim, dropout)(hidden)
|
||||
|
||||
if conv=='conv':
|
||||
#x_conv1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(hidden)
|
||||
avg_pool1_gru = GlobalAveragePooling1D()(hidden)
|
||||
max_pool1_gru = GlobalMaxPooling1D()(hidden)
|
||||
truncated_link = concatenate([avg_pool1_gru, max_pool1_gru])
|
||||
elif conv=='resnet50':
|
||||
truncated_link = ResNet50(include_top=False, pooling='max', weights=None)(hidden)
|
||||
else:
|
||||
truncated_link = hidden[:, :pred_len]
|
||||
truncated_link = L.Flatten()(truncated_link)
|
||||
|
||||
# cross----------------------------
|
||||
categorical_fea2 = cross_inputs[:, :, :1]
|
||||
numerical_fea2 = cross_inputs[:, :, 1:]
|
||||
embed2 = L.Embedding(input_dim=cross_size, output_dim=embed_dim)(categorical_fea2)
|
||||
reshaped2 = tf.reshape(embed2, shape=(-1, embed2.shape[1], embed2.shape[2] * embed2.shape[3]))
|
||||
#reshaped2 = L.SpatialDropout1D(sp_dropout)(reshaped2)
|
||||
|
||||
hidden2 = L.concatenate([reshaped2, numerical_fea2], axis=2)
|
||||
hidden2 = L.SpatialDropout1D(sp_dropout)(hidden2)
|
||||
for x in range(n_layers):
|
||||
hidden2 = lstm_layer(hidden_dim, dropout)(hidden2)
|
||||
|
||||
if conv=='conv':
|
||||
#x_conv3 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(hidden2)
|
||||
avg_pool3_gru = GlobalAveragePooling1D()(hidden2)
|
||||
max_pool3_gru = GlobalMaxPooling1D()(hidden2)
|
||||
truncated_cross = concatenate([avg_pool3_gru, max_pool3_gru])
|
||||
elif conv=='resnet50':
|
||||
truncated_cross = ResNet50(include_top=False, pooling='max', weights=None)(hidden2)
|
||||
else:
|
||||
truncated_cross = hidden2[:, :pred_len]
|
||||
truncated_cross = L.Flatten()(truncated_cross)
|
||||
|
||||
# slice----------------------------
|
||||
embed_slice = L.Embedding(input_dim=slice_size, output_dim=1)(slice_input)
|
||||
embed_slice = L.Flatten()(embed_slice)
|
||||
|
||||
# deep_inputs
|
||||
"""
|
||||
dense_hidden1 = L.Dense(256, activation="relu")(deep_inputs)
|
||||
dense_hidden1 = L.Dropout(dropout)(dense_hidden1)
|
||||
dense_hidden2 = L.Dense(256, activation="relu")(dense_hidden1)
|
||||
dense_hidden2 = L.Dropout(dropout)(dense_hidden2)
|
||||
dense_hidden3 = L.Dense(128, activation="relu")(dense_hidden2)
|
||||
"""
|
||||
x = L.Dense(512, activation="relu")(deep_inputs)
|
||||
x = L.BatchNormalization()(x)
|
||||
x = L.Lambda(tf.keras.activations.swish)(x)
|
||||
x = L.Dropout(0.25)(x)
|
||||
for i in range(2):
|
||||
x = L.Dense(256)(x)
|
||||
x = L.BatchNormalization()(x)
|
||||
x = L.Lambda(tf.keras.activations.swish)(x)
|
||||
x = L.Dropout(0.25)(x)
|
||||
dense_hidden3 = L.Dense(64,activation='linear')(x)
|
||||
# main-------------------------------
|
||||
truncated = L.concatenate([truncated_link, truncated_cross, dense_hidden3, wide_inputs, embed_slice]) # WD
|
||||
"""
|
||||
truncated = L.BatchNormalization()(truncated)
|
||||
truncated = L.Dropout(dropout)(L.Dense(512, activation='relu') (truncated))
|
||||
truncated = L.BatchNormalization()(truncated)
|
||||
truncated = L.Dropout(dropout)(L.Dense(256, activation='relu') (truncated))
|
||||
"""
|
||||
truncated = L.BatchNormalization()(truncated)
|
||||
truncated = L.Dropout(dropout)(L.Dense(1024, activation='relu') (truncated))
|
||||
truncated = L.Dropout(dropout)(truncated)
|
||||
|
||||
for i in range(2):
|
||||
truncated = L.Dense(512)(truncated)
|
||||
truncated = L.BatchNormalization()(truncated)
|
||||
truncated = L.Lambda(tf.keras.activations.swish)(truncated)
|
||||
truncated = L.Dropout(dropout)(truncated)
|
||||
|
||||
out = L.Dense(1, activation='linear')(truncated)
|
||||
|
||||
|
||||
model = tf.keras.Model(inputs=[link_inputs, cross_inputs, deep_inputs, wide_inputs, slice_input],
|
||||
outputs=out) # WD
|
||||
print(model.summary())
|
||||
model.compile(loss='mape',
|
||||
optimizer=RAdamOptimizer(learning_rate=1e-3),
|
||||
metrics=['mape'])
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_mc_es_lr(model_name: str, patience=5, min_delta=1e-4):
|
||||
mc = tf.keras.callbacks.ModelCheckpoint('../model_h5/model_{}.h5'.format(model_name)),
|
||||
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min',
|
||||
restore_best_weights=True, patience=patience)
|
||||
lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=patience, mode='min',
|
||||
min_delta=min_delta)
|
||||
|
||||
return mc, es, lr
|
||||
|
||||
|
||||
class Mish(tf.keras.layers.Layer):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(Mish, self).__init__(**kwargs)
|
||||
self.supports_masking = True
|
||||
|
||||
def call(self, inputs):
|
||||
return inputs * K.tanh(K.softplus(inputs))
|
||||
|
||||
def get_config(self):
|
||||
base_config = super(Mish, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
return input_shape
|
||||
|
||||
|
||||
def mish(x):
|
||||
return tf.keras.layers.Lambda(lambda x: x*K.tanh(K.softplus(x)))(x)
|
||||
|
||||
|
||||
tf.keras.utils.get_custom_objects().update({'mish': tf.keras.layers.Activation(mish)})
|
After Width: | Height: | Size: 3.8 KiB |
After Width: | Height: | Size: 7.1 KiB |
Loading…
Reference in new issue