You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
38 lines
1.5 KiB
38 lines
1.5 KiB
import zipfile
|
|
import os
|
|
import sys
|
|
import pandas as pd
|
|
|
|
# This function unzips the GEFCom2014 data zip file and extracts the 'extended'
|
|
# load forecasting competition data. Data is saved in energy.csv
|
|
def extract_data(data_dir):
|
|
GEFCom_dir = os.path.join(data_dir, 'GEFCom2014', 'GEFCom2014 Data')
|
|
|
|
GEFCom_zipfile = os.path.join(data_dir, 'GEFCom2014.zip')
|
|
if not os.path.exists(GEFCom_zipfile):
|
|
sys.exit("Download GEFCom2014.zip from https://www.dropbox.com/s/pqenrr2mcvl0hk9/GEFCom2014.zip?dl=0 and save it to the '{}' directory.".format(data_dir))
|
|
|
|
# unzip root directory
|
|
zip_ref = zipfile.ZipFile(GEFCom_zipfile, 'r')
|
|
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014'))
|
|
zip_ref.close()
|
|
|
|
# extract the extended competition data
|
|
zip_ref = zipfile.ZipFile(os.path.join(GEFCom_dir, 'GEFCom2014-E_V2.zip'), 'r')
|
|
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014-E'))
|
|
zip_ref.close()
|
|
|
|
# load the data from Excel file
|
|
data = pd.read_excel(os.path.join(data_dir, 'GEFCom2014-E', 'GEFCom2014-E.xlsx'), parse_date='Date')
|
|
|
|
# create timestamp variable from Date and Hour
|
|
data['timestamp'] = data['Date'].add(pd.to_timedelta(data.Hour - 1, unit='h'))
|
|
data = data[['timestamp', 'load', 'T']]
|
|
data = data.rename(columns={'T':'temp'})
|
|
|
|
# remove time period with no load data
|
|
data = data[data.timestamp >= '2012-01-01']
|
|
|
|
# save to csv
|
|
data.to_csv(os.path.join(data_dir, 'energy.csv'), index=False)
|