You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ML-For-Beginners/7-TimeSeries/2-ARIMA/working/common/extract_data.py

38 lines
1.5 KiB

import zipfile
import os
import sys
import pandas as pd
# This function unzips the GEFCom2014 data zip file and extracts the 'extended'
# load forecasting competition data. Data is saved in energy.csv
def extract_data(data_dir):
GEFCom_dir = os.path.join(data_dir, 'GEFCom2014', 'GEFCom2014 Data')
GEFCom_zipfile = os.path.join(data_dir, 'GEFCom2014.zip')
if not os.path.exists(GEFCom_zipfile):
sys.exit("Download GEFCom2014.zip from https://www.dropbox.com/s/pqenrr2mcvl0hk9/GEFCom2014.zip?dl=0 and save it to the '{}' directory.".format(data_dir))
# unzip root directory
zip_ref = zipfile.ZipFile(GEFCom_zipfile, 'r')
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014'))
zip_ref.close()
# extract the extended competition data
zip_ref = zipfile.ZipFile(os.path.join(GEFCom_dir, 'GEFCom2014-E_V2.zip'), 'r')
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014-E'))
zip_ref.close()
# load the data from Excel file
data = pd.read_excel(os.path.join(data_dir, 'GEFCom2014-E', 'GEFCom2014-E.xlsx'), parse_date='Date')
# create timestamp variable from Date and Hour
data['timestamp'] = data['Date'].add(pd.to_timedelta(data.Hour - 1, unit='h'))
data = data[['timestamp', 'load', 'T']]
data = data.rename(columns={'T':'temp'})
# remove time period with no load data
data = data[data.timestamp >= '2012-01-01']
# save to csv
data.to_csv(os.path.join(data_dir, 'energy.csv'), index=False)