parent
008189ce2b
commit
94339ad9f3
@ -0,0 +1,28 @@
|
||||
# To create the conda environment:
|
||||
# $ conda env create -f environment.yaml
|
||||
#
|
||||
# To update the conda environment:
|
||||
# $ conda env update -f environment.yaml
|
||||
#
|
||||
# To register the conda environment in Jupyter:
|
||||
# $ conda activate dlts
|
||||
# $ python -m ipykernel install --user --name dlts --display-name "Python (dlts)"
|
||||
|
||||
name: dlts
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- python==3.6.6
|
||||
- pip>=19.1.1
|
||||
- ipykernel>=4.6.1
|
||||
- jupyter>=1.0.0
|
||||
- matplotlib==3.0.0
|
||||
- numpy==1.16.2
|
||||
- pandas==0.23.4
|
||||
- tensorflow==1.12.0
|
||||
- keras==2.2.4
|
||||
- scikit-learn==0.20.3
|
||||
- statsmodels==0.9.0
|
||||
- xlrd >= 1.0.0
|
||||
- pip:
|
||||
- pyramid-arima==0.8.1
|
@ -0,0 +1,37 @@
|
||||
import zipfile
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
|
||||
# This function unzips the GEFCom2014 data zip file and extracts the 'extended'
|
||||
# load forecasting competition data. Data is saved in energy.csv
|
||||
def extract_data(data_dir):
|
||||
GEFCom_dir = os.path.join(data_dir, 'GEFCom2014', 'GEFCom2014 Data')
|
||||
|
||||
GEFCom_zipfile = os.path.join(data_dir, 'GEFCom2014.zip')
|
||||
if not os.path.exists(GEFCom_zipfile):
|
||||
sys.exit("Download GEFCom2014.zip from https://www.dropbox.com/s/pqenrr2mcvl0hk9/GEFCom2014.zip?dl=0 and save it to the '{}' directory.".format(data_dir))
|
||||
|
||||
# unzip root directory
|
||||
zip_ref = zipfile.ZipFile(GEFCom_zipfile, 'r')
|
||||
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014'))
|
||||
zip_ref.close()
|
||||
|
||||
# extract the extended competition data
|
||||
zip_ref = zipfile.ZipFile(os.path.join(GEFCom_dir, 'GEFCom2014-E_V2.zip'), 'r')
|
||||
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014-E'))
|
||||
zip_ref.close()
|
||||
|
||||
# load the data from Excel file
|
||||
data = pd.read_excel(os.path.join(data_dir, 'GEFCom2014-E', 'GEFCom2014-E.xlsx'), parse_date='Date')
|
||||
|
||||
# create timestamp variable from Date and Hour
|
||||
data['timestamp'] = data['Date'].add(pd.to_timedelta(data.Hour - 1, unit='h'))
|
||||
data = data[['timestamp', 'load', 'T']]
|
||||
data = data.rename(columns={'T':'temp'})
|
||||
|
||||
# remove time period with no load data
|
||||
data = data[data.timestamp >= '2012-01-01']
|
||||
|
||||
# save to csv
|
||||
data.to_csv(os.path.join(data_dir, 'energy.csv'), index=False)
|
@ -0,0 +1,145 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
from collections import UserDict
|
||||
|
||||
def load_data(data_dir):
|
||||
"""Load the GEFCom 2014 energy load data"""
|
||||
|
||||
energy = pd.read_csv(os.path.join(data_dir, 'energy.csv'), parse_dates=['timestamp'])
|
||||
|
||||
# Reindex the dataframe such that the dataframe has a record for every time point
|
||||
# between the minimum and maximum timestamp in the time series. This helps to
|
||||
# identify missing time periods in the data (there are none in this dataset).
|
||||
|
||||
energy.index = energy['timestamp']
|
||||
energy = energy.reindex(pd.date_range(min(energy['timestamp']),
|
||||
max(energy['timestamp']),
|
||||
freq='H'))
|
||||
energy = energy.drop('timestamp', axis=1)
|
||||
|
||||
return energy
|
||||
|
||||
|
||||
def mape(predictions, actuals):
|
||||
"""Mean absolute percentage error"""
|
||||
return ((predictions - actuals).abs() / actuals).mean()
|
||||
|
||||
|
||||
def create_evaluation_df(predictions, test_inputs, H, scaler):
|
||||
"""Create a data frame for easy evaluation"""
|
||||
eval_df = pd.DataFrame(predictions, columns=['t+'+str(t) for t in range(1, H+1)])
|
||||
eval_df['timestamp'] = test_inputs.dataframe.index
|
||||
eval_df = pd.melt(eval_df, id_vars='timestamp', value_name='prediction', var_name='h')
|
||||
eval_df['actual'] = np.transpose(test_inputs['target']).ravel()
|
||||
eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']])
|
||||
return eval_df
|
||||
|
||||
|
||||
class TimeSeriesTensor(UserDict):
|
||||
"""A dictionary of tensors for input into the RNN model.
|
||||
|
||||
Use this class to:
|
||||
1. Shift the values of the time series to create a Pandas dataframe containing all the data
|
||||
for a single training example
|
||||
2. Discard any samples with missing values
|
||||
3. Transform this Pandas dataframe into a numpy array of shape
|
||||
(samples, time steps, features) for input into Keras
|
||||
|
||||
The class takes the following parameters:
|
||||
- **dataset**: original time series
|
||||
- **target** name of the target column
|
||||
- **H**: the forecast horizon
|
||||
- **tensor_structures**: a dictionary discribing the tensor structure of the form
|
||||
{ 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) }
|
||||
if features are non-sequential and should not be shifted, use the form
|
||||
{ 'tensor_name' : (None, [feature, feature, ...])}
|
||||
- **freq**: time series frequency (default 'H' - hourly)
|
||||
- **drop_incomplete**: (Boolean) whether to drop incomplete samples (default True)
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, target, H, tensor_structure, freq='H', drop_incomplete=True):
|
||||
self.dataset = dataset
|
||||
self.target = target
|
||||
self.tensor_structure = tensor_structure
|
||||
self.tensor_names = list(tensor_structure.keys())
|
||||
|
||||
self.dataframe = self._shift_data(H, freq, drop_incomplete)
|
||||
self.data = self._df2tensors(self.dataframe)
|
||||
|
||||
def _shift_data(self, H, freq, drop_incomplete):
|
||||
|
||||
# Use the tensor_structures definitions to shift the features in the original dataset.
|
||||
# The result is a Pandas dataframe with multi-index columns in the hierarchy
|
||||
# tensor - the name of the input tensor
|
||||
# feature - the input feature to be shifted
|
||||
# time step - the time step for the RNN in which the data is input. These labels
|
||||
# are centred on time t. the forecast creation time
|
||||
df = self.dataset.copy()
|
||||
|
||||
idx_tuples = []
|
||||
for t in range(1, H+1):
|
||||
df['t+'+str(t)] = df[self.target].shift(t*-1, freq=freq)
|
||||
idx_tuples.append(('target', 'y', 't+'+str(t)))
|
||||
|
||||
for name, structure in self.tensor_structure.items():
|
||||
rng = structure[0]
|
||||
dataset_cols = structure[1]
|
||||
|
||||
for col in dataset_cols:
|
||||
|
||||
# do not shift non-sequential 'static' features
|
||||
if rng is None:
|
||||
df['context_'+col] = df[col]
|
||||
idx_tuples.append((name, col, 'static'))
|
||||
|
||||
else:
|
||||
for t in rng:
|
||||
sign = '+' if t > 0 else ''
|
||||
shift = str(t) if t != 0 else ''
|
||||
period = 't'+sign+shift
|
||||
shifted_col = name+'_'+col+'_'+period
|
||||
df[shifted_col] = df[col].shift(t*-1, freq=freq)
|
||||
idx_tuples.append((name, col, period))
|
||||
|
||||
df = df.drop(self.dataset.columns, axis=1)
|
||||
idx = pd.MultiIndex.from_tuples(idx_tuples, names=['tensor', 'feature', 'time step'])
|
||||
df.columns = idx
|
||||
|
||||
if drop_incomplete:
|
||||
df = df.dropna(how='any')
|
||||
|
||||
return df
|
||||
|
||||
def _df2tensors(self, dataframe):
|
||||
|
||||
# Transform the shifted Pandas dataframe into the multidimensional numpy arrays. These
|
||||
# arrays can be used to input into the keras model and can be accessed by tensor name.
|
||||
# For example, for a TimeSeriesTensor object named "model_inputs" and a tensor named
|
||||
# "target", the input tensor can be acccessed with model_inputs['target']
|
||||
|
||||
inputs = {}
|
||||
y = dataframe['target']
|
||||
y = y.as_matrix()
|
||||
inputs['target'] = y
|
||||
|
||||
for name, structure in self.tensor_structure.items():
|
||||
rng = structure[0]
|
||||
cols = structure[1]
|
||||
tensor = dataframe[name][cols].as_matrix()
|
||||
if rng is None:
|
||||
tensor = tensor.reshape(tensor.shape[0], len(cols))
|
||||
else:
|
||||
tensor = tensor.reshape(tensor.shape[0], len(cols), len(rng))
|
||||
tensor = np.transpose(tensor, axes=[0, 2, 1])
|
||||
inputs[name] = tensor
|
||||
|
||||
return inputs
|
||||
|
||||
def subset_data(self, new_dataframe):
|
||||
|
||||
# Use this function to recreate the input tensors if the shifted dataframe
|
||||
# has been filtered.
|
||||
|
||||
self.dataframe = new_dataframe
|
||||
self.data = self._df2tensors(self.dataframe)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -0,0 +1,28 @@
|
||||
# To create the conda environment:
|
||||
# $ conda env create -f environment.yaml
|
||||
#
|
||||
# To update the conda environment:
|
||||
# $ conda env update -f environment.yaml
|
||||
#
|
||||
# To register the conda environment in Jupyter:
|
||||
# $ conda activate dlts
|
||||
# $ python -m ipykernel install --user --name dlts --display-name "Python (dlts)"
|
||||
|
||||
name: dlts
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- python==3.6.6
|
||||
- pip>=19.1.1
|
||||
- ipykernel>=4.6.1
|
||||
- jupyter>=1.0.0
|
||||
- matplotlib==3.0.0
|
||||
- numpy==1.16.2
|
||||
- pandas==0.23.4
|
||||
- tensorflow==1.12.0
|
||||
- keras==2.2.4
|
||||
- scikit-learn==0.20.3
|
||||
- statsmodels==0.9.0
|
||||
- xlrd >= 1.0.0
|
||||
- pip:
|
||||
- pyramid-arima==0.8.1
|
@ -0,0 +1,37 @@
|
||||
import zipfile
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
|
||||
# This function unzips the GEFCom2014 data zip file and extracts the 'extended'
|
||||
# load forecasting competition data. Data is saved in energy.csv
|
||||
def extract_data(data_dir):
|
||||
GEFCom_dir = os.path.join(data_dir, 'GEFCom2014', 'GEFCom2014 Data')
|
||||
|
||||
GEFCom_zipfile = os.path.join(data_dir, 'GEFCom2014.zip')
|
||||
if not os.path.exists(GEFCom_zipfile):
|
||||
sys.exit("Download GEFCom2014.zip from https://www.dropbox.com/s/pqenrr2mcvl0hk9/GEFCom2014.zip?dl=0 and save it to the '{}' directory.".format(data_dir))
|
||||
|
||||
# unzip root directory
|
||||
zip_ref = zipfile.ZipFile(GEFCom_zipfile, 'r')
|
||||
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014'))
|
||||
zip_ref.close()
|
||||
|
||||
# extract the extended competition data
|
||||
zip_ref = zipfile.ZipFile(os.path.join(GEFCom_dir, 'GEFCom2014-E_V2.zip'), 'r')
|
||||
zip_ref.extractall(os.path.join(data_dir, 'GEFCom2014-E'))
|
||||
zip_ref.close()
|
||||
|
||||
# load the data from Excel file
|
||||
data = pd.read_excel(os.path.join(data_dir, 'GEFCom2014-E', 'GEFCom2014-E.xlsx'), parse_date='Date')
|
||||
|
||||
# create timestamp variable from Date and Hour
|
||||
data['timestamp'] = data['Date'].add(pd.to_timedelta(data.Hour - 1, unit='h'))
|
||||
data = data[['timestamp', 'load', 'T']]
|
||||
data = data.rename(columns={'T':'temp'})
|
||||
|
||||
# remove time period with no load data
|
||||
data = data[data.timestamp >= '2012-01-01']
|
||||
|
||||
# save to csv
|
||||
data.to_csv(os.path.join(data_dir, 'energy.csv'), index=False)
|
@ -0,0 +1,145 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
from collections import UserDict
|
||||
|
||||
def load_data(data_dir):
|
||||
"""Load the GEFCom 2014 energy load data"""
|
||||
|
||||
energy = pd.read_csv(os.path.join(data_dir, 'energy.csv'), parse_dates=['timestamp'])
|
||||
|
||||
# Reindex the dataframe such that the dataframe has a record for every time point
|
||||
# between the minimum and maximum timestamp in the time series. This helps to
|
||||
# identify missing time periods in the data (there are none in this dataset).
|
||||
|
||||
energy.index = energy['timestamp']
|
||||
energy = energy.reindex(pd.date_range(min(energy['timestamp']),
|
||||
max(energy['timestamp']),
|
||||
freq='H'))
|
||||
energy = energy.drop('timestamp', axis=1)
|
||||
|
||||
return energy
|
||||
|
||||
|
||||
def mape(predictions, actuals):
|
||||
"""Mean absolute percentage error"""
|
||||
return ((predictions - actuals).abs() / actuals).mean()
|
||||
|
||||
|
||||
def create_evaluation_df(predictions, test_inputs, H, scaler):
|
||||
"""Create a data frame for easy evaluation"""
|
||||
eval_df = pd.DataFrame(predictions, columns=['t+'+str(t) for t in range(1, H+1)])
|
||||
eval_df['timestamp'] = test_inputs.dataframe.index
|
||||
eval_df = pd.melt(eval_df, id_vars='timestamp', value_name='prediction', var_name='h')
|
||||
eval_df['actual'] = np.transpose(test_inputs['target']).ravel()
|
||||
eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']])
|
||||
return eval_df
|
||||
|
||||
|
||||
class TimeSeriesTensor(UserDict):
|
||||
"""A dictionary of tensors for input into the RNN model.
|
||||
|
||||
Use this class to:
|
||||
1. Shift the values of the time series to create a Pandas dataframe containing all the data
|
||||
for a single training example
|
||||
2. Discard any samples with missing values
|
||||
3. Transform this Pandas dataframe into a numpy array of shape
|
||||
(samples, time steps, features) for input into Keras
|
||||
|
||||
The class takes the following parameters:
|
||||
- **dataset**: original time series
|
||||
- **target** name of the target column
|
||||
- **H**: the forecast horizon
|
||||
- **tensor_structures**: a dictionary discribing the tensor structure of the form
|
||||
{ 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) }
|
||||
if features are non-sequential and should not be shifted, use the form
|
||||
{ 'tensor_name' : (None, [feature, feature, ...])}
|
||||
- **freq**: time series frequency (default 'H' - hourly)
|
||||
- **drop_incomplete**: (Boolean) whether to drop incomplete samples (default True)
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, target, H, tensor_structure, freq='H', drop_incomplete=True):
|
||||
self.dataset = dataset
|
||||
self.target = target
|
||||
self.tensor_structure = tensor_structure
|
||||
self.tensor_names = list(tensor_structure.keys())
|
||||
|
||||
self.dataframe = self._shift_data(H, freq, drop_incomplete)
|
||||
self.data = self._df2tensors(self.dataframe)
|
||||
|
||||
def _shift_data(self, H, freq, drop_incomplete):
|
||||
|
||||
# Use the tensor_structures definitions to shift the features in the original dataset.
|
||||
# The result is a Pandas dataframe with multi-index columns in the hierarchy
|
||||
# tensor - the name of the input tensor
|
||||
# feature - the input feature to be shifted
|
||||
# time step - the time step for the RNN in which the data is input. These labels
|
||||
# are centred on time t. the forecast creation time
|
||||
df = self.dataset.copy()
|
||||
|
||||
idx_tuples = []
|
||||
for t in range(1, H+1):
|
||||
df['t+'+str(t)] = df[self.target].shift(t*-1, freq=freq)
|
||||
idx_tuples.append(('target', 'y', 't+'+str(t)))
|
||||
|
||||
for name, structure in self.tensor_structure.items():
|
||||
rng = structure[0]
|
||||
dataset_cols = structure[1]
|
||||
|
||||
for col in dataset_cols:
|
||||
|
||||
# do not shift non-sequential 'static' features
|
||||
if rng is None:
|
||||
df['context_'+col] = df[col]
|
||||
idx_tuples.append((name, col, 'static'))
|
||||
|
||||
else:
|
||||
for t in rng:
|
||||
sign = '+' if t > 0 else ''
|
||||
shift = str(t) if t != 0 else ''
|
||||
period = 't'+sign+shift
|
||||
shifted_col = name+'_'+col+'_'+period
|
||||
df[shifted_col] = df[col].shift(t*-1, freq=freq)
|
||||
idx_tuples.append((name, col, period))
|
||||
|
||||
df = df.drop(self.dataset.columns, axis=1)
|
||||
idx = pd.MultiIndex.from_tuples(idx_tuples, names=['tensor', 'feature', 'time step'])
|
||||
df.columns = idx
|
||||
|
||||
if drop_incomplete:
|
||||
df = df.dropna(how='any')
|
||||
|
||||
return df
|
||||
|
||||
def _df2tensors(self, dataframe):
|
||||
|
||||
# Transform the shifted Pandas dataframe into the multidimensional numpy arrays. These
|
||||
# arrays can be used to input into the keras model and can be accessed by tensor name.
|
||||
# For example, for a TimeSeriesTensor object named "model_inputs" and a tensor named
|
||||
# "target", the input tensor can be acccessed with model_inputs['target']
|
||||
|
||||
inputs = {}
|
||||
y = dataframe['target']
|
||||
y = y.as_matrix()
|
||||
inputs['target'] = y
|
||||
|
||||
for name, structure in self.tensor_structure.items():
|
||||
rng = structure[0]
|
||||
cols = structure[1]
|
||||
tensor = dataframe[name][cols].as_matrix()
|
||||
if rng is None:
|
||||
tensor = tensor.reshape(tensor.shape[0], len(cols))
|
||||
else:
|
||||
tensor = tensor.reshape(tensor.shape[0], len(cols), len(rng))
|
||||
tensor = np.transpose(tensor, axes=[0, 2, 1])
|
||||
inputs[name] = tensor
|
||||
|
||||
return inputs
|
||||
|
||||
def subset_data(self, new_dataframe):
|
||||
|
||||
# Use this function to recreate the input tensors if the shifted dataframe
|
||||
# has been filtered.
|
||||
|
||||
self.dataframe = new_dataframe
|
||||
self.data = self._df2tensors(self.dataframe)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,49 @@
|
||||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": 3
|
||||
},
|
||||
"orig_nbformat": 2
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"source": [
|
||||
"# Time series forecasting with ARIMA\n",
|
||||
"\n",
|
||||
"In this notebook, we demonstrate how to:\n",
|
||||
"- prepare time series data for training an ARIMA time series forecasting model\n",
|
||||
"- implement a simple ARIMA model to forecast the next HORIZON steps ahead (time *t+1* through *t+HORIZON*) in the time series\n",
|
||||
"- evaluate the model \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The data in this example is taken from the GEFCom2014 forecasting competition<sup>1</sup>. It consists of 3 years of hourly electricity load and temperature values between 2012 and 2014. The task is to forecast future values of electricity load. In this example, we show how to forecast one time step ahead, using historical load data only.\n",
|
||||
"\n",
|
||||
"<sup>1</sup>Tao Hong, Pierre Pinson, Shu Fan, Hamidreza Zareipour, Alberto Troccoli and Rob J. Hyndman, \"Probabilistic energy forecasting: Global Energy Forecasting Competition 2014 and beyond\", International Journal of Forecasting, vol.32, no.3, pp 896-913, July-September, 2016."
|
||||
],
|
||||
"cell_type": "code",
|
||||
"metadata": {},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install statsmodels"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
Loading…
Reference in new issue