import numpy as np import pandas as pd import os from collections import UserDict def load_data(data_dir): """Load the GEFCom 2014 energy load data""" energy = pd.read_csv(os.path.join(data_dir, 'energy.csv'), parse_dates=['timestamp']) # Reindex the dataframe such that the dataframe has a record for every time point # between the minimum and maximum timestamp in the time series. This helps to # identify missing time periods in the data (there are none in this dataset). energy.index = energy['timestamp'] energy = energy.reindex(pd.date_range(min(energy['timestamp']), max(energy['timestamp']), freq='H')) energy = energy.drop('timestamp', axis=1) return energy def mape(predictions, actuals): """Mean absolute percentage error""" return ((predictions - actuals).abs() / actuals).mean() def create_evaluation_df(predictions, test_inputs, H, scaler): """Create a data frame for easy evaluation""" eval_df = pd.DataFrame(predictions, columns=['t+'+str(t) for t in range(1, H+1)]) eval_df['timestamp'] = test_inputs.dataframe.index eval_df = pd.melt(eval_df, id_vars='timestamp', value_name='prediction', var_name='h') eval_df['actual'] = np.transpose(test_inputs['target']).ravel() eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']]) return eval_df class TimeSeriesTensor(UserDict): """A dictionary of tensors for input into the RNN model. Use this class to: 1. Shift the values of the time series to create a Pandas dataframe containing all the data for a single training example 2. Discard any samples with missing values 3. Transform this Pandas dataframe into a numpy array of shape (samples, time steps, features) for input into Keras The class takes the following parameters: - **dataset**: original time series - **target** name of the target column - **H**: the forecast horizon - **tensor_structures**: a dictionary describing the tensor structure of the form { 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) } if features are non-sequential and should not be shifted, use the form { 'tensor_name' : (None, [feature, feature, ...])} - **freq**: time series frequency (default 'H' - hourly) - **drop_incomplete**: (Boolean) whether to drop incomplete samples (default True) """ def __init__(self, dataset, target, H, tensor_structure, freq='H', drop_incomplete=True): self.dataset = dataset self.target = target self.tensor_structure = tensor_structure self.tensor_names = list(tensor_structure.keys()) self.dataframe = self._shift_data(H, freq, drop_incomplete) self.data = self._df2tensors(self.dataframe) def _shift_data(self, H, freq, drop_incomplete): # Use the tensor_structures definitions to shift the features in the original dataset. # The result is a Pandas dataframe with multi-index columns in the hierarchy # tensor - the name of the input tensor # feature - the input feature to be shifted # time step - the time step for the RNN in which the data is input. These labels # are centred on time t. the forecast creation time df = self.dataset.copy() idx_tuples = [] for t in range(1, H+1): df['t+'+str(t)] = df[self.target].shift(t*-1, freq=freq) idx_tuples.append(('target', 'y', 't+'+str(t))) for name, structure in self.tensor_structure.items(): rng = structure[0] dataset_cols = structure[1] for col in dataset_cols: # do not shift non-sequential 'static' features if rng is None: df['context_'+col] = df[col] idx_tuples.append((name, col, 'static')) else: for t in rng: sign = '+' if t > 0 else '' shift = str(t) if t != 0 else '' period = 't'+sign+shift shifted_col = name+'_'+col+'_'+period df[shifted_col] = df[col].shift(t*-1, freq=freq) idx_tuples.append((name, col, period)) df = df.drop(self.dataset.columns, axis=1) idx = pd.MultiIndex.from_tuples(idx_tuples, names=['tensor', 'feature', 'time step']) df.columns = idx if drop_incomplete: df = df.dropna(how='any') return df def _df2tensors(self, dataframe): # Transform the shifted Pandas dataframe into the multidimensional numpy arrays. These # arrays can be used to input into the keras model and can be accessed by tensor name. # For example, for a TimeSeriesTensor object named "model_inputs" and a tensor named # "target", the input tensor can be acccessed with model_inputs['target'] inputs = {} y = dataframe['target'] y = y.as_matrix() inputs['target'] = y for name, structure in self.tensor_structure.items(): rng = structure[0] cols = structure[1] tensor = dataframe[name][cols].as_matrix() if rng is None: tensor = tensor.reshape(tensor.shape[0], len(cols)) else: tensor = tensor.reshape(tensor.shape[0], len(cols), len(rng)) tensor = np.transpose(tensor, axes=[0, 2, 1]) inputs[name] = tensor return inputs def subset_data(self, new_dataframe): # Use this function to recreate the input tensors if the shifted dataframe # has been filtered. self.dataframe = new_dataframe self.data = self._df2tensors(self.dataframe)