From 91e87678365f0796065b3763db964536ceb61df1 Mon Sep 17 00:00:00 2001 From: Anirban Mukherjee Date: Wed, 6 Oct 2021 00:50:48 +0530 Subject: [PATCH] Create utils.py --- 7-TimeSeries/3-SVR/common/utils.py | 147 +++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 7-TimeSeries/3-SVR/common/utils.py diff --git a/7-TimeSeries/3-SVR/common/utils.py b/7-TimeSeries/3-SVR/common/utils.py new file mode 100644 index 00000000..4ab7623f --- /dev/null +++ b/7-TimeSeries/3-SVR/common/utils.py @@ -0,0 +1,147 @@ +import numpy as np +import pandas as pd +import os +from collections import UserDict + +def load_data(data_dir): + """Load the GEFCom 2014 energy load data""" + + energy = pd.read_csv(os.path.join(data_dir, 'energy.csv'), parse_dates=['timestamp']) + + # Reindex the dataframe such that the dataframe has a record for every time point + # between the minimum and maximum timestamp in the time series. This helps to + # identify missing time periods in the data (there are none in this dataset). + + energy.index = energy['timestamp'] + energy = energy.reindex(pd.date_range(min(energy['timestamp']), + max(energy['timestamp']), + freq='H')) + energy = energy.drop('timestamp', axis=1) + + return energy + + +def mape(predictions, actuals): + """Mean absolute percentage error""" + predictions = np.array(predictions) + actuals = np.array(actuals) + return (np.absolute(predictions - actuals) / actuals).mean() + + +def create_evaluation_df(predictions, test_inputs, H, scaler): + """Create a data frame for easy evaluation""" + eval_df = pd.DataFrame(predictions, columns=['t+'+str(t) for t in range(1, H+1)]) + eval_df['timestamp'] = test_inputs.dataframe.index + eval_df = pd.melt(eval_df, id_vars='timestamp', value_name='prediction', var_name='h') + eval_df['actual'] = np.transpose(test_inputs['target']).ravel() + eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']]) + return eval_df + + +class TimeSeriesTensor(UserDict): + """A dictionary of tensors for input into the RNN model. + + Use this class to: + 1. Shift the values of the time series to create a Pandas dataframe containing all the data + for a single training example + 2. Discard any samples with missing values + 3. Transform this Pandas dataframe into a numpy array of shape + (samples, time steps, features) for input into Keras + + The class takes the following parameters: + - **dataset**: original time series + - **target** name of the target column + - **H**: the forecast horizon + - **tensor_structures**: a dictionary describing the tensor structure of the form + { 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) } + if features are non-sequential and should not be shifted, use the form + { 'tensor_name' : (None, [feature, feature, ...])} + - **freq**: time series frequency (default 'H' - hourly) + - **drop_incomplete**: (Boolean) whether to drop incomplete samples (default True) + """ + + def __init__(self, dataset, target, H, tensor_structure, freq='H', drop_incomplete=True): + self.dataset = dataset + self.target = target + self.tensor_structure = tensor_structure + self.tensor_names = list(tensor_structure.keys()) + + self.dataframe = self._shift_data(H, freq, drop_incomplete) + self.data = self._df2tensors(self.dataframe) + + def _shift_data(self, H, freq, drop_incomplete): + + # Use the tensor_structures definitions to shift the features in the original dataset. + # The result is a Pandas dataframe with multi-index columns in the hierarchy + # tensor - the name of the input tensor + # feature - the input feature to be shifted + # time step - the time step for the RNN in which the data is input. These labels + # are centred on time t. the forecast creation time + df = self.dataset.copy() + + idx_tuples = [] + for t in range(1, H+1): + df['t+'+str(t)] = df[self.target].shift(t*-1, freq=freq) + idx_tuples.append(('target', 'y', 't+'+str(t))) + + for name, structure in self.tensor_structure.items(): + rng = structure[0] + dataset_cols = structure[1] + + for col in dataset_cols: + + # do not shift non-sequential 'static' features + if rng is None: + df['context_'+col] = df[col] + idx_tuples.append((name, col, 'static')) + + else: + for t in rng: + sign = '+' if t > 0 else '' + shift = str(t) if t != 0 else '' + period = 't'+sign+shift + shifted_col = name+'_'+col+'_'+period + df[shifted_col] = df[col].shift(t*-1, freq=freq) + idx_tuples.append((name, col, period)) + + df = df.drop(self.dataset.columns, axis=1) + idx = pd.MultiIndex.from_tuples(idx_tuples, names=['tensor', 'feature', 'time step']) + df.columns = idx + + if drop_incomplete: + df = df.dropna(how='any') + + return df + + def _df2tensors(self, dataframe): + + # Transform the shifted Pandas dataframe into the multidimensional numpy arrays. These + # arrays can be used to input into the keras model and can be accessed by tensor name. + # For example, for a TimeSeriesTensor object named "model_inputs" and a tensor named + # "target", the input tensor can be acccessed with model_inputs['target'] + + inputs = {} + y = dataframe['target'] + y = y.as_matrix() + inputs['target'] = y + + for name, structure in self.tensor_structure.items(): + rng = structure[0] + cols = structure[1] + tensor = dataframe[name][cols].as_matrix() + if rng is None: + tensor = tensor.reshape(tensor.shape[0], len(cols)) + else: + tensor = tensor.reshape(tensor.shape[0], len(cols), len(rng)) + tensor = np.transpose(tensor, axes=[0, 2, 1]) + inputs[name] = tensor + + return inputs + + def subset_data(self, new_dataframe): + + # Use this function to recreate the input tensors if the shifted dataframe + # has been filtered. + + self.dataframe = new_dataframe + self.data = self._df2tensors(self.dataframe)