From 91e87678365f0796065b3763db964536ceb61df1 Mon Sep 17 00:00:00 2001
From: Anirban Mukherjee <raji08xd@gmail.com>
Date: Wed, 6 Oct 2021 00:50:48 +0530
Subject: [PATCH] Create utils.py

---
 7-TimeSeries/3-SVR/common/utils.py | 147 +++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 7-TimeSeries/3-SVR/common/utils.py

diff --git a/7-TimeSeries/3-SVR/common/utils.py b/7-TimeSeries/3-SVR/common/utils.py
new file mode 100644
index 00000000..4ab7623f
--- /dev/null
+++ b/7-TimeSeries/3-SVR/common/utils.py
@@ -0,0 +1,147 @@
+import numpy as np
+import pandas as pd
+import os
+from collections import UserDict
+
+def load_data(data_dir):
+    """Load the GEFCom 2014 energy load data"""
+
+    energy = pd.read_csv(os.path.join(data_dir, 'energy.csv'), parse_dates=['timestamp'])
+
+    # Reindex the dataframe such that the dataframe has a record for every time point
+    # between the minimum and maximum timestamp in the time series. This helps to 
+    # identify missing time periods in the data (there are none in this dataset).
+
+    energy.index = energy['timestamp']
+    energy = energy.reindex(pd.date_range(min(energy['timestamp']),
+                                          max(energy['timestamp']),
+                                          freq='H'))
+    energy = energy.drop('timestamp', axis=1)
+
+    return energy
+
+
+def mape(predictions, actuals):
+    """Mean absolute percentage error"""
+    predictions = np.array(predictions)
+    actuals = np.array(actuals)
+    return (np.absolute(predictions - actuals) / actuals).mean()
+
+
+def create_evaluation_df(predictions, test_inputs, H, scaler):
+    """Create a data frame for easy evaluation"""
+    eval_df = pd.DataFrame(predictions, columns=['t+'+str(t) for t in range(1, H+1)])
+    eval_df['timestamp'] = test_inputs.dataframe.index
+    eval_df = pd.melt(eval_df, id_vars='timestamp', value_name='prediction', var_name='h')
+    eval_df['actual'] = np.transpose(test_inputs['target']).ravel()
+    eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']])
+    return eval_df
+
+
+class TimeSeriesTensor(UserDict):
+    """A dictionary of tensors for input into the RNN model.
+    
+    Use this class to:
+      1. Shift the values of the time series to create a Pandas dataframe containing all the data
+         for a single training example
+      2. Discard any samples with missing values
+      3. Transform this Pandas dataframe into a numpy array of shape 
+         (samples, time steps, features) for input into Keras
+
+    The class takes the following parameters:
+       - **dataset**: original time series
+       - **target** name of the target column
+       - **H**: the forecast horizon
+       - **tensor_structures**: a dictionary describing the tensor structure of the form
+             { 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) }
+             if features are non-sequential and should not be shifted, use the form
+             { 'tensor_name' : (None, [feature, feature, ...])}
+       - **freq**: time series frequency (default 'H' - hourly)
+       - **drop_incomplete**: (Boolean) whether to drop incomplete samples (default True)
+    """
+    
+    def __init__(self, dataset, target, H, tensor_structure, freq='H', drop_incomplete=True):
+        self.dataset = dataset
+        self.target = target
+        self.tensor_structure = tensor_structure
+        self.tensor_names = list(tensor_structure.keys())
+        
+        self.dataframe = self._shift_data(H, freq, drop_incomplete)
+        self.data = self._df2tensors(self.dataframe)
+    
+    def _shift_data(self, H, freq, drop_incomplete):
+        
+        # Use the tensor_structures definitions to shift the features in the original dataset.
+        # The result is a Pandas dataframe with multi-index columns in the hierarchy
+        #     tensor - the name of the input tensor
+        #     feature - the input feature to be shifted
+        #     time step - the time step for the RNN in which the data is input. These labels
+        #         are centred on time t. the forecast creation time
+        df = self.dataset.copy()
+        
+        idx_tuples = []
+        for t in range(1, H+1):
+            df['t+'+str(t)] = df[self.target].shift(t*-1, freq=freq)
+            idx_tuples.append(('target', 'y', 't+'+str(t)))
+
+        for name, structure in self.tensor_structure.items():
+            rng = structure[0]
+            dataset_cols = structure[1]
+            
+            for col in dataset_cols:
+            
+            # do not shift non-sequential 'static' features
+                if rng is None:
+                    df['context_'+col] = df[col]
+                    idx_tuples.append((name, col, 'static'))
+
+                else:
+                    for t in rng:
+                        sign = '+' if t > 0 else ''
+                        shift = str(t) if t != 0 else ''
+                        period = 't'+sign+shift
+                        shifted_col = name+'_'+col+'_'+period
+                        df[shifted_col] = df[col].shift(t*-1, freq=freq)
+                        idx_tuples.append((name, col, period))
+                
+        df = df.drop(self.dataset.columns, axis=1)
+        idx = pd.MultiIndex.from_tuples(idx_tuples, names=['tensor', 'feature', 'time step'])
+        df.columns = idx
+
+        if drop_incomplete:
+            df = df.dropna(how='any')
+
+        return df
+    
+    def _df2tensors(self, dataframe):
+        
+        # Transform the shifted Pandas dataframe into the multidimensional numpy arrays. These
+        # arrays can be used to input into the keras model and can be accessed by tensor name.
+        # For example, for a TimeSeriesTensor object named "model_inputs" and a tensor named
+        # "target", the input tensor can be acccessed with model_inputs['target']
+    
+        inputs = {}
+        y = dataframe['target']
+        y = y.as_matrix()
+        inputs['target'] = y
+
+        for name, structure in self.tensor_structure.items():
+            rng = structure[0]
+            cols = structure[1]
+            tensor = dataframe[name][cols].as_matrix()
+            if rng is None:
+                tensor = tensor.reshape(tensor.shape[0], len(cols))
+            else:
+                tensor = tensor.reshape(tensor.shape[0], len(cols), len(rng))
+                tensor = np.transpose(tensor, axes=[0, 2, 1])
+            inputs[name] = tensor
+
+        return inputs
+       
+    def subset_data(self, new_dataframe):
+        
+        # Use this function to recreate the input tensors if the shifted dataframe
+        # has been filtered.
+        
+        self.dataframe = new_dataframe
+        self.data = self._df2tensors(self.dataframe)