You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							146 lines
						
					
					
						
							6.0 KiB
						
					
					
				
			
		
		
	
	
							146 lines
						
					
					
						
							6.0 KiB
						
					
					
				import numpy as np
 | 
						|
import pandas as pd
 | 
						|
import os
 | 
						|
from collections import UserDict
 | 
						|
 | 
						|
def load_data(data_dir):
 | 
						|
    """Load the GEFCom 2014 energy load data"""
 | 
						|
 | 
						|
    energy = pd.read_csv(os.path.join(data_dir, 'energy.csv'), parse_dates=['timestamp'])
 | 
						|
 | 
						|
    # Reindex the dataframe such that the dataframe has a record for every time point
 | 
						|
    # between the minimum and maximum timestamp in the time series. This helps to 
 | 
						|
    # identify missing time periods in the data (there are none in this dataset).
 | 
						|
 | 
						|
    energy.index = energy['timestamp']
 | 
						|
    energy = energy.reindex(pd.date_range(min(energy['timestamp']),
 | 
						|
                                          max(energy['timestamp']),
 | 
						|
                                          freq='H'))
 | 
						|
    energy = energy.drop('timestamp', axis=1)
 | 
						|
 | 
						|
    return energy
 | 
						|
 | 
						|
 | 
						|
def mape(predictions, actuals):
 | 
						|
    """Mean absolute percentage error"""
 | 
						|
    return ((predictions - actuals).abs() / actuals).mean()
 | 
						|
 | 
						|
 | 
						|
def create_evaluation_df(predictions, test_inputs, H, scaler):
 | 
						|
    """Create a data frame for easy evaluation"""
 | 
						|
    eval_df = pd.DataFrame(predictions, columns=['t+'+str(t) for t in range(1, H+1)])
 | 
						|
    eval_df['timestamp'] = test_inputs.dataframe.index
 | 
						|
    eval_df = pd.melt(eval_df, id_vars='timestamp', value_name='prediction', var_name='h')
 | 
						|
    eval_df['actual'] = np.transpose(test_inputs['target']).ravel()
 | 
						|
    eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']])
 | 
						|
    return eval_df
 | 
						|
 | 
						|
 | 
						|
class TimeSeriesTensor(UserDict):
 | 
						|
    """A dictionary of tensors for input into the RNN model.
 | 
						|
    
 | 
						|
    Use this class to:
 | 
						|
      1. Shift the values of the time series to create a Pandas dataframe containing all the data
 | 
						|
         for a single training example
 | 
						|
      2. Discard any samples with missing values
 | 
						|
      3. Transform this Pandas dataframe into a numpy array of shape 
 | 
						|
         (samples, time steps, features) for input into Keras
 | 
						|
 | 
						|
    The class takes the following parameters:
 | 
						|
       - **dataset**: original time series
 | 
						|
       - **target** name of the target column
 | 
						|
       - **H**: the forecast horizon
 | 
						|
       - **tensor_structures**: a dictionary discribing the tensor structure of the form
 | 
						|
             { 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) }
 | 
						|
             if features are non-sequential and should not be shifted, use the form
 | 
						|
             { 'tensor_name' : (None, [feature, feature, ...])}
 | 
						|
       - **freq**: time series frequency (default 'H' - hourly)
 | 
						|
       - **drop_incomplete**: (Boolean) whether to drop incomplete samples (default True)
 | 
						|
    """
 | 
						|
    
 | 
						|
    def __init__(self, dataset, target, H, tensor_structure, freq='H', drop_incomplete=True):
 | 
						|
        self.dataset = dataset
 | 
						|
        self.target = target
 | 
						|
        self.tensor_structure = tensor_structure
 | 
						|
        self.tensor_names = list(tensor_structure.keys())
 | 
						|
        
 | 
						|
        self.dataframe = self._shift_data(H, freq, drop_incomplete)
 | 
						|
        self.data = self._df2tensors(self.dataframe)
 | 
						|
    
 | 
						|
    def _shift_data(self, H, freq, drop_incomplete):
 | 
						|
        
 | 
						|
        # Use the tensor_structures definitions to shift the features in the original dataset.
 | 
						|
        # The result is a Pandas dataframe with multi-index columns in the hierarchy
 | 
						|
        #     tensor - the name of the input tensor
 | 
						|
        #     feature - the input feature to be shifted
 | 
						|
        #     time step - the time step for the RNN in which the data is input. These labels
 | 
						|
        #         are centred on time t. the forecast creation time
 | 
						|
        df = self.dataset.copy()
 | 
						|
        
 | 
						|
        idx_tuples = []
 | 
						|
        for t in range(1, H+1):
 | 
						|
            df['t+'+str(t)] = df[self.target].shift(t*-1, freq=freq)
 | 
						|
            idx_tuples.append(('target', 'y', 't+'+str(t)))
 | 
						|
 | 
						|
        for name, structure in self.tensor_structure.items():
 | 
						|
            rng = structure[0]
 | 
						|
            dataset_cols = structure[1]
 | 
						|
            
 | 
						|
            for col in dataset_cols:
 | 
						|
            
 | 
						|
            # do not shift non-sequential 'static' features
 | 
						|
                if rng is None:
 | 
						|
                    df['context_'+col] = df[col]
 | 
						|
                    idx_tuples.append((name, col, 'static'))
 | 
						|
 | 
						|
                else:
 | 
						|
                    for t in rng:
 | 
						|
                        sign = '+' if t > 0 else ''
 | 
						|
                        shift = str(t) if t != 0 else ''
 | 
						|
                        period = 't'+sign+shift
 | 
						|
                        shifted_col = name+'_'+col+'_'+period
 | 
						|
                        df[shifted_col] = df[col].shift(t*-1, freq=freq)
 | 
						|
                        idx_tuples.append((name, col, period))
 | 
						|
                
 | 
						|
        df = df.drop(self.dataset.columns, axis=1)
 | 
						|
        idx = pd.MultiIndex.from_tuples(idx_tuples, names=['tensor', 'feature', 'time step'])
 | 
						|
        df.columns = idx
 | 
						|
 | 
						|
        if drop_incomplete:
 | 
						|
            df = df.dropna(how='any')
 | 
						|
 | 
						|
        return df
 | 
						|
    
 | 
						|
    def _df2tensors(self, dataframe):
 | 
						|
        
 | 
						|
        # Transform the shifted Pandas dataframe into the multidimensional numpy arrays. These
 | 
						|
        # arrays can be used to input into the keras model and can be accessed by tensor name.
 | 
						|
        # For example, for a TimeSeriesTensor object named "model_inputs" and a tensor named
 | 
						|
        # "target", the input tensor can be acccessed with model_inputs['target']
 | 
						|
    
 | 
						|
        inputs = {}
 | 
						|
        y = dataframe['target']
 | 
						|
        y = y.as_matrix()
 | 
						|
        inputs['target'] = y
 | 
						|
 | 
						|
        for name, structure in self.tensor_structure.items():
 | 
						|
            rng = structure[0]
 | 
						|
            cols = structure[1]
 | 
						|
            tensor = dataframe[name][cols].as_matrix()
 | 
						|
            if rng is None:
 | 
						|
                tensor = tensor.reshape(tensor.shape[0], len(cols))
 | 
						|
            else:
 | 
						|
                tensor = tensor.reshape(tensor.shape[0], len(cols), len(rng))
 | 
						|
                tensor = np.transpose(tensor, axes=[0, 2, 1])
 | 
						|
            inputs[name] = tensor
 | 
						|
 | 
						|
        return inputs
 | 
						|
       
 | 
						|
    def subset_data(self, new_dataframe):
 | 
						|
        
 | 
						|
        # Use this function to recreate the input tensors if the shifted dataframe
 | 
						|
        # has been filtered.
 | 
						|
        
 | 
						|
        self.dataframe = new_dataframe
 | 
						|
        self.data = self._df2tensors(self.dataframe)
 |