Source code for citylearn.end_use_load_profiles.lstm_model.preprocessing

from typing import Any, List, Mapping, Tuple
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from citylearn.building import Building
from citylearn.data import get_settings
from citylearn.preprocessing import Normalize, PeriodicNormalization


[docs]
def preprocess_df(config:  Mapping[str, Any], df: pd.DataFrame, train_references: List[int] = None, validation_references: List[int] = None, test_references: List[int] = None) -> Mapping[str, Any]:
    ideal_reference = 0
    # # including in training makes model worse so will exclude for now. 
    # # Nevertheless, prediction for free-float is still decent despite not including it in the training.
    # free_float_reference = 1 
    partial_references = [2, 3, 4, 5]
    train_references = partial_references[:2] if train_references is None else train_references
    validation_references = [partial_references[2]] if validation_references is None else validation_references
    test_references = [partial_references[3]] if test_references is None else test_references
    
    # observation names
    observation_names = get_settings()['schema']['template']['buildings']['Building_1']['dynamics']['attributes']['input_observation_names']
    target = observation_names[-1]
    periodic_observations = Building.get_periodic_observation_metadata()
    
    # periodic normalization
    for k, v in periodic_observations.items():
        result = df[k]*PeriodicNormalization(x_max=v[-1])
        result = pd.DataFrame(result.tolist(), index=result.index)
        df[f'{k}_sin'] = result[0].tolist()
        df[f'{k}_cos'] = result[1].tolist()

    # set min-max normalization limits
    normalization_minimum = df[observation_names].min().values.tolist()
    normalization_maximum = df[observation_names].max().values.tolist()

    # min-max normalization
    for c in observation_names:
        df[c] = df[c]*Normalize(df[c].min(), df[c].max())

    # check to make share there is 1-year worth of data
    months = list(sorted(df['month'].unique()))
    assert len(months) == 12, f'Expected 12 months, got {len(months)} months.'
    
    # training data are ideal load data for every three month step beginning from January
    # and free-float load and 2 partial load datasets for entire year
    train_df = df[
        ((df['reference']==ideal_reference) & (df['month'].isin([months[i] for i in range(0, len(months), 3)])))
        | (df['reference'].isin(train_references))
    ][observation_names].copy()
    X_train, y_train = sliding_windows(train_df.to_numpy(), config['lb'], 1)
    train_df, train_loader = dataset_dataloader(X_train, y_train, config['batch_size'])
    
    # validation data are ideal load data for every three month step beginning from February
    # and free-float load and 1 partial load dataset for entire year
    validation_df = df[
        ((df['reference']==ideal_reference) & (df['month'].isin([months[i] for i in range(1, len(months), 3)])))
        | (df['reference'].isin(validation_references))
    ][observation_names].copy()
    X_val, y_val = sliding_windows(validation_df.to_numpy(), config['lb'], 1)
    val_df, val_loader = dataset_dataloader(X_val, y_val, config['batch_size'])

    # test data are ideal load data for every three month step beginning from March
    # and free-float load and 1 partial load dataset for entire year
    test_df = df[
        ((df['reference']==ideal_reference) & (df['month'].isin([months[i] for i in range(2, len(months), 3)])))
        | (df['reference'].isin(test_references))
    ].copy()
    test_df_by_season = test_df.copy()
    test_df = test_df[observation_names].copy()
    X_test, y_test = sliding_windows(test_df.to_numpy(), config['lb'], 1)
    test_df, test_loader = dataset_dataloader(X_test, y_test, config['batch_size'])
    
    # seasonal test data is same as test data but in 3-month sequences
    test_df_by_season = [
        test_df_by_season[test_df_by_season['month'].isin(months[i:i+3])][observation_names] 
        for i in range(0, len(months), 3)
    ]
    test_loader_by_season = [dataset_dataloader(
        *sliding_windows(df.to_numpy(), config['lb'], 1), 
        config['batch_size']
    )[1] for df in test_df_by_season]
    
    return {
        'temp_limits': {
            'min': normalization_minimum[observation_names.index(target)],
            'max': normalization_maximum[observation_names.index(target)]
        },
        'loaders': {
            'train': train_loader,
            'val': val_loader,
            'test': test_loader,
            'test_by_season':  test_loader_by_season
        },
        'train': {
            'X': X_train,
            'y': y_train
        },
        'val': {
            'X': X_val,
            'y': y_val
        },
        'test': {
            'X': X_test,
            'y': y_test
        },
        'observation_metadata': {
            'input_observation_names': observation_names,
            'input_normalization_minimum': normalization_minimum,
            'input_normalization_maximum': normalization_maximum, 
        }
    }



[docs]
def dataset_dataloader(x: np.ndarray, y: np.ndarray, batch_size: int, shuffle: bool = None, drop_last: bool = None) -> Tuple[TensorDataset, DataLoader]:
    shuffle = True if shuffle is None else shuffle
    drop_last = True if drop_last is None else drop_last
    tensor = TensorDataset(torch.from_numpy(x.astype(np.float32)), torch.from_numpy(y.astype(np.float32)))
    loader = DataLoader(tensor, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last)
    
    return tensor, loader



[docs]
def sliding_windows(data: np.ndarray, seq_length: int, output_len: int):
    """
    Check that the variable to be predicted is the last column of the dataframe
    :param data: dataframe
    :param seq_length: lookback
    :param output_len: how many timetep ahead will be predicted
    :return: x = matrix [number of timestep - lookback, lookback, number of input variables];
             y = matrix [number of timestep - lookback, number of output variables]
    """
    
    x = []
    y = []

    for i in range(len(data) - seq_length):
        _x = data[(i+1):(i + 1 + seq_length), :-1]
        T_lag = data[i:(i+seq_length), -1]
        _y = data[(i + seq_length):(i + seq_length + output_len), -1]  # If you want to predict more than one timestamp
        _x = np.column_stack([_x, T_lag])
        x.append(_x)
        y.append(_y)

    return np.array(x), np.array(y)