Source code for citylearn.end_use_load_profiles.lstm_model.preprocessing

from typing import Any, List, Mapping, Tuple
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from citylearn.building import Building
from citylearn.data import get_settings
from citylearn.preprocessing import Normalize, PeriodicNormalization

[docs] def preprocess_df(config: Mapping[str, Any], df: pd.DataFrame, train_references: List[int] = None, validation_references: List[int] = None, test_references: List[int] = None) -> Mapping[str, Any]: ideal_reference = 0 # # including in training makes model worse so will exclude for now. # # Nevertheless, prediction for free-float is still decent despite not including it in the training. # free_float_reference = 1 partial_references = [2, 3, 4, 5] train_references = partial_references[:2] if train_references is None else train_references validation_references = [partial_references[2]] if validation_references is None else validation_references test_references = [partial_references[3]] if test_references is None else test_references # observation names observation_names = get_settings()['schema']['template']['buildings']['Building_1']['dynamics']['attributes']['input_observation_names'] target = observation_names[-1] periodic_observations = Building.get_periodic_observation_metadata() # periodic normalization for k, v in periodic_observations.items(): result = df[k]*PeriodicNormalization(x_max=v[-1]) result = pd.DataFrame(result.tolist(), index=result.index) df[f'{k}_sin'] = result[0].tolist() df[f'{k}_cos'] = result[1].tolist() # set min-max normalization limits normalization_minimum = df[observation_names].min().values.tolist() normalization_maximum = df[observation_names].max().values.tolist() # min-max normalization for c in observation_names: df[c] = df[c]*Normalize(df[c].min(), df[c].max()) # check to make share there is 1-year worth of data months = list(sorted(df['month'].unique())) assert len(months) == 12, f'Expected 12 months, got {len(months)} months.' # training data are ideal load data for every three month step beginning from January # and free-float load and 2 partial load datasets for entire year train_df = df[ ((df['reference']==ideal_reference) & (df['month'].isin([months[i] for i in range(0, len(months), 3)]))) | (df['reference'].isin(train_references)) ][observation_names].copy() X_train, y_train = sliding_windows(train_df.to_numpy(), config['lb'], 1) train_df, train_loader = dataset_dataloader(X_train, y_train, config['batch_size']) # validation data are ideal load data for every three month step beginning from February # and free-float load and 1 partial load dataset for entire year validation_df = df[ ((df['reference']==ideal_reference) & (df['month'].isin([months[i] for i in range(1, len(months), 3)]))) | (df['reference'].isin(validation_references)) ][observation_names].copy() X_val, y_val = sliding_windows(validation_df.to_numpy(), config['lb'], 1) val_df, val_loader = dataset_dataloader(X_val, y_val, config['batch_size']) # test data are ideal load data for every three month step beginning from March # and free-float load and 1 partial load dataset for entire year test_df = df[ ((df['reference']==ideal_reference) & (df['month'].isin([months[i] for i in range(2, len(months), 3)]))) | (df['reference'].isin(test_references)) ].copy() test_df_by_season = test_df.copy() test_df = test_df[observation_names].copy() X_test, y_test = sliding_windows(test_df.to_numpy(), config['lb'], 1) test_df, test_loader = dataset_dataloader(X_test, y_test, config['batch_size']) # seasonal test data is same as test data but in 3-month sequences test_df_by_season = [ test_df_by_season[test_df_by_season['month'].isin(months[i:i+3])][observation_names] for i in range(0, len(months), 3) ] test_loader_by_season = [dataset_dataloader( *sliding_windows(df.to_numpy(), config['lb'], 1), config['batch_size'] )[1] for df in test_df_by_season] return { 'temp_limits': { 'min': normalization_minimum[observation_names.index(target)], 'max': normalization_maximum[observation_names.index(target)] }, 'loaders': { 'train': train_loader, 'val': val_loader, 'test': test_loader, 'test_by_season': test_loader_by_season }, 'train': { 'X': X_train, 'y': y_train }, 'val': { 'X': X_val, 'y': y_val }, 'test': { 'X': X_test, 'y': y_test }, 'observation_metadata': { 'input_observation_names': observation_names, 'input_normalization_minimum': normalization_minimum, 'input_normalization_maximum': normalization_maximum, } }
[docs] def dataset_dataloader(x: np.ndarray, y: np.ndarray, batch_size: int, shuffle: bool = None, drop_last: bool = None) -> Tuple[TensorDataset, DataLoader]: shuffle = True if shuffle is None else shuffle drop_last = True if drop_last is None else drop_last tensor = TensorDataset(torch.from_numpy(x.astype(np.float32)), torch.from_numpy(y.astype(np.float32))) loader = DataLoader(tensor, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last) return tensor, loader
[docs] def sliding_windows(data: np.ndarray, seq_length: int, output_len: int): """ Check that the variable to be predicted is the last column of the dataframe :param data: dataframe :param seq_length: lookback :param output_len: how many timetep ahead will be predicted :return: x = matrix [number of timestep - lookback, lookback, number of input variables]; y = matrix [number of timestep - lookback, number of output variables] """ x = [] y = [] for i in range(len(data) - seq_length): _x = data[(i+1):(i + 1 + seq_length), :-1] T_lag = data[i:(i+seq_length), -1] _y = data[(i + seq_length):(i + seq_length + output_len), -1] # If you want to predict more than one timestamp _x = np.column_stack([_x, T_lag]) x.append(_x) y.append(_y) return np.array(x), np.array(y)