Source code for citylearn.agents.q_learning

import math
from typing import Any, List, Tuple
import numpy as np
from citylearn.agents.base import Agent
from citylearn.citylearn import CityLearnEnv

[docs] class TabularQLearning(Agent): """Implementation of Tabular Q-Learning algorithm for discrete observation and action space and epsilon-greedy action selection. Parameters ---------- env: CityLearnEnv CityLearn environment. epsilon: float, default: 1.0 Exploration rate. minimum_epsilon: float, default: 0.01 Minimum value exporation rate can decay to. epsilon_decay: float, default: 0.0001 :code:`epsilon` exponential decay rate. learning_rate: float, default: 0.05 Defines to what degree new knowledge overrides old knowledge: for :code:`learning_rate` = 0, no learning happens, while for :code:`learning_rate` = 1, all prior knowledge is lost. discount_factor: float, default: 0.90 Balance between an agent that considers only immediate rewards (:code:`discount_factor` = 0) and one that strives towards long term rewards (:code:`discount_factor` = 1) q_init_value: float, default: np.nan Q-Table initialization value. Other Parameters ---------------- **kwargs: Any Other keyword arguments used to initialize :py:class:`citylearn.agents.base.Agent` super class. """ def __init__( self, env: CityLearnEnv, epsilon: float = None, minimum_epsilon: float = None, epsilon_decay: float = None, learning_rate: float = None, discount_factor: float = None, q_init_value: float = None, **kwargs: Any, ): super().__init__(env, **kwargs) self.epsilon = 1.0 if epsilon is None else epsilon self.epsilon_init = self.epsilon self.minimum_epsilon = 0.01 if minimum_epsilon is None else minimum_epsilon self.epsilon_decay = 0.0001 if epsilon_decay is None else epsilon_decay self.learning_rate = 0.05 if learning_rate is None else learning_rate self.discount_factor = 0.90 if discount_factor is None else discount_factor self.q_init_value = np.nan if q_init_value is None else q_init_value self.q, self.q_exploration, self.q_exploitation = self.__initialize_q() self.__explored = False
[docs] def predict(self, observations: List[List[float]], deterministic: bool = None) -> List[List[float]]: """Provide actions for current time step. If `deterministic` = True or, randomly generated number is greater than `epsilon`, return deterministic action from Q-Table i.e. action with max Q-value for given observations otherwise, return randomly sampled action. Parameters ---------- observations: List[List[float]] Environment observations deterministic: bool, default: False Wether to return purely exploitatative deterministic actions. Returns ------- actions: List[List[float]] Action values """ deterministic = False if deterministic is None else deterministic actions = None seed = self.random_seed if self.random_seed is None else self.random_seed + self.time_step nprs = np.random.RandomState(seed) if deterministic or nprs.random() > self.epsilon: # Use q-function to decide action actions = self.__exploit(observations) self.__explored = False else: # Explore random action actions = [[s.sample()] for s in self.action_space] self.__explored = True # exponential decay episode = int(self.time_step/self.episode_time_steps) self.epsilon = max(self.minimum_epsilon, self.epsilon_init*np.exp(-self.epsilon_decay*episode)) self.actions = actions self.next_time_step() return actions
def __exploit(self, observations: List[List[float]]) -> List[List[float]]: """Select deterministic actions from Q-Table i.e. action with max Q-value for given observations.""" actions = [] for i, o in enumerate(observations): o = o[0] try: a = np.nanargmax(self.q[i][o]) except ValueError: # when all values for observation are still NaN a = self.action_space[i].sample() actions.append([a]) return actions
[docs] def update(self, observations: List[List[float]], actions: List[List[float]], reward: List[float], next_observations: List[List[float]], terminated: bool, truncated: bool): r"""Update Q-Table using Bellman equation. Parameters ---------- observations : List[List[float]] Previous time step observations. actions : List[List[float]] Previous time step actions. reward : List[float] Current time step reward. next_observations : List[List[float]] Current time step observations. terminated : bool Indication that episode has ended. truncated : bool If episode truncates due to a time limit or a reason that is not defined as part of the task MDP. """ # Compute temporal difference target and error to udpate q-function for i, (o, a, r, n) in enumerate(zip(observations, actions, reward, next_observations)): o, n, a = o[0], n[0], a[0] current_q = self.q[i][o, a] current_q = 0.0 if math.isnan(current_q) else current_q try: next_max_q = np.nanargmax(self.q[i][n]) except ValueError: # when all values for observation are still NaN next_max_q = 0.0 # update q new_q = current_q + self.learning_rate*(r + self.discount_factor*next_max_q - current_q) self.q[i][o, a] = new_q # update exploration-exploitation count if self.__explored: self.q_exploration[i][o, a] += 1 else: self.q_exploitation[i][o, a] += 1
def __initialize_q(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Initialize all values in Q-Table with `q_init_value`.""" q = [None for _ in self.observation_space] q_exploration = [None for _ in self.observation_space] q_exploitation = [None for _ in self.observation_space] for i, (od, ad) in enumerate(zip(self.observation_space, self.action_space)): shape = (od.n, ad.n) q[i] = np.ones(shape=shape)*self.q_init_value q_exploration[i] = np.zeros(shape=shape) q_exploitation[i] = np.zeros(shape=shape) return q, q_exploration, q_exploitation