update of api docstrings across coach and tutorials [WIP] (#91)

* updating the documentation website * adding the built docs * update of api docstrings across coach and tutorials 0-2 * added some missing api documentation * New Sphinx based documentation
2026-02-16 05:55:46 +01:00 · 2018-11-15 15:00:13 +02:00
parent 524f8436a2
commit 6d40ad1650
517 changed files with 71034 additions and 12834 deletions
--- a/rl_coach/agents/actor_critic_agent.py
+++ b/rl_coach/agents/actor_critic_agent.py
@@ -36,25 +36,25 @@ from rl_coach.utils import last_sample
 class ActorCriticAlgorithmParameters(AlgorithmParameters):
    """
    :param policy_gradient_rescaler: (PolicyGradientRescaler)
-    The value that will be used to rescale the policy gradient
+        The value that will be used to rescale the policy gradient

    :param apply_gradients_every_x_episodes: (int)
-    The number of episodes to wait before applying the accumulated gradients to the network.
-    The training iterations only accumulate gradients without actually applying them.
+        The number of episodes to wait before applying the accumulated gradients to the network.
+        The training iterations only accumulate gradients without actually applying them.

    :param beta_entropy: (float)
-    The weight that will be given to the entropy regularization which is used in order to improve exploration.
+        The weight that will be given to the entropy regularization which is used in order to improve exploration.

    :param num_steps_between_gradient_updates: (int)
-    Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
-    accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
+        Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
+        accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.

    :param gae_lambda: (float)
-    If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
-    scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
+        If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
+        scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.

    :param estimate_state_value_using_gae: (bool)
-    If set to True, the state value targets for the V head will be estimated using the GAE scheme.
+        If set to True, the state value targets for the V head will be estimated using the GAE scheme.
    """
    def __init__(self):
        super().__init__()
--- a/rl_coach/agents/agent.py
+++ b/rl_coach/agents/agent.py
@@ -39,7 +39,7 @@ from rl_coach.memories.backend.memory_impl import get_memory_backend
 class Agent(AgentInterface):
    def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
        """
-        :param agent_parameters: A Preset class instance with all the running paramaters
+        :param agent_parameters: A AgentParameters class instance with all the agent parameters
        """
        super().__init__()
        self.ap = agent_parameters
@@ -175,18 +175,20 @@ class Agent(AgentInterface):
            np.random.seed()

    @property
-    def parent(self):
+    def parent(self) -> 'LevelManager':
        """
        Get the parent class of the agent
+
        :return: the current phase
        """
        return self._parent

    @parent.setter
-    def parent(self, val):
+    def parent(self, val) -> None:
        """
        Change the parent class of the agent.
        Additionally, updates the full name of the agent
+
        :param val: the new parent
        :return: None
        """
@@ -196,7 +198,12 @@ class Agent(AgentInterface):
                raise ValueError("The parent of an agent must have a name")
            self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)

-    def setup_logger(self):
+    def setup_logger(self) -> None:
+        """
+        Setup the logger for the agent
+
+        :return: None
+        """
        # dump documentation
        logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
            format(graph_name=self.parent_level_manager.parent_graph_manager.name,
@@ -212,6 +219,7 @@ class Agent(AgentInterface):
    def set_session(self, sess) -> None:
        """
        Set the deep learning framework session for all the agents in the composite agent
+
        :return: None
        """
        self.input_filter.set_session(sess)
@@ -223,6 +231,7 @@ class Agent(AgentInterface):
                        dump_one_value_per_step: bool=False) -> Signal:
        """
        Register a signal such that its statistics will be dumped and be viewable through dashboard
+
        :param signal_name: the name of the signal as it will appear in dashboard
        :param dump_one_value_per_episode: should the signal value be written for each episode?
        :param dump_one_value_per_step: should the signal value be written for each step?
@@ -239,6 +248,7 @@ class Agent(AgentInterface):
        """
        Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
        dependent on those values, by calling init_environment_dependent_modules
+
        :param spaces: the environment spaces definition
        :return: None
        """
@@ -274,6 +284,7 @@ class Agent(AgentInterface):
        Create all the networks of the agent.
        The network creation will be done after setting the environment parameters for the agent, since they are needed
        for creating the network.
+
        :return: A list containing all the networks
        """
        networks = {}
@@ -295,6 +306,7 @@ class Agent(AgentInterface):
        """
        Initialize any modules that depend on knowing information about the environment such as the action space or
        the observation space
+
        :return: None
        """
        # initialize exploration policy
@@ -314,13 +326,19 @@ class Agent(AgentInterface):

    @property
    def phase(self) -> RunPhase:
+        """
+        The current running phase of the agent
+
+        :return: RunPhase
+        """
        return self._phase

    @phase.setter
    def phase(self, val: RunPhase) -> None:
        """
        Change the phase of the run for the agent and all the sub components
-        :param phase: the new run phase (TRAIN, TEST, etc.)
+
+        :param val: the new run phase (TRAIN, TEST, etc.)
        :return: None
        """
        self.reset_evaluation_state(val)
@@ -328,6 +346,14 @@ class Agent(AgentInterface):
        self.exploration_policy.change_phase(val)

    def reset_evaluation_state(self, val: RunPhase) -> None:
+        """
+        Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
+        evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
+        by val, and by the current phase set in self.phase.
+
+        :param val: The new phase to change to
+        :return: None
+        """
        starting_evaluation = (val == RunPhase.TEST)
        ending_evaluation = (self.phase == RunPhase.TEST)

@@ -363,6 +389,7 @@ class Agent(AgentInterface):
        This function is a wrapper to allow having the same calls for shared or unshared memories.
        It should be used instead of calling the memory directly in order to allow different algorithms to work
        both with a shared and a local memory.
+
        :param func: the name of the memory function to call
        :param args: the arguments to supply to the function
        :return: the return value of the function
@@ -375,7 +402,12 @@ class Agent(AgentInterface):
            result = getattr(self.memory, func)(*args)
        return result

-    def log_to_screen(self):
+    def log_to_screen(self) -> None:
+        """
+        Write an episode summary line to the terminal
+
+        :return: None
+        """
        # log to screen
        log = OrderedDict()
        log["Name"] = self.full_name_id
@@ -388,9 +420,10 @@ class Agent(AgentInterface):
        log["Training iteration"] = self.training_iteration
        screen.log_dict(log, prefix=self.phase.value)

-    def update_step_in_episode_log(self):
+    def update_step_in_episode_log(self) -> None:
        """
-        Writes logging messages to screen and updates the log file with all the signal values.
+        Updates the in-episode log file with all the signal values from the most recent step.
+
        :return: None
        """
        # log all the signals to file
@@ -411,9 +444,12 @@ class Agent(AgentInterface):
        # dump
        self.agent_episode_logger.dump_output_csv()

-    def update_log(self):
+    def update_log(self) -> None:
        """
-        Writes logging messages to screen and updates the log file with all the signal values.
+        Updates the episodic log file with all the signal values from the most recent episode.
+        Additional signals for logging can be set by the creating a new signal using self.register_signal,
+        and then updating it with some internal agent values.
+
        :return: None
        """
        # log all the signals to file
@@ -438,7 +474,6 @@ class Agent(AgentInterface):
            self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
            self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)

-
        for signal in self.episode_signals:
            self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
            self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
@@ -452,7 +487,10 @@ class Agent(AgentInterface):

    def handle_episode_ended(self) -> None:
        """
-        End an episode
+        Make any changes needed when each episode is ended.
+        This includes incrementing counters, updating full episode dependent values, updating logs, etc.
+        This function is called right after each episode is ended.
+
        :return: None
        """
        self.current_episode_buffer.is_complete = True
@@ -486,9 +524,10 @@ class Agent(AgentInterface):
        if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
            self.log_to_screen()

-    def reset_internal_state(self):
+    def reset_internal_state(self) -> None:
        """
-        Reset all the episodic parameters
+        Reset all the episodic parameters. This function is called right before each episode starts.
+
        :return: None
        """
        for signal in self.episode_signals:
@@ -516,6 +555,7 @@ class Agent(AgentInterface):
    def learn_from_batch(self, batch) -> Tuple[float, List, List]:
        """
        Given a batch of transitions, calculates their target values and updates the network.
+
        :param batch: A list of transitions
        :return: The total loss of the training, the loss per head and the unclipped gradients
        """
@@ -524,6 +564,7 @@ class Agent(AgentInterface):
    def _should_update_online_weights_to_target(self):
        """
        Determine if online weights should be copied to the target.
+
        :return: boolean: True if the online weights should be copied to the target.
        """

@@ -542,9 +583,10 @@ class Agent(AgentInterface):
                             "EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
        return should_update

-    def _should_train(self, wait_for_full_episode=False):
+    def _should_train(self, wait_for_full_episode=False) -> bool:
        """
        Determine if we should start a training phase according to the number of steps passed since the last training
+
        :return:  boolean: True if we should start a training phase
        """

@@ -580,11 +622,12 @@ class Agent(AgentInterface):

        return should_update

-    def train(self):
+    def train(self) -> float:
        """
        Check if a training phase should be done as configured by num_consecutive_playing_steps.
        If it should, then do several training steps as configured by num_consecutive_training_steps.
        A single training iteration: Sample a batch, train on it and update target networks.
+
        :return: The total training loss during the training iterations.
        """
        loss = 0
@@ -641,14 +684,12 @@ class Agent(AgentInterface):
            # run additional commands after the training is done
            self.post_training_commands()

-
-
        return loss

    def choose_action(self, curr_state):
        """
-        choose an action to act with in the current episode being played. Different behavior might be exhibited when training
-         or testing.
+        choose an action to act with in the current episode being played. Different behavior might be exhibited when
+        training or testing.

        :param curr_state: the current state to act upon.
        :return: chosen action, some action value describing the action (q-value, probability, etc)
@@ -656,10 +697,16 @@ class Agent(AgentInterface):
        pass

    def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
-                                    network_name: str):
+                                    network_name: str) -> Dict[str, np.array]:
        """
-        convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
+        Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
        observations together, measurements together, etc.
+
+        :param states: A list of environment states, where each one is a dict mapping from an observation name to its
+                       corresponding observation
+        :param network_name: The agent network name to prepare the batch for. this is needed in order to extract only
+                             the observation relevant for the network from the states.
+        :return: A dictionary containing a list of values from all the given states for each of the observations
        """
        # convert to batch so we can run it through the network
        states = force_list(states)
@@ -676,7 +723,8 @@ class Agent(AgentInterface):
    def act(self) -> ActionInfo:
        """
        Given the agents current knowledge, decide on the next action to apply to the environment
-        :return: an action and a dictionary containing any additional info from the action decision process
+
+        :return: An ActionInfo object, which contains the action and any additional info from the action decision process
        """
        if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
            # This agent never plays  while training (e.g. behavioral cloning)
@@ -705,13 +753,20 @@ class Agent(AgentInterface):

        return filtered_action_info

-    def run_pre_network_filter_for_inference(self, state: StateType):
+    def run_pre_network_filter_for_inference(self, state: StateType) -> StateType:
+        """
+        Run filters which where defined for being applied right before using the state for inference.
+
+        :param state: The state to run the filters on
+        :return: The filtered state
+        """
        dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
        return self.pre_network_filter.filter(dummy_env_response)[0].next_state

    def get_state_embedding(self, state: dict) -> np.ndarray:
        """
        Given a state, get the corresponding state embedding  from the main network
+
        :param state: a state dict
        :return: a numpy embedding vector
        """
@@ -726,6 +781,7 @@ class Agent(AgentInterface):
        """
        Allows agents to update the transition just before adding it to the replay buffer.
        Can be useful for agents that want to tweak the reward, termination signal, etc.
+
        :param transition: the transition to update
        :return: the updated transition
        """
@@ -736,8 +792,10 @@ class Agent(AgentInterface):
        Given a response from the environment, distill the observation from it and store it for later use.
        The response should be a dictionary containing the performed action, the new observation and measurements,
        the reward, a game over flag and any additional information necessary.
+
        :param env_response: result of call from environment.step(action)
-        :return:
+        :return: a boolean value which determines if the agent has decided to terminate the episode after seeing the
+                 given observation
        """

        # filter the env_response
@@ -801,7 +859,12 @@ class Agent(AgentInterface):

            return transition.game_over

-    def post_training_commands(self):
+    def post_training_commands(self) -> None:
+        """
+        A function which allows adding any functionality that is required to run right after the training phase ends.
+
+        :return: None
+        """
        pass

    def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
@@ -809,9 +872,10 @@ class Agent(AgentInterface):
        Get a prediction from the agent with regard to the requested prediction_type.
        If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
        raise a ValueException.
-        :param states:
-        :param prediction_type:
-        :return:
+
+        :param states: The states to get a prediction for
+        :param prediction_type: The type of prediction to get for the states. For example, the state-value prediction.
+        :return: the predicted values
        """

        predictions = self.networks['main'].online_network.predict_with_prediction_type(
@@ -824,6 +888,15 @@ class Agent(AgentInterface):
        return list(predictions.values())[0]

    def set_incoming_directive(self, action: ActionType) -> None:
+        """
+        Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent
+        has another master agent that is controlling it. In such cases, the master agent can define the goals for the
+        slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent
+        in-action-space.
+
+        :param action: The action that should be set as the directive
+        :return:
+        """
        if isinstance(self.in_action_space, GoalsSpace):
            self.current_hrl_goal = action
        elif isinstance(self.in_action_space, AttentionActionSpace):
@@ -834,6 +907,7 @@ class Agent(AgentInterface):
    def save_checkpoint(self, checkpoint_id: int) -> None:
        """
        Allows agents to store additional information when saving checkpoints.
+
        :param checkpoint_id: the id of the checkpoint
        :return: None
        """
@@ -842,6 +916,7 @@ class Agent(AgentInterface):
    def sync(self) -> None:
        """
        Sync the global network parameters to local networks
+
        :return: None
        """
        for network in self.networks.values():
--- a/rl_coach/agents/bc_agent.py
+++ b/rl_coach/agents/bc_agent.py
@@ -32,7 +32,6 @@ from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayPar
 class BCAlgorithmParameters(AlgorithmParameters):
    def __init__(self):
        super().__init__()
-        self.collect_new_data = False


 class BCNetworkParameters(NetworkParameters):
--- a/rl_coach/agents/categorical_dqn_agent.py
+++ b/rl_coach/agents/categorical_dqn_agent.py
@@ -33,6 +33,19 @@ class CategoricalDQNNetworkParameters(DQNNetworkParameters):


 class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param v_min: (float)
+        The minimal value that will be represented in the network output for predicting the Q value.
+        Corresponds to :math:`v_{min}` in the paper.
+
+    :param v_max: (float)
+        The maximum value that will be represented in the network output for predicting the Q value.
+        Corresponds to :math:`v_{max}` in the paper.
+
+    :param atoms: (int)
+        The number of atoms that will be used to discretize the range between v_min and v_max.
+        For the C51 algorithm described in the paper, the number of atoms is 51.
+    """
    def __init__(self):
        super().__init__()
        self.v_min = -10.0
--- a/rl_coach/agents/cil_agent.py
+++ b/rl_coach/agents/cil_agent.py
@@ -26,9 +26,12 @@ from rl_coach.memories.non_episodic.balanced_experience_replay import BalancedEx


 class CILAlgorithmParameters(AlgorithmParameters):
+    """
+    :param state_key_with_the_class_index: (str)
+        The key of the state dictionary which corresponds to the value that will be used to control the class index.
+    """
    def __init__(self):
        super().__init__()
-        self.collect_new_data = False
        self.state_key_with_the_class_index = 'high_level_command'


--- a/rl_coach/agents/clipped_ppo_agent.py
+++ b/rl_coach/agents/clipped_ppo_agent.py
@@ -58,6 +58,47 @@ class ClippedPPONetworkParameters(NetworkParameters):


 class ClippedPPOAlgorithmParameters(AlgorithmParameters):
+    """
+    :param policy_gradient_rescaler: (PolicyGradientRescaler)
+        This represents how the critic will be used to update the actor. The critic value function is typically used
+        to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
+        advantage of the action, or the generalized advantage estimation (GAE) value.
+
+    :param gae_lambda: (float)
+        The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
+        estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
+        n-step estimations.
+
+    :param clip_likelihood_ratio_using_epsilon: (float)
+        If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
+        clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
+        This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
+        implementations.
+
+    :param value_targets_mix_fraction: (float)
+        The targets for the value network are an exponential weighted moving average which uses this mix fraction to
+        define how much of the new targets will be taken into account when calculating the loss.
+        This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
+
+    :param estimate_state_value_using_gae: (bool)
+        If set to True, the state value will be estimated using the GAE technique.
+
+    :param use_kl_regularization: (bool)
+        If set to True, the loss function will be regularized using the KL diveregence between the current and new
+        policy, to bound the change of the policy during the network update.
+
+    :param beta_entropy: (float)
+        An entropy regulaization term can be added to the loss function in order to control exploration. This term
+        is weighted using the :math:`\beta` value defined by beta_entropy.
+
+    :param optimization_epochs: (int)
+        For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
+        optimization_epochs value.
+
+    :param optimization_epochs: (Schedule)
+        Can be used to define a schedule over the clipping of the likelihood ratio.
+
+    """
    def __init__(self):
        super().__init__()
        self.num_episodes_in_experience_replay = 1000000
@@ -66,7 +107,6 @@ class ClippedPPOAlgorithmParameters(AlgorithmParameters):
        self.use_kl_regularization = False
        self.clip_likelihood_ratio_using_epsilon = 0.2
        self.estimate_state_value_using_gae = True
-        self.step_until_collecting_full_episodes = True
        self.beta_entropy = 0.01  # should be 0 for mujoco
        self.num_consecutive_playing_steps = EnvironmentSteps(2048)
        self.optimization_epochs = 10
--- a/rl_coach/agents/ddpg_agent.py
+++ b/rl_coach/agents/ddpg_agent.py
@@ -65,6 +65,33 @@ class DDPGActorNetworkParameters(NetworkParameters):


 class DDPGAlgorithmParameters(AlgorithmParameters):
+    """
+    :param num_steps_between_copying_online_weights_to_target: (StepMethod)
+        The number of steps between copying the online network weights to the target network weights.
+
+    :param rate_for_copying_weights_to_target: (float)
+        When copying the online network weights to the target network weights, a soft update will be used, which
+        weight the new online network weights by rate_for_copying_weights_to_target
+
+    :param num_consecutive_playing_steps: (StepMethod)
+        The number of consecutive steps to act between every two training iterations
+
+    :param use_target_network_for_evaluation: (bool)
+        If set to True, the target network will be used for predicting the actions when choosing actions to act.
+        Since the target network weights change more slowly, the predicted actions will be more consistent.
+
+    :param action_penalty: (float)
+        The amount by which to penalize the network on high action feature (pre-activation) values.
+        This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
+        gradients from becoming very low.
+
+    :param clip_critic_targets: (Tuple[float, float] or None)
+        The range to clip the critic target to in order to prevent overestimation of the action values.
+
+    :param use_non_zero_discount_for_terminal_states: (bool)
+        If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
+        values. If set to False, the terminal states reward will be taken as the target return for the network.
+    """
    def __init__(self):
        super().__init__()
        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
--- a/rl_coach/agents/dfp_agent.py
+++ b/rl_coach/agents/dfp_agent.py
@@ -81,6 +81,35 @@ class DFPMemoryParameters(EpisodicExperienceReplayParameters):


 class DFPAlgorithmParameters(AlgorithmParameters):
+    """
+    :param num_predicted_steps_ahead: (int)
+        Number of future steps to predict measurements for. The future steps won't be sequential, but rather jump
+        in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4
+
+    :param goal_vector: (List[float])
+        The goal vector will weight each of the measurements to form an optimization goal. The vector should have
+        the same length as the number of measurements, and it will be vector multiplied by the measurements.
+        Positive values correspond to trying to maximize the particular measurement, and negative values
+        correspond to trying to minimize the particular measurement.
+
+    :param future_measurements_weights: (List[float])
+        The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
+        goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
+        then only the 3 last timesteps will be taken into account, according to the weights in the
+        future_measurements_weights vector.
+
+    :param use_accumulated_reward_as_measurement: (bool)
+        If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
+        the measurements vector in the state. This van be useful in environments where the given measurements don't
+        include enough information for the particular goal the agent should achieve.
+
+    :param handling_targets_after_episode_end: (HandlingTargetsAfterEpisodeEnd)
+        Dictates how to handle measurements that are outside the episode length.
+
+    :param scale_measurements_targets: (Dict[str, float])
+        Allows rescaling the values of each of the measurements available. This van be useful when the measurements
+        have a different scale and you want to normalize them to the same scale.
+    """
    def __init__(self):
        super().__init__()
        self.num_predicted_steps_ahead = 6
--- a/rl_coach/agents/hac_ddpg_agent.py
+++ b/rl_coach/agents/hac_ddpg_agent.py
@@ -24,6 +24,13 @@ from rl_coach.spaces import SpacesDefinition


 class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
+    """
+    :param time_limit: (int)
+        The number of steps the agent is allowed to act for while trying to achieve its goal
+
+    :param sub_goal_testing_rate: (float)
+        The percent of episodes that will be used for testing the sub goals generated by the upper level agents.
+    """
    def __init__(self):
        super().__init__()
        self.time_limit = 40
@@ -91,7 +98,7 @@ class HACDDPGAgent(DDPGAgent):
            sub_goal_is_missed = not sub_goal_reached

            if sub_goal_is_missed:
-                    transition.reward = -self.ap.algorithm.time_limit
+                transition.reward = -self.ap.algorithm.time_limit
        return transition

    def set_environment_parameters(self, spaces: SpacesDefinition):
--- a/rl_coach/agents/mmc_agent.py
+++ b/rl_coach/agents/mmc_agent.py
@@ -24,6 +24,11 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi


 class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param monte_carlo_mixing_rate: (float)
+        The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
+        the single-step bootstrapped targets.
+    """
    def __init__(self):
        super().__init__()
        self.monte_carlo_mixing_rate = 0.1
--- a/rl_coach/agents/n_step_q_agent.py
+++ b/rl_coach/agents/n_step_q_agent.py
@@ -44,6 +44,26 @@ class NStepQNetworkParameters(NetworkParameters):


 class NStepQAlgorithmParameters(AlgorithmParameters):
+    """
+    :param num_steps_between_copying_online_weights_to_target: (StepMethod)
+        The number of steps between copying the online network weights to the target network weights.
+
+    :param apply_gradients_every_x_episodes: (int)
+        The number of episodes between applying the accumulated gradients to the network. After every
+        num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
+        it will then accumulate it in internal accumulators, and will only apply them to the network once in every
+        apply_gradients_every_x_episodes episodes.
+
+    :param num_steps_between_gradient_updates: (int)
+        The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
+        called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
+        are used in the batch.
+
+    :param targets_horizon: (str)
+        Should be either 'N-Step' or '1-Step', and defines the length for which to bootstrap the network values over.
+        Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
+        please refer to the original paper (https://arxiv.org/abs/1602.01783)
+    """
    def __init__(self):
        super().__init__()
        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
--- a/rl_coach/agents/nec_agent.py
+++ b/rl_coach/agents/nec_agent.py
@@ -43,6 +43,39 @@ class NECNetworkParameters(NetworkParameters):


 class NECAlgorithmParameters(AlgorithmParameters):
+    """
+    :param dnd_size: (int)
+        Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
+        of transitions that will be stored is dnd_size x num_actions.
+
+    :param l2_norm_added_delta: (float)
+        A small value that will be added when calculating the weight of each of the DND entries. This follows the
+        :math:`\delta` patameter defined in the paper.
+
+    :param new_value_shift_coefficient: (float)
+        In the case where a ew embedding that was added to the DND was already present, the value that will be stored
+        in the DND is a mix between the existing value and the new value. The mix rate is defined by
+        new_value_shift_coefficient.
+
+    :param number_of_knn: (int)
+        The number of neighbors that will be retrieved for each DND query.
+
+    :param DND_key_error_threshold: (float)
+        When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
+        exists in the DND, since exact matches of embeddings are very rare.
+
+    :param propagate_updates_to_DND: (bool)
+        If set to True, when the gradients of the network will be calculated, the gradients will also be
+        backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
+        network weights.
+
+    :param n_step: (int)
+        The bootstrap length that will be used when calculating the state values to store in the DND.
+
+    :param bootstrap_total_return_from_old_policy: (bool)
+        If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
+        when the state was first seen, and not the latest, most up-to-date network value.
+    """
    def __init__(self):
        super().__init__()
        self.dnd_size = 500000
--- a/rl_coach/agents/pal_agent.py
+++ b/rl_coach/agents/pal_agent.py
@@ -24,6 +24,19 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi


 class PALAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param pal_alpha: (float)
+        A factor that weights the amount by which the advantage learning update will be taken into account.
+
+    :param persistent_advantage_learning: (bool)
+        If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
+        the same actions one after the other instead of changing actions.
+
+    :param monte_carlo_mixing_rate: (float)
+        The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
+        total discounted returns, and they can help reduce the time it takes for the network to update to the newly
+        seen values, since it is not based on bootstrapping the current network values.
+    """
    def __init__(self):
        super().__init__()
        self.pal_alpha = 0.9
--- a/rl_coach/agents/policy_gradients_agent.py
+++ b/rl_coach/agents/policy_gradients_agent.py
@@ -42,6 +42,27 @@ class PolicyGradientNetworkParameters(NetworkParameters):


 class PolicyGradientAlgorithmParameters(AlgorithmParameters):
+    """
+    :param policy_gradient_rescaler: (PolicyGradientRescaler)
+        The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
+        the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
+        return, but there are other rescalers that are intended for reducing the variance of the updates.
+
+    :param apply_gradients_every_x_episodes: (int)
+        The number of episodes between applying the accumulated gradients to the network. After every
+        num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
+        it will then accumulate it in internal accumulators, and will only apply them to the network once in every
+        apply_gradients_every_x_episodes episodes.
+
+    :param beta_entropy: (float)
+        A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
+        will be added to the loss and scaled by the given beta factor.
+
+    :param num_steps_between_gradient_updates: (int)
+        The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
+        called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
+        are used in the batch.
+    """
    def __init__(self):
        super().__init__()
        self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
--- a/rl_coach/agents/ppo_agent.py
+++ b/rl_coach/agents/ppo_agent.py
@@ -63,6 +63,51 @@ class PPOActorNetworkParameters(NetworkParameters):


 class PPOAlgorithmParameters(AlgorithmParameters):
+    """
+    :param policy_gradient_rescaler: (PolicyGradientRescaler)
+        This represents how the critic will be used to update the actor. The critic value function is typically used
+        to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
+        advantage of the action, or the generalized advantage estimation (GAE) value.
+
+    :param gae_lambda: (float)
+        The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
+        estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
+        n-step estimations.
+
+    :param target_kl_divergence: (float)
+        The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
+        bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.
+
+    :param initial_kl_coefficient: (float)
+        The initial weight that will be given to the KL divergence between the current and the new policy in the
+        regularization factor.
+
+    :param high_kl_penalty_coefficient: (float)
+        The penalty that will be given for KL divergence values which are highes than what was defined as the target.
+
+    :param clip_likelihood_ratio_using_epsilon: (float)
+        If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
+        clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
+        This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
+        implementations.
+
+    :param value_targets_mix_fraction: (float)
+        The targets for the value network are an exponential weighted moving average which uses this mix fraction to
+        define how much of the new targets will be taken into account when calculating the loss.
+        This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
+
+    :param estimate_state_value_using_gae: (bool)
+        If set to True, the state value will be estimated using the GAE technique.
+
+    :param use_kl_regularization: (bool)
+        If set to True, the loss function will be regularized using the KL diveregence between the current and new
+        policy, to bound the change of the policy during the network update.
+
+    :param beta_entropy: (float)
+        An entropy regulaization term can be added to the loss function in order to control exploration. This term
+        is weighted using the :math:`\beta` value defined by beta_entropy.
+
+    """
    def __init__(self):
        super().__init__()
        self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
@@ -73,7 +118,6 @@ class PPOAlgorithmParameters(AlgorithmParameters):
        self.clip_likelihood_ratio_using_epsilon = None
        self.value_targets_mix_fraction = 0.1
        self.estimate_state_value_using_gae = True
-        self.step_until_collecting_full_episodes = True
        self.use_kl_regularization = True
        self.beta_entropy = 0.01
        self.num_consecutive_playing_steps = EnvironmentSteps(5000)
--- a/rl_coach/agents/qr_dqn_agent.py
+++ b/rl_coach/agents/qr_dqn_agent.py
@@ -34,6 +34,14 @@ class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):


 class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param atoms: (int)
+        the number of atoms to predict for each action
+
+    :param huber_loss_interval: (float)
+        One of the huber loss parameters, and is referred to as :math:`\kapa` in the paper.
+        It describes the interval [-k, k] in which the huber loss acts as a MSE loss.
+    """
    def __init__(self):
        super().__init__()
        self.atoms = 200
--- a/rl_coach/agents/rainbow_dqn_agent.py
+++ b/rl_coach/agents/rainbow_dqn_agent.py
@@ -37,6 +37,17 @@ class RainbowDQNNetworkParameters(DQNNetworkParameters):


 class RainbowDQNAlgorithmParameters(CategoricalDQNAlgorithmParameters):
+    """
+    :param n_step: (int)
+        The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
+        using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
+        prediction.
+
+    :param store_transitions_only_when_episodes_are_terminated: (bool)
+        If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
+        written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
+        transitions into the memory, and to do so we need the entire episode first.
+    """
    def __init__(self):
        super().__init__()
        self.n_step = 3
--- a/rl_coach/architectures/architecture.py
+++ b/rl_coach/architectures/architecture.py
@@ -57,7 +57,7 @@ class Architecture(object):
        :param initial_feed_dict: a dictionary of extra inputs for forward pass.
        :return: predictions of action or value of shape (batch_size, action_space_size) for action predictions)
        """
-        pass
+        raise NotImplementedError

    @staticmethod
    def parallel_predict(sess: Any,
@@ -68,7 +68,7 @@ class Architecture(object):
        :param network_input_tuples: tuple of network and corresponding input
        :return: list or tuple of outputs from all networks
        """
-        pass
+        raise NotImplementedError

    def train_on_batch(self,
                       inputs: Dict[str, np.ndarray],
@@ -102,7 +102,7 @@ class Architecture(object):
            norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
            fetched_tensors: all values for additional_fetches
        """
-        pass
+        raise NotImplementedError

    def get_weights(self) -> List[np.ndarray]:
        """
@@ -110,7 +110,7 @@ class Architecture(object):

        :return: list weights as ndarray
        """
-        pass
+        raise NotImplementedError

    def set_weights(self, weights: List[np.ndarray], rate: float=1.0) -> None:
        """
@@ -121,7 +121,7 @@ class Architecture(object):
            i.e. new_weight = rate * given_weight + (1 - rate) * old_weight
        :return: None
        """
-        pass
+        raise NotImplementedError

    def reset_accumulated_gradients(self) -> None:
        """
@@ -130,7 +130,7 @@ class Architecture(object):
        Once gradients are reset, they must be accessible by `accumulated_gradients` property of this class,
        which must return a list of numpy ndarrays. Child class must ensure that `accumulated_gradients` is set.
        """
-        pass
+        raise NotImplementedError

    def accumulate_gradients(self,
                             inputs: Dict[str, np.ndarray],
@@ -166,7 +166,7 @@ class Architecture(object):
            norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
            fetched_tensors: all values for additional_fetches
        """
-        pass
+        raise NotImplementedError

    def apply_and_reset_gradients(self, gradients: List[np.ndarray], scaler: float=1.) -> None:
        """
@@ -177,7 +177,7 @@ class Architecture(object):
            of an identical network (either self or another identical network)
        :param scaler: A scaling factor that allows rescaling the gradients before applying them
        """
-        pass
+        raise NotImplementedError

    def apply_gradients(self, gradients: List[np.ndarray], scaler: float=1.) -> None:
        """
@@ -188,7 +188,7 @@ class Architecture(object):
            of an identical network (either self or another identical network)
        :param scaler: A scaling factor that allows rescaling the gradients before applying them
        """
-        pass
+        raise NotImplementedError

    def get_variable_value(self, variable: Any) -> np.ndarray:
        """
@@ -199,7 +199,7 @@ class Architecture(object):
        :param variable: variable of interest
        :return: value of the specified variable
        """
-        pass
+        raise NotImplementedError

    def set_variable_value(self, assign_op: Any, value: np.ndarray, placeholder: Any):
        """
@@ -212,4 +212,4 @@ class Architecture(object):
        :param value: value of the specified variable used for update
        :param placeholder: a placeholder for binding the value to assign_op.
        """
-        pass
+        raise NotImplementedError
--- a/rl_coach/architectures/network_wrapper.py
+++ b/rl_coach/architectures/network_wrapper.py
@@ -34,7 +34,11 @@ except ImportError:

 class NetworkWrapper(object):
    """
-    Contains multiple networks and managers syncing and gradient updates
+    The network wrapper contains multiple copies of the same network, each one with a different set of weights which is
+    updating in a different time scale. The network wrapper will always contain an online network.
+    It will contain an additional slow updating target network if it was requested by the user,
+    and it will contain a global network shared between different workers, if Coach is run in a single-node
+    multi-process distributed mode. The network wrapper contains functionality for managing these networks and syncing
    between them.
    """
    def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str,
@@ -98,6 +102,7 @@ class NetworkWrapper(object):
    def sync(self):
        """
        Initializes the weights of the networks to match each other
+
        :return:
        """
        self.update_online_network()
@@ -106,6 +111,7 @@ class NetworkWrapper(object):
    def update_target_network(self, rate=1.0):
        """
        Copy weights: online network >>> target network
+
        :param rate: the rate of copying the weights - 1 for copying exactly
        """
        if self.target_network:
@@ -114,6 +120,7 @@ class NetworkWrapper(object):
    def update_online_network(self, rate=1.0):
        """
        Copy weights: global network >>> online network
+
        :param rate: the rate of copying the weights - 1 for copying exactly
        """
        if self.global_network:
@@ -122,6 +129,7 @@ class NetworkWrapper(object):
    def apply_gradients_to_global_network(self, gradients=None):
        """
        Apply gradients from the online network on the global network
+
        :param gradients: optional gradients that will be used instead of teh accumulated gradients
        :return:
        """
@@ -135,6 +143,7 @@ class NetworkWrapper(object):
    def apply_gradients_to_online_network(self, gradients=None):
        """
        Apply gradients from the online network on itself
+
        :return:
        """
        if gradients is None:
@@ -144,6 +153,7 @@ class NetworkWrapper(object):
    def train_and_sync_networks(self, inputs, targets, additional_fetches=[], importance_weights=None):
        """
        A generic training function that enables multi-threading training using a global network if necessary.
+
        :param inputs: The inputs for the network.
        :param targets: The targets corresponding to the given inputs
        :param additional_fetches: Any additional tensor the user wants to fetch
@@ -160,6 +170,7 @@ class NetworkWrapper(object):
        """
        Applies the gradients accumulated in the online network to the global network or to itself and syncs the
        networks if necessary
+
        :param reset_gradients: If set to True, the accumulated gradients wont be reset to 0 after applying them to
                                the network. this is useful when the accumulated gradients are overwritten instead
                                if accumulated by the accumulate_gradients function. this allows reducing time
@@ -179,6 +190,7 @@ class NetworkWrapper(object):
    def parallel_prediction(self, network_input_tuples: List[Tuple]):
        """
        Run several network prediction in parallel. Currently this only supports running each of the network once.
+
        :param network_input_tuples: a list of tuples where the first element is the network (online_network,
                                     target_network or global_network) and the second element is the inputs
        :return: the outputs of all the networks in the same order as the inputs were given
@@ -188,6 +200,7 @@ class NetworkWrapper(object):
    def get_local_variables(self):
        """
        Get all the variables that are local to the thread
+
        :return: a list of all the variables that are local to the thread
        """
        local_variables = [v for v in tf.local_variables() if self.online_network.name in v.name]
@@ -198,6 +211,7 @@ class NetworkWrapper(object):
    def get_global_variables(self):
        """
        Get all the variables that are shared between threads
+
        :return: a list of all the variables that are shared between threads
        """
        global_variables = [v for v in tf.global_variables() if self.global_network.name in v.name]
@@ -206,6 +220,7 @@ class NetworkWrapper(object):
    def set_is_training(self, state: bool):
        """
        Set the phase of the network between training and testing
+
        :param state: The current state (True = Training, False = Testing)
        :return: None
        """
--- a/rl_coach/architectures/tensorflow_components/embedders/embedder.py
+++ b/rl_coach/architectures/tensorflow_components/embedders/embedder.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #

-from typing import List, Union
+from typing import List, Union, Tuple
 import copy

 import numpy as np
@@ -74,7 +74,12 @@ class InputEmbedder(object):
                                                                     activation_function=self.activation_function,
                                                                     dropout_rate=self.dropout_rate))

-    def __call__(self, prev_input_placeholder=None):
+    def __call__(self, prev_input_placeholder: tf.placeholder=None) -> Tuple[tf.Tensor, tf.Tensor]:
+        """
+        Wrapper for building the module graph including scoping and loss creation
+        :param prev_input_placeholder: the input to the graph
+        :return: the input placeholder and the output of the last layer
+        """
        with tf.variable_scope(self.get_name()):
            if prev_input_placeholder is None:
                self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
@@ -84,7 +89,13 @@ class InputEmbedder(object):

        return self.input, self.output

-    def _build_module(self):
+    def _build_module(self) -> None:
+        """
+        Builds the graph of the module
+        This method is called early on from __call__. It is expected to store the graph
+        in self.output.
+        :return: None
+        """
        # NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
        #  to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
        #  input to the network to be float, which is 4x more expensive in memory.
@@ -127,7 +138,11 @@ class InputEmbedder(object):
        raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
                                  "configurations.")

-    def get_name(self):
+    def get_name(self) -> str:
+        """
+        Get a formatted name for the module
+        :return: the formatted name
+        """
        return self.name

    def __str__(self):
--- a/rl_coach/architectures/tensorflow_components/middlewares/middleware.py
+++ b/rl_coach/architectures/tensorflow_components/middlewares/middleware.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 import copy
-from typing import Union
+from typing import Union, Tuple

 import tensorflow as tf

@@ -64,17 +64,33 @@ class Middleware(object):
                                                                     activation_function=self.activation_function,
                                                                     dropout_rate=self.dropout_rate))

-    def __call__(self, input_layer):
+    def __call__(self, input_layer: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        """
+        Wrapper for building the module graph including scoping and loss creation
+        :param input_layer: the input to the graph
+        :return: the input placeholder and the output of the last layer
+        """
        with tf.variable_scope(self.get_name()):
            self.input = input_layer
            self._build_module()

        return self.input, self.output

-    def _build_module(self):
+    def _build_module(self) -> None:
+        """
+        Builds the graph of the module
+        This method is called early on from __call__. It is expected to store the graph
+        in self.output.
+        :param input_layer: the input to the graph
+        :return: None
+        """
        pass

-    def get_name(self):
+    def get_name(self) -> str:
+        """
+        Get a formatted name for the module
+        :return: the formatted name
+        """
        return self.name

    @property
--- a/rl_coach/base_parameters.py
+++ b/rl_coach/base_parameters.py
@@ -154,7 +154,6 @@ class AlgorithmParameters(Parameters):
        self.num_steps_between_copying_online_weights_to_target = TrainingSteps(0)
        self.rate_for_copying_weights_to_target = 1.0
        self.load_memory_from_file_path = None
-        self.collect_new_data = True
        self.store_transitions_only_when_episodes_are_terminated = False

        # HRL / HER related params
@@ -174,7 +173,38 @@ class AlgorithmParameters(Parameters):


 class PresetValidationParameters(Parameters):
-    def __init__(self):
+    def __init__(self,
+                 test=False,
+                 min_reward_threshold=0,
+                 max_episodes_to_achieve_reward=1,
+                 num_workers=1,
+                 reward_test_level=None,
+                 test_using_a_trace_test=True,
+                 trace_test_levels=None,
+                 trace_max_env_steps=5000):
+        """
+        :param test:
+            A flag which specifies if the preset should be tested as part of the validation process.
+        :param min_reward_threshold:
+            The minimum reward that the agent should pass after max_episodes_to_achieve_reward episodes when the
+            preset is run.
+        :param max_episodes_to_achieve_reward:
+            The maximum number of episodes that the agent should train using the preset in order to achieve the
+            reward specified by min_reward_threshold.
+        :param num_workers:
+            The number of workers that should be used when running this preset in the test suite for validation.
+        :param reward_test_level:
+            The environment level or levels, given by a list of strings, that should be tested as part of the
+            reward tests suite.
+        :param test_using_a_trace_test:
+            A flag that specifies if the preset should be run as part of the trace tests suite.
+        :param trace_test_levels:
+            The environment level or levels, given by a list of strings, that should be tested as part of the
+            trace tests suite.
+        :param trace_max_env_steps:
+            An integer representing the maximum number of environment steps to run when running this preset as part
+            of the trace tests suite.
+        """
        super().__init__()

        # setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
@@ -182,42 +212,42 @@ class PresetValidationParameters(Parameters):
        # time from the OS.

        # Testing parameters
-        self.test = False
-        self.min_reward_threshold = 0
-        self.max_episodes_to_achieve_reward = 1
-        self.num_workers = 1
-        self.reward_test_level = None
-        self.test_using_a_trace_test = True
-        self.trace_test_levels = None
-        self.trace_max_env_steps = 5000
+        self.test = test
+        self.min_reward_threshold = min_reward_threshold
+        self.max_episodes_to_achieve_reward = max_episodes_to_achieve_reward
+        self.num_workers = num_workers
+        self.reward_test_level = reward_test_level
+        self.test_using_a_trace_test = test_using_a_trace_test
+        self.trace_test_levels = trace_test_levels
+        self.trace_max_env_steps = trace_max_env_steps


 class NetworkParameters(Parameters):
    def __init__(self,
-                 force_cpu = False,
-                 async_training = False,
-                 shared_optimizer = True,
-                 scale_down_gradients_by_number_of_workers_for_sync_training = True,
-                 clip_gradients = None,
-                 gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm,
-                 l2_regularization = 0,
-                 learning_rate = 0.00025,
-                 learning_rate_decay_rate = 0,
-                 learning_rate_decay_steps = 0,
-                 input_embedders_parameters = {},
-                 embedding_merger_type = EmbeddingMergerType.Concat,
-                 middleware_parameters = None,
-                 heads_parameters = [],
-                 use_separate_networks_per_head = False,
-                 optimizer_type = 'Adam',
-                 optimizer_epsilon = 0.0001,
-                 adam_optimizer_beta1 = 0.9,
-                 adam_optimizer_beta2 = 0.99,
-                 rms_prop_optimizer_decay = 0.9,
-                 batch_size = 32,
-                 replace_mse_with_huber_loss = False,
-                 create_target_network = False,
-                 tensorflow_support = True):
+                 force_cpu=False,
+                 async_training=False,
+                 shared_optimizer=True,
+                 scale_down_gradients_by_number_of_workers_for_sync_training=True,
+                 clip_gradients=None,
+                 gradients_clipping_method=GradientClippingMethod.ClipByGlobalNorm,
+                 l2_regularization=0,
+                 learning_rate=0.00025,
+                 learning_rate_decay_rate=0,
+                 learning_rate_decay_steps=0,
+                 input_embedders_parameters={},
+                 embedding_merger_type=EmbeddingMergerType.Concat,
+                 middleware_parameters=None,
+                 heads_parameters=[],
+                 use_separate_networks_per_head=False,
+                 optimizer_type='Adam',
+                 optimizer_epsilon=0.0001,
+                 adam_optimizer_beta1=0.9,
+                 adam_optimizer_beta2=0.99,
+                 rms_prop_optimizer_decay=0.9,
+                 batch_size=32,
+                 replace_mse_with_huber_loss=False,
+                 create_target_network=False,
+                 tensorflow_support=True):
        """
        :param force_cpu:
            Force the neural networks to run on the CPU even if a GPU is available
@@ -240,63 +270,106 @@ class NetworkParameters(Parameters):
            gradients of the network. This will only be used if the clip_gradients value is defined as a value other
            than None.
        :param l2_regularization:
+            A L2 regularization weight that will be applied to the network weights while calculating the loss function
        :param learning_rate:
+            The learning rate for the network
        :param learning_rate_decay_rate:
+            If this value is larger than 0, an exponential decay will be applied to the network learning rate.
+            The rate of the decay is defined by this parameter, and the number of training steps the decay will be
+            applied is defined by learning_rate_decay_steps. Notice that both parameters should be defined in order
+            for this to work correctly.
        :param learning_rate_decay_steps:
+            If the learning_rate_decay_rate of the network is larger than 0, an exponential decay will be applied to
+            the network learning rate. The number of steps the decay will be applied is defined by this parameter.
+            Notice that both this parameter, as well as learning_rate_decay_rate should be defined in order for the
+            learning rate decay to work correctly.
        :param input_embedders_parameters:
+            A dictionary mapping between input names and input embedders (InputEmbedderParameters) to use for the
+            network. Each of the keys is an input name as returned from the environment in the state.
+            For example, if the environment returns a state containing 'observation' and 'measurements', then
+            the keys for the input embedders dictionary can be either 'observation' to use the observation as input,
+            'measurements' to use the measurements as input, or both.
+            The embedder type will be automatically selected according to the input type. Vector inputs will
+            produce a fully connected embedder, and image inputs will produce a convolutional embedder.
        :param embedding_merger_type:
+            The type of embedding merging to use, given by one of the EmbeddingMergerType enum values.
+            This will be used to merge the outputs of all the input embedders into a single embbeding.
        :param middleware_parameters:
+            The parameters of the middleware to use, given by a MiddlewareParameters object.
+            Each network will have only a single middleware embedder which will take the merged embeddings from the
+            input embedders and pass them through more neural network layers.
        :param heads_parameters:
+            A list of heads for the network given by their corresponding HeadParameters.
+            Each network can have one or multiple network heads, where each one will take the output of the middleware
+            and make some additional computation on top of it. Additionally, each head calculates a weighted loss value,
+            and the loss values from all the heads will be summed later on.
        :param use_separate_networks_per_head:
+            A flag that allows using different copies of the input embedders and middleware for each one of the heads.
+            Regularly, the heads will have a shared input, but in the case where use_separate_networks_per_head is set
+            to True, each one of the heads will get a different input.
        :param optimizer_type:
+            A string specifying the optimizer type to use for updating the network. The available optimizers are
+            Adam, RMSProp and LBFGS.
        :param optimizer_epsilon:
+            An internal optimizer parameter used for Adam and RMSProp.
        :param adam_optimizer_beta1:
+            An beta1 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
+            optimizer for the network.
        :param adam_optimizer_beta2:
+            An beta2 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
+            optimizer for the network.
        :param rms_prop_optimizer_decay:
+            The decay value for the RMSProp optimizer, which will be used only in case the RMSProp optimizer was
+            selected for this network.
        :param batch_size:
+            The batch size to use when updating the network.
        :param replace_mse_with_huber_loss:
        :param create_target_network:
+            If this flag is set to True, an additional copy of the network will be created and initialized with the
+            same weights as the online network. It can then be queried, and its weights can be synced from the
+            online network at will.
        :param tensorflow_support:
+            A flag which specifies if the network is supported by the TensorFlow framework.
        """
        super().__init__()
        self.framework = Frameworks.tensorflow
        self.sess = None

        # hardware parameters
-        self.force_cpu = False
+        self.force_cpu = force_cpu

        # distributed training options
-        self.async_training = False
-        self.shared_optimizer = True
-        self.scale_down_gradients_by_number_of_workers_for_sync_training = True
+        self.async_training = async_training
+        self.shared_optimizer = shared_optimizer
+        self.scale_down_gradients_by_number_of_workers_for_sync_training = scale_down_gradients_by_number_of_workers_for_sync_training

        # regularization
-        self.clip_gradients = None
-        self.gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm
-        self.l2_regularization = 0
+        self.clip_gradients = clip_gradients
+        self.gradients_clipping_method = gradients_clipping_method
+        self.l2_regularization = l2_regularization

        # learning rate
-        self.learning_rate = 0.00025
-        self.learning_rate_decay_rate = 0
-        self.learning_rate_decay_steps = 0
+        self.learning_rate = learning_rate
+        self.learning_rate_decay_rate = learning_rate_decay_rate
+        self.learning_rate_decay_steps = learning_rate_decay_steps

        # structure
-        self.input_embedders_parameters = {}
-        self.embedding_merger_type = EmbeddingMergerType.Concat
-        self.middleware_parameters = None
-        self.heads_parameters = []
-        self.use_separate_networks_per_head = False
-        self.optimizer_type = 'Adam'
-        self.optimizer_epsilon = 0.0001
-        self.adam_optimizer_beta1 = 0.9
-        self.adam_optimizer_beta2 = 0.99
-        self.rms_prop_optimizer_decay = 0.9
-        self.batch_size = 32
-        self.replace_mse_with_huber_loss = False
-        self.create_target_network = False
+        self.input_embedders_parameters = input_embedders_parameters
+        self.embedding_merger_type = embedding_merger_type
+        self.middleware_parameters = middleware_parameters
+        self.heads_parameters = heads_parameters
+        self.use_separate_networks_per_head = use_separate_networks_per_head
+        self.optimizer_type = optimizer_type
+        self.optimizer_epsilon = optimizer_epsilon
+        self.adam_optimizer_beta1 = adam_optimizer_beta1
+        self.adam_optimizer_beta2 = adam_optimizer_beta2
+        self.rms_prop_optimizer_decay = rms_prop_optimizer_decay
+        self.batch_size = batch_size
+        self.replace_mse_with_huber_loss = replace_mse_with_huber_loss
+        self.create_target_network = create_target_network

        # Framework support
-        self.tensorflow_support = True
+        self.tensorflow_support = tensorflow_support


 class NetworkComponentParameters(Parameters):
--- a/rl_coach/coach.py
+++ b/rl_coach/coach.py
@@ -83,91 +83,91 @@ def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'


 def handle_distributed_coach_tasks(graph_manager, args):
-        ckpt_inside_container = "/checkpoint"
+    ckpt_inside_container = "/checkpoint"

-        memory_backend_params = None
-        if args.memory_backend_params:
-            memory_backend_params = json.loads(args.memory_backend_params)
-            memory_backend_params['run_type'] = str(args.distributed_coach_run_type)
-            graph_manager.agent_params.memory.register_var('memory_backend_params', construct_memory_params(memory_backend_params))
+    memory_backend_params = None
+    if args.memory_backend_params:
+        memory_backend_params = json.loads(args.memory_backend_params)
+        memory_backend_params['run_type'] = str(args.distributed_coach_run_type)
+        graph_manager.agent_params.memory.register_var('memory_backend_params', construct_memory_params(memory_backend_params))

-        data_store_params = None
+    data_store_params = None
+    if args.data_store_params:
+        data_store_params = construct_data_store_params(json.loads(args.data_store_params))
+        data_store_params.checkpoint_dir = ckpt_inside_container
+        graph_manager.data_store_params = data_store_params
+
+    if args.distributed_coach_run_type == RunType.TRAINER:
+        training_worker(
+            graph_manager=graph_manager,
+            checkpoint_dir=ckpt_inside_container
+        )
+
+    if args.distributed_coach_run_type == RunType.ROLLOUT_WORKER:
+        data_store = None
        if args.data_store_params:
-            data_store_params = construct_data_store_params(json.loads(args.data_store_params))
-            data_store_params.checkpoint_dir = ckpt_inside_container
-            graph_manager.data_store_params = data_store_params
+            data_store = get_data_store(data_store_params)
+            wait_for_checkpoint(checkpoint_dir=ckpt_inside_container, data_store=data_store)

-        if args.distributed_coach_run_type == RunType.TRAINER:
-            training_worker(
-                graph_manager=graph_manager,
-                checkpoint_dir=ckpt_inside_container
-            )
-
-        if args.distributed_coach_run_type == RunType.ROLLOUT_WORKER:
-            data_store = None
-            if args.data_store_params:
-                data_store = get_data_store(data_store_params)
-                wait_for_checkpoint(checkpoint_dir=ckpt_inside_container, data_store=data_store)
-
-            rollout_worker(
-                graph_manager=graph_manager,
-                checkpoint_dir=ckpt_inside_container,
-                data_store=data_store,
-                num_workers=args.num_workers
-            )
+        rollout_worker(
+            graph_manager=graph_manager,
+            checkpoint_dir=ckpt_inside_container,
+            data_store=data_store,
+            num_workers=args.num_workers
+        )


 def handle_distributed_coach_orchestrator(graph_manager, args):
-        ckpt_inside_container = "/checkpoint"
-        rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
-        trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]
+    ckpt_inside_container = "/checkpoint"
+    rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
+    trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]

-        if '--experiment_name' not in rollout_command:
-            rollout_command = rollout_command + ['--experiment_name', args.experiment_name]
+    if '--experiment_name' not in rollout_command:
+        rollout_command = rollout_command + ['--experiment_name', args.experiment_name]

-        if '--experiment_name' not in trainer_command:
-            trainer_command = trainer_command + ['--experiment_name', args.experiment_name]
+    if '--experiment_name' not in trainer_command:
+        trainer_command = trainer_command + ['--experiment_name', args.experiment_name]

-        memory_backend_params = None
-        if args.memory_backend == "redispubsub":
-            memory_backend_params = RedisPubSubMemoryBackendParameters()
+    memory_backend_params = None
+    if args.memory_backend == "redispubsub":
+        memory_backend_params = RedisPubSubMemoryBackendParameters()

-        ds_params_instance = None
-        if args.data_store == "s3":
-            ds_params = DataStoreParameters("s3", "", "")
-            ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
-                                                       creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)
+    ds_params_instance = None
+    if args.data_store == "s3":
+        ds_params = DataStoreParameters("s3", "", "")
+        ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
+                                                   creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)

-        worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
-        trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))
+    worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
+    trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))

-        orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
-                                                    kubeconfig='~/.kube/config',
-                                                    memory_backend_parameters=memory_backend_params,
-                                                    data_store_params=ds_params_instance)
-        orchestrator = Kubernetes(orchestration_params)
-        if not orchestrator.setup():
-            print("Could not setup.")
-            return
+    orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
+                                                kubeconfig='~/.kube/config',
+                                                memory_backend_parameters=memory_backend_params,
+                                                data_store_params=ds_params_instance)
+    orchestrator = Kubernetes(orchestration_params)
+    if not orchestrator.setup():
+        print("Could not setup.")
+        return

-        if orchestrator.deploy_trainer():
-            print("Successfully deployed trainer.")
-        else:
-            print("Could not deploy trainer.")
-            return
+    if orchestrator.deploy_trainer():
+        print("Successfully deployed trainer.")
+    else:
+        print("Could not deploy trainer.")
+        return

-        if orchestrator.deploy_worker():
-            print("Successfully deployed rollout worker(s).")
-        else:
-            print("Could not deploy rollout worker(s).")
-            return
+    if orchestrator.deploy_worker():
+        print("Successfully deployed rollout worker(s).")
+    else:
+        print("Could not deploy rollout worker(s).")
+        return

-        try:
-            orchestrator.trainer_logs()
-        except KeyboardInterrupt:
-            pass
+    try:
+        orchestrator.trainer_logs()
+    except KeyboardInterrupt:
+        pass

-        orchestrator.undeploy()
+    orchestrator.undeploy()


 class CoachLauncher(object):
@@ -192,7 +192,6 @@ class CoachLauncher(object):
        graph_manager = self.get_graph_manager_from_args(args)
        self.run_graph_manager(graph_manager, args)

-    
    def get_graph_manager_from_args(self, args: argparse.Namespace) -> 'GraphManager':
        """
        Return the graph manager according to the command line arguments given by the user.
@@ -251,7 +250,6 @@ class CoachLauncher(object):

        return graph_manager

-
    def display_all_presets_and_exit(self):
        # list available presets
        screen.log_title("Available Presets:")
@@ -259,7 +257,6 @@ class CoachLauncher(object):
            print(preset)
        sys.exit(0)

-
    def expand_preset(self, preset):
        """
        Replace a short preset name with the full python path, and verify that it can be imported.
@@ -287,7 +284,6 @@ class CoachLauncher(object):

        return preset

-
    def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace:
        """
        Returns a Namespace object with all the user-specified configuration options needed to launch.
@@ -317,7 +313,6 @@ class CoachLauncher(object):
        if args.list:
            self.display_all_presets_and_exit()

-
        # Read args from config file for distributed Coach.
        if args.distributed_coach and args.distributed_coach_run_type == RunType.ORCHESTRATOR:
            coach_config = ConfigParser({
@@ -401,7 +396,6 @@ class CoachLauncher(object):

        return args

-
    def get_argument_parser(self) -> argparse.ArgumentParser:
        """
        This returns an ArgumentParser object which defines the set of options that customers are expected to supply in order
@@ -545,7 +539,6 @@ class CoachLauncher(object):

        return parser

-
    def run_graph_manager(self, graph_manager: 'GraphManager', args: argparse.Namespace):
        if args.distributed_coach and not graph_manager.agent_params.algorithm.distributed_coach_synchronization_type:
            screen.error("{} algorithm is not supported using distributed Coach.".format(graph_manager.agent_params.algorithm))
@@ -581,7 +574,6 @@ class CoachLauncher(object):
        else:
            self.start_multi_threaded(graph_manager, args)

-
    def start_single_threaded(self, graph_manager: 'GraphManager', args: argparse.Namespace):
        # Start the training or evaluation
        task_parameters = TaskParameters(
@@ -598,7 +590,6 @@ class CoachLauncher(object):

        start_graph(graph_manager=graph_manager, task_parameters=task_parameters)

-
    def start_multi_threaded(self, graph_manager: 'GraphManager', args: argparse.Namespace):
        total_tasks = args.num_workers
        if args.evaluation_worker:
--- a/rl_coach/core_types.py
+++ b/rl_coach/core_types.py
@@ -260,6 +260,7 @@ class EnvResponse(object):
        """
        An env response is a collection containing the information returning from the environment after a single action
        has been performed on it.
+
        :param next_state: The new state that the environment has transitioned into. Assumed to be a dictionary where the
                          observation is located at state['observation']
        :param reward: The reward received from the environment
@@ -350,11 +351,13 @@ class ActionInfo(object):


 class Batch(object):
+    """
+    A wrapper around a list of transitions that helps extracting batches of parameters from it.
+    For example, one can extract a list of states corresponding to the list of transitions.
+    The class uses lazy evaluation in order to return each of the available parameters.
+    """
    def __init__(self, transitions: List[Transition]):
        """
-        A wrapper around a list of transitions that helps extracting batches of parameters from it.
-        For example, one can extract a list of states corresponding to the list of transitions.
-        The class uses lazy evaluation in order to return each of the available parameters.
        :param transitions: a list of transitions to extract the batch from
        """
        self.transitions = transitions
@@ -370,6 +373,7 @@ class Batch(object):
    def slice(self, start, end) -> None:
        """
        Keep a slice from the batch and discard the rest of the batch
+
        :param start: the start index in the slice
        :param end: the end index in the slice
        :return: None
@@ -396,6 +400,7 @@ class Batch(object):
    def shuffle(self) -> None:
        """
        Shuffle all the transitions in the batch
+
        :return: None
        """
        batch_order = list(range(self.size))
@@ -432,6 +437,7 @@ class Batch(object):
        """
        follow the keys in fetches to extract the corresponding items from the states in the batch
        if these keys were not already extracted before. return only the values corresponding to those keys
+
        :param fetches: the keys of the state dictionary to extract
        :param expand_dims: add an extra dimension to each of the value batches
        :return: a dictionary containing a batch of values correponding to each of the given fetches keys
@@ -452,6 +458,7 @@ class Batch(object):
    def actions(self, expand_dims=False) -> np.ndarray:
        """
        if the actions were not converted to a batch before, extract them to a batch and then return the batch
+
        :param expand_dims: add an extra dimension to the actions batch
        :return: a numpy array containing all the actions of the batch
        """
@@ -464,6 +471,7 @@ class Batch(object):
    def rewards(self, expand_dims=False) -> np.ndarray:
        """
        if the rewards were not converted to a batch before, extract them to a batch and then return the batch
+
        :param expand_dims: add an extra dimension to the rewards batch
        :return: a numpy array containing all the rewards of the batch
        """
@@ -491,6 +499,7 @@ class Batch(object):
    def game_overs(self, expand_dims=False) -> np.ndarray:
        """
        if the game_overs were not converted to a batch before, extract them to a batch and then return the batch
+
        :param expand_dims: add an extra dimension to the game_overs batch
        :return: a numpy array containing all the game over flags of the batch
        """
@@ -504,6 +513,7 @@ class Batch(object):
        """
        follow the keys in fetches to extract the corresponding items from the next states in the batch
        if these keys were not already extracted before. return only the values corresponding to those keys
+
        :param fetches: the keys of the state dictionary to extract
        :param expand_dims: add an extra dimension to each of the value batches
        :return: a dictionary containing a batch of values correponding to each of the given fetches keys
@@ -526,6 +536,7 @@ class Batch(object):
        """
        if the goals were not converted to a batch before, extract them to a batch and then return the batch
        if the goal was not filled, this will raise an exception
+
        :param expand_dims: add an extra dimension to the goals batch
        :return: a numpy array containing all the goals of the batch
        """
@@ -549,6 +560,7 @@ class Batch(object):
        """
        if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
        batch. if the key is not part of the keys in the info dictionary, this will raise an exception
+
        :param expand_dims: add an extra dimension to the info batch
        :return: a numpy array containing all the info values of the batch corresponding to the given key
        """
@@ -568,6 +580,7 @@ class Batch(object):
    def __getitem__(self, key):
        """
        get an item from the transitions list
+
        :param key: index of the transition in the batch
        :return: the transition corresponding to the given index
        """
@@ -576,6 +589,7 @@ class Batch(object):
    def __setitem__(self, key, item):
        """
        set an item in the transition list
+
        :param key: index of the transition in the batch
        :param item: the transition to place in the given index
        :return: None
@@ -598,6 +612,7 @@ class TotalStepsCounter(object):
    def __getitem__(self, key: Type[StepMethod]) -> int:
        """
        get counter value
+
        :param key: counter type
        :return: the counter value
        """
@@ -606,6 +621,7 @@ class TotalStepsCounter(object):
    def __setitem__(self, key: StepMethod, item: int) -> None:
        """
        set an item in the transition list
+
        :param key: counter type
        :param item: an integer representing the new counter value
        :return: None
@@ -626,6 +642,9 @@ class GradientClippingMethod(Enum):


 class Episode(object):
+    """
+    An Episode represents a set of sequential transitions, that end with a terminal state.
+    """
    def __init__(self, discount: float=0.99, bootstrap_total_return_from_old_policy: bool=False, n_step: int=-1):
        """
        :param discount: the discount factor to use when calculating total returns
@@ -634,38 +653,78 @@ class Episode(object):
        :param n_step: the number of future steps to sum the reward over before bootstrapping
        """
        self.transitions = []
-        # a num_transitions x num_transitions table with the n step return in the n'th row
        self._length = 0
        self.discount = discount
        self.bootstrap_total_return_from_old_policy = bootstrap_total_return_from_old_policy
        self.n_step = n_step
        self.is_complete = False

-    def insert(self, transition):
+    def insert(self, transition: Transition) -> None:
+        """
+        Insert a new transition to the episode. If the game_over flag in the transition is set to True,
+        the episode will be marked as complete.
+
+        :param transition: The new transition to insert to the episode
+        :return: None
+        """
        self.transitions.append(transition)
        self._length += 1
        if transition.game_over:
            self.is_complete = True

-    def is_empty(self):
+    def is_empty(self) -> bool:
+        """
+        Check if the episode is empty
+
+        :return: A boolean value determining if the episode is empty or not
+        """
        return self.length() == 0

-    def length(self):
+    def length(self) -> int:
+        """
+        Return the length of the episode, which is the number of transitions it holds.
+
+        :return: The number of transitions in the episode
+        """
        return self._length

    def __len__(self):
        return self.length()

-    def get_transition(self, transition_idx):
+    def get_transition(self, transition_idx: int) -> Transition:
+        """
+        Get a specific transition by its index.
+
+        :param transition_idx: The index of the transition to get
+        :return: The transition which is stored in the given index
+        """
        return self.transitions[transition_idx]

-    def get_last_transition(self):
+    def get_last_transition(self) -> Transition:
+        """
+        Get the last transition in the episode, or None if there are no transition available
+
+        :return: The last transition in the episode
+        """
        return self.get_transition(-1) if self.length() > 0 else None

-    def get_first_transition(self):
+    def get_first_transition(self) -> Transition:
+        """
+        Get the first transition in the episode, or None if there are no transitions available
+
+        :return: The first transition in the episode
+        """
        return self.get_transition(0) if self.length() > 0 else None

    def update_discounted_rewards(self):
+        """
+        Update the discounted returns for all the transitions in the episode.
+        The returns will be calculated according to the rewards of each transition, together with the number of steps
+        to bootstrap from and the discount factor, as defined by n_step and discount respectively when initializing
+        the episode.
+
+        :return: None
+        """
        if self.n_step == -1 or self.n_step > self.length():
            curr_n_step = self.length()
        else:
@@ -708,15 +767,17 @@ class Episode(object):

        self.update_discounted_rewards()

-    def update_actions_probabilities(self):
-        probability_product = 1
-        for transition_idx, transition in enumerate(self.transitions):
-            if 'action_probabilities' in transition.info.keys():
-                probability_product *= transition.info['action_probabilities']
-        for transition_idx, transition in enumerate(self.transitions):
-            transition.info['probability_product'] = probability_product

-    def get_transitions_attribute(self, attribute_name):
+
+    def get_transitions_attribute(self, attribute_name: str) -> List[Any]:
+        """
+        Get the values for some transition attribute from all the transitions in the episode.
+        For example, this allows getting the rewards for all the transitions as a list by calling
+        get_transitions_attribute('reward')
+
+        :param attribute_name: The name of the attribute to extract from all the transitions
+        :return: A list of values from all the transitions according to the attribute given in attribute_name
+        """
        if len(self.transitions) > 0 and hasattr(self.transitions[0], attribute_name):
            return [getattr(t, attribute_name) for t in self.transitions]
        elif len(self.transitions) == 0:
@@ -724,12 +785,6 @@ class Episode(object):
        else:
            raise ValueError("The transitions have no such attribute name")

-    def to_batch(self):
-        batch = []
-        for i in range(self.length()):
-            batch.append(self.get_transition(i))
-        return batch
-
    def __getitem__(self, sliced):
        return self.transitions[sliced]

--- a/rl_coach/environments/control_suite_environment.py
+++ b/rl_coach/environments/control_suite_environment.py
@@ -69,6 +69,38 @@ class ControlSuiteEnvironment(Environment):
                 target_success_rate: float=1.0, seed: Union[None, int]=None, human_control: bool=False,
                 observation_type: ObservationType=ObservationType.Measurements,
                 custom_reward_threshold: Union[int, float]=None, **kwargs):
+        """
+        :param level: (str)
+            A string representing the control suite level to run. This can also be a LevelSelection object.
+            For example, cartpole:swingup.
+
+        :param frame_skip: (int)
+            The number of frames to skip between any two actions given by the agent. The action will be repeated
+            for all the skipped frames.
+
+        :param visualization_parameters: (VisualizationParameters)
+            The parameters used for visualizing the environment, such as the render flag, storing videos etc.
+
+        :param target_success_rate: (float)
+            Stop experiment if given target success rate was achieved.
+
+        :param seed: (int)
+            A seed to use for the random number generator when running the environment.
+
+        :param human_control: (bool)
+            A flag that allows controlling the environment using the keyboard keys.
+
+        :param observation_type: (ObservationType)
+            An enum which defines which observation to use. The current options are to use:
+            * Measurements only - a vector of joint torques and similar measurements
+            * Image only - an image of the environment as seen by a camera attached to the simulator
+            * Measurements & Image - both type of observations will be returned in the state using the keys
+            'measurements' and 'pixels' respectively.
+
+        :param custom_reward_threshold: (float)
+            Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
+
+        """
        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters, target_success_rate)

        self.observation_type = observation_type
--- a/rl_coach/environments/doom_environment.py
+++ b/rl_coach/environments/doom_environment.py
@@ -125,6 +125,36 @@ class DoomEnvironment(Environment):
    def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
                 custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
                 cameras: List[CameraTypes], target_success_rate: float=1.0, **kwargs):
+        """
+        :param level: (str)
+            A string representing the doom level to run. This can also be a LevelSelection object.
+            This should be one of the levels defined in the DoomLevel enum. For example, HEALTH_GATHERING.
+
+        :param seed: (int)
+            A seed to use for the random number generator when running the environment.
+
+        :param frame_skip: (int)
+            The number of frames to skip between any two actions given by the agent. The action will be repeated
+            for all the skipped frames.
+
+        :param human_control: (bool)
+            A flag that allows controlling the environment using the keyboard keys.
+
+        :param custom_reward_threshold: (float)
+            Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
+
+        :param visualization_parameters: (VisualizationParameters)
+            The parameters used for visualizing the environment, such as the render flag, storing videos etc.
+
+        :param cameras: (List[CameraTypes])
+            A list of camera types to use as observation in the state returned from the environment.
+            Each camera should be an enum from CameraTypes, and there are several options like an RGB observation,
+            a depth map, a segmentation map, and a top down map of the enviornment.
+
+		:param target_success_rate: (float)
+			Stop experiment if given target success rate was achieved.
+
+        """
        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters, target_success_rate)

        self.cameras = cameras
--- a/rl_coach/environments/environment.py
+++ b/rl_coach/environments/environment.py
@@ -176,6 +176,7 @@ class Environment(EnvironmentInterface):
    def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
        """
        Get the action space of the environment
+
        :return: the action space
        """
        return self._action_space
@@ -184,6 +185,7 @@ class Environment(EnvironmentInterface):
    def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
        """
        Set the action space of the environment
+
        :return: None
        """
        self._action_space = val
@@ -192,6 +194,7 @@ class Environment(EnvironmentInterface):
    def state_space(self) -> Union[List[StateSpace], StateSpace]:
        """
        Get the state space of the environment
+
        :return: the observation space
        """
        return self._state_space
@@ -200,6 +203,7 @@ class Environment(EnvironmentInterface):
    def state_space(self, val: Union[List[StateSpace], StateSpace]):
        """
        Set the state space of the environment
+
        :return: None
        """
        self._state_space = val
@@ -208,6 +212,7 @@ class Environment(EnvironmentInterface):
    def goal_space(self) -> Union[List[ObservationSpace], ObservationSpace]:
        """
        Get the state space of the environment
+
        :return: the observation space
        """
        return self._goal_space
@@ -216,6 +221,7 @@ class Environment(EnvironmentInterface):
    def goal_space(self, val: Union[List[ObservationSpace], ObservationSpace]):
        """
        Set the goal space of the environment
+
        :return: None
        """
        self._goal_space = val
@@ -223,6 +229,7 @@ class Environment(EnvironmentInterface):
    def get_action_from_user(self) -> ActionType:
        """
        Get an action from the user keyboard
+
        :return: action index
        """
        if self.wait_for_explicit_human_action:
@@ -250,6 +257,7 @@ class Environment(EnvironmentInterface):
    def last_env_response(self) -> Union[List[EnvResponse], EnvResponse]:
        """
        Get the last environment response
+
        :return: a dictionary that contains the state, reward, etc.
        """
        return squeeze_list(self._last_env_response)
@@ -258,6 +266,7 @@ class Environment(EnvironmentInterface):
    def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]):
        """
        Set the last environment response
+
        :param val: the last environment response
        """
        self._last_env_response = force_list(val)
@@ -265,6 +274,7 @@ class Environment(EnvironmentInterface):
    def step(self, action: ActionType) -> EnvResponse:
        """
        Make a single step in the environment using the given action
+
        :param action: an action to use for stepping the environment. Should follow the definition of the action space.
        :return: the environment response as returned in get_last_env_response
        """
@@ -317,6 +327,8 @@ class Environment(EnvironmentInterface):
    def render(self) -> None:
        """
        Call the environment function for rendering to the screen
+
+        :return: None
        """
        if self.native_rendering:
            self._render()
@@ -326,6 +338,7 @@ class Environment(EnvironmentInterface):
    def handle_episode_ended(self) -> None:
        """
        End an episode
+
        :return: None
        """
        self.dump_video_of_last_episode_if_needed()
@@ -333,6 +346,7 @@ class Environment(EnvironmentInterface):
    def reset_internal_state(self, force_environment_reset=False) -> EnvResponse:
        """
        Reset the environment and all the variable of the wrapper
+
        :param force_environment_reset: forces environment reset even when the game did not end
        :return: A dictionary containing the observation, reward, done flag, action and measurements
        """
@@ -368,6 +382,7 @@ class Environment(EnvironmentInterface):
    def get_random_action(self) -> ActionType:
        """
        Returns an action picked uniformly from the available actions
+
        :return: a numpy array with a random action
        """
        return self.action_space.sample()
@@ -375,6 +390,7 @@ class Environment(EnvironmentInterface):
    def get_available_keys(self) -> List[Tuple[str, ActionType]]:
        """
        Return a list of tuples mapping between action names and the keyboard key that triggers them
+
        :return: a list of tuples mapping between action names and the keyboard key that triggers them
        """
        available_keys = []
@@ -391,6 +407,7 @@ class Environment(EnvironmentInterface):
    def get_goal(self) -> GoalType:
        """
        Get the current goal that the agents needs to achieve in the environment
+
        :return: The goal
        """
        return self.goal
@@ -398,6 +415,7 @@ class Environment(EnvironmentInterface):
    def set_goal(self, goal: GoalType) -> None:
        """
        Set the current goal that the agent needs to achieve in the environment
+
        :param goal: the goal that needs to be achieved
        :return: None
        """
@@ -424,14 +442,6 @@ class Environment(EnvironmentInterface):
        if self.visualization_parameters.dump_mp4:
            logger.create_mp4(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)

-    def log_to_screen(self):
-        # log to screen
-        log = OrderedDict()
-        log["Episode"] = self.episode_idx
-        log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
-        log["Steps"] = self.total_steps_counter
-        screen.log_dict(log, prefix=self.phase.value)
-
    # The following functions define the interaction with the environment.
    # Any new environment that inherits the Environment class should use these signatures.
    # Some of these functions are optional - please read their description for more details.
@@ -439,6 +449,7 @@ class Environment(EnvironmentInterface):
    def _take_action(self, action_idx: ActionType) -> None:
        """
        An environment dependent function that sends an action to the simulator.
+
        :param action_idx: the action to perform on the environment
        :return: None
        """
@@ -448,6 +459,7 @@ class Environment(EnvironmentInterface):
        """
        Updates the state from the environment.
        Should update self.observation, self.reward, self.done, self.measurements and self.info
+
        :return: None
        """
        raise NotImplementedError("")
@@ -455,6 +467,7 @@ class Environment(EnvironmentInterface):
    def _restart_environment_episode(self, force_environment_reset=False) -> None:
        """
        Restarts the simulator episode
+
        :param force_environment_reset: Force the environment to reset even if the episode is not done yet.
        :return: None
        """
@@ -463,6 +476,7 @@ class Environment(EnvironmentInterface):
    def _render(self) -> None:
        """
        Renders the environment using the native simulator renderer
+
        :return: None
        """
        pass
@@ -471,6 +485,7 @@ class Environment(EnvironmentInterface):
        """
        Return a numpy array containing the image that will be rendered to the screen.
        This can be different from the observation. For example, mujoco's observation is a measurements vector.
+
        :return: numpy array containing the image that will be rendered to the screen
        """
        return np.transpose(self.state['observation'], [1, 2, 0])
--- a/rl_coach/environments/gym_environment.py
+++ b/rl_coach/environments/gym_environment.py
@@ -140,7 +140,7 @@ atari_schedule = ScheduleParameters()
 atari_schedule.improve_steps = EnvironmentSteps(50000000)
 atari_schedule.steps_between_evaluation_periods = EnvironmentSteps(250000)
 atari_schedule.evaluation_steps = EnvironmentSteps(135000)
-atari_schedule.heatup_steps = EnvironmentSteps(50000)
+atari_schedule.heatup_steps = EnvironmentSteps(1)


 class MaxOverFramesAndFrameskipEnvWrapper(gym.Wrapper):
@@ -181,6 +181,41 @@ class GymEnvironment(Environment):
                 target_success_rate: float=1.0, additional_simulator_parameters: Dict[str, Any] = {}, seed: Union[None, int]=None,
                 human_control: bool=False, custom_reward_threshold: Union[int, float]=None,
                 random_initialization_steps: int=1, max_over_num_frames: int=1, **kwargs):
+        """
+        :param level: (str)
+            A string representing the gym level to run. This can also be a LevelSelection object.
+            For example, BreakoutDeterministic-v0
+
+        :param frame_skip: (int)
+            The number of frames to skip between any two actions given by the agent. The action will be repeated
+            for all the skipped frames.
+
+        :param visualization_parameters: (VisualizationParameters)
+            The parameters used for visualizing the environment, such as the render flag, storing videos etc.
+
+        :param additional_simulator_parameters: (Dict[str, Any])
+            Any additional parameters that the user can pass to the Gym environment. These parameters should be
+            accepted by the __init__ function of the implemented Gym environment.
+
+        :param seed: (int)
+            A seed to use for the random number generator when running the environment.
+
+        :param human_control: (bool)
+            A flag that allows controlling the environment using the keyboard keys.
+
+        :param custom_reward_threshold: (float)
+            Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
+            If not set, this value will be taken from the Gym environment definition.
+
+        :param random_initialization_steps: (int)
+            The number of random steps that will be taken in the environment after each reset.
+            This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.
+
+        :param max_over_num_frames: (int)
+            This value will be used for merging multiple frames into a single frame by taking the maximum value for each
+            of the pixels in the frame. This is particularly used in Atari games, where the frames flicker, and objects
+            can be seen in one frame but disappear in the next.
+        """
        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold,
                         visualization_parameters, target_success_rate)

--- a/rl_coach/exploration_policies/init.py
+++ b/rl_coach/exploration_policies/init.py
@@ -13,3 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from .additive_noise import AdditiveNoiseParameters, AdditiveNoise
+from .boltzmann import BoltzmannParameters, Boltzmann
+from .bootstrapped import BootstrappedParameters, Bootstrapped
+from .categorical import CategoricalParameters, Categorical
+from .continuous_entropy import ContinuousEntropyParameters, ContinuousEntropy
+from .e_greedy import EGreedyParameters, EGreedy
+from .exploration_policy import ExplorationParameters, ExplorationPolicy
+from .greedy import GreedyParameters, Greedy
+from .ou_process import OUProcessParameters, OUProcess
+from .parameter_noise import ParameterNoiseParameters, ParameterNoise
+from .truncated_normal import TruncatedNormalParameters, TruncatedNormal
+from .ucb import UCBParameters, UCB
+
+__all__ = [
+    'AdditiveNoiseParameters',
+    'AdditiveNoise',
+    'BoltzmannParameters',
+    'Boltzmann',
+    'BootstrappedParameters',
+    'Bootstrapped',
+    'CategoricalParameters',
+    'Categorical',
+    'ContinuousEntropyParameters',
+    'ContinuousEntropy',
+    'EGreedyParameters',
+    'EGreedy',
+    'ExplorationParameters',
+    'ExplorationPolicy',
+    'GreedyParameters',
+    'Greedy',
+    'OUProcessParameters',
+    'OUProcess',
+    'ParameterNoiseParameters',
+    'ParameterNoise',
+    'TruncatedNormalParameters',
+    'TruncatedNormal',
+    'UCBParameters',
+    'UCB'
+]
--- a/rl_coach/exploration_policies/additive_noise.py
+++ b/rl_coach/exploration_policies/additive_noise.py
@@ -37,6 +37,14 @@ class AdditiveNoiseParameters(ExplorationParameters):


 class AdditiveNoise(ExplorationPolicy):
+    """
+    AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
+    and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that
+    can be given in two different ways:
+    1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
+    2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
+    be the mean of the action, and 2nd is assumed to be its standard deviation.
+    """
    def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
                 evaluation_noise_percentage: float):
        """
--- a/rl_coach/exploration_policies/boltzmann.py
+++ b/rl_coach/exploration_policies/boltzmann.py
@@ -36,6 +36,12 @@ class BoltzmannParameters(ExplorationParameters):


 class Boltzmann(ExplorationPolicy):
+    """
+    The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
+    actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
+    into a distribution over the actions. It then samples the action for playing out of the calculated distribution.
+    An additional temperature schedule can be given by the user, and will control the steepness of the softmax function.
+    """
    def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
        """
        :param action_space: the action space used by the environment
--- a/rl_coach/exploration_policies/bootstrapped.py
+++ b/rl_coach/exploration_policies/bootstrapped.py
@@ -39,6 +39,17 @@ class BootstrappedParameters(EGreedyParameters):


 class Bootstrapped(EGreedy):
+    """
+    Bootstrapped exploration policy is currently only used for discrete action spaces along with the
+    Bootstrapped DQN agent. It assumes that there is an ensemble of network heads, where each one predicts the
+    values for all the possible actions. For each episode, a single head is selected to lead the agent, according
+    to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
+    predictions.
+
+    .. note::
+       This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
+       since it requires the agent to have a network with multiple heads.
+    """
    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
                 architecture_num_q_heads: int,
                 continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):
--- a/rl_coach/exploration_policies/categorical.py
+++ b/rl_coach/exploration_policies/categorical.py
@@ -30,6 +30,12 @@ class CategoricalParameters(ExplorationParameters):


 class Categorical(ExplorationPolicy):
+    """
+    Categorical exploration policy is intended for discrete action spaces. It expects the action values to
+    represent a probability distribution over the action, from which a single action will be sampled.
+    In evaluation, the action that has the highest probability will be selected. This is particularly useful for
+    actor-critic schemes, where the actors output is a probability distribution over the actions.
+    """
    def __init__(self, action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
--- a/rl_coach/exploration_policies/continuous_entropy.py
+++ b/rl_coach/exploration_policies/continuous_entropy.py
@@ -24,4 +24,15 @@ class ContinuousEntropyParameters(AdditiveNoiseParameters):


 class ContinuousEntropy(AdditiveNoise):
+    """
+    Continuous entropy is an exploration policy that is actually implemented as part of the network.
+    The exploration policy class is only a placeholder for choosing this policy. The exploration policy is
+    implemented by adding a regularization factor to the network loss, which regularizes the entropy of the action.
+    This exploration policy is only intended for continuous action spaces, and assumes that the entire calculation
+    is implemented as part of the head.
+
+    .. warning::
+       This exploration policy expects the agent or the network to implement the exploration functionality.
+       There are only a few heads that actually are relevant and implement the entropy regularization factor.
+    """
    pass
--- a/rl_coach/exploration_policies/e_greedy.py
+++ b/rl_coach/exploration_policies/e_greedy.py
@@ -43,6 +43,19 @@ class EGreedyParameters(ExplorationParameters):


 class EGreedy(ExplorationPolicy):
+    """
+    e-greedy is an exploration policy that is intended for both discrete and continuous action spaces.
+
+    For discrete action spaces, it assumes that each action is assigned a value, and it selects the action with the
+    highest value with probability 1 - epsilon. Otherwise, it selects a action sampled uniformly out of all the
+    possible actions. The epsilon value is given by the user and can be given as a schedule.
+    In evaluation, a different epsilon value can be specified.
+
+    For continuous action spaces, it assumes that the mean action is given by the agent. With probability epsilon,
+    it samples a random action out of the action space bounds. Otherwise, it selects the action according to a
+    given continuous exploration policy, which is set to AdditiveNoise by default. In evaluation, the action is
+    always selected according to the given continuous exploration policy (where its phase is set to evaluation as well).
+    """
    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
                 evaluation_epsilon: float,
                 continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
--- a/rl_coach/exploration_policies/exploration_policy.py
+++ b/rl_coach/exploration_policies/exploration_policy.py
@@ -31,6 +31,10 @@ class ExplorationParameters(Parameters):


 class ExplorationPolicy(object):
+    """
+    An exploration policy takes the predicted actions or action values from the agent, and selects the action to
+    actually apply to the environment using some predefined algorithm.
+    """
    def __init__(self, action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
--- a/rl_coach/exploration_policies/greedy.py
+++ b/rl_coach/exploration_policies/greedy.py
@@ -30,6 +30,11 @@ class GreedyParameters(ExplorationParameters):


 class Greedy(ExplorationPolicy):
+    """
+    The Greedy exploration policy is intended for both discrete and continuous action spaces.
+    For discrete action spaces, it always selects the action with the maximum value, as given by the agent.
+    For continuous action spaces, it always return the exact action, as it was given by the agent.
+    """
    def __init__(self, action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
--- a/rl_coach/exploration_policies/ou_process.py
+++ b/rl_coach/exploration_policies/ou_process.py
@@ -40,6 +40,11 @@ class OUProcessParameters(ExplorationParameters):

 # Ornstein-Uhlenbeck process
 class OUProcess(ExplorationPolicy):
+    """
+    OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
+    an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
+    the samples are correlated between consequent time steps.
+    """
    def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
        """
        :param action_space: the action space used by the environment
--- a/rl_coach/exploration_policies/parameter_noise.py
+++ b/rl_coach/exploration_policies/parameter_noise.py
@@ -42,10 +42,18 @@ class ParameterNoiseParameters(ExplorationParameters):


 class ParameterNoise(ExplorationPolicy):
+    """
+    The ParameterNoise exploration policy is intended for both discrete and continuous action spaces.
+    It applies the exploration policy by replacing all the dense network layers with noisy layers.
+    The noisy layers have both weight means and weight standard deviations, and for each forward pass of the network
+    the weights are sampled from a normal distribution that follows the learned weights mean and standard deviation
+    values.
+
+    Warning: currently supported only by DQN variants
+    """
    def __init__(self, network_params: Dict[str, NetworkParameters], action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
-        :param alpha0:
        """
        super().__init__(action_space)
        self.network_params = network_params
--- a/rl_coach/exploration_policies/truncated_normal.py
+++ b/rl_coach/exploration_policies/truncated_normal.py
@@ -39,6 +39,16 @@ class TruncatedNormalParameters(ExplorationParameters):


 class TruncatedNormal(ExplorationPolicy):
+    """
+    The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
+    normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t
+    wo different ways:
+    1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
+    2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
+    be the mean of the action, and 2nd is assumed to be its standard deviation.
+    When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
+    is within the bounds.
+    """
    def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
                 evaluation_noise_percentage: float, clip_low: float, clip_high: float):
        """
--- a/rl_coach/exploration_policies/ucb.py
+++ b/rl_coach/exploration_policies/ucb.py
@@ -43,6 +43,15 @@ class UCBParameters(EGreedyParameters):


 class UCB(EGreedy):
+    """
+    UCB exploration policy is following the upper confidence bound heuristic to sample actions in discrete action spaces.
+    It assumes that there are multiple network heads that are predicting action values, and that the standard deviation
+    between the heads predictions represents the uncertainty of the agent in each of the actions.
+    It then updates the action value estimates to by mean(actions)+lambda*stdev(actions), where lambda is
+    given by the user. This exploration policy aims to take advantage of the uncertainty of the agent in its predictions,
+    and select the action according to the tradeoff between how uncertain the agent is, and how large it predicts
+    the outcome from those actions to be.
+    """
    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
                 architecture_num_q_heads: int, lamb: int,
                 continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):
--- a/rl_coach/filters/action/init.py
+++ b/rl_coach/filters/action/init.py
@@ -0,0 +1,14 @@
+from .attention_discretization import AttentionDiscretization
+from .box_discretization import BoxDiscretization
+from .box_masking import BoxMasking
+from .full_discrete_action_space_map import FullDiscreteActionSpaceMap
+from .linear_box_to_box_map import LinearBoxToBoxMap
+from .partial_discrete_action_space_map import PartialDiscreteActionSpaceMap
+__all__ = [
+    'AttentionDiscretization',
+    'BoxDiscretization',
+    'BoxMasking',
+    'FullDiscreteActionSpaceMap',
+    'LinearBoxToBoxMap',
+    'PartialDiscreteActionSpaceMap'
+]
--- a/rl_coach/filters/action/attention_discretization.py
+++ b/rl_coach/filters/action/attention_discretization.py
@@ -25,11 +25,18 @@ from rl_coach.spaces import AttentionActionSpace, BoxActionSpace, DiscreteAction

 class AttentionDiscretization(PartialDiscreteActionSpaceMap):
    """
-    Given a box action space, this is used to discretize the space.
-    The discretization is achieved by creating a grid in the space with num_bins_per_dimension bins per dimension in the
-    space. Each discrete action is mapped to a single sub-box in the BoxActionSpace action space.
+    Discretizes an **AttentionActionSpace**. The attention action space defines the actions
+    as choosing sub-boxes in a given box. For example, consider an image of size 100x100, where the action is choosing
+    a crop window of size 20x20 to attend to in the image. AttentionDiscretization allows discretizing the possible crop
+    windows to choose into a finite number of options, and map a discrete action space into those crop windows.
+
+    Warning! this will currently only work for attention spaces with 2 dimensions.
    """
    def __init__(self, num_bins_per_dimension: Union[int, List[int]], force_int_bins=False):
+        """
+        :param num_bins_per_dimension: Number of discrete bins to use for each dimension of the action space
+        :param force_int_bins: If set to True, all the bins will represent integer coordinates in space.
+        """
        # we allow specifying either a single number for all dimensions, or a single number per dimension in the target
        # action space
        self.num_bins_per_dimension = num_bins_per_dimension
--- a/rl_coach/filters/action/box_discretization.py
+++ b/rl_coach/filters/action/box_discretization.py
@@ -25,9 +25,12 @@ from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace

 class BoxDiscretization(PartialDiscreteActionSpaceMap):
    """
-    Given a box action space, this is used to discretize the space.
-    The discretization is achieved by creating a grid in the space with num_bins_per_dimension bins per dimension in the
-    space. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
+    Discretizes a continuous action space into a discrete action space, allowing the usage of
+    agents such as DQN for continuous environments such as MuJoCo. Given the number of bins to discretize into, the
+    original continuous action space is uniformly separated into the given number of bins, each mapped to a discrete
+    action index. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
+    For example, if the original actions space is between -1 and 1 and 5 bins were selected, the new action
+    space will consist of 5 actions mapped to -1, -0.5, 0, 0.5 and 1.
    """
    def __init__(self, num_bins_per_dimension: Union[int, List[int]], force_int_bins=False):
        """
--- a/rl_coach/filters/action/box_masking.py
+++ b/rl_coach/filters/action/box_masking.py
@@ -25,12 +25,10 @@ from rl_coach.spaces import BoxActionSpace

 class BoxMasking(ActionFilter):
    """
-    Masks a box action space by allowing only selecting a subset of the space
-    For example,
-    - the target action space has actions of shape 1 with values between 10 and 32
-    - we mask the target action space so that only the action 20 to 25 can be chosen
-    The actions will be between 0 to 5 and the mapping will add an offset of 20 to the incoming actions
-    The shape of the source and target action spaces is always the same
+    Masks part of the action space to enforce the agent to work in a defined space. For example,
+    if the original action space is between -1 and 1, then this filter can be used in order to constrain the agent actions
+    to the range 0 and 1 instead. This essentially masks the range -1 and 0 from the agent.
+    The resulting action space will be shifted and will always start from 0 and have the size of the unmasked area.
    """
    def __init__(self,
                 masked_target_space_low: Union[None, int, float, np.ndarray],
--- a/rl_coach/filters/action/full_discrete_action_space_map.py
+++ b/rl_coach/filters/action/full_discrete_action_space_map.py
@@ -20,7 +20,9 @@ from rl_coach.spaces import ActionSpace, DiscreteActionSpace

 class FullDiscreteActionSpaceMap(PartialDiscreteActionSpaceMap):
    """
-    Maps all the actions in the output space to discrete actions in the action space.
+    Full map of two countable action spaces. This works in a similar way to the
+    PartialDiscreteActionSpaceMap, but maps the entire source action space into the entire target action space, without
+    masking any actions.
    For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those
    multiselect actions.
    """
--- a/rl_coach/filters/action/linear_box_to_box_map.py
+++ b/rl_coach/filters/action/linear_box_to_box_map.py
@@ -25,17 +25,19 @@ from rl_coach.spaces import BoxActionSpace

 class LinearBoxToBoxMap(ActionFilter):
    """
-    Maps a box action space to a box action space.
-    For example,
-    - the source action space has actions of shape 1 with values between -42 and -10,
-    - the target action space has actions of shape 1 with values between 10 and 32
-    The mapping will add an offset of 52 to the incoming actions and then multiply them by 22/32 to scale them to the
-    target action space
-    The shape of the source and target action spaces is always the same
+    A linear mapping of two box action spaces. For example, if the action space of the
+    environment consists of continuous actions between 0 and 1, and we want the agent to choose actions between -1 and 1,
+    the LinearBoxToBoxMap can be used to map the range -1 and 1 to the range 0 and 1 in a linear way. This means that the
+    action -1 will be mapped to 0, the action 1 will be mapped to 1, and the rest of the actions will be linearly mapped
+    between those values.
    """
    def __init__(self,
                 input_space_low: Union[None, int, float, np.ndarray],
                 input_space_high: Union[None, int, float, np.ndarray]):
+        """
+        :param input_space_low: the low values of the desired action space
+        :param input_space_high: the high values of the desired action space
+        """
        self.input_space_low = input_space_low
        self.input_space_high = input_space_high
        self.rescale = None
--- a/rl_coach/filters/action/partial_discrete_action_space_map.py
+++ b/rl_coach/filters/action/partial_discrete_action_space_map.py
@@ -23,11 +23,17 @@ from rl_coach.spaces import DiscreteActionSpace, ActionSpace

 class PartialDiscreteActionSpaceMap(ActionFilter):
    """
-    Maps the given actions from the output space to discrete actions in the action space.
-    For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those
-    multiselect actions.
+    Partial map of two countable action spaces. For example, consider an environment
+    with a MultiSelect action space (select multiple actions at the same time, such as jump and go right), with 8 actual
+    MultiSelect actions. If we want the agent to be able to select only 5 of those actions by their index (0-4), we can
+    map a discrete action space with 5 actions into the 5 selected MultiSelect actions. This will both allow the agent to
+    use regular discrete actions, and mask 3 of the actions from the agent.
    """
    def __init__(self, target_actions: List[ActionType]=None, descriptions: List[str]=None):
+        """
+        :param target_actions: A partial list of actions from the target space to map to.
+        :param descriptions: a list of descriptions of each of the actions
+        """
        self.target_actions = target_actions
        self.descriptions = descriptions
        super().__init__()
--- a/rl_coach/filters/observation/init.py
+++ b/rl_coach/filters/observation/init.py
@@ -0,0 +1,25 @@
+from .observation_clipping_filter import ObservationClippingFilter
+from .observation_crop_filter import ObservationCropFilter
+from .observation_move_axis_filter import ObservationMoveAxisFilter
+from .observation_normalization_filter import ObservationNormalizationFilter
+from .observation_reduction_by_sub_parts_name_filter import ObservationReductionBySubPartsNameFilter
+from .observation_rescale_size_by_factor_filter import ObservationRescaleSizeByFactorFilter
+from .observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
+from .observation_rgb_to_y_filter import ObservationRGBToYFilter
+from .observation_squeeze_filter import ObservationSqueezeFilter
+from .observation_stacking_filter import ObservationStackingFilter
+from .observation_to_uint8_filter import ObservationToUInt8Filter
+
+__all__ = [
+    'ObservationClippingFilter',
+    'ObservationCropFilter',
+    'ObservationMoveAxisFilter',
+    'ObservationNormalizationFilter',
+    'ObservationReductionBySubPartsNameFilter',
+    'ObservationRescaleSizeByFactorFilter',
+    'ObservationRescaleToSizeFilter',
+    'ObservationRGBToYFilter',
+    'ObservationSqueezeFilter',
+    'ObservationStackingFilter',
+    'ObservationToUInt8Filter'
+]
--- a/rl_coach/filters/observation/observation_clipping_filter.py
+++ b/rl_coach/filters/observation/observation_clipping_filter.py
@@ -24,7 +24,10 @@ from rl_coach.spaces import ObservationSpace

 class ObservationClippingFilter(ObservationFilter):
    """
-    Clip the observation values using the given ranges
+    Clips the observation values to a given range of values.
+    For example, if the observation consists of measurements in an arbitrary range,
+    and we want to control the minimum and maximum values of these observations,
+    we can define a range and clip the values of the measurements.
    """
    def __init__(self, clipping_low: float=-np.inf, clipping_high: float=np.inf):
        """
--- a/rl_coach/filters/observation/observation_crop_filter.py
+++ b/rl_coach/filters/observation/observation_crop_filter.py
@@ -24,7 +24,9 @@ from rl_coach.spaces import ObservationSpace

 class ObservationCropFilter(ObservationFilter):
    """
-    Crops the current state observation to a given shape
+    Crops the size of the observation to a given crop window. For example, in Atari, the
+    observations are images with a shape of 210x160. Usually, we will want to crop the size of the observation to a
+    square of 160x160 before rescaling them.
    """
    def __init__(self, crop_low: np.ndarray=None, crop_high: np.ndarray=None):
        """
--- a/rl_coach/filters/observation/observation_move_axis_filter.py
+++ b/rl_coach/filters/observation/observation_move_axis_filter.py
@@ -23,9 +23,14 @@ from rl_coach.spaces import ObservationSpace, PlanarMapsObservationSpace

 class ObservationMoveAxisFilter(ObservationFilter):
    """
-    Move an axis of the observation to a different place.
+    Reorders the axes of the observation. This can be useful when the observation is an
+    image, and we want to move the channel axis to be the last axis instead of the first axis.
    """
    def __init__(self, axis_origin: int = None, axis_target: int=None):
+        """
+        :param axis_origin: The axis to move
+        :param axis_target: Where to move the selected axis to
+        """
        super().__init__()
        self.axis_origin = axis_origin
        self.axis_target = axis_target
--- a/rl_coach/filters/observation/observation_normalization_filter.py
+++ b/rl_coach/filters/observation/observation_normalization_filter.py
@@ -25,8 +25,9 @@ from rl_coach.spaces import ObservationSpace

 class ObservationNormalizationFilter(ObservationFilter):
    """
-    Normalize the observation with a running standard deviation and mean of the observations seen so far
-    If there is more than a single worker, the statistics of the observations are shared between all the workers
+    Normalizes the observation values with a running mean and standard deviation of
+    all the observations seen so far. The normalization is performed element-wise. Additionally, when working with
+    multiple workers, the statistics used for the normalization operation are accumulated over all the workers.
    """
    def __init__(self, clip_min: float=-5.0, clip_max: float=5.0, name='observation_stats'):
        """
--- a/rl_coach/filters/observation/observation_reduction_by_sub_parts_name_filter.py
+++ b/rl_coach/filters/observation/observation_reduction_by_sub_parts_name_filter.py
@@ -26,9 +26,11 @@ from rl_coach.spaces import ObservationSpace, VectorObservationSpace

 class ObservationReductionBySubPartsNameFilter(ObservationFilter):
    """
-    Choose sub parts of the observation to remove or keep using their name.
-    This is useful when the environment has a measurements vector as observation which includes several different
+    Allows keeping only parts of the observation, by specifying their
+    name. This is useful when the environment has a measurements vector as observation which includes several different
    measurements, but you want the agent to only see some of the measurements and not all.
+    For example, the CARLA environment extracts multiple measurements that can be used by the agent, such as
+    speed and location. If we want to only use the speed, it can be done using this filter.
    This will currently work only for VectorObservationSpace observations
    """
    class ReductionMethod(Enum):
--- a/rl_coach/filters/observation/observation_rescale_size_by_factor_filter.py
+++ b/rl_coach/filters/observation/observation_rescale_size_by_factor_filter.py
@@ -35,7 +35,8 @@ class RescaleInterpolationType(Enum):

 class ObservationRescaleSizeByFactorFilter(ObservationFilter):
    """
-    Scales the current state observation size by a given factor
+    Rescales an image observation by some factor. For example, the image size
+    can be reduced by a factor of 2.
    Warning: this requires the input observation to be of type uint8 due to scipy requirements!
    """
    def __init__(self, rescale_factor: float, rescaling_interpolation_type: RescaleInterpolationType):
--- a/rl_coach/filters/observation/observation_rescale_to_size_filter.py
+++ b/rl_coach/filters/observation/observation_rescale_to_size_filter.py
@@ -37,7 +37,8 @@ class RescaleInterpolationType(Enum):

 class ObservationRescaleToSizeFilter(ObservationFilter):
    """
-    Scales the current state observation to a given shape
+    Rescales an image observation to a given size. The target size does not
+    necessarily keep the aspect ratio of the original observation.
    Warning: this requires the input observation to be of type uint8 due to scipy requirements!
    """
    def __init__(self, output_observation_space: PlanarMapsObservationSpace,
--- a/rl_coach/filters/observation/observation_rgb_to_y_filter.py
+++ b/rl_coach/filters/observation/observation_rgb_to_y_filter.py
@@ -21,7 +21,9 @@ from rl_coach.spaces import ObservationSpace

 class ObservationRGBToYFilter(ObservationFilter):
    """
-    Converts the observation in the current state to gray scale (Y channel).
+    Converts a color image observation specified using the RGB encoding into a grayscale
+    image observation, by keeping only the luminance (Y) channel of the YUV encoding. This can be useful if the colors
+    in the original image are not relevant for solving the task at hand.
    The channels axis is assumed to be the last axis
    """
    def __init__(self):
--- a/rl_coach/filters/observation/observation_squeeze_filter.py
+++ b/rl_coach/filters/observation/observation_squeeze_filter.py
@@ -23,9 +23,12 @@ from rl_coach.spaces import ObservationSpace

 class ObservationSqueezeFilter(ObservationFilter):
    """
-    Squeezes the observation so to eliminate redundant axes.
+    Removes redundant axes from the observation, which are axes with a dimension of 1.
    """
    def __init__(self, axis: int = None):
+        """
+        :param axis: Specifies which axis to remove. If set to None, all the axes of size 1 will be removed.
+        """
        super().__init__()
        self.axis = axis

--- a/rl_coach/filters/observation/observation_stacking_filter.py
+++ b/rl_coach/filters/observation/observation_stacking_filter.py
@@ -43,7 +43,10 @@ class LazyStack(object):

 class ObservationStackingFilter(ObservationFilter):
    """
-    Stack the current state observation on top of several previous observations.
+    Stacks several observations on top of each other. For image observation this will
+    create a 3D blob. The stacking is done in a lazy manner in order to reduce memory consumption. To achieve this,
+    a LazyStack object is used in order to wrap the observations in the stack. For this reason, the
+    ObservationStackingFilter **must** be the last filter in the inputs filters stack.
    This filter is stateful since it stores the previous step result and depends on it.
    The filter adds an additional dimension to the output observation.

--- a/rl_coach/filters/observation/observation_to_uint8_filter.py
+++ b/rl_coach/filters/observation/observation_to_uint8_filter.py
@@ -23,10 +23,15 @@ from rl_coach.spaces import ObservationSpace

 class ObservationToUInt8Filter(ObservationFilter):
    """
-    Converts the observation values to be uint8 values between 0 and 255.
-    It first scales the observation values to fit in the range and then converts them to uint8.
+    Converts a floating point observation into an unsigned int 8 bit observation. This is
+    mostly useful for reducing memory consumption and is usually used for image observations. The filter will first
+    spread the observation values over the range 0-255 and then discretize them into integer values.
    """
    def __init__(self, input_low: float, input_high: float):
+        """
+        :param input_low: The lowest value currently present in the observation
+        :param input_high: The highest value currently present in the observation
+        """
        super().__init__()
        self.input_low = input_low
        self.input_high = input_high
--- a/rl_coach/filters/reward/init.py
+++ b/rl_coach/filters/reward/init.py
@@ -0,0 +1,8 @@
+from .reward_rescale_filter import RewardRescaleFilter
+from .reward_clipping_filter import RewardClippingFilter
+from .reward_normalization_filter import RewardNormalizationFilter
+__all__ = [
+    'RewardRescaleFilter',
+    'RewardClippingFilter',
+    'RewardNormalizationFilter'
+]
--- a/rl_coach/filters/reward/reward_clipping_filter.py
+++ b/rl_coach/filters/reward/reward_clipping_filter.py
@@ -23,7 +23,8 @@ from rl_coach.spaces import RewardSpace

 class RewardClippingFilter(RewardFilter):
    """
-    Clips the reward to some range
+    Clips the reward values into a given range. For example, in DQN, the Atari rewards are
+    clipped into the range -1 and 1 in order to control the scale of the returns.
    """
    def __init__(self, clipping_low: float=-np.inf, clipping_high: float=np.inf):
        """
--- a/rl_coach/filters/reward/reward_normalization_filter.py
+++ b/rl_coach/filters/reward/reward_normalization_filter.py
@@ -25,8 +25,9 @@ from rl_coach.spaces import RewardSpace

 class RewardNormalizationFilter(RewardFilter):
    """
-    Normalize the reward with a running standard deviation and mean of the rewards seen so far
-    If there is more than a single worker, the statistics of the rewards are shared between all the workers
+    Normalizes the reward values with a running mean and standard deviation of
+    all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation
+    are accumulated over all the workers.
    """
    def __init__(self, clip_min: float=-5.0, clip_max: float=5.0):
        """
--- a/rl_coach/filters/reward/reward_rescale_filter.py
+++ b/rl_coach/filters/reward/reward_rescale_filter.py
@@ -21,7 +21,8 @@ from rl_coach.spaces import RewardSpace

 class RewardRescaleFilter(RewardFilter):
    """
-    Rescales the reward by multiplying with some factor
+    Rescales the reward by a given factor. Rescaling the rewards of the environment has been
+    observed to have a large effect (negative or positive) on the behavior of the learning process.
    """
    def __init__(self, rescale_factor: float):
        """
--- a/rl_coach/graph_managers/graph_manager.py
+++ b/rl_coach/graph_managers/graph_manager.py
@@ -504,6 +504,8 @@ class GraphManager(object):
        :return: None
        """

+        self.verify_graph_was_created()
+
        # initialize the network parameters from the global network
        self.sync()

--- a/rl_coach/memories/episodic/init.py
+++ b/rl_coach/memories/episodic/init.py
@@ -0,0 +1,14 @@
+from .episodic_experience_replay import EpisodicExperienceReplayParameters, EpisodicExperienceReplay
+from .episodic_hindsight_experience_replay import EpisodicHindsightExperienceReplayParameters, EpisodicHindsightExperienceReplay
+from .episodic_hrl_hindsight_experience_replay import EpisodicHRLHindsightExperienceReplayParameters, EpisodicHRLHindsightExperienceReplay
+from .single_episode_buffer import SingleEpisodeBufferParameters, SingleEpisodeBuffer
+__all__ = [
+    'EpisodicExperienceReplayParameters',
+    'EpisodicHindsightExperienceReplayParameters',
+    'EpisodicHRLHindsightExperienceReplayParameters',
+    'SingleEpisodeBufferParameters',
+    'EpisodicExperienceReplay',
+    'EpisodicHindsightExperienceReplay',
+    'EpisodicHRLHindsightExperienceReplay',
+    'SingleEpisodeBuffer'
+]
--- a/rl_coach/memories/non_episodic/init.py
+++ b/rl_coach/memories/non_episodic/init.py
@@ -0,0 +1,13 @@
+from .balanced_experience_replay import BalancedExperienceReplayParameters, BalancedExperienceReplay
+from .differentiable_neural_dictionary import QDND
+from .experience_replay import ExperienceReplayParameters, ExperienceReplay
+from .prioritized_experience_replay import PrioritizedExperienceReplayParameters, PrioritizedExperienceReplay
+from .transition_collection import TransitionCollection
+__all__ = [
+    'BalancedExperienceReplayParameters',
+    'BalancedExperienceReplay',
+    'QDND',
+    'ExperienceReplay',
+    'PrioritizedExperienceReplay',
+    'TransitionCollection'
+]
--- a/rl_coach/spaces.py
+++ b/rl_coach/spaces.py
@@ -120,6 +120,7 @@ class Space(object):
    def val_matches_space_definition(self, val: Union[int, float, np.ndarray]) -> bool:
        """
        Checks if the given value matches the space definition in terms of shape and values
+
        :param val: a value to check
        :return: True / False depending on if the val matches the space definition
        """
@@ -136,6 +137,7 @@ class Space(object):
    def is_point_in_space_shape(self, point: np.ndarray) -> bool:
        """
        Checks if a given multidimensional point is within the bounds of the shape of the space
+
        :param point: a multidimensional point
        :return: True if the point is within the shape of the space. False otherwise
        """
@@ -146,6 +148,12 @@ class Space(object):
        return True

    def sample(self) -> np.ndarray:
+        """
+        Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
+        bounds are defined
+
+        :return: A numpy array sampled from the space
+        """
        # if there are infinite bounds, we sample using gaussian noise with mean 0 and std 1
        if np.any(self.low == -np.inf) or np.any(self.high == np.inf):
            return np.random.normal(0, 1, self.shape)
@@ -173,6 +181,10 @@ class ObservationSpace(Space):


 class VectorObservationSpace(ObservationSpace):
+    """
+    An observation space which is defined as a vector of elements. This can be particularly useful for environments
+    which return measurements, such as in robotic environmnets.
+    """
    def __init__(self, shape: int, low: Union[None, int, float, np.ndarray]=-np.inf,
                 high: Union[None, int, float, np.ndarray]=np.inf, measurements_names: List[str]=None):
        if measurements_names is None:
@@ -186,6 +198,10 @@ class VectorObservationSpace(ObservationSpace):


 class PlanarMapsObservationSpace(ObservationSpace):
+    """
+    An observation space which defines a stack of 2D observations. For example, an environment which returns
+    a stack of segmentation maps like in Starcraft.
+    """
    def __init__(self, shape: Union[np.ndarray], low: int, high: int, channels_axis: int=-1):
        super().__init__(shape, low, high)
        self.channels_axis = channels_axis
@@ -200,6 +216,10 @@ class PlanarMapsObservationSpace(ObservationSpace):


 class ImageObservationSpace(PlanarMapsObservationSpace):
+    """
+    An observation space which is a private case of the PlanarMapsObservationSpace, where the stack of 2D observations
+    represent a RGB image, or a grayscale image.
+    """
    def __init__(self, shape: Union[np.ndarray], high: int, channels_axis: int=-1):
        # TODO: consider allowing arbitrary low values for images
        super().__init__(shape, 0, high, channels_axis)
@@ -245,6 +265,7 @@ class ActionSpace(Space):
    def sample_with_info(self) -> ActionInfo:
        """
        Get a random action with additional "fake" info
+
        :return: An action info instance
        """
        return ActionInfo(self.sample())
@@ -252,6 +273,7 @@ class ActionSpace(Space):
    def clip_action_to_space(self, action: ActionType) -> ActionType:
        """
        Given an action, clip its values to fit to the action space ranges
+
        :param action: a given action
        :return: the clipped action
        """
@@ -460,6 +482,7 @@ class GoalToRewardConversion(object):
    def convert_distance_to_reward(self, distance: Union[float, np.ndarray]) -> Tuple[float, bool]:
        """
        Given a distance from the goal, return a reward and a flag representing if the goal was reached
+
        :param distance: the distance from the goal
        :return:
        """
@@ -543,6 +566,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
    def goal_from_state(self, state: Dict):
        """
        Given a state, extract an observation according to the goal_name
+
        :param state: a dictionary of observations
        :return: the observation corresponding to the goal_name
        """
@@ -551,6 +575,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
    def distance_from_goal(self, goal: np.ndarray, state: dict) -> float:
        """
        Given a state, check its distance from the goal
+
        :param goal: a numpy array representing the goal
        :param state: a dict representing the state
        :return: the distance from the goal
@@ -574,6 +599,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
    def get_reward_for_goal_and_state(self, goal: np.ndarray, state: dict) -> Tuple[float, bool]:
        """
        Given a state, check if the goal was reached and return a reward accordingly
+
        :param goal: a numpy array representing the goal
        :param state: a dict representing the state
        :return: the reward for the current goal and state pair and a boolean representing if the goal was reached