update of api docstrings across coach and tutorials [WIP] (#91)

* updating the documentation website * adding the built docs * update of api docstrings across coach and tutorials 0-2 * added some missing api documentation * New Sphinx based documentation
2026-02-18 23:45:48 +01:00 · 2018-11-15 15:00:13 +02:00
parent 524f8436a2
commit 6d40ad1650
517 changed files with 71034 additions and 12834 deletions
--- a/rl_coach/agents/actor_critic_agent.py
+++ b/rl_coach/agents/actor_critic_agent.py
@@ -36,25 +36,25 @@ from rl_coach.utils import last_sample
 class ActorCriticAlgorithmParameters(AlgorithmParameters):
    """
    :param policy_gradient_rescaler: (PolicyGradientRescaler)
-    The value that will be used to rescale the policy gradient
+        The value that will be used to rescale the policy gradient

    :param apply_gradients_every_x_episodes: (int)
-    The number of episodes to wait before applying the accumulated gradients to the network.
-    The training iterations only accumulate gradients without actually applying them.
+        The number of episodes to wait before applying the accumulated gradients to the network.
+        The training iterations only accumulate gradients without actually applying them.

    :param beta_entropy: (float)
-    The weight that will be given to the entropy regularization which is used in order to improve exploration.
+        The weight that will be given to the entropy regularization which is used in order to improve exploration.

    :param num_steps_between_gradient_updates: (int)
-    Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
-    accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
+        Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
+        accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.

    :param gae_lambda: (float)
-    If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
-    scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
+        If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
+        scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.

    :param estimate_state_value_using_gae: (bool)
-    If set to True, the state value targets for the V head will be estimated using the GAE scheme.
+        If set to True, the state value targets for the V head will be estimated using the GAE scheme.
    """
    def __init__(self):
        super().__init__()
--- a/rl_coach/agents/agent.py
+++ b/rl_coach/agents/agent.py
@@ -39,7 +39,7 @@ from rl_coach.memories.backend.memory_impl import get_memory_backend
 class Agent(AgentInterface):
    def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
        """
-        :param agent_parameters: A Preset class instance with all the running paramaters
+        :param agent_parameters: A AgentParameters class instance with all the agent parameters
        """
        super().__init__()
        self.ap = agent_parameters
@@ -175,18 +175,20 @@ class Agent(AgentInterface):
            np.random.seed()

    @property
-    def parent(self):
+    def parent(self) -> 'LevelManager':
        """
        Get the parent class of the agent
+
        :return: the current phase
        """
        return self._parent

    @parent.setter
-    def parent(self, val):
+    def parent(self, val) -> None:
        """
        Change the parent class of the agent.
        Additionally, updates the full name of the agent
+
        :param val: the new parent
        :return: None
        """
@@ -196,7 +198,12 @@ class Agent(AgentInterface):
                raise ValueError("The parent of an agent must have a name")
            self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)

-    def setup_logger(self):
+    def setup_logger(self) -> None:
+        """
+        Setup the logger for the agent
+
+        :return: None
+        """
        # dump documentation
        logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
            format(graph_name=self.parent_level_manager.parent_graph_manager.name,
@@ -212,6 +219,7 @@ class Agent(AgentInterface):
    def set_session(self, sess) -> None:
        """
        Set the deep learning framework session for all the agents in the composite agent
+
        :return: None
        """
        self.input_filter.set_session(sess)
@@ -223,6 +231,7 @@ class Agent(AgentInterface):
                        dump_one_value_per_step: bool=False) -> Signal:
        """
        Register a signal such that its statistics will be dumped and be viewable through dashboard
+
        :param signal_name: the name of the signal as it will appear in dashboard
        :param dump_one_value_per_episode: should the signal value be written for each episode?
        :param dump_one_value_per_step: should the signal value be written for each step?
@@ -239,6 +248,7 @@ class Agent(AgentInterface):
        """
        Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
        dependent on those values, by calling init_environment_dependent_modules
+
        :param spaces: the environment spaces definition
        :return: None
        """
@@ -274,6 +284,7 @@ class Agent(AgentInterface):
        Create all the networks of the agent.
        The network creation will be done after setting the environment parameters for the agent, since they are needed
        for creating the network.
+
        :return: A list containing all the networks
        """
        networks = {}
@@ -295,6 +306,7 @@ class Agent(AgentInterface):
        """
        Initialize any modules that depend on knowing information about the environment such as the action space or
        the observation space
+
        :return: None
        """
        # initialize exploration policy
@@ -314,13 +326,19 @@ class Agent(AgentInterface):

    @property
    def phase(self) -> RunPhase:
+        """
+        The current running phase of the agent
+
+        :return: RunPhase
+        """
        return self._phase

    @phase.setter
    def phase(self, val: RunPhase) -> None:
        """
        Change the phase of the run for the agent and all the sub components
-        :param phase: the new run phase (TRAIN, TEST, etc.)
+
+        :param val: the new run phase (TRAIN, TEST, etc.)
        :return: None
        """
        self.reset_evaluation_state(val)
@@ -328,6 +346,14 @@ class Agent(AgentInterface):
        self.exploration_policy.change_phase(val)

    def reset_evaluation_state(self, val: RunPhase) -> None:
+        """
+        Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
+        evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
+        by val, and by the current phase set in self.phase.
+
+        :param val: The new phase to change to
+        :return: None
+        """
        starting_evaluation = (val == RunPhase.TEST)
        ending_evaluation = (self.phase == RunPhase.TEST)

@@ -363,6 +389,7 @@ class Agent(AgentInterface):
        This function is a wrapper to allow having the same calls for shared or unshared memories.
        It should be used instead of calling the memory directly in order to allow different algorithms to work
        both with a shared and a local memory.
+
        :param func: the name of the memory function to call
        :param args: the arguments to supply to the function
        :return: the return value of the function
@@ -375,7 +402,12 @@ class Agent(AgentInterface):
            result = getattr(self.memory, func)(*args)
        return result

-    def log_to_screen(self):
+    def log_to_screen(self) -> None:
+        """
+        Write an episode summary line to the terminal
+
+        :return: None
+        """
        # log to screen
        log = OrderedDict()
        log["Name"] = self.full_name_id
@@ -388,9 +420,10 @@ class Agent(AgentInterface):
        log["Training iteration"] = self.training_iteration
        screen.log_dict(log, prefix=self.phase.value)

-    def update_step_in_episode_log(self):
+    def update_step_in_episode_log(self) -> None:
        """
-        Writes logging messages to screen and updates the log file with all the signal values.
+        Updates the in-episode log file with all the signal values from the most recent step.
+
        :return: None
        """
        # log all the signals to file
@@ -411,9 +444,12 @@ class Agent(AgentInterface):
        # dump
        self.agent_episode_logger.dump_output_csv()

-    def update_log(self):
+    def update_log(self) -> None:
        """
-        Writes logging messages to screen and updates the log file with all the signal values.
+        Updates the episodic log file with all the signal values from the most recent episode.
+        Additional signals for logging can be set by the creating a new signal using self.register_signal,
+        and then updating it with some internal agent values.
+
        :return: None
        """
        # log all the signals to file
@@ -438,7 +474,6 @@ class Agent(AgentInterface):
            self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
            self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)

-
        for signal in self.episode_signals:
            self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
            self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
@@ -452,7 +487,10 @@ class Agent(AgentInterface):

    def handle_episode_ended(self) -> None:
        """
-        End an episode
+        Make any changes needed when each episode is ended.
+        This includes incrementing counters, updating full episode dependent values, updating logs, etc.
+        This function is called right after each episode is ended.
+
        :return: None
        """
        self.current_episode_buffer.is_complete = True
@@ -486,9 +524,10 @@ class Agent(AgentInterface):
        if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
            self.log_to_screen()

-    def reset_internal_state(self):
+    def reset_internal_state(self) -> None:
        """
-        Reset all the episodic parameters
+        Reset all the episodic parameters. This function is called right before each episode starts.
+
        :return: None
        """
        for signal in self.episode_signals:
@@ -516,6 +555,7 @@ class Agent(AgentInterface):
    def learn_from_batch(self, batch) -> Tuple[float, List, List]:
        """
        Given a batch of transitions, calculates their target values and updates the network.
+
        :param batch: A list of transitions
        :return: The total loss of the training, the loss per head and the unclipped gradients
        """
@@ -524,6 +564,7 @@ class Agent(AgentInterface):
    def _should_update_online_weights_to_target(self):
        """
        Determine if online weights should be copied to the target.
+
        :return: boolean: True if the online weights should be copied to the target.
        """

@@ -542,9 +583,10 @@ class Agent(AgentInterface):
                             "EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
        return should_update

-    def _should_train(self, wait_for_full_episode=False):
+    def _should_train(self, wait_for_full_episode=False) -> bool:
        """
        Determine if we should start a training phase according to the number of steps passed since the last training
+
        :return:  boolean: True if we should start a training phase
        """

@@ -580,11 +622,12 @@ class Agent(AgentInterface):

        return should_update

-    def train(self):
+    def train(self) -> float:
        """
        Check if a training phase should be done as configured by num_consecutive_playing_steps.
        If it should, then do several training steps as configured by num_consecutive_training_steps.
        A single training iteration: Sample a batch, train on it and update target networks.
+
        :return: The total training loss during the training iterations.
        """
        loss = 0
@@ -641,14 +684,12 @@ class Agent(AgentInterface):
            # run additional commands after the training is done
            self.post_training_commands()

-
-
        return loss

    def choose_action(self, curr_state):
        """
-        choose an action to act with in the current episode being played. Different behavior might be exhibited when training
-         or testing.
+        choose an action to act with in the current episode being played. Different behavior might be exhibited when
+        training or testing.

        :param curr_state: the current state to act upon.
        :return: chosen action, some action value describing the action (q-value, probability, etc)
@@ -656,10 +697,16 @@ class Agent(AgentInterface):
        pass

    def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
-                                    network_name: str):
+                                    network_name: str) -> Dict[str, np.array]:
        """
-        convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
+        Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
        observations together, measurements together, etc.
+
+        :param states: A list of environment states, where each one is a dict mapping from an observation name to its
+                       corresponding observation
+        :param network_name: The agent network name to prepare the batch for. this is needed in order to extract only
+                             the observation relevant for the network from the states.
+        :return: A dictionary containing a list of values from all the given states for each of the observations
        """
        # convert to batch so we can run it through the network
        states = force_list(states)
@@ -676,7 +723,8 @@ class Agent(AgentInterface):
    def act(self) -> ActionInfo:
        """
        Given the agents current knowledge, decide on the next action to apply to the environment
-        :return: an action and a dictionary containing any additional info from the action decision process
+
+        :return: An ActionInfo object, which contains the action and any additional info from the action decision process
        """
        if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
            # This agent never plays  while training (e.g. behavioral cloning)
@@ -705,13 +753,20 @@ class Agent(AgentInterface):

        return filtered_action_info

-    def run_pre_network_filter_for_inference(self, state: StateType):
+    def run_pre_network_filter_for_inference(self, state: StateType) -> StateType:
+        """
+        Run filters which where defined for being applied right before using the state for inference.
+
+        :param state: The state to run the filters on
+        :return: The filtered state
+        """
        dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
        return self.pre_network_filter.filter(dummy_env_response)[0].next_state

    def get_state_embedding(self, state: dict) -> np.ndarray:
        """
        Given a state, get the corresponding state embedding  from the main network
+
        :param state: a state dict
        :return: a numpy embedding vector
        """
@@ -726,6 +781,7 @@ class Agent(AgentInterface):
        """
        Allows agents to update the transition just before adding it to the replay buffer.
        Can be useful for agents that want to tweak the reward, termination signal, etc.
+
        :param transition: the transition to update
        :return: the updated transition
        """
@@ -736,8 +792,10 @@ class Agent(AgentInterface):
        Given a response from the environment, distill the observation from it and store it for later use.
        The response should be a dictionary containing the performed action, the new observation and measurements,
        the reward, a game over flag and any additional information necessary.
+
        :param env_response: result of call from environment.step(action)
-        :return:
+        :return: a boolean value which determines if the agent has decided to terminate the episode after seeing the
+                 given observation
        """

        # filter the env_response
@@ -801,7 +859,12 @@ class Agent(AgentInterface):

            return transition.game_over

-    def post_training_commands(self):
+    def post_training_commands(self) -> None:
+        """
+        A function which allows adding any functionality that is required to run right after the training phase ends.
+
+        :return: None
+        """
        pass

    def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
@@ -809,9 +872,10 @@ class Agent(AgentInterface):
        Get a prediction from the agent with regard to the requested prediction_type.
        If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
        raise a ValueException.
-        :param states:
-        :param prediction_type:
-        :return:
+
+        :param states: The states to get a prediction for
+        :param prediction_type: The type of prediction to get for the states. For example, the state-value prediction.
+        :return: the predicted values
        """

        predictions = self.networks['main'].online_network.predict_with_prediction_type(
@@ -824,6 +888,15 @@ class Agent(AgentInterface):
        return list(predictions.values())[0]

    def set_incoming_directive(self, action: ActionType) -> None:
+        """
+        Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent
+        has another master agent that is controlling it. In such cases, the master agent can define the goals for the
+        slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent
+        in-action-space.
+
+        :param action: The action that should be set as the directive
+        :return:
+        """
        if isinstance(self.in_action_space, GoalsSpace):
            self.current_hrl_goal = action
        elif isinstance(self.in_action_space, AttentionActionSpace):
@@ -834,6 +907,7 @@ class Agent(AgentInterface):
    def save_checkpoint(self, checkpoint_id: int) -> None:
        """
        Allows agents to store additional information when saving checkpoints.
+
        :param checkpoint_id: the id of the checkpoint
        :return: None
        """
@@ -842,6 +916,7 @@ class Agent(AgentInterface):
    def sync(self) -> None:
        """
        Sync the global network parameters to local networks
+
        :return: None
        """
        for network in self.networks.values():
--- a/rl_coach/agents/bc_agent.py
+++ b/rl_coach/agents/bc_agent.py
@@ -32,7 +32,6 @@ from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayPar
 class BCAlgorithmParameters(AlgorithmParameters):
    def __init__(self):
        super().__init__()
-        self.collect_new_data = False


 class BCNetworkParameters(NetworkParameters):
--- a/rl_coach/agents/categorical_dqn_agent.py
+++ b/rl_coach/agents/categorical_dqn_agent.py
@@ -33,6 +33,19 @@ class CategoricalDQNNetworkParameters(DQNNetworkParameters):


 class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param v_min: (float)
+        The minimal value that will be represented in the network output for predicting the Q value.
+        Corresponds to :math:`v_{min}` in the paper.
+
+    :param v_max: (float)
+        The maximum value that will be represented in the network output for predicting the Q value.
+        Corresponds to :math:`v_{max}` in the paper.
+
+    :param atoms: (int)
+        The number of atoms that will be used to discretize the range between v_min and v_max.
+        For the C51 algorithm described in the paper, the number of atoms is 51.
+    """
    def __init__(self):
        super().__init__()
        self.v_min = -10.0
--- a/rl_coach/agents/cil_agent.py
+++ b/rl_coach/agents/cil_agent.py
@@ -26,9 +26,12 @@ from rl_coach.memories.non_episodic.balanced_experience_replay import BalancedEx


 class CILAlgorithmParameters(AlgorithmParameters):
+    """
+    :param state_key_with_the_class_index: (str)
+        The key of the state dictionary which corresponds to the value that will be used to control the class index.
+    """
    def __init__(self):
        super().__init__()
-        self.collect_new_data = False
        self.state_key_with_the_class_index = 'high_level_command'


--- a/rl_coach/agents/clipped_ppo_agent.py
+++ b/rl_coach/agents/clipped_ppo_agent.py
@@ -58,6 +58,47 @@ class ClippedPPONetworkParameters(NetworkParameters):


 class ClippedPPOAlgorithmParameters(AlgorithmParameters):
+    """
+    :param policy_gradient_rescaler: (PolicyGradientRescaler)
+        This represents how the critic will be used to update the actor. The critic value function is typically used
+        to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
+        advantage of the action, or the generalized advantage estimation (GAE) value.
+
+    :param gae_lambda: (float)
+        The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
+        estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
+        n-step estimations.
+
+    :param clip_likelihood_ratio_using_epsilon: (float)
+        If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
+        clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
+        This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
+        implementations.
+
+    :param value_targets_mix_fraction: (float)
+        The targets for the value network are an exponential weighted moving average which uses this mix fraction to
+        define how much of the new targets will be taken into account when calculating the loss.
+        This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
+
+    :param estimate_state_value_using_gae: (bool)
+        If set to True, the state value will be estimated using the GAE technique.
+
+    :param use_kl_regularization: (bool)
+        If set to True, the loss function will be regularized using the KL diveregence between the current and new
+        policy, to bound the change of the policy during the network update.
+
+    :param beta_entropy: (float)
+        An entropy regulaization term can be added to the loss function in order to control exploration. This term
+        is weighted using the :math:`\beta` value defined by beta_entropy.
+
+    :param optimization_epochs: (int)
+        For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
+        optimization_epochs value.
+
+    :param optimization_epochs: (Schedule)
+        Can be used to define a schedule over the clipping of the likelihood ratio.
+
+    """
    def __init__(self):
        super().__init__()
        self.num_episodes_in_experience_replay = 1000000
@@ -66,7 +107,6 @@ class ClippedPPOAlgorithmParameters(AlgorithmParameters):
        self.use_kl_regularization = False
        self.clip_likelihood_ratio_using_epsilon = 0.2
        self.estimate_state_value_using_gae = True
-        self.step_until_collecting_full_episodes = True
        self.beta_entropy = 0.01  # should be 0 for mujoco
        self.num_consecutive_playing_steps = EnvironmentSteps(2048)
        self.optimization_epochs = 10
--- a/rl_coach/agents/ddpg_agent.py
+++ b/rl_coach/agents/ddpg_agent.py
@@ -65,6 +65,33 @@ class DDPGActorNetworkParameters(NetworkParameters):


 class DDPGAlgorithmParameters(AlgorithmParameters):
+    """
+    :param num_steps_between_copying_online_weights_to_target: (StepMethod)
+        The number of steps between copying the online network weights to the target network weights.
+
+    :param rate_for_copying_weights_to_target: (float)
+        When copying the online network weights to the target network weights, a soft update will be used, which
+        weight the new online network weights by rate_for_copying_weights_to_target
+
+    :param num_consecutive_playing_steps: (StepMethod)
+        The number of consecutive steps to act between every two training iterations
+
+    :param use_target_network_for_evaluation: (bool)
+        If set to True, the target network will be used for predicting the actions when choosing actions to act.
+        Since the target network weights change more slowly, the predicted actions will be more consistent.
+
+    :param action_penalty: (float)
+        The amount by which to penalize the network on high action feature (pre-activation) values.
+        This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
+        gradients from becoming very low.
+
+    :param clip_critic_targets: (Tuple[float, float] or None)
+        The range to clip the critic target to in order to prevent overestimation of the action values.
+
+    :param use_non_zero_discount_for_terminal_states: (bool)
+        If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
+        values. If set to False, the terminal states reward will be taken as the target return for the network.
+    """
    def __init__(self):
        super().__init__()
        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
--- a/rl_coach/agents/dfp_agent.py
+++ b/rl_coach/agents/dfp_agent.py
@@ -81,6 +81,35 @@ class DFPMemoryParameters(EpisodicExperienceReplayParameters):


 class DFPAlgorithmParameters(AlgorithmParameters):
+    """
+    :param num_predicted_steps_ahead: (int)
+        Number of future steps to predict measurements for. The future steps won't be sequential, but rather jump
+        in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4
+
+    :param goal_vector: (List[float])
+        The goal vector will weight each of the measurements to form an optimization goal. The vector should have
+        the same length as the number of measurements, and it will be vector multiplied by the measurements.
+        Positive values correspond to trying to maximize the particular measurement, and negative values
+        correspond to trying to minimize the particular measurement.
+
+    :param future_measurements_weights: (List[float])
+        The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
+        goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
+        then only the 3 last timesteps will be taken into account, according to the weights in the
+        future_measurements_weights vector.
+
+    :param use_accumulated_reward_as_measurement: (bool)
+        If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
+        the measurements vector in the state. This van be useful in environments where the given measurements don't
+        include enough information for the particular goal the agent should achieve.
+
+    :param handling_targets_after_episode_end: (HandlingTargetsAfterEpisodeEnd)
+        Dictates how to handle measurements that are outside the episode length.
+
+    :param scale_measurements_targets: (Dict[str, float])
+        Allows rescaling the values of each of the measurements available. This van be useful when the measurements
+        have a different scale and you want to normalize them to the same scale.
+    """
    def __init__(self):
        super().__init__()
        self.num_predicted_steps_ahead = 6
--- a/rl_coach/agents/hac_ddpg_agent.py
+++ b/rl_coach/agents/hac_ddpg_agent.py
@@ -24,6 +24,13 @@ from rl_coach.spaces import SpacesDefinition


 class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
+    """
+    :param time_limit: (int)
+        The number of steps the agent is allowed to act for while trying to achieve its goal
+
+    :param sub_goal_testing_rate: (float)
+        The percent of episodes that will be used for testing the sub goals generated by the upper level agents.
+    """
    def __init__(self):
        super().__init__()
        self.time_limit = 40
@@ -91,7 +98,7 @@ class HACDDPGAgent(DDPGAgent):
            sub_goal_is_missed = not sub_goal_reached

            if sub_goal_is_missed:
-                    transition.reward = -self.ap.algorithm.time_limit
+                transition.reward = -self.ap.algorithm.time_limit
        return transition

    def set_environment_parameters(self, spaces: SpacesDefinition):
--- a/rl_coach/agents/mmc_agent.py
+++ b/rl_coach/agents/mmc_agent.py
@@ -24,6 +24,11 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi


 class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param monte_carlo_mixing_rate: (float)
+        The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
+        the single-step bootstrapped targets.
+    """
    def __init__(self):
        super().__init__()
        self.monte_carlo_mixing_rate = 0.1
--- a/rl_coach/agents/n_step_q_agent.py
+++ b/rl_coach/agents/n_step_q_agent.py
@@ -44,6 +44,26 @@ class NStepQNetworkParameters(NetworkParameters):


 class NStepQAlgorithmParameters(AlgorithmParameters):
+    """
+    :param num_steps_between_copying_online_weights_to_target: (StepMethod)
+        The number of steps between copying the online network weights to the target network weights.
+
+    :param apply_gradients_every_x_episodes: (int)
+        The number of episodes between applying the accumulated gradients to the network. After every
+        num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
+        it will then accumulate it in internal accumulators, and will only apply them to the network once in every
+        apply_gradients_every_x_episodes episodes.
+
+    :param num_steps_between_gradient_updates: (int)
+        The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
+        called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
+        are used in the batch.
+
+    :param targets_horizon: (str)
+        Should be either 'N-Step' or '1-Step', and defines the length for which to bootstrap the network values over.
+        Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
+        please refer to the original paper (https://arxiv.org/abs/1602.01783)
+    """
    def __init__(self):
        super().__init__()
        self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
--- a/rl_coach/agents/nec_agent.py
+++ b/rl_coach/agents/nec_agent.py
@@ -43,6 +43,39 @@ class NECNetworkParameters(NetworkParameters):


 class NECAlgorithmParameters(AlgorithmParameters):
+    """
+    :param dnd_size: (int)
+        Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
+        of transitions that will be stored is dnd_size x num_actions.
+
+    :param l2_norm_added_delta: (float)
+        A small value that will be added when calculating the weight of each of the DND entries. This follows the
+        :math:`\delta` patameter defined in the paper.
+
+    :param new_value_shift_coefficient: (float)
+        In the case where a ew embedding that was added to the DND was already present, the value that will be stored
+        in the DND is a mix between the existing value and the new value. The mix rate is defined by
+        new_value_shift_coefficient.
+
+    :param number_of_knn: (int)
+        The number of neighbors that will be retrieved for each DND query.
+
+    :param DND_key_error_threshold: (float)
+        When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
+        exists in the DND, since exact matches of embeddings are very rare.
+
+    :param propagate_updates_to_DND: (bool)
+        If set to True, when the gradients of the network will be calculated, the gradients will also be
+        backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
+        network weights.
+
+    :param n_step: (int)
+        The bootstrap length that will be used when calculating the state values to store in the DND.
+
+    :param bootstrap_total_return_from_old_policy: (bool)
+        If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
+        when the state was first seen, and not the latest, most up-to-date network value.
+    """
    def __init__(self):
        super().__init__()
        self.dnd_size = 500000
--- a/rl_coach/agents/pal_agent.py
+++ b/rl_coach/agents/pal_agent.py
@@ -24,6 +24,19 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi


 class PALAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param pal_alpha: (float)
+        A factor that weights the amount by which the advantage learning update will be taken into account.
+
+    :param persistent_advantage_learning: (bool)
+        If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
+        the same actions one after the other instead of changing actions.
+
+    :param monte_carlo_mixing_rate: (float)
+        The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
+        total discounted returns, and they can help reduce the time it takes for the network to update to the newly
+        seen values, since it is not based on bootstrapping the current network values.
+    """
    def __init__(self):
        super().__init__()
        self.pal_alpha = 0.9
--- a/rl_coach/agents/policy_gradients_agent.py
+++ b/rl_coach/agents/policy_gradients_agent.py
@@ -42,6 +42,27 @@ class PolicyGradientNetworkParameters(NetworkParameters):


 class PolicyGradientAlgorithmParameters(AlgorithmParameters):
+    """
+    :param policy_gradient_rescaler: (PolicyGradientRescaler)
+        The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
+        the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
+        return, but there are other rescalers that are intended for reducing the variance of the updates.
+
+    :param apply_gradients_every_x_episodes: (int)
+        The number of episodes between applying the accumulated gradients to the network. After every
+        num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
+        it will then accumulate it in internal accumulators, and will only apply them to the network once in every
+        apply_gradients_every_x_episodes episodes.
+
+    :param beta_entropy: (float)
+        A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
+        will be added to the loss and scaled by the given beta factor.
+
+    :param num_steps_between_gradient_updates: (int)
+        The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
+        called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
+        are used in the batch.
+    """
    def __init__(self):
        super().__init__()
        self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
--- a/rl_coach/agents/ppo_agent.py
+++ b/rl_coach/agents/ppo_agent.py
@@ -63,6 +63,51 @@ class PPOActorNetworkParameters(NetworkParameters):


 class PPOAlgorithmParameters(AlgorithmParameters):
+    """
+    :param policy_gradient_rescaler: (PolicyGradientRescaler)
+        This represents how the critic will be used to update the actor. The critic value function is typically used
+        to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
+        advantage of the action, or the generalized advantage estimation (GAE) value.
+
+    :param gae_lambda: (float)
+        The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
+        estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
+        n-step estimations.
+
+    :param target_kl_divergence: (float)
+        The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
+        bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.
+
+    :param initial_kl_coefficient: (float)
+        The initial weight that will be given to the KL divergence between the current and the new policy in the
+        regularization factor.
+
+    :param high_kl_penalty_coefficient: (float)
+        The penalty that will be given for KL divergence values which are highes than what was defined as the target.
+
+    :param clip_likelihood_ratio_using_epsilon: (float)
+        If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
+        clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
+        This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
+        implementations.
+
+    :param value_targets_mix_fraction: (float)
+        The targets for the value network are an exponential weighted moving average which uses this mix fraction to
+        define how much of the new targets will be taken into account when calculating the loss.
+        This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
+
+    :param estimate_state_value_using_gae: (bool)
+        If set to True, the state value will be estimated using the GAE technique.
+
+    :param use_kl_regularization: (bool)
+        If set to True, the loss function will be regularized using the KL diveregence between the current and new
+        policy, to bound the change of the policy during the network update.
+
+    :param beta_entropy: (float)
+        An entropy regulaization term can be added to the loss function in order to control exploration. This term
+        is weighted using the :math:`\beta` value defined by beta_entropy.
+
+    """
    def __init__(self):
        super().__init__()
        self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
@@ -73,7 +118,6 @@ class PPOAlgorithmParameters(AlgorithmParameters):
        self.clip_likelihood_ratio_using_epsilon = None
        self.value_targets_mix_fraction = 0.1
        self.estimate_state_value_using_gae = True
-        self.step_until_collecting_full_episodes = True
        self.use_kl_regularization = True
        self.beta_entropy = 0.01
        self.num_consecutive_playing_steps = EnvironmentSteps(5000)
--- a/rl_coach/agents/qr_dqn_agent.py
+++ b/rl_coach/agents/qr_dqn_agent.py
@@ -34,6 +34,14 @@ class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):


 class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
+    """
+    :param atoms: (int)
+        the number of atoms to predict for each action
+
+    :param huber_loss_interval: (float)
+        One of the huber loss parameters, and is referred to as :math:`\kapa` in the paper.
+        It describes the interval [-k, k] in which the huber loss acts as a MSE loss.
+    """
    def __init__(self):
        super().__init__()
        self.atoms = 200
--- a/rl_coach/agents/rainbow_dqn_agent.py
+++ b/rl_coach/agents/rainbow_dqn_agent.py
@@ -37,6 +37,17 @@ class RainbowDQNNetworkParameters(DQNNetworkParameters):


 class RainbowDQNAlgorithmParameters(CategoricalDQNAlgorithmParameters):
+    """
+    :param n_step: (int)
+        The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
+        using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
+        prediction.
+
+    :param store_transitions_only_when_episodes_are_terminated: (bool)
+        If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
+        written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
+        transitions into the memory, and to do so we need the entire episode first.
+    """
    def __init__(self):
        super().__init__()
        self.n_step = 3