update of api docstrings across coach and tutorials [WIP] (#91)

* updating the documentation website * adding the built docs * update of api docstrings across coach and tutorials 0-2 * added some missing api documentation * New Sphinx based documentation
2026-02-24 11:15:45 +01:00 · 2018-11-15 15:00:13 +02:00
parent 524f8436a2
commit 6d40ad1650
517 changed files with 71034 additions and 12834 deletions
--- a/rl_coach/agents/ppo_agent.py
+++ b/rl_coach/agents/ppo_agent.py
@@ -63,6 +63,51 @@ class PPOActorNetworkParameters(NetworkParameters):


 class PPOAlgorithmParameters(AlgorithmParameters):
+    """
+    :param policy_gradient_rescaler: (PolicyGradientRescaler)
+        This represents how the critic will be used to update the actor. The critic value function is typically used
+        to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
+        advantage of the action, or the generalized advantage estimation (GAE) value.
+
+    :param gae_lambda: (float)
+        The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
+        estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
+        n-step estimations.
+
+    :param target_kl_divergence: (float)
+        The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
+        bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.
+
+    :param initial_kl_coefficient: (float)
+        The initial weight that will be given to the KL divergence between the current and the new policy in the
+        regularization factor.
+
+    :param high_kl_penalty_coefficient: (float)
+        The penalty that will be given for KL divergence values which are highes than what was defined as the target.
+
+    :param clip_likelihood_ratio_using_epsilon: (float)
+        If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
+        clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
+        This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
+        implementations.
+
+    :param value_targets_mix_fraction: (float)
+        The targets for the value network are an exponential weighted moving average which uses this mix fraction to
+        define how much of the new targets will be taken into account when calculating the loss.
+        This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
+
+    :param estimate_state_value_using_gae: (bool)
+        If set to True, the state value will be estimated using the GAE technique.
+
+    :param use_kl_regularization: (bool)
+        If set to True, the loss function will be regularized using the KL diveregence between the current and new
+        policy, to bound the change of the policy during the network update.
+
+    :param beta_entropy: (float)
+        An entropy regulaization term can be added to the loss function in order to control exploration. This term
+        is weighted using the :math:`\beta` value defined by beta_entropy.
+
+    """
    def __init__(self):
        super().__init__()
        self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
@@ -73,7 +118,6 @@ class PPOAlgorithmParameters(AlgorithmParameters):
        self.clip_likelihood_ratio_using_epsilon = None
        self.value_targets_mix_fraction = 0.1
        self.estimate_state_value_using_gae = True
-        self.step_until_collecting_full_episodes = True
        self.use_kl_regularization = True
        self.beta_entropy = 0.01
        self.num_consecutive_playing_steps = EnvironmentSteps(5000)