update of api docstrings across coach and tutorials [WIP] (#91)

* updating the documentation website * adding the built docs * update of api docstrings across coach and tutorials 0-2 * added some missing api documentation * New Sphinx based documentation
2026-07-09 02:46:33 +02:00 · 2018-11-15 15:00:13 +02:00
parent 524f8436a2
commit 6d40ad1650
517 changed files with 71034 additions and 12834 deletions
@@ -13,3 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from .additive_noise import AdditiveNoiseParameters, AdditiveNoise
+from .boltzmann import BoltzmannParameters, Boltzmann
+from .bootstrapped import BootstrappedParameters, Bootstrapped
+from .categorical import CategoricalParameters, Categorical
+from .continuous_entropy import ContinuousEntropyParameters, ContinuousEntropy
+from .e_greedy import EGreedyParameters, EGreedy
+from .exploration_policy import ExplorationParameters, ExplorationPolicy
+from .greedy import GreedyParameters, Greedy
+from .ou_process import OUProcessParameters, OUProcess
+from .parameter_noise import ParameterNoiseParameters, ParameterNoise
+from .truncated_normal import TruncatedNormalParameters, TruncatedNormal
+from .ucb import UCBParameters, UCB
+
+__all__ = [
+    'AdditiveNoiseParameters',
+    'AdditiveNoise',
+    'BoltzmannParameters',
+    'Boltzmann',
+    'BootstrappedParameters',
+    'Bootstrapped',
+    'CategoricalParameters',
+    'Categorical',
+    'ContinuousEntropyParameters',
+    'ContinuousEntropy',
+    'EGreedyParameters',
+    'EGreedy',
+    'ExplorationParameters',
+    'ExplorationPolicy',
+    'GreedyParameters',
+    'Greedy',
+    'OUProcessParameters',
+    'OUProcess',
+    'ParameterNoiseParameters',
+    'ParameterNoise',
+    'TruncatedNormalParameters',
+    'TruncatedNormal',
+    'UCBParameters',
+    'UCB'
+]
@@ -37,6 +37,14 @@ class AdditiveNoiseParameters(ExplorationParameters):


 class AdditiveNoise(ExplorationPolicy):
+    """
+    AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
+    and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that
+    can be given in two different ways:
+    1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
+    2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
+    be the mean of the action, and 2nd is assumed to be its standard deviation.
+    """
    def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
                 evaluation_noise_percentage: float):
        """
@@ -36,6 +36,12 @@ class BoltzmannParameters(ExplorationParameters):


 class Boltzmann(ExplorationPolicy):
+    """
+    The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
+    actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
+    into a distribution over the actions. It then samples the action for playing out of the calculated distribution.
+    An additional temperature schedule can be given by the user, and will control the steepness of the softmax function.
+    """
    def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
        """
        :param action_space: the action space used by the environment
@@ -39,6 +39,17 @@ class BootstrappedParameters(EGreedyParameters):


 class Bootstrapped(EGreedy):
+    """
+    Bootstrapped exploration policy is currently only used for discrete action spaces along with the
+    Bootstrapped DQN agent. It assumes that there is an ensemble of network heads, where each one predicts the
+    values for all the possible actions. For each episode, a single head is selected to lead the agent, according
+    to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
+    predictions.
+
+    .. note::
+       This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
+       since it requires the agent to have a network with multiple heads.
+    """
    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
                 architecture_num_q_heads: int,
                 continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):
@@ -30,6 +30,12 @@ class CategoricalParameters(ExplorationParameters):


 class Categorical(ExplorationPolicy):
+    """
+    Categorical exploration policy is intended for discrete action spaces. It expects the action values to
+    represent a probability distribution over the action, from which a single action will be sampled.
+    In evaluation, the action that has the highest probability will be selected. This is particularly useful for
+    actor-critic schemes, where the actors output is a probability distribution over the actions.
+    """
    def __init__(self, action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
@@ -24,4 +24,15 @@ class ContinuousEntropyParameters(AdditiveNoiseParameters):


 class ContinuousEntropy(AdditiveNoise):
+    """
+    Continuous entropy is an exploration policy that is actually implemented as part of the network.
+    The exploration policy class is only a placeholder for choosing this policy. The exploration policy is
+    implemented by adding a regularization factor to the network loss, which regularizes the entropy of the action.
+    This exploration policy is only intended for continuous action spaces, and assumes that the entire calculation
+    is implemented as part of the head.
+
+    .. warning::
+       This exploration policy expects the agent or the network to implement the exploration functionality.
+       There are only a few heads that actually are relevant and implement the entropy regularization factor.
+    """
    pass
@@ -43,6 +43,19 @@ class EGreedyParameters(ExplorationParameters):


 class EGreedy(ExplorationPolicy):
+    """
+    e-greedy is an exploration policy that is intended for both discrete and continuous action spaces.
+
+    For discrete action spaces, it assumes that each action is assigned a value, and it selects the action with the
+    highest value with probability 1 - epsilon. Otherwise, it selects a action sampled uniformly out of all the
+    possible actions. The epsilon value is given by the user and can be given as a schedule.
+    In evaluation, a different epsilon value can be specified.
+
+    For continuous action spaces, it assumes that the mean action is given by the agent. With probability epsilon,
+    it samples a random action out of the action space bounds. Otherwise, it selects the action according to a
+    given continuous exploration policy, which is set to AdditiveNoise by default. In evaluation, the action is
+    always selected according to the given continuous exploration policy (where its phase is set to evaluation as well).
+    """
    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
                 evaluation_epsilon: float,
                 continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
@@ -31,6 +31,10 @@ class ExplorationParameters(Parameters):


 class ExplorationPolicy(object):
+    """
+    An exploration policy takes the predicted actions or action values from the agent, and selects the action to
+    actually apply to the environment using some predefined algorithm.
+    """
    def __init__(self, action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
@@ -30,6 +30,11 @@ class GreedyParameters(ExplorationParameters):


 class Greedy(ExplorationPolicy):
+    """
+    The Greedy exploration policy is intended for both discrete and continuous action spaces.
+    For discrete action spaces, it always selects the action with the maximum value, as given by the agent.
+    For continuous action spaces, it always return the exact action, as it was given by the agent.
+    """
    def __init__(self, action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
@@ -40,6 +40,11 @@ class OUProcessParameters(ExplorationParameters):

 # Ornstein-Uhlenbeck process
 class OUProcess(ExplorationPolicy):
+    """
+    OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
+    an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
+    the samples are correlated between consequent time steps.
+    """
    def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
        """
        :param action_space: the action space used by the environment
@@ -42,10 +42,18 @@ class ParameterNoiseParameters(ExplorationParameters):


 class ParameterNoise(ExplorationPolicy):
+    """
+    The ParameterNoise exploration policy is intended for both discrete and continuous action spaces.
+    It applies the exploration policy by replacing all the dense network layers with noisy layers.
+    The noisy layers have both weight means and weight standard deviations, and for each forward pass of the network
+    the weights are sampled from a normal distribution that follows the learned weights mean and standard deviation
+    values.
+
+    Warning: currently supported only by DQN variants
+    """
    def __init__(self, network_params: Dict[str, NetworkParameters], action_space: ActionSpace):
        """
        :param action_space: the action space used by the environment
-        :param alpha0:
        """
        super().__init__(action_space)
        self.network_params = network_params
@@ -39,6 +39,16 @@ class TruncatedNormalParameters(ExplorationParameters):


 class TruncatedNormal(ExplorationPolicy):
+    """
+    The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
+    normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t
+    wo different ways:
+    1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
+    2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
+    be the mean of the action, and 2nd is assumed to be its standard deviation.
+    When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
+    is within the bounds.
+    """
    def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
                 evaluation_noise_percentage: float, clip_low: float, clip_high: float):
        """
@@ -43,6 +43,15 @@ class UCBParameters(EGreedyParameters):


 class UCB(EGreedy):
+    """
+    UCB exploration policy is following the upper confidence bound heuristic to sample actions in discrete action spaces.
+    It assumes that there are multiple network heads that are predicting action values, and that the standard deviation
+    between the heads predictions represents the uncertainty of the agent in each of the actions.
+    It then updates the action value estimates to by mean(actions)+lambda*stdev(actions), where lambda is
+    given by the user. This exploration policy aims to take advantage of the uncertainty of the agent in its predictions,
+    and select the action according to the tradeoff between how uncertain the agent is, and how large it predicts
+    the outcome from those actions to be.
+    """
    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
                 architecture_num_q_heads: int, lamb: int,
                 continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):