mirror of
https://github.com/gryf/coach.git
synced 2026-04-04 03:03:32 +02:00
update of api docstrings across coach and tutorials [WIP] (#91)
* updating the documentation website * adding the built docs * update of api docstrings across coach and tutorials 0-2 * added some missing api documentation * New Sphinx based documentation
This commit is contained in:
@@ -13,3 +13,43 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from .additive_noise import AdditiveNoiseParameters, AdditiveNoise
|
||||
from .boltzmann import BoltzmannParameters, Boltzmann
|
||||
from .bootstrapped import BootstrappedParameters, Bootstrapped
|
||||
from .categorical import CategoricalParameters, Categorical
|
||||
from .continuous_entropy import ContinuousEntropyParameters, ContinuousEntropy
|
||||
from .e_greedy import EGreedyParameters, EGreedy
|
||||
from .exploration_policy import ExplorationParameters, ExplorationPolicy
|
||||
from .greedy import GreedyParameters, Greedy
|
||||
from .ou_process import OUProcessParameters, OUProcess
|
||||
from .parameter_noise import ParameterNoiseParameters, ParameterNoise
|
||||
from .truncated_normal import TruncatedNormalParameters, TruncatedNormal
|
||||
from .ucb import UCBParameters, UCB
|
||||
|
||||
__all__ = [
|
||||
'AdditiveNoiseParameters',
|
||||
'AdditiveNoise',
|
||||
'BoltzmannParameters',
|
||||
'Boltzmann',
|
||||
'BootstrappedParameters',
|
||||
'Bootstrapped',
|
||||
'CategoricalParameters',
|
||||
'Categorical',
|
||||
'ContinuousEntropyParameters',
|
||||
'ContinuousEntropy',
|
||||
'EGreedyParameters',
|
||||
'EGreedy',
|
||||
'ExplorationParameters',
|
||||
'ExplorationPolicy',
|
||||
'GreedyParameters',
|
||||
'Greedy',
|
||||
'OUProcessParameters',
|
||||
'OUProcess',
|
||||
'ParameterNoiseParameters',
|
||||
'ParameterNoise',
|
||||
'TruncatedNormalParameters',
|
||||
'TruncatedNormal',
|
||||
'UCBParameters',
|
||||
'UCB'
|
||||
]
|
||||
|
||||
@@ -37,6 +37,14 @@ class AdditiveNoiseParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class AdditiveNoise(ExplorationPolicy):
|
||||
"""
|
||||
AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
|
||||
and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that
|
||||
can be given in two different ways:
|
||||
1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
|
||||
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
|
||||
be the mean of the action, and 2nd is assumed to be its standard deviation.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float):
|
||||
"""
|
||||
|
||||
@@ -36,6 +36,12 @@ class BoltzmannParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class Boltzmann(ExplorationPolicy):
|
||||
"""
|
||||
The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
|
||||
actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
|
||||
into a distribution over the actions. It then samples the action for playing out of the calculated distribution.
|
||||
An additional temperature schedule can be given by the user, and will control the steepness of the softmax function.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -39,6 +39,17 @@ class BootstrappedParameters(EGreedyParameters):
|
||||
|
||||
|
||||
class Bootstrapped(EGreedy):
|
||||
"""
|
||||
Bootstrapped exploration policy is currently only used for discrete action spaces along with the
|
||||
Bootstrapped DQN agent. It assumes that there is an ensemble of network heads, where each one predicts the
|
||||
values for all the possible actions. For each episode, a single head is selected to lead the agent, according
|
||||
to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
|
||||
predictions.
|
||||
|
||||
.. note::
|
||||
This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
|
||||
since it requires the agent to have a network with multiple heads.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
|
||||
architecture_num_q_heads: int,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):
|
||||
|
||||
@@ -30,6 +30,12 @@ class CategoricalParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class Categorical(ExplorationPolicy):
|
||||
"""
|
||||
Categorical exploration policy is intended for discrete action spaces. It expects the action values to
|
||||
represent a probability distribution over the action, from which a single action will be sampled.
|
||||
In evaluation, the action that has the highest probability will be selected. This is particularly useful for
|
||||
actor-critic schemes, where the actors output is a probability distribution over the actions.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -24,4 +24,15 @@ class ContinuousEntropyParameters(AdditiveNoiseParameters):
|
||||
|
||||
|
||||
class ContinuousEntropy(AdditiveNoise):
|
||||
"""
|
||||
Continuous entropy is an exploration policy that is actually implemented as part of the network.
|
||||
The exploration policy class is only a placeholder for choosing this policy. The exploration policy is
|
||||
implemented by adding a regularization factor to the network loss, which regularizes the entropy of the action.
|
||||
This exploration policy is only intended for continuous action spaces, and assumes that the entire calculation
|
||||
is implemented as part of the head.
|
||||
|
||||
.. warning::
|
||||
This exploration policy expects the agent or the network to implement the exploration functionality.
|
||||
There are only a few heads that actually are relevant and implement the entropy regularization factor.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -43,6 +43,19 @@ class EGreedyParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class EGreedy(ExplorationPolicy):
|
||||
"""
|
||||
e-greedy is an exploration policy that is intended for both discrete and continuous action spaces.
|
||||
|
||||
For discrete action spaces, it assumes that each action is assigned a value, and it selects the action with the
|
||||
highest value with probability 1 - epsilon. Otherwise, it selects a action sampled uniformly out of all the
|
||||
possible actions. The epsilon value is given by the user and can be given as a schedule.
|
||||
In evaluation, a different epsilon value can be specified.
|
||||
|
||||
For continuous action spaces, it assumes that the mean action is given by the agent. With probability epsilon,
|
||||
it samples a random action out of the action space bounds. Otherwise, it selects the action according to a
|
||||
given continuous exploration policy, which is set to AdditiveNoise by default. In evaluation, the action is
|
||||
always selected according to the given continuous exploration policy (where its phase is set to evaluation as well).
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
|
||||
evaluation_epsilon: float,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
|
||||
|
||||
@@ -31,6 +31,10 @@ class ExplorationParameters(Parameters):
|
||||
|
||||
|
||||
class ExplorationPolicy(object):
|
||||
"""
|
||||
An exploration policy takes the predicted actions or action values from the agent, and selects the action to
|
||||
actually apply to the environment using some predefined algorithm.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -30,6 +30,11 @@ class GreedyParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class Greedy(ExplorationPolicy):
|
||||
"""
|
||||
The Greedy exploration policy is intended for both discrete and continuous action spaces.
|
||||
For discrete action spaces, it always selects the action with the maximum value, as given by the agent.
|
||||
For continuous action spaces, it always return the exact action, as it was given by the agent.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -40,6 +40,11 @@ class OUProcessParameters(ExplorationParameters):
|
||||
|
||||
# Ornstein-Uhlenbeck process
|
||||
class OUProcess(ExplorationPolicy):
|
||||
"""
|
||||
OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
|
||||
an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
|
||||
the samples are correlated between consequent time steps.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -42,10 +42,18 @@ class ParameterNoiseParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class ParameterNoise(ExplorationPolicy):
|
||||
"""
|
||||
The ParameterNoise exploration policy is intended for both discrete and continuous action spaces.
|
||||
It applies the exploration policy by replacing all the dense network layers with noisy layers.
|
||||
The noisy layers have both weight means and weight standard deviations, and for each forward pass of the network
|
||||
the weights are sampled from a normal distribution that follows the learned weights mean and standard deviation
|
||||
values.
|
||||
|
||||
Warning: currently supported only by DQN variants
|
||||
"""
|
||||
def __init__(self, network_params: Dict[str, NetworkParameters], action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param alpha0:
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.network_params = network_params
|
||||
|
||||
@@ -39,6 +39,16 @@ class TruncatedNormalParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class TruncatedNormal(ExplorationPolicy):
|
||||
"""
|
||||
The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
|
||||
normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t
|
||||
wo different ways:
|
||||
1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
|
||||
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
|
||||
be the mean of the action, and 2nd is assumed to be its standard deviation.
|
||||
When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
|
||||
is within the bounds.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float, clip_low: float, clip_high: float):
|
||||
"""
|
||||
|
||||
@@ -43,6 +43,15 @@ class UCBParameters(EGreedyParameters):
|
||||
|
||||
|
||||
class UCB(EGreedy):
|
||||
"""
|
||||
UCB exploration policy is following the upper confidence bound heuristic to sample actions in discrete action spaces.
|
||||
It assumes that there are multiple network heads that are predicting action values, and that the standard deviation
|
||||
between the heads predictions represents the uncertainty of the agent in each of the actions.
|
||||
It then updates the action value estimates to by mean(actions)+lambda*stdev(actions), where lambda is
|
||||
given by the user. This exploration policy aims to take advantage of the uncertainty of the agent in its predictions,
|
||||
and select the action according to the tradeoff between how uncertain the agent is, and how large it predicts
|
||||
the outcome from those actions to be.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
|
||||
architecture_num_q_heads: int, lamb: int,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):
|
||||
|
||||
Reference in New Issue
Block a user