mirror of
https://github.com/gryf/coach.git
synced 2026-02-18 23:45:48 +01:00
update of api docstrings across coach and tutorials [WIP] (#91)
* updating the documentation website * adding the built docs * update of api docstrings across coach and tutorials 0-2 * added some missing api documentation * New Sphinx based documentation
This commit is contained in:
@@ -36,25 +36,25 @@ from rl_coach.utils import last_sample
|
||||
class ActorCriticAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
The value that will be used to rescale the policy gradient
|
||||
The value that will be used to rescale the policy gradient
|
||||
|
||||
:param apply_gradients_every_x_episodes: (int)
|
||||
The number of episodes to wait before applying the accumulated gradients to the network.
|
||||
The training iterations only accumulate gradients without actually applying them.
|
||||
The number of episodes to wait before applying the accumulated gradients to the network.
|
||||
The training iterations only accumulate gradients without actually applying them.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.
|
||||
|
||||
:param num_steps_between_gradient_updates: (int)
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
|
||||
|
||||
:param gae_lambda: (float)
|
||||
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
|
||||
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
|
||||
|
||||
:param estimate_state_value_using_gae: (bool)
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -39,7 +39,7 @@ from rl_coach.memories.backend.memory_impl import get_memory_backend
|
||||
class Agent(AgentInterface):
|
||||
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
"""
|
||||
:param agent_parameters: A Preset class instance with all the running paramaters
|
||||
:param agent_parameters: A AgentParameters class instance with all the agent parameters
|
||||
"""
|
||||
super().__init__()
|
||||
self.ap = agent_parameters
|
||||
@@ -175,18 +175,20 @@ class Agent(AgentInterface):
|
||||
np.random.seed()
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
def parent(self) -> 'LevelManager':
|
||||
"""
|
||||
Get the parent class of the agent
|
||||
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
def parent(self, val) -> None:
|
||||
"""
|
||||
Change the parent class of the agent.
|
||||
Additionally, updates the full name of the agent
|
||||
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
@@ -196,7 +198,12 @@ class Agent(AgentInterface):
|
||||
raise ValueError("The parent of an agent must have a name")
|
||||
self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
|
||||
|
||||
def setup_logger(self):
|
||||
def setup_logger(self) -> None:
|
||||
"""
|
||||
Setup the logger for the agent
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# dump documentation
|
||||
logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
|
||||
format(graph_name=self.parent_level_manager.parent_graph_manager.name,
|
||||
@@ -212,6 +219,7 @@ class Agent(AgentInterface):
|
||||
def set_session(self, sess) -> None:
|
||||
"""
|
||||
Set the deep learning framework session for all the agents in the composite agent
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self.input_filter.set_session(sess)
|
||||
@@ -223,6 +231,7 @@ class Agent(AgentInterface):
|
||||
dump_one_value_per_step: bool=False) -> Signal:
|
||||
"""
|
||||
Register a signal such that its statistics will be dumped and be viewable through dashboard
|
||||
|
||||
:param signal_name: the name of the signal as it will appear in dashboard
|
||||
:param dump_one_value_per_episode: should the signal value be written for each episode?
|
||||
:param dump_one_value_per_step: should the signal value be written for each step?
|
||||
@@ -239,6 +248,7 @@ class Agent(AgentInterface):
|
||||
"""
|
||||
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules
|
||||
|
||||
:param spaces: the environment spaces definition
|
||||
:return: None
|
||||
"""
|
||||
@@ -274,6 +284,7 @@ class Agent(AgentInterface):
|
||||
Create all the networks of the agent.
|
||||
The network creation will be done after setting the environment parameters for the agent, since they are needed
|
||||
for creating the network.
|
||||
|
||||
:return: A list containing all the networks
|
||||
"""
|
||||
networks = {}
|
||||
@@ -295,6 +306,7 @@ class Agent(AgentInterface):
|
||||
"""
|
||||
Initialize any modules that depend on knowing information about the environment such as the action space or
|
||||
the observation space
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# initialize exploration policy
|
||||
@@ -314,13 +326,19 @@ class Agent(AgentInterface):
|
||||
|
||||
@property
|
||||
def phase(self) -> RunPhase:
|
||||
"""
|
||||
The current running phase of the agent
|
||||
|
||||
:return: RunPhase
|
||||
"""
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Change the phase of the run for the agent and all the sub components
|
||||
:param phase: the new run phase (TRAIN, TEST, etc.)
|
||||
|
||||
:param val: the new run phase (TRAIN, TEST, etc.)
|
||||
:return: None
|
||||
"""
|
||||
self.reset_evaluation_state(val)
|
||||
@@ -328,6 +346,14 @@ class Agent(AgentInterface):
|
||||
self.exploration_policy.change_phase(val)
|
||||
|
||||
def reset_evaluation_state(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
|
||||
evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
|
||||
by val, and by the current phase set in self.phase.
|
||||
|
||||
:param val: The new phase to change to
|
||||
:return: None
|
||||
"""
|
||||
starting_evaluation = (val == RunPhase.TEST)
|
||||
ending_evaluation = (self.phase == RunPhase.TEST)
|
||||
|
||||
@@ -363,6 +389,7 @@ class Agent(AgentInterface):
|
||||
This function is a wrapper to allow having the same calls for shared or unshared memories.
|
||||
It should be used instead of calling the memory directly in order to allow different algorithms to work
|
||||
both with a shared and a local memory.
|
||||
|
||||
:param func: the name of the memory function to call
|
||||
:param args: the arguments to supply to the function
|
||||
:return: the return value of the function
|
||||
@@ -375,7 +402,12 @@ class Agent(AgentInterface):
|
||||
result = getattr(self.memory, func)(*args)
|
||||
return result
|
||||
|
||||
def log_to_screen(self):
|
||||
def log_to_screen(self) -> None:
|
||||
"""
|
||||
Write an episode summary line to the terminal
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Name"] = self.full_name_id
|
||||
@@ -388,9 +420,10 @@ class Agent(AgentInterface):
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
def update_step_in_episode_log(self):
|
||||
def update_step_in_episode_log(self) -> None:
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
Updates the in-episode log file with all the signal values from the most recent step.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
@@ -411,9 +444,12 @@ class Agent(AgentInterface):
|
||||
# dump
|
||||
self.agent_episode_logger.dump_output_csv()
|
||||
|
||||
def update_log(self):
|
||||
def update_log(self) -> None:
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
Updates the episodic log file with all the signal values from the most recent episode.
|
||||
Additional signals for logging can be set by the creating a new signal using self.register_signal,
|
||||
and then updating it with some internal agent values.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
@@ -438,7 +474,6 @@ class Agent(AgentInterface):
|
||||
self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
|
||||
self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
|
||||
|
||||
|
||||
for signal in self.episode_signals:
|
||||
self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||
self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||
@@ -452,7 +487,10 @@ class Agent(AgentInterface):
|
||||
|
||||
def handle_episode_ended(self) -> None:
|
||||
"""
|
||||
End an episode
|
||||
Make any changes needed when each episode is ended.
|
||||
This includes incrementing counters, updating full episode dependent values, updating logs, etc.
|
||||
This function is called right after each episode is ended.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self.current_episode_buffer.is_complete = True
|
||||
@@ -486,9 +524,10 @@ class Agent(AgentInterface):
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
self.log_to_screen()
|
||||
|
||||
def reset_internal_state(self):
|
||||
def reset_internal_state(self) -> None:
|
||||
"""
|
||||
Reset all the episodic parameters
|
||||
Reset all the episodic parameters. This function is called right before each episode starts.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
for signal in self.episode_signals:
|
||||
@@ -516,6 +555,7 @@ class Agent(AgentInterface):
|
||||
def learn_from_batch(self, batch) -> Tuple[float, List, List]:
|
||||
"""
|
||||
Given a batch of transitions, calculates their target values and updates the network.
|
||||
|
||||
:param batch: A list of transitions
|
||||
:return: The total loss of the training, the loss per head and the unclipped gradients
|
||||
"""
|
||||
@@ -524,6 +564,7 @@ class Agent(AgentInterface):
|
||||
def _should_update_online_weights_to_target(self):
|
||||
"""
|
||||
Determine if online weights should be copied to the target.
|
||||
|
||||
:return: boolean: True if the online weights should be copied to the target.
|
||||
"""
|
||||
|
||||
@@ -542,9 +583,10 @@ class Agent(AgentInterface):
|
||||
"EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
|
||||
return should_update
|
||||
|
||||
def _should_train(self, wait_for_full_episode=False):
|
||||
def _should_train(self, wait_for_full_episode=False) -> bool:
|
||||
"""
|
||||
Determine if we should start a training phase according to the number of steps passed since the last training
|
||||
|
||||
:return: boolean: True if we should start a training phase
|
||||
"""
|
||||
|
||||
@@ -580,11 +622,12 @@ class Agent(AgentInterface):
|
||||
|
||||
return should_update
|
||||
|
||||
def train(self):
|
||||
def train(self) -> float:
|
||||
"""
|
||||
Check if a training phase should be done as configured by num_consecutive_playing_steps.
|
||||
If it should, then do several training steps as configured by num_consecutive_training_steps.
|
||||
A single training iteration: Sample a batch, train on it and update target networks.
|
||||
|
||||
:return: The total training loss during the training iterations.
|
||||
"""
|
||||
loss = 0
|
||||
@@ -641,14 +684,12 @@ class Agent(AgentInterface):
|
||||
# run additional commands after the training is done
|
||||
self.post_training_commands()
|
||||
|
||||
|
||||
|
||||
return loss
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
"""
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
||||
or testing.
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when
|
||||
training or testing.
|
||||
|
||||
:param curr_state: the current state to act upon.
|
||||
:return: chosen action, some action value describing the action (q-value, probability, etc)
|
||||
@@ -656,10 +697,16 @@ class Agent(AgentInterface):
|
||||
pass
|
||||
|
||||
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
|
||||
network_name: str):
|
||||
network_name: str) -> Dict[str, np.array]:
|
||||
"""
|
||||
convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
observations together, measurements together, etc.
|
||||
|
||||
:param states: A list of environment states, where each one is a dict mapping from an observation name to its
|
||||
corresponding observation
|
||||
:param network_name: The agent network name to prepare the batch for. this is needed in order to extract only
|
||||
the observation relevant for the network from the states.
|
||||
:return: A dictionary containing a list of values from all the given states for each of the observations
|
||||
"""
|
||||
# convert to batch so we can run it through the network
|
||||
states = force_list(states)
|
||||
@@ -676,7 +723,8 @@ class Agent(AgentInterface):
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Given the agents current knowledge, decide on the next action to apply to the environment
|
||||
:return: an action and a dictionary containing any additional info from the action decision process
|
||||
|
||||
:return: An ActionInfo object, which contains the action and any additional info from the action decision process
|
||||
"""
|
||||
if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
|
||||
# This agent never plays while training (e.g. behavioral cloning)
|
||||
@@ -705,13 +753,20 @@ class Agent(AgentInterface):
|
||||
|
||||
return filtered_action_info
|
||||
|
||||
def run_pre_network_filter_for_inference(self, state: StateType):
|
||||
def run_pre_network_filter_for_inference(self, state: StateType) -> StateType:
|
||||
"""
|
||||
Run filters which where defined for being applied right before using the state for inference.
|
||||
|
||||
:param state: The state to run the filters on
|
||||
:return: The filtered state
|
||||
"""
|
||||
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
|
||||
return self.pre_network_filter.filter(dummy_env_response)[0].next_state
|
||||
|
||||
def get_state_embedding(self, state: dict) -> np.ndarray:
|
||||
"""
|
||||
Given a state, get the corresponding state embedding from the main network
|
||||
|
||||
:param state: a state dict
|
||||
:return: a numpy embedding vector
|
||||
"""
|
||||
@@ -726,6 +781,7 @@ class Agent(AgentInterface):
|
||||
"""
|
||||
Allows agents to update the transition just before adding it to the replay buffer.
|
||||
Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
|
||||
:param transition: the transition to update
|
||||
:return: the updated transition
|
||||
"""
|
||||
@@ -736,8 +792,10 @@ class Agent(AgentInterface):
|
||||
Given a response from the environment, distill the observation from it and store it for later use.
|
||||
The response should be a dictionary containing the performed action, the new observation and measurements,
|
||||
the reward, a game over flag and any additional information necessary.
|
||||
|
||||
:param env_response: result of call from environment.step(action)
|
||||
:return:
|
||||
:return: a boolean value which determines if the agent has decided to terminate the episode after seeing the
|
||||
given observation
|
||||
"""
|
||||
|
||||
# filter the env_response
|
||||
@@ -801,7 +859,12 @@ class Agent(AgentInterface):
|
||||
|
||||
return transition.game_over
|
||||
|
||||
def post_training_commands(self):
|
||||
def post_training_commands(self) -> None:
|
||||
"""
|
||||
A function which allows adding any functionality that is required to run right after the training phase ends.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
|
||||
@@ -809,9 +872,10 @@ class Agent(AgentInterface):
|
||||
Get a prediction from the agent with regard to the requested prediction_type.
|
||||
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
|
||||
raise a ValueException.
|
||||
:param states:
|
||||
:param prediction_type:
|
||||
:return:
|
||||
|
||||
:param states: The states to get a prediction for
|
||||
:param prediction_type: The type of prediction to get for the states. For example, the state-value prediction.
|
||||
:return: the predicted values
|
||||
"""
|
||||
|
||||
predictions = self.networks['main'].online_network.predict_with_prediction_type(
|
||||
@@ -824,6 +888,15 @@ class Agent(AgentInterface):
|
||||
return list(predictions.values())[0]
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
"""
|
||||
Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent
|
||||
has another master agent that is controlling it. In such cases, the master agent can define the goals for the
|
||||
slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent
|
||||
in-action-space.
|
||||
|
||||
:param action: The action that should be set as the directive
|
||||
:return:
|
||||
"""
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
self.current_hrl_goal = action
|
||||
elif isinstance(self.in_action_space, AttentionActionSpace):
|
||||
@@ -834,6 +907,7 @@ class Agent(AgentInterface):
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
"""
|
||||
Allows agents to store additional information when saving checkpoints.
|
||||
|
||||
:param checkpoint_id: the id of the checkpoint
|
||||
:return: None
|
||||
"""
|
||||
@@ -842,6 +916,7 @@ class Agent(AgentInterface):
|
||||
def sync(self) -> None:
|
||||
"""
|
||||
Sync the global network parameters to local networks
|
||||
|
||||
:return: None
|
||||
"""
|
||||
for network in self.networks.values():
|
||||
|
||||
@@ -32,7 +32,6 @@ from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayPar
|
||||
class BCAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.collect_new_data = False
|
||||
|
||||
|
||||
class BCNetworkParameters(NetworkParameters):
|
||||
|
||||
@@ -33,6 +33,19 @@ class CategoricalDQNNetworkParameters(DQNNetworkParameters):
|
||||
|
||||
|
||||
class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param v_min: (float)
|
||||
The minimal value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to :math:`v_{min}` in the paper.
|
||||
|
||||
:param v_max: (float)
|
||||
The maximum value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to :math:`v_{max}` in the paper.
|
||||
|
||||
:param atoms: (int)
|
||||
The number of atoms that will be used to discretize the range between v_min and v_max.
|
||||
For the C51 algorithm described in the paper, the number of atoms is 51.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.v_min = -10.0
|
||||
|
||||
@@ -26,9 +26,12 @@ from rl_coach.memories.non_episodic.balanced_experience_replay import BalancedEx
|
||||
|
||||
|
||||
class CILAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param state_key_with_the_class_index: (str)
|
||||
The key of the state dictionary which corresponds to the value that will be used to control the class index.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.collect_new_data = False
|
||||
self.state_key_with_the_class_index = 'high_level_command'
|
||||
|
||||
|
||||
|
||||
@@ -58,6 +58,47 @@ class ClippedPPONetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class ClippedPPOAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.
|
||||
|
||||
:param gae_lambda: (float)
|
||||
The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.
|
||||
|
||||
:param clip_likelihood_ratio_using_epsilon: (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.
|
||||
|
||||
:param value_targets_mix_fraction: (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
|
||||
|
||||
:param estimate_state_value_using_gae: (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.
|
||||
|
||||
:param use_kl_regularization: (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the :math:`\beta` value defined by beta_entropy.
|
||||
|
||||
:param optimization_epochs: (int)
|
||||
For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
|
||||
optimization_epochs value.
|
||||
|
||||
:param optimization_epochs: (Schedule)
|
||||
Can be used to define a schedule over the clipping of the likelihood ratio.
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_episodes_in_experience_replay = 1000000
|
||||
@@ -66,7 +107,6 @@ class ClippedPPOAlgorithmParameters(AlgorithmParameters):
|
||||
self.use_kl_regularization = False
|
||||
self.clip_likelihood_ratio_using_epsilon = 0.2
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.beta_entropy = 0.01 # should be 0 for mujoco
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(2048)
|
||||
self.optimization_epochs = 10
|
||||
|
||||
@@ -65,6 +65,33 @@ class DDPGActorNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class DDPGAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.
|
||||
|
||||
:param rate_for_copying_weights_to_target: (float)
|
||||
When copying the online network weights to the target network weights, a soft update will be used, which
|
||||
weight the new online network weights by rate_for_copying_weights_to_target
|
||||
|
||||
:param num_consecutive_playing_steps: (StepMethod)
|
||||
The number of consecutive steps to act between every two training iterations
|
||||
|
||||
:param use_target_network_for_evaluation: (bool)
|
||||
If set to True, the target network will be used for predicting the actions when choosing actions to act.
|
||||
Since the target network weights change more slowly, the predicted actions will be more consistent.
|
||||
|
||||
:param action_penalty: (float)
|
||||
The amount by which to penalize the network on high action feature (pre-activation) values.
|
||||
This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
|
||||
gradients from becoming very low.
|
||||
|
||||
:param clip_critic_targets: (Tuple[float, float] or None)
|
||||
The range to clip the critic target to in order to prevent overestimation of the action values.
|
||||
|
||||
:param use_non_zero_discount_for_terminal_states: (bool)
|
||||
If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
|
||||
values. If set to False, the terminal states reward will be taken as the target return for the network.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
|
||||
|
||||
@@ -81,6 +81,35 @@ class DFPMemoryParameters(EpisodicExperienceReplayParameters):
|
||||
|
||||
|
||||
class DFPAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_predicted_steps_ahead: (int)
|
||||
Number of future steps to predict measurements for. The future steps won't be sequential, but rather jump
|
||||
in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4
|
||||
|
||||
:param goal_vector: (List[float])
|
||||
The goal vector will weight each of the measurements to form an optimization goal. The vector should have
|
||||
the same length as the number of measurements, and it will be vector multiplied by the measurements.
|
||||
Positive values correspond to trying to maximize the particular measurement, and negative values
|
||||
correspond to trying to minimize the particular measurement.
|
||||
|
||||
:param future_measurements_weights: (List[float])
|
||||
The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
|
||||
goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
|
||||
then only the 3 last timesteps will be taken into account, according to the weights in the
|
||||
future_measurements_weights vector.
|
||||
|
||||
:param use_accumulated_reward_as_measurement: (bool)
|
||||
If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
|
||||
the measurements vector in the state. This van be useful in environments where the given measurements don't
|
||||
include enough information for the particular goal the agent should achieve.
|
||||
|
||||
:param handling_targets_after_episode_end: (HandlingTargetsAfterEpisodeEnd)
|
||||
Dictates how to handle measurements that are outside the episode length.
|
||||
|
||||
:param scale_measurements_targets: (Dict[str, float])
|
||||
Allows rescaling the values of each of the measurements available. This van be useful when the measurements
|
||||
have a different scale and you want to normalize them to the same scale.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_predicted_steps_ahead = 6
|
||||
|
||||
@@ -24,6 +24,13 @@ from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
|
||||
"""
|
||||
:param time_limit: (int)
|
||||
The number of steps the agent is allowed to act for while trying to achieve its goal
|
||||
|
||||
:param sub_goal_testing_rate: (float)
|
||||
The percent of episodes that will be used for testing the sub goals generated by the upper level agents.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.time_limit = 40
|
||||
@@ -91,7 +98,7 @@ class HACDDPGAgent(DDPGAgent):
|
||||
sub_goal_is_missed = not sub_goal_reached
|
||||
|
||||
if sub_goal_is_missed:
|
||||
transition.reward = -self.ap.algorithm.time_limit
|
||||
transition.reward = -self.ap.algorithm.time_limit
|
||||
return transition
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
|
||||
@@ -24,6 +24,11 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi
|
||||
|
||||
|
||||
class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param monte_carlo_mixing_rate: (float)
|
||||
The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
|
||||
the single-step bootstrapped targets.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.monte_carlo_mixing_rate = 0.1
|
||||
|
||||
@@ -44,6 +44,26 @@ class NStepQNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class NStepQAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.
|
||||
|
||||
:param apply_gradients_every_x_episodes: (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.
|
||||
|
||||
:param num_steps_between_gradient_updates: (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.
|
||||
|
||||
:param targets_horizon: (str)
|
||||
Should be either 'N-Step' or '1-Step', and defines the length for which to bootstrap the network values over.
|
||||
Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
|
||||
please refer to the original paper (https://arxiv.org/abs/1602.01783)
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
|
||||
|
||||
@@ -43,6 +43,39 @@ class NECNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class NECAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param dnd_size: (int)
|
||||
Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
|
||||
of transitions that will be stored is dnd_size x num_actions.
|
||||
|
||||
:param l2_norm_added_delta: (float)
|
||||
A small value that will be added when calculating the weight of each of the DND entries. This follows the
|
||||
:math:`\delta` patameter defined in the paper.
|
||||
|
||||
:param new_value_shift_coefficient: (float)
|
||||
In the case where a ew embedding that was added to the DND was already present, the value that will be stored
|
||||
in the DND is a mix between the existing value and the new value. The mix rate is defined by
|
||||
new_value_shift_coefficient.
|
||||
|
||||
:param number_of_knn: (int)
|
||||
The number of neighbors that will be retrieved for each DND query.
|
||||
|
||||
:param DND_key_error_threshold: (float)
|
||||
When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
|
||||
exists in the DND, since exact matches of embeddings are very rare.
|
||||
|
||||
:param propagate_updates_to_DND: (bool)
|
||||
If set to True, when the gradients of the network will be calculated, the gradients will also be
|
||||
backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
|
||||
network weights.
|
||||
|
||||
:param n_step: (int)
|
||||
The bootstrap length that will be used when calculating the state values to store in the DND.
|
||||
|
||||
:param bootstrap_total_return_from_old_policy: (bool)
|
||||
If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
|
||||
when the state was first seen, and not the latest, most up-to-date network value.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.dnd_size = 500000
|
||||
|
||||
@@ -24,6 +24,19 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi
|
||||
|
||||
|
||||
class PALAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param pal_alpha: (float)
|
||||
A factor that weights the amount by which the advantage learning update will be taken into account.
|
||||
|
||||
:param persistent_advantage_learning: (bool)
|
||||
If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
|
||||
the same actions one after the other instead of changing actions.
|
||||
|
||||
:param monte_carlo_mixing_rate: (float)
|
||||
The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
|
||||
total discounted returns, and they can help reduce the time it takes for the network to update to the newly
|
||||
seen values, since it is not based on bootstrapping the current network values.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pal_alpha = 0.9
|
||||
|
||||
@@ -42,6 +42,27 @@ class PolicyGradientNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class PolicyGradientAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
|
||||
the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
|
||||
return, but there are other rescalers that are intended for reducing the variance of the updates.
|
||||
|
||||
:param apply_gradients_every_x_episodes: (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
|
||||
will be added to the loss and scaled by the given beta factor.
|
||||
|
||||
:param num_steps_between_gradient_updates: (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
|
||||
|
||||
@@ -63,6 +63,51 @@ class PPOActorNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class PPOAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.
|
||||
|
||||
:param gae_lambda: (float)
|
||||
The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.
|
||||
|
||||
:param target_kl_divergence: (float)
|
||||
The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
|
||||
bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.
|
||||
|
||||
:param initial_kl_coefficient: (float)
|
||||
The initial weight that will be given to the KL divergence between the current and the new policy in the
|
||||
regularization factor.
|
||||
|
||||
:param high_kl_penalty_coefficient: (float)
|
||||
The penalty that will be given for KL divergence values which are highes than what was defined as the target.
|
||||
|
||||
:param clip_likelihood_ratio_using_epsilon: (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.
|
||||
|
||||
:param value_targets_mix_fraction: (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
|
||||
|
||||
:param estimate_state_value_using_gae: (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.
|
||||
|
||||
:param use_kl_regularization: (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the :math:`\beta` value defined by beta_entropy.
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
@@ -73,7 +118,6 @@ class PPOAlgorithmParameters(AlgorithmParameters):
|
||||
self.clip_likelihood_ratio_using_epsilon = None
|
||||
self.value_targets_mix_fraction = 0.1
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.use_kl_regularization = True
|
||||
self.beta_entropy = 0.01
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(5000)
|
||||
|
||||
@@ -34,6 +34,14 @@ class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
|
||||
|
||||
|
||||
class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param atoms: (int)
|
||||
the number of atoms to predict for each action
|
||||
|
||||
:param huber_loss_interval: (float)
|
||||
One of the huber loss parameters, and is referred to as :math:`\kapa` in the paper.
|
||||
It describes the interval [-k, k] in which the huber loss acts as a MSE loss.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.atoms = 200
|
||||
|
||||
@@ -37,6 +37,17 @@ class RainbowDQNNetworkParameters(DQNNetworkParameters):
|
||||
|
||||
|
||||
class RainbowDQNAlgorithmParameters(CategoricalDQNAlgorithmParameters):
|
||||
"""
|
||||
:param n_step: (int)
|
||||
The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
|
||||
using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
|
||||
prediction.
|
||||
|
||||
:param store_transitions_only_when_episodes_are_terminated: (bool)
|
||||
If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
|
||||
written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
|
||||
transitions into the memory, and to do so we need the entire episode first.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.n_step = 3
|
||||
|
||||
Reference in New Issue
Block a user