mirror of
https://github.com/gryf/coach.git
synced 2026-02-16 05:55:46 +01:00
update of api docstrings across coach and tutorials [WIP] (#91)
* updating the documentation website * adding the built docs * update of api docstrings across coach and tutorials 0-2 * added some missing api documentation * New Sphinx based documentation
This commit is contained in:
@@ -36,25 +36,25 @@ from rl_coach.utils import last_sample
|
||||
class ActorCriticAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
The value that will be used to rescale the policy gradient
|
||||
The value that will be used to rescale the policy gradient
|
||||
|
||||
:param apply_gradients_every_x_episodes: (int)
|
||||
The number of episodes to wait before applying the accumulated gradients to the network.
|
||||
The training iterations only accumulate gradients without actually applying them.
|
||||
The number of episodes to wait before applying the accumulated gradients to the network.
|
||||
The training iterations only accumulate gradients without actually applying them.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.
|
||||
The weight that will be given to the entropy regularization which is used in order to improve exploration.
|
||||
|
||||
:param num_steps_between_gradient_updates: (int)
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
|
||||
|
||||
:param gae_lambda: (float)
|
||||
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
|
||||
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
|
||||
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
|
||||
|
||||
:param estimate_state_value_using_gae: (bool)
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.
|
||||
If set to True, the state value targets for the V head will be estimated using the GAE scheme.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -39,7 +39,7 @@ from rl_coach.memories.backend.memory_impl import get_memory_backend
|
||||
class Agent(AgentInterface):
|
||||
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
"""
|
||||
:param agent_parameters: A Preset class instance with all the running paramaters
|
||||
:param agent_parameters: A AgentParameters class instance with all the agent parameters
|
||||
"""
|
||||
super().__init__()
|
||||
self.ap = agent_parameters
|
||||
@@ -175,18 +175,20 @@ class Agent(AgentInterface):
|
||||
np.random.seed()
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
def parent(self) -> 'LevelManager':
|
||||
"""
|
||||
Get the parent class of the agent
|
||||
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
def parent(self, val) -> None:
|
||||
"""
|
||||
Change the parent class of the agent.
|
||||
Additionally, updates the full name of the agent
|
||||
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
@@ -196,7 +198,12 @@ class Agent(AgentInterface):
|
||||
raise ValueError("The parent of an agent must have a name")
|
||||
self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
|
||||
|
||||
def setup_logger(self):
|
||||
def setup_logger(self) -> None:
|
||||
"""
|
||||
Setup the logger for the agent
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# dump documentation
|
||||
logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
|
||||
format(graph_name=self.parent_level_manager.parent_graph_manager.name,
|
||||
@@ -212,6 +219,7 @@ class Agent(AgentInterface):
|
||||
def set_session(self, sess) -> None:
|
||||
"""
|
||||
Set the deep learning framework session for all the agents in the composite agent
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self.input_filter.set_session(sess)
|
||||
@@ -223,6 +231,7 @@ class Agent(AgentInterface):
|
||||
dump_one_value_per_step: bool=False) -> Signal:
|
||||
"""
|
||||
Register a signal such that its statistics will be dumped and be viewable through dashboard
|
||||
|
||||
:param signal_name: the name of the signal as it will appear in dashboard
|
||||
:param dump_one_value_per_episode: should the signal value be written for each episode?
|
||||
:param dump_one_value_per_step: should the signal value be written for each step?
|
||||
@@ -239,6 +248,7 @@ class Agent(AgentInterface):
|
||||
"""
|
||||
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules
|
||||
|
||||
:param spaces: the environment spaces definition
|
||||
:return: None
|
||||
"""
|
||||
@@ -274,6 +284,7 @@ class Agent(AgentInterface):
|
||||
Create all the networks of the agent.
|
||||
The network creation will be done after setting the environment parameters for the agent, since they are needed
|
||||
for creating the network.
|
||||
|
||||
:return: A list containing all the networks
|
||||
"""
|
||||
networks = {}
|
||||
@@ -295,6 +306,7 @@ class Agent(AgentInterface):
|
||||
"""
|
||||
Initialize any modules that depend on knowing information about the environment such as the action space or
|
||||
the observation space
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# initialize exploration policy
|
||||
@@ -314,13 +326,19 @@ class Agent(AgentInterface):
|
||||
|
||||
@property
|
||||
def phase(self) -> RunPhase:
|
||||
"""
|
||||
The current running phase of the agent
|
||||
|
||||
:return: RunPhase
|
||||
"""
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Change the phase of the run for the agent and all the sub components
|
||||
:param phase: the new run phase (TRAIN, TEST, etc.)
|
||||
|
||||
:param val: the new run phase (TRAIN, TEST, etc.)
|
||||
:return: None
|
||||
"""
|
||||
self.reset_evaluation_state(val)
|
||||
@@ -328,6 +346,14 @@ class Agent(AgentInterface):
|
||||
self.exploration_policy.change_phase(val)
|
||||
|
||||
def reset_evaluation_state(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
|
||||
evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
|
||||
by val, and by the current phase set in self.phase.
|
||||
|
||||
:param val: The new phase to change to
|
||||
:return: None
|
||||
"""
|
||||
starting_evaluation = (val == RunPhase.TEST)
|
||||
ending_evaluation = (self.phase == RunPhase.TEST)
|
||||
|
||||
@@ -363,6 +389,7 @@ class Agent(AgentInterface):
|
||||
This function is a wrapper to allow having the same calls for shared or unshared memories.
|
||||
It should be used instead of calling the memory directly in order to allow different algorithms to work
|
||||
both with a shared and a local memory.
|
||||
|
||||
:param func: the name of the memory function to call
|
||||
:param args: the arguments to supply to the function
|
||||
:return: the return value of the function
|
||||
@@ -375,7 +402,12 @@ class Agent(AgentInterface):
|
||||
result = getattr(self.memory, func)(*args)
|
||||
return result
|
||||
|
||||
def log_to_screen(self):
|
||||
def log_to_screen(self) -> None:
|
||||
"""
|
||||
Write an episode summary line to the terminal
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Name"] = self.full_name_id
|
||||
@@ -388,9 +420,10 @@ class Agent(AgentInterface):
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
def update_step_in_episode_log(self):
|
||||
def update_step_in_episode_log(self) -> None:
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
Updates the in-episode log file with all the signal values from the most recent step.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
@@ -411,9 +444,12 @@ class Agent(AgentInterface):
|
||||
# dump
|
||||
self.agent_episode_logger.dump_output_csv()
|
||||
|
||||
def update_log(self):
|
||||
def update_log(self) -> None:
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
Updates the episodic log file with all the signal values from the most recent episode.
|
||||
Additional signals for logging can be set by the creating a new signal using self.register_signal,
|
||||
and then updating it with some internal agent values.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
@@ -438,7 +474,6 @@ class Agent(AgentInterface):
|
||||
self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
|
||||
self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
|
||||
|
||||
|
||||
for signal in self.episode_signals:
|
||||
self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||
self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||
@@ -452,7 +487,10 @@ class Agent(AgentInterface):
|
||||
|
||||
def handle_episode_ended(self) -> None:
|
||||
"""
|
||||
End an episode
|
||||
Make any changes needed when each episode is ended.
|
||||
This includes incrementing counters, updating full episode dependent values, updating logs, etc.
|
||||
This function is called right after each episode is ended.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self.current_episode_buffer.is_complete = True
|
||||
@@ -486,9 +524,10 @@ class Agent(AgentInterface):
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
self.log_to_screen()
|
||||
|
||||
def reset_internal_state(self):
|
||||
def reset_internal_state(self) -> None:
|
||||
"""
|
||||
Reset all the episodic parameters
|
||||
Reset all the episodic parameters. This function is called right before each episode starts.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
for signal in self.episode_signals:
|
||||
@@ -516,6 +555,7 @@ class Agent(AgentInterface):
|
||||
def learn_from_batch(self, batch) -> Tuple[float, List, List]:
|
||||
"""
|
||||
Given a batch of transitions, calculates their target values and updates the network.
|
||||
|
||||
:param batch: A list of transitions
|
||||
:return: The total loss of the training, the loss per head and the unclipped gradients
|
||||
"""
|
||||
@@ -524,6 +564,7 @@ class Agent(AgentInterface):
|
||||
def _should_update_online_weights_to_target(self):
|
||||
"""
|
||||
Determine if online weights should be copied to the target.
|
||||
|
||||
:return: boolean: True if the online weights should be copied to the target.
|
||||
"""
|
||||
|
||||
@@ -542,9 +583,10 @@ class Agent(AgentInterface):
|
||||
"EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
|
||||
return should_update
|
||||
|
||||
def _should_train(self, wait_for_full_episode=False):
|
||||
def _should_train(self, wait_for_full_episode=False) -> bool:
|
||||
"""
|
||||
Determine if we should start a training phase according to the number of steps passed since the last training
|
||||
|
||||
:return: boolean: True if we should start a training phase
|
||||
"""
|
||||
|
||||
@@ -580,11 +622,12 @@ class Agent(AgentInterface):
|
||||
|
||||
return should_update
|
||||
|
||||
def train(self):
|
||||
def train(self) -> float:
|
||||
"""
|
||||
Check if a training phase should be done as configured by num_consecutive_playing_steps.
|
||||
If it should, then do several training steps as configured by num_consecutive_training_steps.
|
||||
A single training iteration: Sample a batch, train on it and update target networks.
|
||||
|
||||
:return: The total training loss during the training iterations.
|
||||
"""
|
||||
loss = 0
|
||||
@@ -641,14 +684,12 @@ class Agent(AgentInterface):
|
||||
# run additional commands after the training is done
|
||||
self.post_training_commands()
|
||||
|
||||
|
||||
|
||||
return loss
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
"""
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
||||
or testing.
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when
|
||||
training or testing.
|
||||
|
||||
:param curr_state: the current state to act upon.
|
||||
:return: chosen action, some action value describing the action (q-value, probability, etc)
|
||||
@@ -656,10 +697,16 @@ class Agent(AgentInterface):
|
||||
pass
|
||||
|
||||
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
|
||||
network_name: str):
|
||||
network_name: str) -> Dict[str, np.array]:
|
||||
"""
|
||||
convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
observations together, measurements together, etc.
|
||||
|
||||
:param states: A list of environment states, where each one is a dict mapping from an observation name to its
|
||||
corresponding observation
|
||||
:param network_name: The agent network name to prepare the batch for. this is needed in order to extract only
|
||||
the observation relevant for the network from the states.
|
||||
:return: A dictionary containing a list of values from all the given states for each of the observations
|
||||
"""
|
||||
# convert to batch so we can run it through the network
|
||||
states = force_list(states)
|
||||
@@ -676,7 +723,8 @@ class Agent(AgentInterface):
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Given the agents current knowledge, decide on the next action to apply to the environment
|
||||
:return: an action and a dictionary containing any additional info from the action decision process
|
||||
|
||||
:return: An ActionInfo object, which contains the action and any additional info from the action decision process
|
||||
"""
|
||||
if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
|
||||
# This agent never plays while training (e.g. behavioral cloning)
|
||||
@@ -705,13 +753,20 @@ class Agent(AgentInterface):
|
||||
|
||||
return filtered_action_info
|
||||
|
||||
def run_pre_network_filter_for_inference(self, state: StateType):
|
||||
def run_pre_network_filter_for_inference(self, state: StateType) -> StateType:
|
||||
"""
|
||||
Run filters which where defined for being applied right before using the state for inference.
|
||||
|
||||
:param state: The state to run the filters on
|
||||
:return: The filtered state
|
||||
"""
|
||||
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
|
||||
return self.pre_network_filter.filter(dummy_env_response)[0].next_state
|
||||
|
||||
def get_state_embedding(self, state: dict) -> np.ndarray:
|
||||
"""
|
||||
Given a state, get the corresponding state embedding from the main network
|
||||
|
||||
:param state: a state dict
|
||||
:return: a numpy embedding vector
|
||||
"""
|
||||
@@ -726,6 +781,7 @@ class Agent(AgentInterface):
|
||||
"""
|
||||
Allows agents to update the transition just before adding it to the replay buffer.
|
||||
Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
|
||||
:param transition: the transition to update
|
||||
:return: the updated transition
|
||||
"""
|
||||
@@ -736,8 +792,10 @@ class Agent(AgentInterface):
|
||||
Given a response from the environment, distill the observation from it and store it for later use.
|
||||
The response should be a dictionary containing the performed action, the new observation and measurements,
|
||||
the reward, a game over flag and any additional information necessary.
|
||||
|
||||
:param env_response: result of call from environment.step(action)
|
||||
:return:
|
||||
:return: a boolean value which determines if the agent has decided to terminate the episode after seeing the
|
||||
given observation
|
||||
"""
|
||||
|
||||
# filter the env_response
|
||||
@@ -801,7 +859,12 @@ class Agent(AgentInterface):
|
||||
|
||||
return transition.game_over
|
||||
|
||||
def post_training_commands(self):
|
||||
def post_training_commands(self) -> None:
|
||||
"""
|
||||
A function which allows adding any functionality that is required to run right after the training phase ends.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
|
||||
@@ -809,9 +872,10 @@ class Agent(AgentInterface):
|
||||
Get a prediction from the agent with regard to the requested prediction_type.
|
||||
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
|
||||
raise a ValueException.
|
||||
:param states:
|
||||
:param prediction_type:
|
||||
:return:
|
||||
|
||||
:param states: The states to get a prediction for
|
||||
:param prediction_type: The type of prediction to get for the states. For example, the state-value prediction.
|
||||
:return: the predicted values
|
||||
"""
|
||||
|
||||
predictions = self.networks['main'].online_network.predict_with_prediction_type(
|
||||
@@ -824,6 +888,15 @@ class Agent(AgentInterface):
|
||||
return list(predictions.values())[0]
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
"""
|
||||
Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent
|
||||
has another master agent that is controlling it. In such cases, the master agent can define the goals for the
|
||||
slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent
|
||||
in-action-space.
|
||||
|
||||
:param action: The action that should be set as the directive
|
||||
:return:
|
||||
"""
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
self.current_hrl_goal = action
|
||||
elif isinstance(self.in_action_space, AttentionActionSpace):
|
||||
@@ -834,6 +907,7 @@ class Agent(AgentInterface):
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
"""
|
||||
Allows agents to store additional information when saving checkpoints.
|
||||
|
||||
:param checkpoint_id: the id of the checkpoint
|
||||
:return: None
|
||||
"""
|
||||
@@ -842,6 +916,7 @@ class Agent(AgentInterface):
|
||||
def sync(self) -> None:
|
||||
"""
|
||||
Sync the global network parameters to local networks
|
||||
|
||||
:return: None
|
||||
"""
|
||||
for network in self.networks.values():
|
||||
|
||||
@@ -32,7 +32,6 @@ from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayPar
|
||||
class BCAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.collect_new_data = False
|
||||
|
||||
|
||||
class BCNetworkParameters(NetworkParameters):
|
||||
|
||||
@@ -33,6 +33,19 @@ class CategoricalDQNNetworkParameters(DQNNetworkParameters):
|
||||
|
||||
|
||||
class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param v_min: (float)
|
||||
The minimal value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to :math:`v_{min}` in the paper.
|
||||
|
||||
:param v_max: (float)
|
||||
The maximum value that will be represented in the network output for predicting the Q value.
|
||||
Corresponds to :math:`v_{max}` in the paper.
|
||||
|
||||
:param atoms: (int)
|
||||
The number of atoms that will be used to discretize the range between v_min and v_max.
|
||||
For the C51 algorithm described in the paper, the number of atoms is 51.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.v_min = -10.0
|
||||
|
||||
@@ -26,9 +26,12 @@ from rl_coach.memories.non_episodic.balanced_experience_replay import BalancedEx
|
||||
|
||||
|
||||
class CILAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param state_key_with_the_class_index: (str)
|
||||
The key of the state dictionary which corresponds to the value that will be used to control the class index.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.collect_new_data = False
|
||||
self.state_key_with_the_class_index = 'high_level_command'
|
||||
|
||||
|
||||
|
||||
@@ -58,6 +58,47 @@ class ClippedPPONetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class ClippedPPOAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.
|
||||
|
||||
:param gae_lambda: (float)
|
||||
The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.
|
||||
|
||||
:param clip_likelihood_ratio_using_epsilon: (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.
|
||||
|
||||
:param value_targets_mix_fraction: (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
|
||||
|
||||
:param estimate_state_value_using_gae: (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.
|
||||
|
||||
:param use_kl_regularization: (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the :math:`\beta` value defined by beta_entropy.
|
||||
|
||||
:param optimization_epochs: (int)
|
||||
For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
|
||||
optimization_epochs value.
|
||||
|
||||
:param optimization_epochs: (Schedule)
|
||||
Can be used to define a schedule over the clipping of the likelihood ratio.
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_episodes_in_experience_replay = 1000000
|
||||
@@ -66,7 +107,6 @@ class ClippedPPOAlgorithmParameters(AlgorithmParameters):
|
||||
self.use_kl_regularization = False
|
||||
self.clip_likelihood_ratio_using_epsilon = 0.2
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.beta_entropy = 0.01 # should be 0 for mujoco
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(2048)
|
||||
self.optimization_epochs = 10
|
||||
|
||||
@@ -65,6 +65,33 @@ class DDPGActorNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class DDPGAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.
|
||||
|
||||
:param rate_for_copying_weights_to_target: (float)
|
||||
When copying the online network weights to the target network weights, a soft update will be used, which
|
||||
weight the new online network weights by rate_for_copying_weights_to_target
|
||||
|
||||
:param num_consecutive_playing_steps: (StepMethod)
|
||||
The number of consecutive steps to act between every two training iterations
|
||||
|
||||
:param use_target_network_for_evaluation: (bool)
|
||||
If set to True, the target network will be used for predicting the actions when choosing actions to act.
|
||||
Since the target network weights change more slowly, the predicted actions will be more consistent.
|
||||
|
||||
:param action_penalty: (float)
|
||||
The amount by which to penalize the network on high action feature (pre-activation) values.
|
||||
This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
|
||||
gradients from becoming very low.
|
||||
|
||||
:param clip_critic_targets: (Tuple[float, float] or None)
|
||||
The range to clip the critic target to in order to prevent overestimation of the action values.
|
||||
|
||||
:param use_non_zero_discount_for_terminal_states: (bool)
|
||||
If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
|
||||
values. If set to False, the terminal states reward will be taken as the target return for the network.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
|
||||
|
||||
@@ -81,6 +81,35 @@ class DFPMemoryParameters(EpisodicExperienceReplayParameters):
|
||||
|
||||
|
||||
class DFPAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_predicted_steps_ahead: (int)
|
||||
Number of future steps to predict measurements for. The future steps won't be sequential, but rather jump
|
||||
in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4
|
||||
|
||||
:param goal_vector: (List[float])
|
||||
The goal vector will weight each of the measurements to form an optimization goal. The vector should have
|
||||
the same length as the number of measurements, and it will be vector multiplied by the measurements.
|
||||
Positive values correspond to trying to maximize the particular measurement, and negative values
|
||||
correspond to trying to minimize the particular measurement.
|
||||
|
||||
:param future_measurements_weights: (List[float])
|
||||
The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
|
||||
goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
|
||||
then only the 3 last timesteps will be taken into account, according to the weights in the
|
||||
future_measurements_weights vector.
|
||||
|
||||
:param use_accumulated_reward_as_measurement: (bool)
|
||||
If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
|
||||
the measurements vector in the state. This van be useful in environments where the given measurements don't
|
||||
include enough information for the particular goal the agent should achieve.
|
||||
|
||||
:param handling_targets_after_episode_end: (HandlingTargetsAfterEpisodeEnd)
|
||||
Dictates how to handle measurements that are outside the episode length.
|
||||
|
||||
:param scale_measurements_targets: (Dict[str, float])
|
||||
Allows rescaling the values of each of the measurements available. This van be useful when the measurements
|
||||
have a different scale and you want to normalize them to the same scale.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_predicted_steps_ahead = 6
|
||||
|
||||
@@ -24,6 +24,13 @@ from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
|
||||
"""
|
||||
:param time_limit: (int)
|
||||
The number of steps the agent is allowed to act for while trying to achieve its goal
|
||||
|
||||
:param sub_goal_testing_rate: (float)
|
||||
The percent of episodes that will be used for testing the sub goals generated by the upper level agents.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.time_limit = 40
|
||||
@@ -91,7 +98,7 @@ class HACDDPGAgent(DDPGAgent):
|
||||
sub_goal_is_missed = not sub_goal_reached
|
||||
|
||||
if sub_goal_is_missed:
|
||||
transition.reward = -self.ap.algorithm.time_limit
|
||||
transition.reward = -self.ap.algorithm.time_limit
|
||||
return transition
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
|
||||
@@ -24,6 +24,11 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi
|
||||
|
||||
|
||||
class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param monte_carlo_mixing_rate: (float)
|
||||
The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
|
||||
the single-step bootstrapped targets.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.monte_carlo_mixing_rate = 0.1
|
||||
|
||||
@@ -44,6 +44,26 @@ class NStepQNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class NStepQAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.
|
||||
|
||||
:param apply_gradients_every_x_episodes: (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.
|
||||
|
||||
:param num_steps_between_gradient_updates: (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.
|
||||
|
||||
:param targets_horizon: (str)
|
||||
Should be either 'N-Step' or '1-Step', and defines the length for which to bootstrap the network values over.
|
||||
Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
|
||||
please refer to the original paper (https://arxiv.org/abs/1602.01783)
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
|
||||
|
||||
@@ -43,6 +43,39 @@ class NECNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class NECAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param dnd_size: (int)
|
||||
Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
|
||||
of transitions that will be stored is dnd_size x num_actions.
|
||||
|
||||
:param l2_norm_added_delta: (float)
|
||||
A small value that will be added when calculating the weight of each of the DND entries. This follows the
|
||||
:math:`\delta` patameter defined in the paper.
|
||||
|
||||
:param new_value_shift_coefficient: (float)
|
||||
In the case where a ew embedding that was added to the DND was already present, the value that will be stored
|
||||
in the DND is a mix between the existing value and the new value. The mix rate is defined by
|
||||
new_value_shift_coefficient.
|
||||
|
||||
:param number_of_knn: (int)
|
||||
The number of neighbors that will be retrieved for each DND query.
|
||||
|
||||
:param DND_key_error_threshold: (float)
|
||||
When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
|
||||
exists in the DND, since exact matches of embeddings are very rare.
|
||||
|
||||
:param propagate_updates_to_DND: (bool)
|
||||
If set to True, when the gradients of the network will be calculated, the gradients will also be
|
||||
backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
|
||||
network weights.
|
||||
|
||||
:param n_step: (int)
|
||||
The bootstrap length that will be used when calculating the state values to store in the DND.
|
||||
|
||||
:param bootstrap_total_return_from_old_policy: (bool)
|
||||
If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
|
||||
when the state was first seen, and not the latest, most up-to-date network value.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.dnd_size = 500000
|
||||
|
||||
@@ -24,6 +24,19 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi
|
||||
|
||||
|
||||
class PALAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param pal_alpha: (float)
|
||||
A factor that weights the amount by which the advantage learning update will be taken into account.
|
||||
|
||||
:param persistent_advantage_learning: (bool)
|
||||
If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
|
||||
the same actions one after the other instead of changing actions.
|
||||
|
||||
:param monte_carlo_mixing_rate: (float)
|
||||
The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
|
||||
total discounted returns, and they can help reduce the time it takes for the network to update to the newly
|
||||
seen values, since it is not based on bootstrapping the current network values.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pal_alpha = 0.9
|
||||
|
||||
@@ -42,6 +42,27 @@ class PolicyGradientNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class PolicyGradientAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
|
||||
the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
|
||||
return, but there are other rescalers that are intended for reducing the variance of the updates.
|
||||
|
||||
:param apply_gradients_every_x_episodes: (int)
|
||||
The number of episodes between applying the accumulated gradients to the network. After every
|
||||
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
|
||||
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
|
||||
apply_gradients_every_x_episodes episodes.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
|
||||
will be added to the loss and scaled by the given beta factor.
|
||||
|
||||
:param num_steps_between_gradient_updates: (int)
|
||||
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
|
||||
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
|
||||
are used in the batch.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
|
||||
|
||||
@@ -63,6 +63,51 @@ class PPOActorNetworkParameters(NetworkParameters):
|
||||
|
||||
|
||||
class PPOAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param policy_gradient_rescaler: (PolicyGradientRescaler)
|
||||
This represents how the critic will be used to update the actor. The critic value function is typically used
|
||||
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
|
||||
advantage of the action, or the generalized advantage estimation (GAE) value.
|
||||
|
||||
:param gae_lambda: (float)
|
||||
The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
|
||||
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
|
||||
n-step estimations.
|
||||
|
||||
:param target_kl_divergence: (float)
|
||||
The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
|
||||
bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.
|
||||
|
||||
:param initial_kl_coefficient: (float)
|
||||
The initial weight that will be given to the KL divergence between the current and the new policy in the
|
||||
regularization factor.
|
||||
|
||||
:param high_kl_penalty_coefficient: (float)
|
||||
The penalty that will be given for KL divergence values which are highes than what was defined as the target.
|
||||
|
||||
:param clip_likelihood_ratio_using_epsilon: (float)
|
||||
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
|
||||
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
|
||||
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
|
||||
implementations.
|
||||
|
||||
:param value_targets_mix_fraction: (float)
|
||||
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
|
||||
define how much of the new targets will be taken into account when calculating the loss.
|
||||
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
|
||||
|
||||
:param estimate_state_value_using_gae: (bool)
|
||||
If set to True, the state value will be estimated using the GAE technique.
|
||||
|
||||
:param use_kl_regularization: (bool)
|
||||
If set to True, the loss function will be regularized using the KL diveregence between the current and new
|
||||
policy, to bound the change of the policy during the network update.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the :math:`\beta` value defined by beta_entropy.
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
@@ -73,7 +118,6 @@ class PPOAlgorithmParameters(AlgorithmParameters):
|
||||
self.clip_likelihood_ratio_using_epsilon = None
|
||||
self.value_targets_mix_fraction = 0.1
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.use_kl_regularization = True
|
||||
self.beta_entropy = 0.01
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(5000)
|
||||
|
||||
@@ -34,6 +34,14 @@ class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
|
||||
|
||||
|
||||
class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
"""
|
||||
:param atoms: (int)
|
||||
the number of atoms to predict for each action
|
||||
|
||||
:param huber_loss_interval: (float)
|
||||
One of the huber loss parameters, and is referred to as :math:`\kapa` in the paper.
|
||||
It describes the interval [-k, k] in which the huber loss acts as a MSE loss.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.atoms = 200
|
||||
|
||||
@@ -37,6 +37,17 @@ class RainbowDQNNetworkParameters(DQNNetworkParameters):
|
||||
|
||||
|
||||
class RainbowDQNAlgorithmParameters(CategoricalDQNAlgorithmParameters):
|
||||
"""
|
||||
:param n_step: (int)
|
||||
The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
|
||||
using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
|
||||
prediction.
|
||||
|
||||
:param store_transitions_only_when_episodes_are_terminated: (bool)
|
||||
If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
|
||||
written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
|
||||
transitions into the memory, and to do so we need the entire episode first.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.n_step = 3
|
||||
|
||||
@@ -57,7 +57,7 @@ class Architecture(object):
|
||||
:param initial_feed_dict: a dictionary of extra inputs for forward pass.
|
||||
:return: predictions of action or value of shape (batch_size, action_space_size) for action predictions)
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def parallel_predict(sess: Any,
|
||||
@@ -68,7 +68,7 @@ class Architecture(object):
|
||||
:param network_input_tuples: tuple of network and corresponding input
|
||||
:return: list or tuple of outputs from all networks
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def train_on_batch(self,
|
||||
inputs: Dict[str, np.ndarray],
|
||||
@@ -102,7 +102,7 @@ class Architecture(object):
|
||||
norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
|
||||
fetched_tensors: all values for additional_fetches
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def get_weights(self) -> List[np.ndarray]:
|
||||
"""
|
||||
@@ -110,7 +110,7 @@ class Architecture(object):
|
||||
|
||||
:return: list weights as ndarray
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def set_weights(self, weights: List[np.ndarray], rate: float=1.0) -> None:
|
||||
"""
|
||||
@@ -121,7 +121,7 @@ class Architecture(object):
|
||||
i.e. new_weight = rate * given_weight + (1 - rate) * old_weight
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def reset_accumulated_gradients(self) -> None:
|
||||
"""
|
||||
@@ -130,7 +130,7 @@ class Architecture(object):
|
||||
Once gradients are reset, they must be accessible by `accumulated_gradients` property of this class,
|
||||
which must return a list of numpy ndarrays. Child class must ensure that `accumulated_gradients` is set.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def accumulate_gradients(self,
|
||||
inputs: Dict[str, np.ndarray],
|
||||
@@ -166,7 +166,7 @@ class Architecture(object):
|
||||
norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
|
||||
fetched_tensors: all values for additional_fetches
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_and_reset_gradients(self, gradients: List[np.ndarray], scaler: float=1.) -> None:
|
||||
"""
|
||||
@@ -177,7 +177,7 @@ class Architecture(object):
|
||||
of an identical network (either self or another identical network)
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_gradients(self, gradients: List[np.ndarray], scaler: float=1.) -> None:
|
||||
"""
|
||||
@@ -188,7 +188,7 @@ class Architecture(object):
|
||||
of an identical network (either self or another identical network)
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def get_variable_value(self, variable: Any) -> np.ndarray:
|
||||
"""
|
||||
@@ -199,7 +199,7 @@ class Architecture(object):
|
||||
:param variable: variable of interest
|
||||
:return: value of the specified variable
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def set_variable_value(self, assign_op: Any, value: np.ndarray, placeholder: Any):
|
||||
"""
|
||||
@@ -212,4 +212,4 @@ class Architecture(object):
|
||||
:param value: value of the specified variable used for update
|
||||
:param placeholder: a placeholder for binding the value to assign_op.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -34,7 +34,11 @@ except ImportError:
|
||||
|
||||
class NetworkWrapper(object):
|
||||
"""
|
||||
Contains multiple networks and managers syncing and gradient updates
|
||||
The network wrapper contains multiple copies of the same network, each one with a different set of weights which is
|
||||
updating in a different time scale. The network wrapper will always contain an online network.
|
||||
It will contain an additional slow updating target network if it was requested by the user,
|
||||
and it will contain a global network shared between different workers, if Coach is run in a single-node
|
||||
multi-process distributed mode. The network wrapper contains functionality for managing these networks and syncing
|
||||
between them.
|
||||
"""
|
||||
def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str,
|
||||
@@ -98,6 +102,7 @@ class NetworkWrapper(object):
|
||||
def sync(self):
|
||||
"""
|
||||
Initializes the weights of the networks to match each other
|
||||
|
||||
:return:
|
||||
"""
|
||||
self.update_online_network()
|
||||
@@ -106,6 +111,7 @@ class NetworkWrapper(object):
|
||||
def update_target_network(self, rate=1.0):
|
||||
"""
|
||||
Copy weights: online network >>> target network
|
||||
|
||||
:param rate: the rate of copying the weights - 1 for copying exactly
|
||||
"""
|
||||
if self.target_network:
|
||||
@@ -114,6 +120,7 @@ class NetworkWrapper(object):
|
||||
def update_online_network(self, rate=1.0):
|
||||
"""
|
||||
Copy weights: global network >>> online network
|
||||
|
||||
:param rate: the rate of copying the weights - 1 for copying exactly
|
||||
"""
|
||||
if self.global_network:
|
||||
@@ -122,6 +129,7 @@ class NetworkWrapper(object):
|
||||
def apply_gradients_to_global_network(self, gradients=None):
|
||||
"""
|
||||
Apply gradients from the online network on the global network
|
||||
|
||||
:param gradients: optional gradients that will be used instead of teh accumulated gradients
|
||||
:return:
|
||||
"""
|
||||
@@ -135,6 +143,7 @@ class NetworkWrapper(object):
|
||||
def apply_gradients_to_online_network(self, gradients=None):
|
||||
"""
|
||||
Apply gradients from the online network on itself
|
||||
|
||||
:return:
|
||||
"""
|
||||
if gradients is None:
|
||||
@@ -144,6 +153,7 @@ class NetworkWrapper(object):
|
||||
def train_and_sync_networks(self, inputs, targets, additional_fetches=[], importance_weights=None):
|
||||
"""
|
||||
A generic training function that enables multi-threading training using a global network if necessary.
|
||||
|
||||
:param inputs: The inputs for the network.
|
||||
:param targets: The targets corresponding to the given inputs
|
||||
:param additional_fetches: Any additional tensor the user wants to fetch
|
||||
@@ -160,6 +170,7 @@ class NetworkWrapper(object):
|
||||
"""
|
||||
Applies the gradients accumulated in the online network to the global network or to itself and syncs the
|
||||
networks if necessary
|
||||
|
||||
:param reset_gradients: If set to True, the accumulated gradients wont be reset to 0 after applying them to
|
||||
the network. this is useful when the accumulated gradients are overwritten instead
|
||||
if accumulated by the accumulate_gradients function. this allows reducing time
|
||||
@@ -179,6 +190,7 @@ class NetworkWrapper(object):
|
||||
def parallel_prediction(self, network_input_tuples: List[Tuple]):
|
||||
"""
|
||||
Run several network prediction in parallel. Currently this only supports running each of the network once.
|
||||
|
||||
:param network_input_tuples: a list of tuples where the first element is the network (online_network,
|
||||
target_network or global_network) and the second element is the inputs
|
||||
:return: the outputs of all the networks in the same order as the inputs were given
|
||||
@@ -188,6 +200,7 @@ class NetworkWrapper(object):
|
||||
def get_local_variables(self):
|
||||
"""
|
||||
Get all the variables that are local to the thread
|
||||
|
||||
:return: a list of all the variables that are local to the thread
|
||||
"""
|
||||
local_variables = [v for v in tf.local_variables() if self.online_network.name in v.name]
|
||||
@@ -198,6 +211,7 @@ class NetworkWrapper(object):
|
||||
def get_global_variables(self):
|
||||
"""
|
||||
Get all the variables that are shared between threads
|
||||
|
||||
:return: a list of all the variables that are shared between threads
|
||||
"""
|
||||
global_variables = [v for v in tf.global_variables() if self.global_network.name in v.name]
|
||||
@@ -206,6 +220,7 @@ class NetworkWrapper(object):
|
||||
def set_is_training(self, state: bool):
|
||||
"""
|
||||
Set the phase of the network between training and testing
|
||||
|
||||
:param state: The current state (True = Training, False = Testing)
|
||||
:return: None
|
||||
"""
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List, Union
|
||||
from typing import List, Union, Tuple
|
||||
import copy
|
||||
|
||||
import numpy as np
|
||||
@@ -74,7 +74,12 @@ class InputEmbedder(object):
|
||||
activation_function=self.activation_function,
|
||||
dropout_rate=self.dropout_rate))
|
||||
|
||||
def __call__(self, prev_input_placeholder=None):
|
||||
def __call__(self, prev_input_placeholder: tf.placeholder=None) -> Tuple[tf.Tensor, tf.Tensor]:
|
||||
"""
|
||||
Wrapper for building the module graph including scoping and loss creation
|
||||
:param prev_input_placeholder: the input to the graph
|
||||
:return: the input placeholder and the output of the last layer
|
||||
"""
|
||||
with tf.variable_scope(self.get_name()):
|
||||
if prev_input_placeholder is None:
|
||||
self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
|
||||
@@ -84,7 +89,13 @@ class InputEmbedder(object):
|
||||
|
||||
return self.input, self.output
|
||||
|
||||
def _build_module(self):
|
||||
def _build_module(self) -> None:
|
||||
"""
|
||||
Builds the graph of the module
|
||||
This method is called early on from __call__. It is expected to store the graph
|
||||
in self.output.
|
||||
:return: None
|
||||
"""
|
||||
# NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
|
||||
# to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
|
||||
# input to the network to be float, which is 4x more expensive in memory.
|
||||
@@ -127,7 +138,11 @@ class InputEmbedder(object):
|
||||
raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
|
||||
"configurations.")
|
||||
|
||||
def get_name(self):
|
||||
def get_name(self) -> str:
|
||||
"""
|
||||
Get a formatted name for the module
|
||||
:return: the formatted name
|
||||
"""
|
||||
return self.name
|
||||
|
||||
def __str__(self):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
from typing import Union
|
||||
from typing import Union, Tuple
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
@@ -64,17 +64,33 @@ class Middleware(object):
|
||||
activation_function=self.activation_function,
|
||||
dropout_rate=self.dropout_rate))
|
||||
|
||||
def __call__(self, input_layer):
|
||||
def __call__(self, input_layer: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
|
||||
"""
|
||||
Wrapper for building the module graph including scoping and loss creation
|
||||
:param input_layer: the input to the graph
|
||||
:return: the input placeholder and the output of the last layer
|
||||
"""
|
||||
with tf.variable_scope(self.get_name()):
|
||||
self.input = input_layer
|
||||
self._build_module()
|
||||
|
||||
return self.input, self.output
|
||||
|
||||
def _build_module(self):
|
||||
def _build_module(self) -> None:
|
||||
"""
|
||||
Builds the graph of the module
|
||||
This method is called early on from __call__. It is expected to store the graph
|
||||
in self.output.
|
||||
:param input_layer: the input to the graph
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_name(self):
|
||||
def get_name(self) -> str:
|
||||
"""
|
||||
Get a formatted name for the module
|
||||
:return: the formatted name
|
||||
"""
|
||||
return self.name
|
||||
|
||||
@property
|
||||
|
||||
@@ -154,7 +154,6 @@ class AlgorithmParameters(Parameters):
|
||||
self.num_steps_between_copying_online_weights_to_target = TrainingSteps(0)
|
||||
self.rate_for_copying_weights_to_target = 1.0
|
||||
self.load_memory_from_file_path = None
|
||||
self.collect_new_data = True
|
||||
self.store_transitions_only_when_episodes_are_terminated = False
|
||||
|
||||
# HRL / HER related params
|
||||
@@ -174,7 +173,38 @@ class AlgorithmParameters(Parameters):
|
||||
|
||||
|
||||
class PresetValidationParameters(Parameters):
|
||||
def __init__(self):
|
||||
def __init__(self,
|
||||
test=False,
|
||||
min_reward_threshold=0,
|
||||
max_episodes_to_achieve_reward=1,
|
||||
num_workers=1,
|
||||
reward_test_level=None,
|
||||
test_using_a_trace_test=True,
|
||||
trace_test_levels=None,
|
||||
trace_max_env_steps=5000):
|
||||
"""
|
||||
:param test:
|
||||
A flag which specifies if the preset should be tested as part of the validation process.
|
||||
:param min_reward_threshold:
|
||||
The minimum reward that the agent should pass after max_episodes_to_achieve_reward episodes when the
|
||||
preset is run.
|
||||
:param max_episodes_to_achieve_reward:
|
||||
The maximum number of episodes that the agent should train using the preset in order to achieve the
|
||||
reward specified by min_reward_threshold.
|
||||
:param num_workers:
|
||||
The number of workers that should be used when running this preset in the test suite for validation.
|
||||
:param reward_test_level:
|
||||
The environment level or levels, given by a list of strings, that should be tested as part of the
|
||||
reward tests suite.
|
||||
:param test_using_a_trace_test:
|
||||
A flag that specifies if the preset should be run as part of the trace tests suite.
|
||||
:param trace_test_levels:
|
||||
The environment level or levels, given by a list of strings, that should be tested as part of the
|
||||
trace tests suite.
|
||||
:param trace_max_env_steps:
|
||||
An integer representing the maximum number of environment steps to run when running this preset as part
|
||||
of the trace tests suite.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
|
||||
@@ -182,42 +212,42 @@ class PresetValidationParameters(Parameters):
|
||||
# time from the OS.
|
||||
|
||||
# Testing parameters
|
||||
self.test = False
|
||||
self.min_reward_threshold = 0
|
||||
self.max_episodes_to_achieve_reward = 1
|
||||
self.num_workers = 1
|
||||
self.reward_test_level = None
|
||||
self.test_using_a_trace_test = True
|
||||
self.trace_test_levels = None
|
||||
self.trace_max_env_steps = 5000
|
||||
self.test = test
|
||||
self.min_reward_threshold = min_reward_threshold
|
||||
self.max_episodes_to_achieve_reward = max_episodes_to_achieve_reward
|
||||
self.num_workers = num_workers
|
||||
self.reward_test_level = reward_test_level
|
||||
self.test_using_a_trace_test = test_using_a_trace_test
|
||||
self.trace_test_levels = trace_test_levels
|
||||
self.trace_max_env_steps = trace_max_env_steps
|
||||
|
||||
|
||||
class NetworkParameters(Parameters):
|
||||
def __init__(self,
|
||||
force_cpu = False,
|
||||
async_training = False,
|
||||
shared_optimizer = True,
|
||||
scale_down_gradients_by_number_of_workers_for_sync_training = True,
|
||||
clip_gradients = None,
|
||||
gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm,
|
||||
l2_regularization = 0,
|
||||
learning_rate = 0.00025,
|
||||
learning_rate_decay_rate = 0,
|
||||
learning_rate_decay_steps = 0,
|
||||
input_embedders_parameters = {},
|
||||
embedding_merger_type = EmbeddingMergerType.Concat,
|
||||
middleware_parameters = None,
|
||||
heads_parameters = [],
|
||||
use_separate_networks_per_head = False,
|
||||
optimizer_type = 'Adam',
|
||||
optimizer_epsilon = 0.0001,
|
||||
adam_optimizer_beta1 = 0.9,
|
||||
adam_optimizer_beta2 = 0.99,
|
||||
rms_prop_optimizer_decay = 0.9,
|
||||
batch_size = 32,
|
||||
replace_mse_with_huber_loss = False,
|
||||
create_target_network = False,
|
||||
tensorflow_support = True):
|
||||
force_cpu=False,
|
||||
async_training=False,
|
||||
shared_optimizer=True,
|
||||
scale_down_gradients_by_number_of_workers_for_sync_training=True,
|
||||
clip_gradients=None,
|
||||
gradients_clipping_method=GradientClippingMethod.ClipByGlobalNorm,
|
||||
l2_regularization=0,
|
||||
learning_rate=0.00025,
|
||||
learning_rate_decay_rate=0,
|
||||
learning_rate_decay_steps=0,
|
||||
input_embedders_parameters={},
|
||||
embedding_merger_type=EmbeddingMergerType.Concat,
|
||||
middleware_parameters=None,
|
||||
heads_parameters=[],
|
||||
use_separate_networks_per_head=False,
|
||||
optimizer_type='Adam',
|
||||
optimizer_epsilon=0.0001,
|
||||
adam_optimizer_beta1=0.9,
|
||||
adam_optimizer_beta2=0.99,
|
||||
rms_prop_optimizer_decay=0.9,
|
||||
batch_size=32,
|
||||
replace_mse_with_huber_loss=False,
|
||||
create_target_network=False,
|
||||
tensorflow_support=True):
|
||||
"""
|
||||
:param force_cpu:
|
||||
Force the neural networks to run on the CPU even if a GPU is available
|
||||
@@ -240,63 +270,106 @@ class NetworkParameters(Parameters):
|
||||
gradients of the network. This will only be used if the clip_gradients value is defined as a value other
|
||||
than None.
|
||||
:param l2_regularization:
|
||||
A L2 regularization weight that will be applied to the network weights while calculating the loss function
|
||||
:param learning_rate:
|
||||
The learning rate for the network
|
||||
:param learning_rate_decay_rate:
|
||||
If this value is larger than 0, an exponential decay will be applied to the network learning rate.
|
||||
The rate of the decay is defined by this parameter, and the number of training steps the decay will be
|
||||
applied is defined by learning_rate_decay_steps. Notice that both parameters should be defined in order
|
||||
for this to work correctly.
|
||||
:param learning_rate_decay_steps:
|
||||
If the learning_rate_decay_rate of the network is larger than 0, an exponential decay will be applied to
|
||||
the network learning rate. The number of steps the decay will be applied is defined by this parameter.
|
||||
Notice that both this parameter, as well as learning_rate_decay_rate should be defined in order for the
|
||||
learning rate decay to work correctly.
|
||||
:param input_embedders_parameters:
|
||||
A dictionary mapping between input names and input embedders (InputEmbedderParameters) to use for the
|
||||
network. Each of the keys is an input name as returned from the environment in the state.
|
||||
For example, if the environment returns a state containing 'observation' and 'measurements', then
|
||||
the keys for the input embedders dictionary can be either 'observation' to use the observation as input,
|
||||
'measurements' to use the measurements as input, or both.
|
||||
The embedder type will be automatically selected according to the input type. Vector inputs will
|
||||
produce a fully connected embedder, and image inputs will produce a convolutional embedder.
|
||||
:param embedding_merger_type:
|
||||
The type of embedding merging to use, given by one of the EmbeddingMergerType enum values.
|
||||
This will be used to merge the outputs of all the input embedders into a single embbeding.
|
||||
:param middleware_parameters:
|
||||
The parameters of the middleware to use, given by a MiddlewareParameters object.
|
||||
Each network will have only a single middleware embedder which will take the merged embeddings from the
|
||||
input embedders and pass them through more neural network layers.
|
||||
:param heads_parameters:
|
||||
A list of heads for the network given by their corresponding HeadParameters.
|
||||
Each network can have one or multiple network heads, where each one will take the output of the middleware
|
||||
and make some additional computation on top of it. Additionally, each head calculates a weighted loss value,
|
||||
and the loss values from all the heads will be summed later on.
|
||||
:param use_separate_networks_per_head:
|
||||
A flag that allows using different copies of the input embedders and middleware for each one of the heads.
|
||||
Regularly, the heads will have a shared input, but in the case where use_separate_networks_per_head is set
|
||||
to True, each one of the heads will get a different input.
|
||||
:param optimizer_type:
|
||||
A string specifying the optimizer type to use for updating the network. The available optimizers are
|
||||
Adam, RMSProp and LBFGS.
|
||||
:param optimizer_epsilon:
|
||||
An internal optimizer parameter used for Adam and RMSProp.
|
||||
:param adam_optimizer_beta1:
|
||||
An beta1 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
|
||||
optimizer for the network.
|
||||
:param adam_optimizer_beta2:
|
||||
An beta2 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
|
||||
optimizer for the network.
|
||||
:param rms_prop_optimizer_decay:
|
||||
The decay value for the RMSProp optimizer, which will be used only in case the RMSProp optimizer was
|
||||
selected for this network.
|
||||
:param batch_size:
|
||||
The batch size to use when updating the network.
|
||||
:param replace_mse_with_huber_loss:
|
||||
:param create_target_network:
|
||||
If this flag is set to True, an additional copy of the network will be created and initialized with the
|
||||
same weights as the online network. It can then be queried, and its weights can be synced from the
|
||||
online network at will.
|
||||
:param tensorflow_support:
|
||||
A flag which specifies if the network is supported by the TensorFlow framework.
|
||||
"""
|
||||
super().__init__()
|
||||
self.framework = Frameworks.tensorflow
|
||||
self.sess = None
|
||||
|
||||
# hardware parameters
|
||||
self.force_cpu = False
|
||||
self.force_cpu = force_cpu
|
||||
|
||||
# distributed training options
|
||||
self.async_training = False
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
|
||||
self.async_training = async_training
|
||||
self.shared_optimizer = shared_optimizer
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = scale_down_gradients_by_number_of_workers_for_sync_training
|
||||
|
||||
# regularization
|
||||
self.clip_gradients = None
|
||||
self.gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm
|
||||
self.l2_regularization = 0
|
||||
self.clip_gradients = clip_gradients
|
||||
self.gradients_clipping_method = gradients_clipping_method
|
||||
self.l2_regularization = l2_regularization
|
||||
|
||||
# learning rate
|
||||
self.learning_rate = 0.00025
|
||||
self.learning_rate_decay_rate = 0
|
||||
self.learning_rate_decay_steps = 0
|
||||
self.learning_rate = learning_rate
|
||||
self.learning_rate_decay_rate = learning_rate_decay_rate
|
||||
self.learning_rate_decay_steps = learning_rate_decay_steps
|
||||
|
||||
# structure
|
||||
self.input_embedders_parameters = {}
|
||||
self.embedding_merger_type = EmbeddingMergerType.Concat
|
||||
self.middleware_parameters = None
|
||||
self.heads_parameters = []
|
||||
self.use_separate_networks_per_head = False
|
||||
self.optimizer_type = 'Adam'
|
||||
self.optimizer_epsilon = 0.0001
|
||||
self.adam_optimizer_beta1 = 0.9
|
||||
self.adam_optimizer_beta2 = 0.99
|
||||
self.rms_prop_optimizer_decay = 0.9
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = False
|
||||
self.create_target_network = False
|
||||
self.input_embedders_parameters = input_embedders_parameters
|
||||
self.embedding_merger_type = embedding_merger_type
|
||||
self.middleware_parameters = middleware_parameters
|
||||
self.heads_parameters = heads_parameters
|
||||
self.use_separate_networks_per_head = use_separate_networks_per_head
|
||||
self.optimizer_type = optimizer_type
|
||||
self.optimizer_epsilon = optimizer_epsilon
|
||||
self.adam_optimizer_beta1 = adam_optimizer_beta1
|
||||
self.adam_optimizer_beta2 = adam_optimizer_beta2
|
||||
self.rms_prop_optimizer_decay = rms_prop_optimizer_decay
|
||||
self.batch_size = batch_size
|
||||
self.replace_mse_with_huber_loss = replace_mse_with_huber_loss
|
||||
self.create_target_network = create_target_network
|
||||
|
||||
# Framework support
|
||||
self.tensorflow_support = True
|
||||
self.tensorflow_support = tensorflow_support
|
||||
|
||||
|
||||
class NetworkComponentParameters(Parameters):
|
||||
|
||||
@@ -83,91 +83,91 @@ def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'
|
||||
|
||||
|
||||
def handle_distributed_coach_tasks(graph_manager, args):
|
||||
ckpt_inside_container = "/checkpoint"
|
||||
ckpt_inside_container = "/checkpoint"
|
||||
|
||||
memory_backend_params = None
|
||||
if args.memory_backend_params:
|
||||
memory_backend_params = json.loads(args.memory_backend_params)
|
||||
memory_backend_params['run_type'] = str(args.distributed_coach_run_type)
|
||||
graph_manager.agent_params.memory.register_var('memory_backend_params', construct_memory_params(memory_backend_params))
|
||||
memory_backend_params = None
|
||||
if args.memory_backend_params:
|
||||
memory_backend_params = json.loads(args.memory_backend_params)
|
||||
memory_backend_params['run_type'] = str(args.distributed_coach_run_type)
|
||||
graph_manager.agent_params.memory.register_var('memory_backend_params', construct_memory_params(memory_backend_params))
|
||||
|
||||
data_store_params = None
|
||||
data_store_params = None
|
||||
if args.data_store_params:
|
||||
data_store_params = construct_data_store_params(json.loads(args.data_store_params))
|
||||
data_store_params.checkpoint_dir = ckpt_inside_container
|
||||
graph_manager.data_store_params = data_store_params
|
||||
|
||||
if args.distributed_coach_run_type == RunType.TRAINER:
|
||||
training_worker(
|
||||
graph_manager=graph_manager,
|
||||
checkpoint_dir=ckpt_inside_container
|
||||
)
|
||||
|
||||
if args.distributed_coach_run_type == RunType.ROLLOUT_WORKER:
|
||||
data_store = None
|
||||
if args.data_store_params:
|
||||
data_store_params = construct_data_store_params(json.loads(args.data_store_params))
|
||||
data_store_params.checkpoint_dir = ckpt_inside_container
|
||||
graph_manager.data_store_params = data_store_params
|
||||
data_store = get_data_store(data_store_params)
|
||||
wait_for_checkpoint(checkpoint_dir=ckpt_inside_container, data_store=data_store)
|
||||
|
||||
if args.distributed_coach_run_type == RunType.TRAINER:
|
||||
training_worker(
|
||||
graph_manager=graph_manager,
|
||||
checkpoint_dir=ckpt_inside_container
|
||||
)
|
||||
|
||||
if args.distributed_coach_run_type == RunType.ROLLOUT_WORKER:
|
||||
data_store = None
|
||||
if args.data_store_params:
|
||||
data_store = get_data_store(data_store_params)
|
||||
wait_for_checkpoint(checkpoint_dir=ckpt_inside_container, data_store=data_store)
|
||||
|
||||
rollout_worker(
|
||||
graph_manager=graph_manager,
|
||||
checkpoint_dir=ckpt_inside_container,
|
||||
data_store=data_store,
|
||||
num_workers=args.num_workers
|
||||
)
|
||||
rollout_worker(
|
||||
graph_manager=graph_manager,
|
||||
checkpoint_dir=ckpt_inside_container,
|
||||
data_store=data_store,
|
||||
num_workers=args.num_workers
|
||||
)
|
||||
|
||||
|
||||
def handle_distributed_coach_orchestrator(graph_manager, args):
|
||||
ckpt_inside_container = "/checkpoint"
|
||||
rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
|
||||
trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]
|
||||
ckpt_inside_container = "/checkpoint"
|
||||
rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
|
||||
trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]
|
||||
|
||||
if '--experiment_name' not in rollout_command:
|
||||
rollout_command = rollout_command + ['--experiment_name', args.experiment_name]
|
||||
if '--experiment_name' not in rollout_command:
|
||||
rollout_command = rollout_command + ['--experiment_name', args.experiment_name]
|
||||
|
||||
if '--experiment_name' not in trainer_command:
|
||||
trainer_command = trainer_command + ['--experiment_name', args.experiment_name]
|
||||
if '--experiment_name' not in trainer_command:
|
||||
trainer_command = trainer_command + ['--experiment_name', args.experiment_name]
|
||||
|
||||
memory_backend_params = None
|
||||
if args.memory_backend == "redispubsub":
|
||||
memory_backend_params = RedisPubSubMemoryBackendParameters()
|
||||
memory_backend_params = None
|
||||
if args.memory_backend == "redispubsub":
|
||||
memory_backend_params = RedisPubSubMemoryBackendParameters()
|
||||
|
||||
ds_params_instance = None
|
||||
if args.data_store == "s3":
|
||||
ds_params = DataStoreParameters("s3", "", "")
|
||||
ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
|
||||
creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)
|
||||
ds_params_instance = None
|
||||
if args.data_store == "s3":
|
||||
ds_params = DataStoreParameters("s3", "", "")
|
||||
ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
|
||||
creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)
|
||||
|
||||
worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
|
||||
trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))
|
||||
worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
|
||||
trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))
|
||||
|
||||
orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
|
||||
kubeconfig='~/.kube/config',
|
||||
memory_backend_parameters=memory_backend_params,
|
||||
data_store_params=ds_params_instance)
|
||||
orchestrator = Kubernetes(orchestration_params)
|
||||
if not orchestrator.setup():
|
||||
print("Could not setup.")
|
||||
return
|
||||
orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
|
||||
kubeconfig='~/.kube/config',
|
||||
memory_backend_parameters=memory_backend_params,
|
||||
data_store_params=ds_params_instance)
|
||||
orchestrator = Kubernetes(orchestration_params)
|
||||
if not orchestrator.setup():
|
||||
print("Could not setup.")
|
||||
return
|
||||
|
||||
if orchestrator.deploy_trainer():
|
||||
print("Successfully deployed trainer.")
|
||||
else:
|
||||
print("Could not deploy trainer.")
|
||||
return
|
||||
if orchestrator.deploy_trainer():
|
||||
print("Successfully deployed trainer.")
|
||||
else:
|
||||
print("Could not deploy trainer.")
|
||||
return
|
||||
|
||||
if orchestrator.deploy_worker():
|
||||
print("Successfully deployed rollout worker(s).")
|
||||
else:
|
||||
print("Could not deploy rollout worker(s).")
|
||||
return
|
||||
if orchestrator.deploy_worker():
|
||||
print("Successfully deployed rollout worker(s).")
|
||||
else:
|
||||
print("Could not deploy rollout worker(s).")
|
||||
return
|
||||
|
||||
try:
|
||||
orchestrator.trainer_logs()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
try:
|
||||
orchestrator.trainer_logs()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
orchestrator.undeploy()
|
||||
orchestrator.undeploy()
|
||||
|
||||
|
||||
class CoachLauncher(object):
|
||||
@@ -192,7 +192,6 @@ class CoachLauncher(object):
|
||||
graph_manager = self.get_graph_manager_from_args(args)
|
||||
self.run_graph_manager(graph_manager, args)
|
||||
|
||||
|
||||
def get_graph_manager_from_args(self, args: argparse.Namespace) -> 'GraphManager':
|
||||
"""
|
||||
Return the graph manager according to the command line arguments given by the user.
|
||||
@@ -251,7 +250,6 @@ class CoachLauncher(object):
|
||||
|
||||
return graph_manager
|
||||
|
||||
|
||||
def display_all_presets_and_exit(self):
|
||||
# list available presets
|
||||
screen.log_title("Available Presets:")
|
||||
@@ -259,7 +257,6 @@ class CoachLauncher(object):
|
||||
print(preset)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def expand_preset(self, preset):
|
||||
"""
|
||||
Replace a short preset name with the full python path, and verify that it can be imported.
|
||||
@@ -287,7 +284,6 @@ class CoachLauncher(object):
|
||||
|
||||
return preset
|
||||
|
||||
|
||||
def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace:
|
||||
"""
|
||||
Returns a Namespace object with all the user-specified configuration options needed to launch.
|
||||
@@ -317,7 +313,6 @@ class CoachLauncher(object):
|
||||
if args.list:
|
||||
self.display_all_presets_and_exit()
|
||||
|
||||
|
||||
# Read args from config file for distributed Coach.
|
||||
if args.distributed_coach and args.distributed_coach_run_type == RunType.ORCHESTRATOR:
|
||||
coach_config = ConfigParser({
|
||||
@@ -401,7 +396,6 @@ class CoachLauncher(object):
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def get_argument_parser(self) -> argparse.ArgumentParser:
|
||||
"""
|
||||
This returns an ArgumentParser object which defines the set of options that customers are expected to supply in order
|
||||
@@ -545,7 +539,6 @@ class CoachLauncher(object):
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def run_graph_manager(self, graph_manager: 'GraphManager', args: argparse.Namespace):
|
||||
if args.distributed_coach and not graph_manager.agent_params.algorithm.distributed_coach_synchronization_type:
|
||||
screen.error("{} algorithm is not supported using distributed Coach.".format(graph_manager.agent_params.algorithm))
|
||||
@@ -581,7 +574,6 @@ class CoachLauncher(object):
|
||||
else:
|
||||
self.start_multi_threaded(graph_manager, args)
|
||||
|
||||
|
||||
def start_single_threaded(self, graph_manager: 'GraphManager', args: argparse.Namespace):
|
||||
# Start the training or evaluation
|
||||
task_parameters = TaskParameters(
|
||||
@@ -598,7 +590,6 @@ class CoachLauncher(object):
|
||||
|
||||
start_graph(graph_manager=graph_manager, task_parameters=task_parameters)
|
||||
|
||||
|
||||
def start_multi_threaded(self, graph_manager: 'GraphManager', args: argparse.Namespace):
|
||||
total_tasks = args.num_workers
|
||||
if args.evaluation_worker:
|
||||
|
||||
@@ -260,6 +260,7 @@ class EnvResponse(object):
|
||||
"""
|
||||
An env response is a collection containing the information returning from the environment after a single action
|
||||
has been performed on it.
|
||||
|
||||
:param next_state: The new state that the environment has transitioned into. Assumed to be a dictionary where the
|
||||
observation is located at state['observation']
|
||||
:param reward: The reward received from the environment
|
||||
@@ -350,11 +351,13 @@ class ActionInfo(object):
|
||||
|
||||
|
||||
class Batch(object):
|
||||
"""
|
||||
A wrapper around a list of transitions that helps extracting batches of parameters from it.
|
||||
For example, one can extract a list of states corresponding to the list of transitions.
|
||||
The class uses lazy evaluation in order to return each of the available parameters.
|
||||
"""
|
||||
def __init__(self, transitions: List[Transition]):
|
||||
"""
|
||||
A wrapper around a list of transitions that helps extracting batches of parameters from it.
|
||||
For example, one can extract a list of states corresponding to the list of transitions.
|
||||
The class uses lazy evaluation in order to return each of the available parameters.
|
||||
:param transitions: a list of transitions to extract the batch from
|
||||
"""
|
||||
self.transitions = transitions
|
||||
@@ -370,6 +373,7 @@ class Batch(object):
|
||||
def slice(self, start, end) -> None:
|
||||
"""
|
||||
Keep a slice from the batch and discard the rest of the batch
|
||||
|
||||
:param start: the start index in the slice
|
||||
:param end: the end index in the slice
|
||||
:return: None
|
||||
@@ -396,6 +400,7 @@ class Batch(object):
|
||||
def shuffle(self) -> None:
|
||||
"""
|
||||
Shuffle all the transitions in the batch
|
||||
|
||||
:return: None
|
||||
"""
|
||||
batch_order = list(range(self.size))
|
||||
@@ -432,6 +437,7 @@ class Batch(object):
|
||||
"""
|
||||
follow the keys in fetches to extract the corresponding items from the states in the batch
|
||||
if these keys were not already extracted before. return only the values corresponding to those keys
|
||||
|
||||
:param fetches: the keys of the state dictionary to extract
|
||||
:param expand_dims: add an extra dimension to each of the value batches
|
||||
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
|
||||
@@ -452,6 +458,7 @@ class Batch(object):
|
||||
def actions(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the actions were not converted to a batch before, extract them to a batch and then return the batch
|
||||
|
||||
:param expand_dims: add an extra dimension to the actions batch
|
||||
:return: a numpy array containing all the actions of the batch
|
||||
"""
|
||||
@@ -464,6 +471,7 @@ class Batch(object):
|
||||
def rewards(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the rewards were not converted to a batch before, extract them to a batch and then return the batch
|
||||
|
||||
:param expand_dims: add an extra dimension to the rewards batch
|
||||
:return: a numpy array containing all the rewards of the batch
|
||||
"""
|
||||
@@ -491,6 +499,7 @@ class Batch(object):
|
||||
def game_overs(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the game_overs were not converted to a batch before, extract them to a batch and then return the batch
|
||||
|
||||
:param expand_dims: add an extra dimension to the game_overs batch
|
||||
:return: a numpy array containing all the game over flags of the batch
|
||||
"""
|
||||
@@ -504,6 +513,7 @@ class Batch(object):
|
||||
"""
|
||||
follow the keys in fetches to extract the corresponding items from the next states in the batch
|
||||
if these keys were not already extracted before. return only the values corresponding to those keys
|
||||
|
||||
:param fetches: the keys of the state dictionary to extract
|
||||
:param expand_dims: add an extra dimension to each of the value batches
|
||||
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
|
||||
@@ -526,6 +536,7 @@ class Batch(object):
|
||||
"""
|
||||
if the goals were not converted to a batch before, extract them to a batch and then return the batch
|
||||
if the goal was not filled, this will raise an exception
|
||||
|
||||
:param expand_dims: add an extra dimension to the goals batch
|
||||
:return: a numpy array containing all the goals of the batch
|
||||
"""
|
||||
@@ -549,6 +560,7 @@ class Batch(object):
|
||||
"""
|
||||
if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
|
||||
batch. if the key is not part of the keys in the info dictionary, this will raise an exception
|
||||
|
||||
:param expand_dims: add an extra dimension to the info batch
|
||||
:return: a numpy array containing all the info values of the batch corresponding to the given key
|
||||
"""
|
||||
@@ -568,6 +580,7 @@ class Batch(object):
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
get an item from the transitions list
|
||||
|
||||
:param key: index of the transition in the batch
|
||||
:return: the transition corresponding to the given index
|
||||
"""
|
||||
@@ -576,6 +589,7 @@ class Batch(object):
|
||||
def __setitem__(self, key, item):
|
||||
"""
|
||||
set an item in the transition list
|
||||
|
||||
:param key: index of the transition in the batch
|
||||
:param item: the transition to place in the given index
|
||||
:return: None
|
||||
@@ -598,6 +612,7 @@ class TotalStepsCounter(object):
|
||||
def __getitem__(self, key: Type[StepMethod]) -> int:
|
||||
"""
|
||||
get counter value
|
||||
|
||||
:param key: counter type
|
||||
:return: the counter value
|
||||
"""
|
||||
@@ -606,6 +621,7 @@ class TotalStepsCounter(object):
|
||||
def __setitem__(self, key: StepMethod, item: int) -> None:
|
||||
"""
|
||||
set an item in the transition list
|
||||
|
||||
:param key: counter type
|
||||
:param item: an integer representing the new counter value
|
||||
:return: None
|
||||
@@ -626,6 +642,9 @@ class GradientClippingMethod(Enum):
|
||||
|
||||
|
||||
class Episode(object):
|
||||
"""
|
||||
An Episode represents a set of sequential transitions, that end with a terminal state.
|
||||
"""
|
||||
def __init__(self, discount: float=0.99, bootstrap_total_return_from_old_policy: bool=False, n_step: int=-1):
|
||||
"""
|
||||
:param discount: the discount factor to use when calculating total returns
|
||||
@@ -634,38 +653,78 @@ class Episode(object):
|
||||
:param n_step: the number of future steps to sum the reward over before bootstrapping
|
||||
"""
|
||||
self.transitions = []
|
||||
# a num_transitions x num_transitions table with the n step return in the n'th row
|
||||
self._length = 0
|
||||
self.discount = discount
|
||||
self.bootstrap_total_return_from_old_policy = bootstrap_total_return_from_old_policy
|
||||
self.n_step = n_step
|
||||
self.is_complete = False
|
||||
|
||||
def insert(self, transition):
|
||||
def insert(self, transition: Transition) -> None:
|
||||
"""
|
||||
Insert a new transition to the episode. If the game_over flag in the transition is set to True,
|
||||
the episode will be marked as complete.
|
||||
|
||||
:param transition: The new transition to insert to the episode
|
||||
:return: None
|
||||
"""
|
||||
self.transitions.append(transition)
|
||||
self._length += 1
|
||||
if transition.game_over:
|
||||
self.is_complete = True
|
||||
|
||||
def is_empty(self):
|
||||
def is_empty(self) -> bool:
|
||||
"""
|
||||
Check if the episode is empty
|
||||
|
||||
:return: A boolean value determining if the episode is empty or not
|
||||
"""
|
||||
return self.length() == 0
|
||||
|
||||
def length(self):
|
||||
def length(self) -> int:
|
||||
"""
|
||||
Return the length of the episode, which is the number of transitions it holds.
|
||||
|
||||
:return: The number of transitions in the episode
|
||||
"""
|
||||
return self._length
|
||||
|
||||
def __len__(self):
|
||||
return self.length()
|
||||
|
||||
def get_transition(self, transition_idx):
|
||||
def get_transition(self, transition_idx: int) -> Transition:
|
||||
"""
|
||||
Get a specific transition by its index.
|
||||
|
||||
:param transition_idx: The index of the transition to get
|
||||
:return: The transition which is stored in the given index
|
||||
"""
|
||||
return self.transitions[transition_idx]
|
||||
|
||||
def get_last_transition(self):
|
||||
def get_last_transition(self) -> Transition:
|
||||
"""
|
||||
Get the last transition in the episode, or None if there are no transition available
|
||||
|
||||
:return: The last transition in the episode
|
||||
"""
|
||||
return self.get_transition(-1) if self.length() > 0 else None
|
||||
|
||||
def get_first_transition(self):
|
||||
def get_first_transition(self) -> Transition:
|
||||
"""
|
||||
Get the first transition in the episode, or None if there are no transitions available
|
||||
|
||||
:return: The first transition in the episode
|
||||
"""
|
||||
return self.get_transition(0) if self.length() > 0 else None
|
||||
|
||||
def update_discounted_rewards(self):
|
||||
"""
|
||||
Update the discounted returns for all the transitions in the episode.
|
||||
The returns will be calculated according to the rewards of each transition, together with the number of steps
|
||||
to bootstrap from and the discount factor, as defined by n_step and discount respectively when initializing
|
||||
the episode.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
if self.n_step == -1 or self.n_step > self.length():
|
||||
curr_n_step = self.length()
|
||||
else:
|
||||
@@ -708,15 +767,17 @@ class Episode(object):
|
||||
|
||||
self.update_discounted_rewards()
|
||||
|
||||
def update_actions_probabilities(self):
|
||||
probability_product = 1
|
||||
for transition_idx, transition in enumerate(self.transitions):
|
||||
if 'action_probabilities' in transition.info.keys():
|
||||
probability_product *= transition.info['action_probabilities']
|
||||
for transition_idx, transition in enumerate(self.transitions):
|
||||
transition.info['probability_product'] = probability_product
|
||||
|
||||
def get_transitions_attribute(self, attribute_name):
|
||||
|
||||
def get_transitions_attribute(self, attribute_name: str) -> List[Any]:
|
||||
"""
|
||||
Get the values for some transition attribute from all the transitions in the episode.
|
||||
For example, this allows getting the rewards for all the transitions as a list by calling
|
||||
get_transitions_attribute('reward')
|
||||
|
||||
:param attribute_name: The name of the attribute to extract from all the transitions
|
||||
:return: A list of values from all the transitions according to the attribute given in attribute_name
|
||||
"""
|
||||
if len(self.transitions) > 0 and hasattr(self.transitions[0], attribute_name):
|
||||
return [getattr(t, attribute_name) for t in self.transitions]
|
||||
elif len(self.transitions) == 0:
|
||||
@@ -724,12 +785,6 @@ class Episode(object):
|
||||
else:
|
||||
raise ValueError("The transitions have no such attribute name")
|
||||
|
||||
def to_batch(self):
|
||||
batch = []
|
||||
for i in range(self.length()):
|
||||
batch.append(self.get_transition(i))
|
||||
return batch
|
||||
|
||||
def __getitem__(self, sliced):
|
||||
return self.transitions[sliced]
|
||||
|
||||
|
||||
@@ -69,6 +69,38 @@ class ControlSuiteEnvironment(Environment):
|
||||
target_success_rate: float=1.0, seed: Union[None, int]=None, human_control: bool=False,
|
||||
observation_type: ObservationType=ObservationType.Measurements,
|
||||
custom_reward_threshold: Union[int, float]=None, **kwargs):
|
||||
"""
|
||||
:param level: (str)
|
||||
A string representing the control suite level to run. This can also be a LevelSelection object.
|
||||
For example, cartpole:swingup.
|
||||
|
||||
:param frame_skip: (int)
|
||||
The number of frames to skip between any two actions given by the agent. The action will be repeated
|
||||
for all the skipped frames.
|
||||
|
||||
:param visualization_parameters: (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.
|
||||
|
||||
:param target_success_rate: (float)
|
||||
Stop experiment if given target success rate was achieved.
|
||||
|
||||
:param seed: (int)
|
||||
A seed to use for the random number generator when running the environment.
|
||||
|
||||
:param human_control: (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.
|
||||
|
||||
:param observation_type: (ObservationType)
|
||||
An enum which defines which observation to use. The current options are to use:
|
||||
* Measurements only - a vector of joint torques and similar measurements
|
||||
* Image only - an image of the environment as seen by a camera attached to the simulator
|
||||
* Measurements & Image - both type of observations will be returned in the state using the keys
|
||||
'measurements' and 'pixels' respectively.
|
||||
|
||||
:param custom_reward_threshold: (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
|
||||
|
||||
"""
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters, target_success_rate)
|
||||
|
||||
self.observation_type = observation_type
|
||||
|
||||
@@ -125,6 +125,36 @@ class DoomEnvironment(Environment):
|
||||
def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
|
||||
custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
|
||||
cameras: List[CameraTypes], target_success_rate: float=1.0, **kwargs):
|
||||
"""
|
||||
:param level: (str)
|
||||
A string representing the doom level to run. This can also be a LevelSelection object.
|
||||
This should be one of the levels defined in the DoomLevel enum. For example, HEALTH_GATHERING.
|
||||
|
||||
:param seed: (int)
|
||||
A seed to use for the random number generator when running the environment.
|
||||
|
||||
:param frame_skip: (int)
|
||||
The number of frames to skip between any two actions given by the agent. The action will be repeated
|
||||
for all the skipped frames.
|
||||
|
||||
:param human_control: (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.
|
||||
|
||||
:param custom_reward_threshold: (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
|
||||
|
||||
:param visualization_parameters: (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.
|
||||
|
||||
:param cameras: (List[CameraTypes])
|
||||
A list of camera types to use as observation in the state returned from the environment.
|
||||
Each camera should be an enum from CameraTypes, and there are several options like an RGB observation,
|
||||
a depth map, a segmentation map, and a top down map of the enviornment.
|
||||
|
||||
:param target_success_rate: (float)
|
||||
Stop experiment if given target success rate was achieved.
|
||||
|
||||
"""
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters, target_success_rate)
|
||||
|
||||
self.cameras = cameras
|
||||
|
||||
@@ -176,6 +176,7 @@ class Environment(EnvironmentInterface):
|
||||
def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
|
||||
"""
|
||||
Get the action space of the environment
|
||||
|
||||
:return: the action space
|
||||
"""
|
||||
return self._action_space
|
||||
@@ -184,6 +185,7 @@ class Environment(EnvironmentInterface):
|
||||
def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
|
||||
"""
|
||||
Set the action space of the environment
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self._action_space = val
|
||||
@@ -192,6 +194,7 @@ class Environment(EnvironmentInterface):
|
||||
def state_space(self) -> Union[List[StateSpace], StateSpace]:
|
||||
"""
|
||||
Get the state space of the environment
|
||||
|
||||
:return: the observation space
|
||||
"""
|
||||
return self._state_space
|
||||
@@ -200,6 +203,7 @@ class Environment(EnvironmentInterface):
|
||||
def state_space(self, val: Union[List[StateSpace], StateSpace]):
|
||||
"""
|
||||
Set the state space of the environment
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self._state_space = val
|
||||
@@ -208,6 +212,7 @@ class Environment(EnvironmentInterface):
|
||||
def goal_space(self) -> Union[List[ObservationSpace], ObservationSpace]:
|
||||
"""
|
||||
Get the state space of the environment
|
||||
|
||||
:return: the observation space
|
||||
"""
|
||||
return self._goal_space
|
||||
@@ -216,6 +221,7 @@ class Environment(EnvironmentInterface):
|
||||
def goal_space(self, val: Union[List[ObservationSpace], ObservationSpace]):
|
||||
"""
|
||||
Set the goal space of the environment
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self._goal_space = val
|
||||
@@ -223,6 +229,7 @@ class Environment(EnvironmentInterface):
|
||||
def get_action_from_user(self) -> ActionType:
|
||||
"""
|
||||
Get an action from the user keyboard
|
||||
|
||||
:return: action index
|
||||
"""
|
||||
if self.wait_for_explicit_human_action:
|
||||
@@ -250,6 +257,7 @@ class Environment(EnvironmentInterface):
|
||||
def last_env_response(self) -> Union[List[EnvResponse], EnvResponse]:
|
||||
"""
|
||||
Get the last environment response
|
||||
|
||||
:return: a dictionary that contains the state, reward, etc.
|
||||
"""
|
||||
return squeeze_list(self._last_env_response)
|
||||
@@ -258,6 +266,7 @@ class Environment(EnvironmentInterface):
|
||||
def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]):
|
||||
"""
|
||||
Set the last environment response
|
||||
|
||||
:param val: the last environment response
|
||||
"""
|
||||
self._last_env_response = force_list(val)
|
||||
@@ -265,6 +274,7 @@ class Environment(EnvironmentInterface):
|
||||
def step(self, action: ActionType) -> EnvResponse:
|
||||
"""
|
||||
Make a single step in the environment using the given action
|
||||
|
||||
:param action: an action to use for stepping the environment. Should follow the definition of the action space.
|
||||
:return: the environment response as returned in get_last_env_response
|
||||
"""
|
||||
@@ -317,6 +327,8 @@ class Environment(EnvironmentInterface):
|
||||
def render(self) -> None:
|
||||
"""
|
||||
Call the environment function for rendering to the screen
|
||||
|
||||
:return: None
|
||||
"""
|
||||
if self.native_rendering:
|
||||
self._render()
|
||||
@@ -326,6 +338,7 @@ class Environment(EnvironmentInterface):
|
||||
def handle_episode_ended(self) -> None:
|
||||
"""
|
||||
End an episode
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self.dump_video_of_last_episode_if_needed()
|
||||
@@ -333,6 +346,7 @@ class Environment(EnvironmentInterface):
|
||||
def reset_internal_state(self, force_environment_reset=False) -> EnvResponse:
|
||||
"""
|
||||
Reset the environment and all the variable of the wrapper
|
||||
|
||||
:param force_environment_reset: forces environment reset even when the game did not end
|
||||
:return: A dictionary containing the observation, reward, done flag, action and measurements
|
||||
"""
|
||||
@@ -368,6 +382,7 @@ class Environment(EnvironmentInterface):
|
||||
def get_random_action(self) -> ActionType:
|
||||
"""
|
||||
Returns an action picked uniformly from the available actions
|
||||
|
||||
:return: a numpy array with a random action
|
||||
"""
|
||||
return self.action_space.sample()
|
||||
@@ -375,6 +390,7 @@ class Environment(EnvironmentInterface):
|
||||
def get_available_keys(self) -> List[Tuple[str, ActionType]]:
|
||||
"""
|
||||
Return a list of tuples mapping between action names and the keyboard key that triggers them
|
||||
|
||||
:return: a list of tuples mapping between action names and the keyboard key that triggers them
|
||||
"""
|
||||
available_keys = []
|
||||
@@ -391,6 +407,7 @@ class Environment(EnvironmentInterface):
|
||||
def get_goal(self) -> GoalType:
|
||||
"""
|
||||
Get the current goal that the agents needs to achieve in the environment
|
||||
|
||||
:return: The goal
|
||||
"""
|
||||
return self.goal
|
||||
@@ -398,6 +415,7 @@ class Environment(EnvironmentInterface):
|
||||
def set_goal(self, goal: GoalType) -> None:
|
||||
"""
|
||||
Set the current goal that the agent needs to achieve in the environment
|
||||
|
||||
:param goal: the goal that needs to be achieved
|
||||
:return: None
|
||||
"""
|
||||
@@ -424,14 +442,6 @@ class Environment(EnvironmentInterface):
|
||||
if self.visualization_parameters.dump_mp4:
|
||||
logger.create_mp4(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Episode"] = self.episode_idx
|
||||
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
# The following functions define the interaction with the environment.
|
||||
# Any new environment that inherits the Environment class should use these signatures.
|
||||
# Some of these functions are optional - please read their description for more details.
|
||||
@@ -439,6 +449,7 @@ class Environment(EnvironmentInterface):
|
||||
def _take_action(self, action_idx: ActionType) -> None:
|
||||
"""
|
||||
An environment dependent function that sends an action to the simulator.
|
||||
|
||||
:param action_idx: the action to perform on the environment
|
||||
:return: None
|
||||
"""
|
||||
@@ -448,6 +459,7 @@ class Environment(EnvironmentInterface):
|
||||
"""
|
||||
Updates the state from the environment.
|
||||
Should update self.observation, self.reward, self.done, self.measurements and self.info
|
||||
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
@@ -455,6 +467,7 @@ class Environment(EnvironmentInterface):
|
||||
def _restart_environment_episode(self, force_environment_reset=False) -> None:
|
||||
"""
|
||||
Restarts the simulator episode
|
||||
|
||||
:param force_environment_reset: Force the environment to reset even if the episode is not done yet.
|
||||
:return: None
|
||||
"""
|
||||
@@ -463,6 +476,7 @@ class Environment(EnvironmentInterface):
|
||||
def _render(self) -> None:
|
||||
"""
|
||||
Renders the environment using the native simulator renderer
|
||||
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
@@ -471,6 +485,7 @@ class Environment(EnvironmentInterface):
|
||||
"""
|
||||
Return a numpy array containing the image that will be rendered to the screen.
|
||||
This can be different from the observation. For example, mujoco's observation is a measurements vector.
|
||||
|
||||
:return: numpy array containing the image that will be rendered to the screen
|
||||
"""
|
||||
return np.transpose(self.state['observation'], [1, 2, 0])
|
||||
|
||||
@@ -140,7 +140,7 @@ atari_schedule = ScheduleParameters()
|
||||
atari_schedule.improve_steps = EnvironmentSteps(50000000)
|
||||
atari_schedule.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
atari_schedule.evaluation_steps = EnvironmentSteps(135000)
|
||||
atari_schedule.heatup_steps = EnvironmentSteps(50000)
|
||||
atari_schedule.heatup_steps = EnvironmentSteps(1)
|
||||
|
||||
|
||||
class MaxOverFramesAndFrameskipEnvWrapper(gym.Wrapper):
|
||||
@@ -181,6 +181,41 @@ class GymEnvironment(Environment):
|
||||
target_success_rate: float=1.0, additional_simulator_parameters: Dict[str, Any] = {}, seed: Union[None, int]=None,
|
||||
human_control: bool=False, custom_reward_threshold: Union[int, float]=None,
|
||||
random_initialization_steps: int=1, max_over_num_frames: int=1, **kwargs):
|
||||
"""
|
||||
:param level: (str)
|
||||
A string representing the gym level to run. This can also be a LevelSelection object.
|
||||
For example, BreakoutDeterministic-v0
|
||||
|
||||
:param frame_skip: (int)
|
||||
The number of frames to skip between any two actions given by the agent. The action will be repeated
|
||||
for all the skipped frames.
|
||||
|
||||
:param visualization_parameters: (VisualizationParameters)
|
||||
The parameters used for visualizing the environment, such as the render flag, storing videos etc.
|
||||
|
||||
:param additional_simulator_parameters: (Dict[str, Any])
|
||||
Any additional parameters that the user can pass to the Gym environment. These parameters should be
|
||||
accepted by the __init__ function of the implemented Gym environment.
|
||||
|
||||
:param seed: (int)
|
||||
A seed to use for the random number generator when running the environment.
|
||||
|
||||
:param human_control: (bool)
|
||||
A flag that allows controlling the environment using the keyboard keys.
|
||||
|
||||
:param custom_reward_threshold: (float)
|
||||
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
|
||||
If not set, this value will be taken from the Gym environment definition.
|
||||
|
||||
:param random_initialization_steps: (int)
|
||||
The number of random steps that will be taken in the environment after each reset.
|
||||
This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.
|
||||
|
||||
:param max_over_num_frames: (int)
|
||||
This value will be used for merging multiple frames into a single frame by taking the maximum value for each
|
||||
of the pixels in the frame. This is particularly used in Atari games, where the frames flicker, and objects
|
||||
can be seen in one frame but disappear in the next.
|
||||
"""
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold,
|
||||
visualization_parameters, target_success_rate)
|
||||
|
||||
|
||||
@@ -13,3 +13,43 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from .additive_noise import AdditiveNoiseParameters, AdditiveNoise
|
||||
from .boltzmann import BoltzmannParameters, Boltzmann
|
||||
from .bootstrapped import BootstrappedParameters, Bootstrapped
|
||||
from .categorical import CategoricalParameters, Categorical
|
||||
from .continuous_entropy import ContinuousEntropyParameters, ContinuousEntropy
|
||||
from .e_greedy import EGreedyParameters, EGreedy
|
||||
from .exploration_policy import ExplorationParameters, ExplorationPolicy
|
||||
from .greedy import GreedyParameters, Greedy
|
||||
from .ou_process import OUProcessParameters, OUProcess
|
||||
from .parameter_noise import ParameterNoiseParameters, ParameterNoise
|
||||
from .truncated_normal import TruncatedNormalParameters, TruncatedNormal
|
||||
from .ucb import UCBParameters, UCB
|
||||
|
||||
__all__ = [
|
||||
'AdditiveNoiseParameters',
|
||||
'AdditiveNoise',
|
||||
'BoltzmannParameters',
|
||||
'Boltzmann',
|
||||
'BootstrappedParameters',
|
||||
'Bootstrapped',
|
||||
'CategoricalParameters',
|
||||
'Categorical',
|
||||
'ContinuousEntropyParameters',
|
||||
'ContinuousEntropy',
|
||||
'EGreedyParameters',
|
||||
'EGreedy',
|
||||
'ExplorationParameters',
|
||||
'ExplorationPolicy',
|
||||
'GreedyParameters',
|
||||
'Greedy',
|
||||
'OUProcessParameters',
|
||||
'OUProcess',
|
||||
'ParameterNoiseParameters',
|
||||
'ParameterNoise',
|
||||
'TruncatedNormalParameters',
|
||||
'TruncatedNormal',
|
||||
'UCBParameters',
|
||||
'UCB'
|
||||
]
|
||||
|
||||
@@ -37,6 +37,14 @@ class AdditiveNoiseParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class AdditiveNoise(ExplorationPolicy):
|
||||
"""
|
||||
AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
|
||||
and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that
|
||||
can be given in two different ways:
|
||||
1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
|
||||
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
|
||||
be the mean of the action, and 2nd is assumed to be its standard deviation.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float):
|
||||
"""
|
||||
|
||||
@@ -36,6 +36,12 @@ class BoltzmannParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class Boltzmann(ExplorationPolicy):
|
||||
"""
|
||||
The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
|
||||
actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
|
||||
into a distribution over the actions. It then samples the action for playing out of the calculated distribution.
|
||||
An additional temperature schedule can be given by the user, and will control the steepness of the softmax function.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -39,6 +39,17 @@ class BootstrappedParameters(EGreedyParameters):
|
||||
|
||||
|
||||
class Bootstrapped(EGreedy):
|
||||
"""
|
||||
Bootstrapped exploration policy is currently only used for discrete action spaces along with the
|
||||
Bootstrapped DQN agent. It assumes that there is an ensemble of network heads, where each one predicts the
|
||||
values for all the possible actions. For each episode, a single head is selected to lead the agent, according
|
||||
to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
|
||||
predictions.
|
||||
|
||||
.. note::
|
||||
This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
|
||||
since it requires the agent to have a network with multiple heads.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
|
||||
architecture_num_q_heads: int,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):
|
||||
|
||||
@@ -30,6 +30,12 @@ class CategoricalParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class Categorical(ExplorationPolicy):
|
||||
"""
|
||||
Categorical exploration policy is intended for discrete action spaces. It expects the action values to
|
||||
represent a probability distribution over the action, from which a single action will be sampled.
|
||||
In evaluation, the action that has the highest probability will be selected. This is particularly useful for
|
||||
actor-critic schemes, where the actors output is a probability distribution over the actions.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -24,4 +24,15 @@ class ContinuousEntropyParameters(AdditiveNoiseParameters):
|
||||
|
||||
|
||||
class ContinuousEntropy(AdditiveNoise):
|
||||
"""
|
||||
Continuous entropy is an exploration policy that is actually implemented as part of the network.
|
||||
The exploration policy class is only a placeholder for choosing this policy. The exploration policy is
|
||||
implemented by adding a regularization factor to the network loss, which regularizes the entropy of the action.
|
||||
This exploration policy is only intended for continuous action spaces, and assumes that the entire calculation
|
||||
is implemented as part of the head.
|
||||
|
||||
.. warning::
|
||||
This exploration policy expects the agent or the network to implement the exploration functionality.
|
||||
There are only a few heads that actually are relevant and implement the entropy regularization factor.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -43,6 +43,19 @@ class EGreedyParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class EGreedy(ExplorationPolicy):
|
||||
"""
|
||||
e-greedy is an exploration policy that is intended for both discrete and continuous action spaces.
|
||||
|
||||
For discrete action spaces, it assumes that each action is assigned a value, and it selects the action with the
|
||||
highest value with probability 1 - epsilon. Otherwise, it selects a action sampled uniformly out of all the
|
||||
possible actions. The epsilon value is given by the user and can be given as a schedule.
|
||||
In evaluation, a different epsilon value can be specified.
|
||||
|
||||
For continuous action spaces, it assumes that the mean action is given by the agent. With probability epsilon,
|
||||
it samples a random action out of the action space bounds. Otherwise, it selects the action according to a
|
||||
given continuous exploration policy, which is set to AdditiveNoise by default. In evaluation, the action is
|
||||
always selected according to the given continuous exploration policy (where its phase is set to evaluation as well).
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
|
||||
evaluation_epsilon: float,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
|
||||
|
||||
@@ -31,6 +31,10 @@ class ExplorationParameters(Parameters):
|
||||
|
||||
|
||||
class ExplorationPolicy(object):
|
||||
"""
|
||||
An exploration policy takes the predicted actions or action values from the agent, and selects the action to
|
||||
actually apply to the environment using some predefined algorithm.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -30,6 +30,11 @@ class GreedyParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class Greedy(ExplorationPolicy):
|
||||
"""
|
||||
The Greedy exploration policy is intended for both discrete and continuous action spaces.
|
||||
For discrete action spaces, it always selects the action with the maximum value, as given by the agent.
|
||||
For continuous action spaces, it always return the exact action, as it was given by the agent.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -40,6 +40,11 @@ class OUProcessParameters(ExplorationParameters):
|
||||
|
||||
# Ornstein-Uhlenbeck process
|
||||
class OUProcess(ExplorationPolicy):
|
||||
"""
|
||||
OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
|
||||
an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
|
||||
the samples are correlated between consequent time steps.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
|
||||
@@ -42,10 +42,18 @@ class ParameterNoiseParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class ParameterNoise(ExplorationPolicy):
|
||||
"""
|
||||
The ParameterNoise exploration policy is intended for both discrete and continuous action spaces.
|
||||
It applies the exploration policy by replacing all the dense network layers with noisy layers.
|
||||
The noisy layers have both weight means and weight standard deviations, and for each forward pass of the network
|
||||
the weights are sampled from a normal distribution that follows the learned weights mean and standard deviation
|
||||
values.
|
||||
|
||||
Warning: currently supported only by DQN variants
|
||||
"""
|
||||
def __init__(self, network_params: Dict[str, NetworkParameters], action_space: ActionSpace):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param alpha0:
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.network_params = network_params
|
||||
|
||||
@@ -39,6 +39,16 @@ class TruncatedNormalParameters(ExplorationParameters):
|
||||
|
||||
|
||||
class TruncatedNormal(ExplorationPolicy):
|
||||
"""
|
||||
The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
|
||||
normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t
|
||||
wo different ways:
|
||||
1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
|
||||
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
|
||||
be the mean of the action, and 2nd is assumed to be its standard deviation.
|
||||
When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
|
||||
is within the bounds.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float, clip_low: float, clip_high: float):
|
||||
"""
|
||||
|
||||
@@ -43,6 +43,15 @@ class UCBParameters(EGreedyParameters):
|
||||
|
||||
|
||||
class UCB(EGreedy):
|
||||
"""
|
||||
UCB exploration policy is following the upper confidence bound heuristic to sample actions in discrete action spaces.
|
||||
It assumes that there are multiple network heads that are predicting action values, and that the standard deviation
|
||||
between the heads predictions represents the uncertainty of the agent in each of the actions.
|
||||
It then updates the action value estimates to by mean(actions)+lambda*stdev(actions), where lambda is
|
||||
given by the user. This exploration policy aims to take advantage of the uncertainty of the agent in its predictions,
|
||||
and select the action according to the tradeoff between how uncertain the agent is, and how large it predicts
|
||||
the outcome from those actions to be.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
|
||||
architecture_num_q_heads: int, lamb: int,
|
||||
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
from .attention_discretization import AttentionDiscretization
|
||||
from .box_discretization import BoxDiscretization
|
||||
from .box_masking import BoxMasking
|
||||
from .full_discrete_action_space_map import FullDiscreteActionSpaceMap
|
||||
from .linear_box_to_box_map import LinearBoxToBoxMap
|
||||
from .partial_discrete_action_space_map import PartialDiscreteActionSpaceMap
|
||||
__all__ = [
|
||||
'AttentionDiscretization',
|
||||
'BoxDiscretization',
|
||||
'BoxMasking',
|
||||
'FullDiscreteActionSpaceMap',
|
||||
'LinearBoxToBoxMap',
|
||||
'PartialDiscreteActionSpaceMap'
|
||||
]
|
||||
@@ -25,11 +25,18 @@ from rl_coach.spaces import AttentionActionSpace, BoxActionSpace, DiscreteAction
|
||||
|
||||
class AttentionDiscretization(PartialDiscreteActionSpaceMap):
|
||||
"""
|
||||
Given a box action space, this is used to discretize the space.
|
||||
The discretization is achieved by creating a grid in the space with num_bins_per_dimension bins per dimension in the
|
||||
space. Each discrete action is mapped to a single sub-box in the BoxActionSpace action space.
|
||||
Discretizes an **AttentionActionSpace**. The attention action space defines the actions
|
||||
as choosing sub-boxes in a given box. For example, consider an image of size 100x100, where the action is choosing
|
||||
a crop window of size 20x20 to attend to in the image. AttentionDiscretization allows discretizing the possible crop
|
||||
windows to choose into a finite number of options, and map a discrete action space into those crop windows.
|
||||
|
||||
Warning! this will currently only work for attention spaces with 2 dimensions.
|
||||
"""
|
||||
def __init__(self, num_bins_per_dimension: Union[int, List[int]], force_int_bins=False):
|
||||
"""
|
||||
:param num_bins_per_dimension: Number of discrete bins to use for each dimension of the action space
|
||||
:param force_int_bins: If set to True, all the bins will represent integer coordinates in space.
|
||||
"""
|
||||
# we allow specifying either a single number for all dimensions, or a single number per dimension in the target
|
||||
# action space
|
||||
self.num_bins_per_dimension = num_bins_per_dimension
|
||||
|
||||
@@ -25,9 +25,12 @@ from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
|
||||
|
||||
class BoxDiscretization(PartialDiscreteActionSpaceMap):
|
||||
"""
|
||||
Given a box action space, this is used to discretize the space.
|
||||
The discretization is achieved by creating a grid in the space with num_bins_per_dimension bins per dimension in the
|
||||
space. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
|
||||
Discretizes a continuous action space into a discrete action space, allowing the usage of
|
||||
agents such as DQN for continuous environments such as MuJoCo. Given the number of bins to discretize into, the
|
||||
original continuous action space is uniformly separated into the given number of bins, each mapped to a discrete
|
||||
action index. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
|
||||
For example, if the original actions space is between -1 and 1 and 5 bins were selected, the new action
|
||||
space will consist of 5 actions mapped to -1, -0.5, 0, 0.5 and 1.
|
||||
"""
|
||||
def __init__(self, num_bins_per_dimension: Union[int, List[int]], force_int_bins=False):
|
||||
"""
|
||||
|
||||
@@ -25,12 +25,10 @@ from rl_coach.spaces import BoxActionSpace
|
||||
|
||||
class BoxMasking(ActionFilter):
|
||||
"""
|
||||
Masks a box action space by allowing only selecting a subset of the space
|
||||
For example,
|
||||
- the target action space has actions of shape 1 with values between 10 and 32
|
||||
- we mask the target action space so that only the action 20 to 25 can be chosen
|
||||
The actions will be between 0 to 5 and the mapping will add an offset of 20 to the incoming actions
|
||||
The shape of the source and target action spaces is always the same
|
||||
Masks part of the action space to enforce the agent to work in a defined space. For example,
|
||||
if the original action space is between -1 and 1, then this filter can be used in order to constrain the agent actions
|
||||
to the range 0 and 1 instead. This essentially masks the range -1 and 0 from the agent.
|
||||
The resulting action space will be shifted and will always start from 0 and have the size of the unmasked area.
|
||||
"""
|
||||
def __init__(self,
|
||||
masked_target_space_low: Union[None, int, float, np.ndarray],
|
||||
|
||||
@@ -20,7 +20,9 @@ from rl_coach.spaces import ActionSpace, DiscreteActionSpace
|
||||
|
||||
class FullDiscreteActionSpaceMap(PartialDiscreteActionSpaceMap):
|
||||
"""
|
||||
Maps all the actions in the output space to discrete actions in the action space.
|
||||
Full map of two countable action spaces. This works in a similar way to the
|
||||
PartialDiscreteActionSpaceMap, but maps the entire source action space into the entire target action space, without
|
||||
masking any actions.
|
||||
For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those
|
||||
multiselect actions.
|
||||
"""
|
||||
|
||||
@@ -25,17 +25,19 @@ from rl_coach.spaces import BoxActionSpace
|
||||
|
||||
class LinearBoxToBoxMap(ActionFilter):
|
||||
"""
|
||||
Maps a box action space to a box action space.
|
||||
For example,
|
||||
- the source action space has actions of shape 1 with values between -42 and -10,
|
||||
- the target action space has actions of shape 1 with values between 10 and 32
|
||||
The mapping will add an offset of 52 to the incoming actions and then multiply them by 22/32 to scale them to the
|
||||
target action space
|
||||
The shape of the source and target action spaces is always the same
|
||||
A linear mapping of two box action spaces. For example, if the action space of the
|
||||
environment consists of continuous actions between 0 and 1, and we want the agent to choose actions between -1 and 1,
|
||||
the LinearBoxToBoxMap can be used to map the range -1 and 1 to the range 0 and 1 in a linear way. This means that the
|
||||
action -1 will be mapped to 0, the action 1 will be mapped to 1, and the rest of the actions will be linearly mapped
|
||||
between those values.
|
||||
"""
|
||||
def __init__(self,
|
||||
input_space_low: Union[None, int, float, np.ndarray],
|
||||
input_space_high: Union[None, int, float, np.ndarray]):
|
||||
"""
|
||||
:param input_space_low: the low values of the desired action space
|
||||
:param input_space_high: the high values of the desired action space
|
||||
"""
|
||||
self.input_space_low = input_space_low
|
||||
self.input_space_high = input_space_high
|
||||
self.rescale = None
|
||||
|
||||
@@ -23,11 +23,17 @@ from rl_coach.spaces import DiscreteActionSpace, ActionSpace
|
||||
|
||||
class PartialDiscreteActionSpaceMap(ActionFilter):
|
||||
"""
|
||||
Maps the given actions from the output space to discrete actions in the action space.
|
||||
For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those
|
||||
multiselect actions.
|
||||
Partial map of two countable action spaces. For example, consider an environment
|
||||
with a MultiSelect action space (select multiple actions at the same time, such as jump and go right), with 8 actual
|
||||
MultiSelect actions. If we want the agent to be able to select only 5 of those actions by their index (0-4), we can
|
||||
map a discrete action space with 5 actions into the 5 selected MultiSelect actions. This will both allow the agent to
|
||||
use regular discrete actions, and mask 3 of the actions from the agent.
|
||||
"""
|
||||
def __init__(self, target_actions: List[ActionType]=None, descriptions: List[str]=None):
|
||||
"""
|
||||
:param target_actions: A partial list of actions from the target space to map to.
|
||||
:param descriptions: a list of descriptions of each of the actions
|
||||
"""
|
||||
self.target_actions = target_actions
|
||||
self.descriptions = descriptions
|
||||
super().__init__()
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
from .observation_clipping_filter import ObservationClippingFilter
|
||||
from .observation_crop_filter import ObservationCropFilter
|
||||
from .observation_move_axis_filter import ObservationMoveAxisFilter
|
||||
from .observation_normalization_filter import ObservationNormalizationFilter
|
||||
from .observation_reduction_by_sub_parts_name_filter import ObservationReductionBySubPartsNameFilter
|
||||
from .observation_rescale_size_by_factor_filter import ObservationRescaleSizeByFactorFilter
|
||||
from .observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
|
||||
from .observation_rgb_to_y_filter import ObservationRGBToYFilter
|
||||
from .observation_squeeze_filter import ObservationSqueezeFilter
|
||||
from .observation_stacking_filter import ObservationStackingFilter
|
||||
from .observation_to_uint8_filter import ObservationToUInt8Filter
|
||||
|
||||
__all__ = [
|
||||
'ObservationClippingFilter',
|
||||
'ObservationCropFilter',
|
||||
'ObservationMoveAxisFilter',
|
||||
'ObservationNormalizationFilter',
|
||||
'ObservationReductionBySubPartsNameFilter',
|
||||
'ObservationRescaleSizeByFactorFilter',
|
||||
'ObservationRescaleToSizeFilter',
|
||||
'ObservationRGBToYFilter',
|
||||
'ObservationSqueezeFilter',
|
||||
'ObservationStackingFilter',
|
||||
'ObservationToUInt8Filter'
|
||||
]
|
||||
@@ -24,7 +24,10 @@ from rl_coach.spaces import ObservationSpace
|
||||
|
||||
class ObservationClippingFilter(ObservationFilter):
|
||||
"""
|
||||
Clip the observation values using the given ranges
|
||||
Clips the observation values to a given range of values.
|
||||
For example, if the observation consists of measurements in an arbitrary range,
|
||||
and we want to control the minimum and maximum values of these observations,
|
||||
we can define a range and clip the values of the measurements.
|
||||
"""
|
||||
def __init__(self, clipping_low: float=-np.inf, clipping_high: float=np.inf):
|
||||
"""
|
||||
|
||||
@@ -24,7 +24,9 @@ from rl_coach.spaces import ObservationSpace
|
||||
|
||||
class ObservationCropFilter(ObservationFilter):
|
||||
"""
|
||||
Crops the current state observation to a given shape
|
||||
Crops the size of the observation to a given crop window. For example, in Atari, the
|
||||
observations are images with a shape of 210x160. Usually, we will want to crop the size of the observation to a
|
||||
square of 160x160 before rescaling them.
|
||||
"""
|
||||
def __init__(self, crop_low: np.ndarray=None, crop_high: np.ndarray=None):
|
||||
"""
|
||||
|
||||
@@ -23,9 +23,14 @@ from rl_coach.spaces import ObservationSpace, PlanarMapsObservationSpace
|
||||
|
||||
class ObservationMoveAxisFilter(ObservationFilter):
|
||||
"""
|
||||
Move an axis of the observation to a different place.
|
||||
Reorders the axes of the observation. This can be useful when the observation is an
|
||||
image, and we want to move the channel axis to be the last axis instead of the first axis.
|
||||
"""
|
||||
def __init__(self, axis_origin: int = None, axis_target: int=None):
|
||||
"""
|
||||
:param axis_origin: The axis to move
|
||||
:param axis_target: Where to move the selected axis to
|
||||
"""
|
||||
super().__init__()
|
||||
self.axis_origin = axis_origin
|
||||
self.axis_target = axis_target
|
||||
|
||||
@@ -25,8 +25,9 @@ from rl_coach.spaces import ObservationSpace
|
||||
|
||||
class ObservationNormalizationFilter(ObservationFilter):
|
||||
"""
|
||||
Normalize the observation with a running standard deviation and mean of the observations seen so far
|
||||
If there is more than a single worker, the statistics of the observations are shared between all the workers
|
||||
Normalizes the observation values with a running mean and standard deviation of
|
||||
all the observations seen so far. The normalization is performed element-wise. Additionally, when working with
|
||||
multiple workers, the statistics used for the normalization operation are accumulated over all the workers.
|
||||
"""
|
||||
def __init__(self, clip_min: float=-5.0, clip_max: float=5.0, name='observation_stats'):
|
||||
"""
|
||||
|
||||
@@ -26,9 +26,11 @@ from rl_coach.spaces import ObservationSpace, VectorObservationSpace
|
||||
|
||||
class ObservationReductionBySubPartsNameFilter(ObservationFilter):
|
||||
"""
|
||||
Choose sub parts of the observation to remove or keep using their name.
|
||||
This is useful when the environment has a measurements vector as observation which includes several different
|
||||
Allows keeping only parts of the observation, by specifying their
|
||||
name. This is useful when the environment has a measurements vector as observation which includes several different
|
||||
measurements, but you want the agent to only see some of the measurements and not all.
|
||||
For example, the CARLA environment extracts multiple measurements that can be used by the agent, such as
|
||||
speed and location. If we want to only use the speed, it can be done using this filter.
|
||||
This will currently work only for VectorObservationSpace observations
|
||||
"""
|
||||
class ReductionMethod(Enum):
|
||||
|
||||
@@ -35,7 +35,8 @@ class RescaleInterpolationType(Enum):
|
||||
|
||||
class ObservationRescaleSizeByFactorFilter(ObservationFilter):
|
||||
"""
|
||||
Scales the current state observation size by a given factor
|
||||
Rescales an image observation by some factor. For example, the image size
|
||||
can be reduced by a factor of 2.
|
||||
Warning: this requires the input observation to be of type uint8 due to scipy requirements!
|
||||
"""
|
||||
def __init__(self, rescale_factor: float, rescaling_interpolation_type: RescaleInterpolationType):
|
||||
|
||||
@@ -37,7 +37,8 @@ class RescaleInterpolationType(Enum):
|
||||
|
||||
class ObservationRescaleToSizeFilter(ObservationFilter):
|
||||
"""
|
||||
Scales the current state observation to a given shape
|
||||
Rescales an image observation to a given size. The target size does not
|
||||
necessarily keep the aspect ratio of the original observation.
|
||||
Warning: this requires the input observation to be of type uint8 due to scipy requirements!
|
||||
"""
|
||||
def __init__(self, output_observation_space: PlanarMapsObservationSpace,
|
||||
|
||||
@@ -21,7 +21,9 @@ from rl_coach.spaces import ObservationSpace
|
||||
|
||||
class ObservationRGBToYFilter(ObservationFilter):
|
||||
"""
|
||||
Converts the observation in the current state to gray scale (Y channel).
|
||||
Converts a color image observation specified using the RGB encoding into a grayscale
|
||||
image observation, by keeping only the luminance (Y) channel of the YUV encoding. This can be useful if the colors
|
||||
in the original image are not relevant for solving the task at hand.
|
||||
The channels axis is assumed to be the last axis
|
||||
"""
|
||||
def __init__(self):
|
||||
|
||||
@@ -23,9 +23,12 @@ from rl_coach.spaces import ObservationSpace
|
||||
|
||||
class ObservationSqueezeFilter(ObservationFilter):
|
||||
"""
|
||||
Squeezes the observation so to eliminate redundant axes.
|
||||
Removes redundant axes from the observation, which are axes with a dimension of 1.
|
||||
"""
|
||||
def __init__(self, axis: int = None):
|
||||
"""
|
||||
:param axis: Specifies which axis to remove. If set to None, all the axes of size 1 will be removed.
|
||||
"""
|
||||
super().__init__()
|
||||
self.axis = axis
|
||||
|
||||
|
||||
@@ -43,7 +43,10 @@ class LazyStack(object):
|
||||
|
||||
class ObservationStackingFilter(ObservationFilter):
|
||||
"""
|
||||
Stack the current state observation on top of several previous observations.
|
||||
Stacks several observations on top of each other. For image observation this will
|
||||
create a 3D blob. The stacking is done in a lazy manner in order to reduce memory consumption. To achieve this,
|
||||
a LazyStack object is used in order to wrap the observations in the stack. For this reason, the
|
||||
ObservationStackingFilter **must** be the last filter in the inputs filters stack.
|
||||
This filter is stateful since it stores the previous step result and depends on it.
|
||||
The filter adds an additional dimension to the output observation.
|
||||
|
||||
|
||||
@@ -23,10 +23,15 @@ from rl_coach.spaces import ObservationSpace
|
||||
|
||||
class ObservationToUInt8Filter(ObservationFilter):
|
||||
"""
|
||||
Converts the observation values to be uint8 values between 0 and 255.
|
||||
It first scales the observation values to fit in the range and then converts them to uint8.
|
||||
Converts a floating point observation into an unsigned int 8 bit observation. This is
|
||||
mostly useful for reducing memory consumption and is usually used for image observations. The filter will first
|
||||
spread the observation values over the range 0-255 and then discretize them into integer values.
|
||||
"""
|
||||
def __init__(self, input_low: float, input_high: float):
|
||||
"""
|
||||
:param input_low: The lowest value currently present in the observation
|
||||
:param input_high: The highest value currently present in the observation
|
||||
"""
|
||||
super().__init__()
|
||||
self.input_low = input_low
|
||||
self.input_high = input_high
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
from .reward_rescale_filter import RewardRescaleFilter
|
||||
from .reward_clipping_filter import RewardClippingFilter
|
||||
from .reward_normalization_filter import RewardNormalizationFilter
|
||||
__all__ = [
|
||||
'RewardRescaleFilter',
|
||||
'RewardClippingFilter',
|
||||
'RewardNormalizationFilter'
|
||||
]
|
||||
@@ -23,7 +23,8 @@ from rl_coach.spaces import RewardSpace
|
||||
|
||||
class RewardClippingFilter(RewardFilter):
|
||||
"""
|
||||
Clips the reward to some range
|
||||
Clips the reward values into a given range. For example, in DQN, the Atari rewards are
|
||||
clipped into the range -1 and 1 in order to control the scale of the returns.
|
||||
"""
|
||||
def __init__(self, clipping_low: float=-np.inf, clipping_high: float=np.inf):
|
||||
"""
|
||||
|
||||
@@ -25,8 +25,9 @@ from rl_coach.spaces import RewardSpace
|
||||
|
||||
class RewardNormalizationFilter(RewardFilter):
|
||||
"""
|
||||
Normalize the reward with a running standard deviation and mean of the rewards seen so far
|
||||
If there is more than a single worker, the statistics of the rewards are shared between all the workers
|
||||
Normalizes the reward values with a running mean and standard deviation of
|
||||
all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation
|
||||
are accumulated over all the workers.
|
||||
"""
|
||||
def __init__(self, clip_min: float=-5.0, clip_max: float=5.0):
|
||||
"""
|
||||
|
||||
@@ -21,7 +21,8 @@ from rl_coach.spaces import RewardSpace
|
||||
|
||||
class RewardRescaleFilter(RewardFilter):
|
||||
"""
|
||||
Rescales the reward by multiplying with some factor
|
||||
Rescales the reward by a given factor. Rescaling the rewards of the environment has been
|
||||
observed to have a large effect (negative or positive) on the behavior of the learning process.
|
||||
"""
|
||||
def __init__(self, rescale_factor: float):
|
||||
"""
|
||||
|
||||
@@ -504,6 +504,8 @@ class GraphManager(object):
|
||||
:return: None
|
||||
"""
|
||||
|
||||
self.verify_graph_was_created()
|
||||
|
||||
# initialize the network parameters from the global network
|
||||
self.sync()
|
||||
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
from .episodic_experience_replay import EpisodicExperienceReplayParameters, EpisodicExperienceReplay
|
||||
from .episodic_hindsight_experience_replay import EpisodicHindsightExperienceReplayParameters, EpisodicHindsightExperienceReplay
|
||||
from .episodic_hrl_hindsight_experience_replay import EpisodicHRLHindsightExperienceReplayParameters, EpisodicHRLHindsightExperienceReplay
|
||||
from .single_episode_buffer import SingleEpisodeBufferParameters, SingleEpisodeBuffer
|
||||
__all__ = [
|
||||
'EpisodicExperienceReplayParameters',
|
||||
'EpisodicHindsightExperienceReplayParameters',
|
||||
'EpisodicHRLHindsightExperienceReplayParameters',
|
||||
'SingleEpisodeBufferParameters',
|
||||
'EpisodicExperienceReplay',
|
||||
'EpisodicHindsightExperienceReplay',
|
||||
'EpisodicHRLHindsightExperienceReplay',
|
||||
'SingleEpisodeBuffer'
|
||||
]
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
from .balanced_experience_replay import BalancedExperienceReplayParameters, BalancedExperienceReplay
|
||||
from .differentiable_neural_dictionary import QDND
|
||||
from .experience_replay import ExperienceReplayParameters, ExperienceReplay
|
||||
from .prioritized_experience_replay import PrioritizedExperienceReplayParameters, PrioritizedExperienceReplay
|
||||
from .transition_collection import TransitionCollection
|
||||
__all__ = [
|
||||
'BalancedExperienceReplayParameters',
|
||||
'BalancedExperienceReplay',
|
||||
'QDND',
|
||||
'ExperienceReplay',
|
||||
'PrioritizedExperienceReplay',
|
||||
'TransitionCollection'
|
||||
]
|
||||
|
||||
@@ -120,6 +120,7 @@ class Space(object):
|
||||
def val_matches_space_definition(self, val: Union[int, float, np.ndarray]) -> bool:
|
||||
"""
|
||||
Checks if the given value matches the space definition in terms of shape and values
|
||||
|
||||
:param val: a value to check
|
||||
:return: True / False depending on if the val matches the space definition
|
||||
"""
|
||||
@@ -136,6 +137,7 @@ class Space(object):
|
||||
def is_point_in_space_shape(self, point: np.ndarray) -> bool:
|
||||
"""
|
||||
Checks if a given multidimensional point is within the bounds of the shape of the space
|
||||
|
||||
:param point: a multidimensional point
|
||||
:return: True if the point is within the shape of the space. False otherwise
|
||||
"""
|
||||
@@ -146,6 +148,12 @@ class Space(object):
|
||||
return True
|
||||
|
||||
def sample(self) -> np.ndarray:
|
||||
"""
|
||||
Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
|
||||
bounds are defined
|
||||
|
||||
:return: A numpy array sampled from the space
|
||||
"""
|
||||
# if there are infinite bounds, we sample using gaussian noise with mean 0 and std 1
|
||||
if np.any(self.low == -np.inf) or np.any(self.high == np.inf):
|
||||
return np.random.normal(0, 1, self.shape)
|
||||
@@ -173,6 +181,10 @@ class ObservationSpace(Space):
|
||||
|
||||
|
||||
class VectorObservationSpace(ObservationSpace):
|
||||
"""
|
||||
An observation space which is defined as a vector of elements. This can be particularly useful for environments
|
||||
which return measurements, such as in robotic environmnets.
|
||||
"""
|
||||
def __init__(self, shape: int, low: Union[None, int, float, np.ndarray]=-np.inf,
|
||||
high: Union[None, int, float, np.ndarray]=np.inf, measurements_names: List[str]=None):
|
||||
if measurements_names is None:
|
||||
@@ -186,6 +198,10 @@ class VectorObservationSpace(ObservationSpace):
|
||||
|
||||
|
||||
class PlanarMapsObservationSpace(ObservationSpace):
|
||||
"""
|
||||
An observation space which defines a stack of 2D observations. For example, an environment which returns
|
||||
a stack of segmentation maps like in Starcraft.
|
||||
"""
|
||||
def __init__(self, shape: Union[np.ndarray], low: int, high: int, channels_axis: int=-1):
|
||||
super().__init__(shape, low, high)
|
||||
self.channels_axis = channels_axis
|
||||
@@ -200,6 +216,10 @@ class PlanarMapsObservationSpace(ObservationSpace):
|
||||
|
||||
|
||||
class ImageObservationSpace(PlanarMapsObservationSpace):
|
||||
"""
|
||||
An observation space which is a private case of the PlanarMapsObservationSpace, where the stack of 2D observations
|
||||
represent a RGB image, or a grayscale image.
|
||||
"""
|
||||
def __init__(self, shape: Union[np.ndarray], high: int, channels_axis: int=-1):
|
||||
# TODO: consider allowing arbitrary low values for images
|
||||
super().__init__(shape, 0, high, channels_axis)
|
||||
@@ -245,6 +265,7 @@ class ActionSpace(Space):
|
||||
def sample_with_info(self) -> ActionInfo:
|
||||
"""
|
||||
Get a random action with additional "fake" info
|
||||
|
||||
:return: An action info instance
|
||||
"""
|
||||
return ActionInfo(self.sample())
|
||||
@@ -252,6 +273,7 @@ class ActionSpace(Space):
|
||||
def clip_action_to_space(self, action: ActionType) -> ActionType:
|
||||
"""
|
||||
Given an action, clip its values to fit to the action space ranges
|
||||
|
||||
:param action: a given action
|
||||
:return: the clipped action
|
||||
"""
|
||||
@@ -460,6 +482,7 @@ class GoalToRewardConversion(object):
|
||||
def convert_distance_to_reward(self, distance: Union[float, np.ndarray]) -> Tuple[float, bool]:
|
||||
"""
|
||||
Given a distance from the goal, return a reward and a flag representing if the goal was reached
|
||||
|
||||
:param distance: the distance from the goal
|
||||
:return:
|
||||
"""
|
||||
@@ -543,6 +566,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
|
||||
def goal_from_state(self, state: Dict):
|
||||
"""
|
||||
Given a state, extract an observation according to the goal_name
|
||||
|
||||
:param state: a dictionary of observations
|
||||
:return: the observation corresponding to the goal_name
|
||||
"""
|
||||
@@ -551,6 +575,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
|
||||
def distance_from_goal(self, goal: np.ndarray, state: dict) -> float:
|
||||
"""
|
||||
Given a state, check its distance from the goal
|
||||
|
||||
:param goal: a numpy array representing the goal
|
||||
:param state: a dict representing the state
|
||||
:return: the distance from the goal
|
||||
@@ -574,6 +599,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
|
||||
def get_reward_for_goal_and_state(self, goal: np.ndarray, state: dict) -> Tuple[float, bool]:
|
||||
"""
|
||||
Given a state, check if the goal was reached and return a reward accordingly
|
||||
|
||||
:param goal: a numpy array representing the goal
|
||||
:param state: a dict representing the state
|
||||
:return: the reward for the current goal and state pair and a boolean representing if the goal was reached
|
||||
|
||||
Reference in New Issue
Block a user