1
0
mirror of https://github.com/gryf/coach.git synced 2026-02-16 05:55:46 +01:00

update of api docstrings across coach and tutorials [WIP] (#91)

* updating the documentation website
* adding the built docs
* update of api docstrings across coach and tutorials 0-2
* added some missing api documentation
* New Sphinx based documentation
This commit is contained in:
Itai Caspi
2018-11-15 15:00:13 +02:00
committed by Gal Novik
parent 524f8436a2
commit 6d40ad1650
517 changed files with 71034 additions and 12834 deletions

View File

@@ -36,25 +36,25 @@ from rl_coach.utils import last_sample
class ActorCriticAlgorithmParameters(AlgorithmParameters):
"""
:param policy_gradient_rescaler: (PolicyGradientRescaler)
The value that will be used to rescale the policy gradient
The value that will be used to rescale the policy gradient
:param apply_gradients_every_x_episodes: (int)
The number of episodes to wait before applying the accumulated gradients to the network.
The training iterations only accumulate gradients without actually applying them.
The number of episodes to wait before applying the accumulated gradients to the network.
The training iterations only accumulate gradients without actually applying them.
:param beta_entropy: (float)
The weight that will be given to the entropy regularization which is used in order to improve exploration.
The weight that will be given to the entropy regularization which is used in order to improve exploration.
:param num_steps_between_gradient_updates: (int)
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
:param gae_lambda: (float)
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
If the policy gradient rescaler was defined as PolicyGradientRescaler.GAE, the generalized advantage estimation
scheme will be used, in which case the lambda value controls the decay for the different n-step lengths.
:param estimate_state_value_using_gae: (bool)
If set to True, the state value targets for the V head will be estimated using the GAE scheme.
If set to True, the state value targets for the V head will be estimated using the GAE scheme.
"""
def __init__(self):
super().__init__()

View File

@@ -39,7 +39,7 @@ from rl_coach.memories.backend.memory_impl import get_memory_backend
class Agent(AgentInterface):
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
"""
:param agent_parameters: A Preset class instance with all the running paramaters
:param agent_parameters: A AgentParameters class instance with all the agent parameters
"""
super().__init__()
self.ap = agent_parameters
@@ -175,18 +175,20 @@ class Agent(AgentInterface):
np.random.seed()
@property
def parent(self):
def parent(self) -> 'LevelManager':
"""
Get the parent class of the agent
:return: the current phase
"""
return self._parent
@parent.setter
def parent(self, val):
def parent(self, val) -> None:
"""
Change the parent class of the agent.
Additionally, updates the full name of the agent
:param val: the new parent
:return: None
"""
@@ -196,7 +198,12 @@ class Agent(AgentInterface):
raise ValueError("The parent of an agent must have a name")
self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
def setup_logger(self):
def setup_logger(self) -> None:
"""
Setup the logger for the agent
:return: None
"""
# dump documentation
logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
format(graph_name=self.parent_level_manager.parent_graph_manager.name,
@@ -212,6 +219,7 @@ class Agent(AgentInterface):
def set_session(self, sess) -> None:
"""
Set the deep learning framework session for all the agents in the composite agent
:return: None
"""
self.input_filter.set_session(sess)
@@ -223,6 +231,7 @@ class Agent(AgentInterface):
dump_one_value_per_step: bool=False) -> Signal:
"""
Register a signal such that its statistics will be dumped and be viewable through dashboard
:param signal_name: the name of the signal as it will appear in dashboard
:param dump_one_value_per_episode: should the signal value be written for each episode?
:param dump_one_value_per_step: should the signal value be written for each step?
@@ -239,6 +248,7 @@ class Agent(AgentInterface):
"""
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
dependent on those values, by calling init_environment_dependent_modules
:param spaces: the environment spaces definition
:return: None
"""
@@ -274,6 +284,7 @@ class Agent(AgentInterface):
Create all the networks of the agent.
The network creation will be done after setting the environment parameters for the agent, since they are needed
for creating the network.
:return: A list containing all the networks
"""
networks = {}
@@ -295,6 +306,7 @@ class Agent(AgentInterface):
"""
Initialize any modules that depend on knowing information about the environment such as the action space or
the observation space
:return: None
"""
# initialize exploration policy
@@ -314,13 +326,19 @@ class Agent(AgentInterface):
@property
def phase(self) -> RunPhase:
"""
The current running phase of the agent
:return: RunPhase
"""
return self._phase
@phase.setter
def phase(self, val: RunPhase) -> None:
"""
Change the phase of the run for the agent and all the sub components
:param phase: the new run phase (TRAIN, TEST, etc.)
:param val: the new run phase (TRAIN, TEST, etc.)
:return: None
"""
self.reset_evaluation_state(val)
@@ -328,6 +346,14 @@ class Agent(AgentInterface):
self.exploration_policy.change_phase(val)
def reset_evaluation_state(self, val: RunPhase) -> None:
"""
Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an
evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given
by val, and by the current phase set in self.phase.
:param val: The new phase to change to
:return: None
"""
starting_evaluation = (val == RunPhase.TEST)
ending_evaluation = (self.phase == RunPhase.TEST)
@@ -363,6 +389,7 @@ class Agent(AgentInterface):
This function is a wrapper to allow having the same calls for shared or unshared memories.
It should be used instead of calling the memory directly in order to allow different algorithms to work
both with a shared and a local memory.
:param func: the name of the memory function to call
:param args: the arguments to supply to the function
:return: the return value of the function
@@ -375,7 +402,12 @@ class Agent(AgentInterface):
result = getattr(self.memory, func)(*args)
return result
def log_to_screen(self):
def log_to_screen(self) -> None:
"""
Write an episode summary line to the terminal
:return: None
"""
# log to screen
log = OrderedDict()
log["Name"] = self.full_name_id
@@ -388,9 +420,10 @@ class Agent(AgentInterface):
log["Training iteration"] = self.training_iteration
screen.log_dict(log, prefix=self.phase.value)
def update_step_in_episode_log(self):
def update_step_in_episode_log(self) -> None:
"""
Writes logging messages to screen and updates the log file with all the signal values.
Updates the in-episode log file with all the signal values from the most recent step.
:return: None
"""
# log all the signals to file
@@ -411,9 +444,12 @@ class Agent(AgentInterface):
# dump
self.agent_episode_logger.dump_output_csv()
def update_log(self):
def update_log(self) -> None:
"""
Writes logging messages to screen and updates the log file with all the signal values.
Updates the episodic log file with all the signal values from the most recent episode.
Additional signals for logging can be set by the creating a new signal using self.register_signal,
and then updating it with some internal agent values.
:return: None
"""
# log all the signals to file
@@ -438,7 +474,6 @@ class Agent(AgentInterface):
self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
for signal in self.episode_signals:
self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
@@ -452,7 +487,10 @@ class Agent(AgentInterface):
def handle_episode_ended(self) -> None:
"""
End an episode
Make any changes needed when each episode is ended.
This includes incrementing counters, updating full episode dependent values, updating logs, etc.
This function is called right after each episode is ended.
:return: None
"""
self.current_episode_buffer.is_complete = True
@@ -486,9 +524,10 @@ class Agent(AgentInterface):
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
self.log_to_screen()
def reset_internal_state(self):
def reset_internal_state(self) -> None:
"""
Reset all the episodic parameters
Reset all the episodic parameters. This function is called right before each episode starts.
:return: None
"""
for signal in self.episode_signals:
@@ -516,6 +555,7 @@ class Agent(AgentInterface):
def learn_from_batch(self, batch) -> Tuple[float, List, List]:
"""
Given a batch of transitions, calculates their target values and updates the network.
:param batch: A list of transitions
:return: The total loss of the training, the loss per head and the unclipped gradients
"""
@@ -524,6 +564,7 @@ class Agent(AgentInterface):
def _should_update_online_weights_to_target(self):
"""
Determine if online weights should be copied to the target.
:return: boolean: True if the online weights should be copied to the target.
"""
@@ -542,9 +583,10 @@ class Agent(AgentInterface):
"EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
return should_update
def _should_train(self, wait_for_full_episode=False):
def _should_train(self, wait_for_full_episode=False) -> bool:
"""
Determine if we should start a training phase according to the number of steps passed since the last training
:return: boolean: True if we should start a training phase
"""
@@ -580,11 +622,12 @@ class Agent(AgentInterface):
return should_update
def train(self):
def train(self) -> float:
"""
Check if a training phase should be done as configured by num_consecutive_playing_steps.
If it should, then do several training steps as configured by num_consecutive_training_steps.
A single training iteration: Sample a batch, train on it and update target networks.
:return: The total training loss during the training iterations.
"""
loss = 0
@@ -641,14 +684,12 @@ class Agent(AgentInterface):
# run additional commands after the training is done
self.post_training_commands()
return loss
def choose_action(self, curr_state):
"""
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
or testing.
choose an action to act with in the current episode being played. Different behavior might be exhibited when
training or testing.
:param curr_state: the current state to act upon.
:return: chosen action, some action value describing the action (q-value, probability, etc)
@@ -656,10 +697,16 @@ class Agent(AgentInterface):
pass
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
network_name: str):
network_name: str) -> Dict[str, np.array]:
"""
convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
:param states: A list of environment states, where each one is a dict mapping from an observation name to its
corresponding observation
:param network_name: The agent network name to prepare the batch for. this is needed in order to extract only
the observation relevant for the network from the states.
:return: A dictionary containing a list of values from all the given states for each of the observations
"""
# convert to batch so we can run it through the network
states = force_list(states)
@@ -676,7 +723,8 @@ class Agent(AgentInterface):
def act(self) -> ActionInfo:
"""
Given the agents current knowledge, decide on the next action to apply to the environment
:return: an action and a dictionary containing any additional info from the action decision process
:return: An ActionInfo object, which contains the action and any additional info from the action decision process
"""
if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
# This agent never plays while training (e.g. behavioral cloning)
@@ -705,13 +753,20 @@ class Agent(AgentInterface):
return filtered_action_info
def run_pre_network_filter_for_inference(self, state: StateType):
def run_pre_network_filter_for_inference(self, state: StateType) -> StateType:
"""
Run filters which where defined for being applied right before using the state for inference.
:param state: The state to run the filters on
:return: The filtered state
"""
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
return self.pre_network_filter.filter(dummy_env_response)[0].next_state
def get_state_embedding(self, state: dict) -> np.ndarray:
"""
Given a state, get the corresponding state embedding from the main network
:param state: a state dict
:return: a numpy embedding vector
"""
@@ -726,6 +781,7 @@ class Agent(AgentInterface):
"""
Allows agents to update the transition just before adding it to the replay buffer.
Can be useful for agents that want to tweak the reward, termination signal, etc.
:param transition: the transition to update
:return: the updated transition
"""
@@ -736,8 +792,10 @@ class Agent(AgentInterface):
Given a response from the environment, distill the observation from it and store it for later use.
The response should be a dictionary containing the performed action, the new observation and measurements,
the reward, a game over flag and any additional information necessary.
:param env_response: result of call from environment.step(action)
:return:
:return: a boolean value which determines if the agent has decided to terminate the episode after seeing the
given observation
"""
# filter the env_response
@@ -801,7 +859,12 @@ class Agent(AgentInterface):
return transition.game_over
def post_training_commands(self):
def post_training_commands(self) -> None:
"""
A function which allows adding any functionality that is required to run right after the training phase ends.
:return: None
"""
pass
def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
@@ -809,9 +872,10 @@ class Agent(AgentInterface):
Get a prediction from the agent with regard to the requested prediction_type.
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
raise a ValueException.
:param states:
:param prediction_type:
:return:
:param states: The states to get a prediction for
:param prediction_type: The type of prediction to get for the states. For example, the state-value prediction.
:return: the predicted values
"""
predictions = self.networks['main'].online_network.predict_with_prediction_type(
@@ -824,6 +888,15 @@ class Agent(AgentInterface):
return list(predictions.values())[0]
def set_incoming_directive(self, action: ActionType) -> None:
"""
Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent
has another master agent that is controlling it. In such cases, the master agent can define the goals for the
slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent
in-action-space.
:param action: The action that should be set as the directive
:return:
"""
if isinstance(self.in_action_space, GoalsSpace):
self.current_hrl_goal = action
elif isinstance(self.in_action_space, AttentionActionSpace):
@@ -834,6 +907,7 @@ class Agent(AgentInterface):
def save_checkpoint(self, checkpoint_id: int) -> None:
"""
Allows agents to store additional information when saving checkpoints.
:param checkpoint_id: the id of the checkpoint
:return: None
"""
@@ -842,6 +916,7 @@ class Agent(AgentInterface):
def sync(self) -> None:
"""
Sync the global network parameters to local networks
:return: None
"""
for network in self.networks.values():

View File

@@ -32,7 +32,6 @@ from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayPar
class BCAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.collect_new_data = False
class BCNetworkParameters(NetworkParameters):

View File

@@ -33,6 +33,19 @@ class CategoricalDQNNetworkParameters(DQNNetworkParameters):
class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
"""
:param v_min: (float)
The minimal value that will be represented in the network output for predicting the Q value.
Corresponds to :math:`v_{min}` in the paper.
:param v_max: (float)
The maximum value that will be represented in the network output for predicting the Q value.
Corresponds to :math:`v_{max}` in the paper.
:param atoms: (int)
The number of atoms that will be used to discretize the range between v_min and v_max.
For the C51 algorithm described in the paper, the number of atoms is 51.
"""
def __init__(self):
super().__init__()
self.v_min = -10.0

View File

@@ -26,9 +26,12 @@ from rl_coach.memories.non_episodic.balanced_experience_replay import BalancedEx
class CILAlgorithmParameters(AlgorithmParameters):
"""
:param state_key_with_the_class_index: (str)
The key of the state dictionary which corresponds to the value that will be used to control the class index.
"""
def __init__(self):
super().__init__()
self.collect_new_data = False
self.state_key_with_the_class_index = 'high_level_command'

View File

@@ -58,6 +58,47 @@ class ClippedPPONetworkParameters(NetworkParameters):
class ClippedPPOAlgorithmParameters(AlgorithmParameters):
"""
:param policy_gradient_rescaler: (PolicyGradientRescaler)
This represents how the critic will be used to update the actor. The critic value function is typically used
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
advantage of the action, or the generalized advantage estimation (GAE) value.
:param gae_lambda: (float)
The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
n-step estimations.
:param clip_likelihood_ratio_using_epsilon: (float)
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
implementations.
:param value_targets_mix_fraction: (float)
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
define how much of the new targets will be taken into account when calculating the loss.
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
:param estimate_state_value_using_gae: (bool)
If set to True, the state value will be estimated using the GAE technique.
:param use_kl_regularization: (bool)
If set to True, the loss function will be regularized using the KL diveregence between the current and new
policy, to bound the change of the policy during the network update.
:param beta_entropy: (float)
An entropy regulaization term can be added to the loss function in order to control exploration. This term
is weighted using the :math:`\beta` value defined by beta_entropy.
:param optimization_epochs: (int)
For each training phase, the collected dataset will be used for multiple epochs, which are defined by the
optimization_epochs value.
:param optimization_epochs: (Schedule)
Can be used to define a schedule over the clipping of the likelihood ratio.
"""
def __init__(self):
super().__init__()
self.num_episodes_in_experience_replay = 1000000
@@ -66,7 +107,6 @@ class ClippedPPOAlgorithmParameters(AlgorithmParameters):
self.use_kl_regularization = False
self.clip_likelihood_ratio_using_epsilon = 0.2
self.estimate_state_value_using_gae = True
self.step_until_collecting_full_episodes = True
self.beta_entropy = 0.01 # should be 0 for mujoco
self.num_consecutive_playing_steps = EnvironmentSteps(2048)
self.optimization_epochs = 10

View File

@@ -65,6 +65,33 @@ class DDPGActorNetworkParameters(NetworkParameters):
class DDPGAlgorithmParameters(AlgorithmParameters):
"""
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
The number of steps between copying the online network weights to the target network weights.
:param rate_for_copying_weights_to_target: (float)
When copying the online network weights to the target network weights, a soft update will be used, which
weight the new online network weights by rate_for_copying_weights_to_target
:param num_consecutive_playing_steps: (StepMethod)
The number of consecutive steps to act between every two training iterations
:param use_target_network_for_evaluation: (bool)
If set to True, the target network will be used for predicting the actions when choosing actions to act.
Since the target network weights change more slowly, the predicted actions will be more consistent.
:param action_penalty: (float)
The amount by which to penalize the network on high action feature (pre-activation) values.
This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
gradients from becoming very low.
:param clip_critic_targets: (Tuple[float, float] or None)
The range to clip the critic target to in order to prevent overestimation of the action values.
:param use_non_zero_discount_for_terminal_states: (bool)
If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
values. If set to False, the terminal states reward will be taken as the target return for the network.
"""
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)

View File

@@ -81,6 +81,35 @@ class DFPMemoryParameters(EpisodicExperienceReplayParameters):
class DFPAlgorithmParameters(AlgorithmParameters):
"""
:param num_predicted_steps_ahead: (int)
Number of future steps to predict measurements for. The future steps won't be sequential, but rather jump
in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4
:param goal_vector: (List[float])
The goal vector will weight each of the measurements to form an optimization goal. The vector should have
the same length as the number of measurements, and it will be vector multiplied by the measurements.
Positive values correspond to trying to maximize the particular measurement, and negative values
correspond to trying to minimize the particular measurement.
:param future_measurements_weights: (List[float])
The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
then only the 3 last timesteps will be taken into account, according to the weights in the
future_measurements_weights vector.
:param use_accumulated_reward_as_measurement: (bool)
If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
the measurements vector in the state. This van be useful in environments where the given measurements don't
include enough information for the particular goal the agent should achieve.
:param handling_targets_after_episode_end: (HandlingTargetsAfterEpisodeEnd)
Dictates how to handle measurements that are outside the episode length.
:param scale_measurements_targets: (Dict[str, float])
Allows rescaling the values of each of the measurements available. This van be useful when the measurements
have a different scale and you want to normalize them to the same scale.
"""
def __init__(self):
super().__init__()
self.num_predicted_steps_ahead = 6

View File

@@ -24,6 +24,13 @@ from rl_coach.spaces import SpacesDefinition
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
"""
:param time_limit: (int)
The number of steps the agent is allowed to act for while trying to achieve its goal
:param sub_goal_testing_rate: (float)
The percent of episodes that will be used for testing the sub goals generated by the upper level agents.
"""
def __init__(self):
super().__init__()
self.time_limit = 40
@@ -91,7 +98,7 @@ class HACDDPGAgent(DDPGAgent):
sub_goal_is_missed = not sub_goal_reached
if sub_goal_is_missed:
transition.reward = -self.ap.algorithm.time_limit
transition.reward = -self.ap.algorithm.time_limit
return transition
def set_environment_parameters(self, spaces: SpacesDefinition):

View File

@@ -24,6 +24,11 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi
class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
"""
:param monte_carlo_mixing_rate: (float)
The mixing rate is used for setting the amount of monte carlo estimate (full return) that will be mixes into
the single-step bootstrapped targets.
"""
def __init__(self):
super().__init__()
self.monte_carlo_mixing_rate = 0.1

View File

@@ -44,6 +44,26 @@ class NStepQNetworkParameters(NetworkParameters):
class NStepQAlgorithmParameters(AlgorithmParameters):
"""
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
The number of steps between copying the online network weights to the target network weights.
:param apply_gradients_every_x_episodes: (int)
The number of episodes between applying the accumulated gradients to the network. After every
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
apply_gradients_every_x_episodes episodes.
:param num_steps_between_gradient_updates: (int)
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
are used in the batch.
:param targets_horizon: (str)
Should be either 'N-Step' or '1-Step', and defines the length for which to bootstrap the network values over.
Essentially, 1-Step follows the regular 1 step bootstrapping Q learning update. For more information,
please refer to the original paper (https://arxiv.org/abs/1602.01783)
"""
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)

View File

@@ -43,6 +43,39 @@ class NECNetworkParameters(NetworkParameters):
class NECAlgorithmParameters(AlgorithmParameters):
"""
:param dnd_size: (int)
Defines the number of transitions that will be stored in each one of the DNDs. Note that the total number
of transitions that will be stored is dnd_size x num_actions.
:param l2_norm_added_delta: (float)
A small value that will be added when calculating the weight of each of the DND entries. This follows the
:math:`\delta` patameter defined in the paper.
:param new_value_shift_coefficient: (float)
In the case where a ew embedding that was added to the DND was already present, the value that will be stored
in the DND is a mix between the existing value and the new value. The mix rate is defined by
new_value_shift_coefficient.
:param number_of_knn: (int)
The number of neighbors that will be retrieved for each DND query.
:param DND_key_error_threshold: (float)
When the DND is queried for a specific embedding, this threshold will be used to determine if the embedding
exists in the DND, since exact matches of embeddings are very rare.
:param propagate_updates_to_DND: (bool)
If set to True, when the gradients of the network will be calculated, the gradients will also be
backpropagated through the keys of the DND. The keys will then be updated as well, as if they were regular
network weights.
:param n_step: (int)
The bootstrap length that will be used when calculating the state values to store in the DND.
:param bootstrap_total_return_from_old_policy: (bool)
If set to True, the bootstrap that will be used to calculate each state-action value, is the network value
when the state was first seen, and not the latest, most up-to-date network value.
"""
def __init__(self):
super().__init__()
self.dnd_size = 500000

View File

@@ -24,6 +24,19 @@ from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperi
class PALAlgorithmParameters(DQNAlgorithmParameters):
"""
:param pal_alpha: (float)
A factor that weights the amount by which the advantage learning update will be taken into account.
:param persistent_advantage_learning: (bool)
If set to True, the persistent mode of advantage learning will be used, which encourages the agent to take
the same actions one after the other instead of changing actions.
:param monte_carlo_mixing_rate: (float)
The amount of monte carlo values to mix into the targets of the network. The monte carlo values are just the
total discounted returns, and they can help reduce the time it takes for the network to update to the newly
seen values, since it is not based on bootstrapping the current network values.
"""
def __init__(self):
super().__init__()
self.pal_alpha = 0.9

View File

@@ -42,6 +42,27 @@ class PolicyGradientNetworkParameters(NetworkParameters):
class PolicyGradientAlgorithmParameters(AlgorithmParameters):
"""
:param policy_gradient_rescaler: (PolicyGradientRescaler)
The rescaler type to use for the policy gradient loss. For policy gradients, we calculate log probability of
the action and then multiply it by the policy gradient rescaler. The most basic rescaler is the discounter
return, but there are other rescalers that are intended for reducing the variance of the updates.
:param apply_gradients_every_x_episodes: (int)
The number of episodes between applying the accumulated gradients to the network. After every
num_steps_between_gradient_updates steps, the agent will calculate the gradients for the collected data,
it will then accumulate it in internal accumulators, and will only apply them to the network once in every
apply_gradients_every_x_episodes episodes.
:param beta_entropy: (float)
A factor which defines the amount of entropy regularization to apply to the network. The entropy of the actions
will be added to the loss and scaled by the given beta factor.
:param num_steps_between_gradient_updates: (int)
The number of steps between calculating gradients for the collected data. In the A3C paper, this parameter is
called t_max. Since this algorithm is on-policy, only the steps collected between each two gradient calculations
are used in the batch.
"""
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP

View File

@@ -63,6 +63,51 @@ class PPOActorNetworkParameters(NetworkParameters):
class PPOAlgorithmParameters(AlgorithmParameters):
"""
:param policy_gradient_rescaler: (PolicyGradientRescaler)
This represents how the critic will be used to update the actor. The critic value function is typically used
to rescale the gradients calculated by the actor. There are several ways for doing this, such as using the
advantage of the action, or the generalized advantage estimation (GAE) value.
:param gae_lambda: (float)
The :math:`\lambda` value is used within the GAE function in order to weight different bootstrap length
estimations. Typical values are in the range 0.9-1, and define an exponential decay over the different
n-step estimations.
:param target_kl_divergence: (float)
The target kl divergence between the current policy distribution and the new policy. PPO uses a heuristic to
bring the KL divergence to this value, by adding a penalty if the kl divergence is higher.
:param initial_kl_coefficient: (float)
The initial weight that will be given to the KL divergence between the current and the new policy in the
regularization factor.
:param high_kl_penalty_coefficient: (float)
The penalty that will be given for KL divergence values which are highes than what was defined as the target.
:param clip_likelihood_ratio_using_epsilon: (float)
If not None, the likelihood ratio between the current and new policy in the PPO loss function will be
clipped to the range [1-clip_likelihood_ratio_using_epsilon, 1+clip_likelihood_ratio_using_epsilon].
This is typically used in the Clipped PPO version of PPO, and should be set to None in regular PPO
implementations.
:param value_targets_mix_fraction: (float)
The targets for the value network are an exponential weighted moving average which uses this mix fraction to
define how much of the new targets will be taken into account when calculating the loss.
This value should be set to the range (0,1], where 1 means that only the new targets will be taken into account.
:param estimate_state_value_using_gae: (bool)
If set to True, the state value will be estimated using the GAE technique.
:param use_kl_regularization: (bool)
If set to True, the loss function will be regularized using the KL diveregence between the current and new
policy, to bound the change of the policy during the network update.
:param beta_entropy: (float)
An entropy regulaization term can be added to the loss function in order to control exploration. This term
is weighted using the :math:`\beta` value defined by beta_entropy.
"""
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
@@ -73,7 +118,6 @@ class PPOAlgorithmParameters(AlgorithmParameters):
self.clip_likelihood_ratio_using_epsilon = None
self.value_targets_mix_fraction = 0.1
self.estimate_state_value_using_gae = True
self.step_until_collecting_full_episodes = True
self.use_kl_regularization = True
self.beta_entropy = 0.01
self.num_consecutive_playing_steps = EnvironmentSteps(5000)

View File

@@ -34,6 +34,14 @@ class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
"""
:param atoms: (int)
the number of atoms to predict for each action
:param huber_loss_interval: (float)
One of the huber loss parameters, and is referred to as :math:`\kapa` in the paper.
It describes the interval [-k, k] in which the huber loss acts as a MSE loss.
"""
def __init__(self):
super().__init__()
self.atoms = 200

View File

@@ -37,6 +37,17 @@ class RainbowDQNNetworkParameters(DQNNetworkParameters):
class RainbowDQNAlgorithmParameters(CategoricalDQNAlgorithmParameters):
"""
:param n_step: (int)
The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
prediction.
:param store_transitions_only_when_episodes_are_terminated: (bool)
If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
transitions into the memory, and to do so we need the entire episode first.
"""
def __init__(self):
super().__init__()
self.n_step = 3

View File

@@ -57,7 +57,7 @@ class Architecture(object):
:param initial_feed_dict: a dictionary of extra inputs for forward pass.
:return: predictions of action or value of shape (batch_size, action_space_size) for action predictions)
"""
pass
raise NotImplementedError
@staticmethod
def parallel_predict(sess: Any,
@@ -68,7 +68,7 @@ class Architecture(object):
:param network_input_tuples: tuple of network and corresponding input
:return: list or tuple of outputs from all networks
"""
pass
raise NotImplementedError
def train_on_batch(self,
inputs: Dict[str, np.ndarray],
@@ -102,7 +102,7 @@ class Architecture(object):
norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
fetched_tensors: all values for additional_fetches
"""
pass
raise NotImplementedError
def get_weights(self) -> List[np.ndarray]:
"""
@@ -110,7 +110,7 @@ class Architecture(object):
:return: list weights as ndarray
"""
pass
raise NotImplementedError
def set_weights(self, weights: List[np.ndarray], rate: float=1.0) -> None:
"""
@@ -121,7 +121,7 @@ class Architecture(object):
i.e. new_weight = rate * given_weight + (1 - rate) * old_weight
:return: None
"""
pass
raise NotImplementedError
def reset_accumulated_gradients(self) -> None:
"""
@@ -130,7 +130,7 @@ class Architecture(object):
Once gradients are reset, they must be accessible by `accumulated_gradients` property of this class,
which must return a list of numpy ndarrays. Child class must ensure that `accumulated_gradients` is set.
"""
pass
raise NotImplementedError
def accumulate_gradients(self,
inputs: Dict[str, np.ndarray],
@@ -166,7 +166,7 @@ class Architecture(object):
norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
fetched_tensors: all values for additional_fetches
"""
pass
raise NotImplementedError
def apply_and_reset_gradients(self, gradients: List[np.ndarray], scaler: float=1.) -> None:
"""
@@ -177,7 +177,7 @@ class Architecture(object):
of an identical network (either self or another identical network)
:param scaler: A scaling factor that allows rescaling the gradients before applying them
"""
pass
raise NotImplementedError
def apply_gradients(self, gradients: List[np.ndarray], scaler: float=1.) -> None:
"""
@@ -188,7 +188,7 @@ class Architecture(object):
of an identical network (either self or another identical network)
:param scaler: A scaling factor that allows rescaling the gradients before applying them
"""
pass
raise NotImplementedError
def get_variable_value(self, variable: Any) -> np.ndarray:
"""
@@ -199,7 +199,7 @@ class Architecture(object):
:param variable: variable of interest
:return: value of the specified variable
"""
pass
raise NotImplementedError
def set_variable_value(self, assign_op: Any, value: np.ndarray, placeholder: Any):
"""
@@ -212,4 +212,4 @@ class Architecture(object):
:param value: value of the specified variable used for update
:param placeholder: a placeholder for binding the value to assign_op.
"""
pass
raise NotImplementedError

View File

@@ -34,7 +34,11 @@ except ImportError:
class NetworkWrapper(object):
"""
Contains multiple networks and managers syncing and gradient updates
The network wrapper contains multiple copies of the same network, each one with a different set of weights which is
updating in a different time scale. The network wrapper will always contain an online network.
It will contain an additional slow updating target network if it was requested by the user,
and it will contain a global network shared between different workers, if Coach is run in a single-node
multi-process distributed mode. The network wrapper contains functionality for managing these networks and syncing
between them.
"""
def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str,
@@ -98,6 +102,7 @@ class NetworkWrapper(object):
def sync(self):
"""
Initializes the weights of the networks to match each other
:return:
"""
self.update_online_network()
@@ -106,6 +111,7 @@ class NetworkWrapper(object):
def update_target_network(self, rate=1.0):
"""
Copy weights: online network >>> target network
:param rate: the rate of copying the weights - 1 for copying exactly
"""
if self.target_network:
@@ -114,6 +120,7 @@ class NetworkWrapper(object):
def update_online_network(self, rate=1.0):
"""
Copy weights: global network >>> online network
:param rate: the rate of copying the weights - 1 for copying exactly
"""
if self.global_network:
@@ -122,6 +129,7 @@ class NetworkWrapper(object):
def apply_gradients_to_global_network(self, gradients=None):
"""
Apply gradients from the online network on the global network
:param gradients: optional gradients that will be used instead of teh accumulated gradients
:return:
"""
@@ -135,6 +143,7 @@ class NetworkWrapper(object):
def apply_gradients_to_online_network(self, gradients=None):
"""
Apply gradients from the online network on itself
:return:
"""
if gradients is None:
@@ -144,6 +153,7 @@ class NetworkWrapper(object):
def train_and_sync_networks(self, inputs, targets, additional_fetches=[], importance_weights=None):
"""
A generic training function that enables multi-threading training using a global network if necessary.
:param inputs: The inputs for the network.
:param targets: The targets corresponding to the given inputs
:param additional_fetches: Any additional tensor the user wants to fetch
@@ -160,6 +170,7 @@ class NetworkWrapper(object):
"""
Applies the gradients accumulated in the online network to the global network or to itself and syncs the
networks if necessary
:param reset_gradients: If set to True, the accumulated gradients wont be reset to 0 after applying them to
the network. this is useful when the accumulated gradients are overwritten instead
if accumulated by the accumulate_gradients function. this allows reducing time
@@ -179,6 +190,7 @@ class NetworkWrapper(object):
def parallel_prediction(self, network_input_tuples: List[Tuple]):
"""
Run several network prediction in parallel. Currently this only supports running each of the network once.
:param network_input_tuples: a list of tuples where the first element is the network (online_network,
target_network or global_network) and the second element is the inputs
:return: the outputs of all the networks in the same order as the inputs were given
@@ -188,6 +200,7 @@ class NetworkWrapper(object):
def get_local_variables(self):
"""
Get all the variables that are local to the thread
:return: a list of all the variables that are local to the thread
"""
local_variables = [v for v in tf.local_variables() if self.online_network.name in v.name]
@@ -198,6 +211,7 @@ class NetworkWrapper(object):
def get_global_variables(self):
"""
Get all the variables that are shared between threads
:return: a list of all the variables that are shared between threads
"""
global_variables = [v for v in tf.global_variables() if self.global_network.name in v.name]
@@ -206,6 +220,7 @@ class NetworkWrapper(object):
def set_is_training(self, state: bool):
"""
Set the phase of the network between training and testing
:param state: The current state (True = Training, False = Testing)
:return: None
"""

View File

@@ -14,7 +14,7 @@
# limitations under the License.
#
from typing import List, Union
from typing import List, Union, Tuple
import copy
import numpy as np
@@ -74,7 +74,12 @@ class InputEmbedder(object):
activation_function=self.activation_function,
dropout_rate=self.dropout_rate))
def __call__(self, prev_input_placeholder=None):
def __call__(self, prev_input_placeholder: tf.placeholder=None) -> Tuple[tf.Tensor, tf.Tensor]:
"""
Wrapper for building the module graph including scoping and loss creation
:param prev_input_placeholder: the input to the graph
:return: the input placeholder and the output of the last layer
"""
with tf.variable_scope(self.get_name()):
if prev_input_placeholder is None:
self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
@@ -84,7 +89,13 @@ class InputEmbedder(object):
return self.input, self.output
def _build_module(self):
def _build_module(self) -> None:
"""
Builds the graph of the module
This method is called early on from __call__. It is expected to store the graph
in self.output.
:return: None
"""
# NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
# to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
# input to the network to be float, which is 4x more expensive in memory.
@@ -127,7 +138,11 @@ class InputEmbedder(object):
raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
"configurations.")
def get_name(self):
def get_name(self) -> str:
"""
Get a formatted name for the module
:return: the formatted name
"""
return self.name
def __str__(self):

View File

@@ -14,7 +14,7 @@
# limitations under the License.
#
import copy
from typing import Union
from typing import Union, Tuple
import tensorflow as tf
@@ -64,17 +64,33 @@ class Middleware(object):
activation_function=self.activation_function,
dropout_rate=self.dropout_rate))
def __call__(self, input_layer):
def __call__(self, input_layer: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
"""
Wrapper for building the module graph including scoping and loss creation
:param input_layer: the input to the graph
:return: the input placeholder and the output of the last layer
"""
with tf.variable_scope(self.get_name()):
self.input = input_layer
self._build_module()
return self.input, self.output
def _build_module(self):
def _build_module(self) -> None:
"""
Builds the graph of the module
This method is called early on from __call__. It is expected to store the graph
in self.output.
:param input_layer: the input to the graph
:return: None
"""
pass
def get_name(self):
def get_name(self) -> str:
"""
Get a formatted name for the module
:return: the formatted name
"""
return self.name
@property

View File

@@ -154,7 +154,6 @@ class AlgorithmParameters(Parameters):
self.num_steps_between_copying_online_weights_to_target = TrainingSteps(0)
self.rate_for_copying_weights_to_target = 1.0
self.load_memory_from_file_path = None
self.collect_new_data = True
self.store_transitions_only_when_episodes_are_terminated = False
# HRL / HER related params
@@ -174,7 +173,38 @@ class AlgorithmParameters(Parameters):
class PresetValidationParameters(Parameters):
def __init__(self):
def __init__(self,
test=False,
min_reward_threshold=0,
max_episodes_to_achieve_reward=1,
num_workers=1,
reward_test_level=None,
test_using_a_trace_test=True,
trace_test_levels=None,
trace_max_env_steps=5000):
"""
:param test:
A flag which specifies if the preset should be tested as part of the validation process.
:param min_reward_threshold:
The minimum reward that the agent should pass after max_episodes_to_achieve_reward episodes when the
preset is run.
:param max_episodes_to_achieve_reward:
The maximum number of episodes that the agent should train using the preset in order to achieve the
reward specified by min_reward_threshold.
:param num_workers:
The number of workers that should be used when running this preset in the test suite for validation.
:param reward_test_level:
The environment level or levels, given by a list of strings, that should be tested as part of the
reward tests suite.
:param test_using_a_trace_test:
A flag that specifies if the preset should be run as part of the trace tests suite.
:param trace_test_levels:
The environment level or levels, given by a list of strings, that should be tested as part of the
trace tests suite.
:param trace_max_env_steps:
An integer representing the maximum number of environment steps to run when running this preset as part
of the trace tests suite.
"""
super().__init__()
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
@@ -182,42 +212,42 @@ class PresetValidationParameters(Parameters):
# time from the OS.
# Testing parameters
self.test = False
self.min_reward_threshold = 0
self.max_episodes_to_achieve_reward = 1
self.num_workers = 1
self.reward_test_level = None
self.test_using_a_trace_test = True
self.trace_test_levels = None
self.trace_max_env_steps = 5000
self.test = test
self.min_reward_threshold = min_reward_threshold
self.max_episodes_to_achieve_reward = max_episodes_to_achieve_reward
self.num_workers = num_workers
self.reward_test_level = reward_test_level
self.test_using_a_trace_test = test_using_a_trace_test
self.trace_test_levels = trace_test_levels
self.trace_max_env_steps = trace_max_env_steps
class NetworkParameters(Parameters):
def __init__(self,
force_cpu = False,
async_training = False,
shared_optimizer = True,
scale_down_gradients_by_number_of_workers_for_sync_training = True,
clip_gradients = None,
gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm,
l2_regularization = 0,
learning_rate = 0.00025,
learning_rate_decay_rate = 0,
learning_rate_decay_steps = 0,
input_embedders_parameters = {},
embedding_merger_type = EmbeddingMergerType.Concat,
middleware_parameters = None,
heads_parameters = [],
use_separate_networks_per_head = False,
optimizer_type = 'Adam',
optimizer_epsilon = 0.0001,
adam_optimizer_beta1 = 0.9,
adam_optimizer_beta2 = 0.99,
rms_prop_optimizer_decay = 0.9,
batch_size = 32,
replace_mse_with_huber_loss = False,
create_target_network = False,
tensorflow_support = True):
force_cpu=False,
async_training=False,
shared_optimizer=True,
scale_down_gradients_by_number_of_workers_for_sync_training=True,
clip_gradients=None,
gradients_clipping_method=GradientClippingMethod.ClipByGlobalNorm,
l2_regularization=0,
learning_rate=0.00025,
learning_rate_decay_rate=0,
learning_rate_decay_steps=0,
input_embedders_parameters={},
embedding_merger_type=EmbeddingMergerType.Concat,
middleware_parameters=None,
heads_parameters=[],
use_separate_networks_per_head=False,
optimizer_type='Adam',
optimizer_epsilon=0.0001,
adam_optimizer_beta1=0.9,
adam_optimizer_beta2=0.99,
rms_prop_optimizer_decay=0.9,
batch_size=32,
replace_mse_with_huber_loss=False,
create_target_network=False,
tensorflow_support=True):
"""
:param force_cpu:
Force the neural networks to run on the CPU even if a GPU is available
@@ -240,63 +270,106 @@ class NetworkParameters(Parameters):
gradients of the network. This will only be used if the clip_gradients value is defined as a value other
than None.
:param l2_regularization:
A L2 regularization weight that will be applied to the network weights while calculating the loss function
:param learning_rate:
The learning rate for the network
:param learning_rate_decay_rate:
If this value is larger than 0, an exponential decay will be applied to the network learning rate.
The rate of the decay is defined by this parameter, and the number of training steps the decay will be
applied is defined by learning_rate_decay_steps. Notice that both parameters should be defined in order
for this to work correctly.
:param learning_rate_decay_steps:
If the learning_rate_decay_rate of the network is larger than 0, an exponential decay will be applied to
the network learning rate. The number of steps the decay will be applied is defined by this parameter.
Notice that both this parameter, as well as learning_rate_decay_rate should be defined in order for the
learning rate decay to work correctly.
:param input_embedders_parameters:
A dictionary mapping between input names and input embedders (InputEmbedderParameters) to use for the
network. Each of the keys is an input name as returned from the environment in the state.
For example, if the environment returns a state containing 'observation' and 'measurements', then
the keys for the input embedders dictionary can be either 'observation' to use the observation as input,
'measurements' to use the measurements as input, or both.
The embedder type will be automatically selected according to the input type. Vector inputs will
produce a fully connected embedder, and image inputs will produce a convolutional embedder.
:param embedding_merger_type:
The type of embedding merging to use, given by one of the EmbeddingMergerType enum values.
This will be used to merge the outputs of all the input embedders into a single embbeding.
:param middleware_parameters:
The parameters of the middleware to use, given by a MiddlewareParameters object.
Each network will have only a single middleware embedder which will take the merged embeddings from the
input embedders and pass them through more neural network layers.
:param heads_parameters:
A list of heads for the network given by their corresponding HeadParameters.
Each network can have one or multiple network heads, where each one will take the output of the middleware
and make some additional computation on top of it. Additionally, each head calculates a weighted loss value,
and the loss values from all the heads will be summed later on.
:param use_separate_networks_per_head:
A flag that allows using different copies of the input embedders and middleware for each one of the heads.
Regularly, the heads will have a shared input, but in the case where use_separate_networks_per_head is set
to True, each one of the heads will get a different input.
:param optimizer_type:
A string specifying the optimizer type to use for updating the network. The available optimizers are
Adam, RMSProp and LBFGS.
:param optimizer_epsilon:
An internal optimizer parameter used for Adam and RMSProp.
:param adam_optimizer_beta1:
An beta1 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
optimizer for the network.
:param adam_optimizer_beta2:
An beta2 internal optimizer parameter used for Adam. It will be used only if Adam was selected as the
optimizer for the network.
:param rms_prop_optimizer_decay:
The decay value for the RMSProp optimizer, which will be used only in case the RMSProp optimizer was
selected for this network.
:param batch_size:
The batch size to use when updating the network.
:param replace_mse_with_huber_loss:
:param create_target_network:
If this flag is set to True, an additional copy of the network will be created and initialized with the
same weights as the online network. It can then be queried, and its weights can be synced from the
online network at will.
:param tensorflow_support:
A flag which specifies if the network is supported by the TensorFlow framework.
"""
super().__init__()
self.framework = Frameworks.tensorflow
self.sess = None
# hardware parameters
self.force_cpu = False
self.force_cpu = force_cpu
# distributed training options
self.async_training = False
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
self.async_training = async_training
self.shared_optimizer = shared_optimizer
self.scale_down_gradients_by_number_of_workers_for_sync_training = scale_down_gradients_by_number_of_workers_for_sync_training
# regularization
self.clip_gradients = None
self.gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm
self.l2_regularization = 0
self.clip_gradients = clip_gradients
self.gradients_clipping_method = gradients_clipping_method
self.l2_regularization = l2_regularization
# learning rate
self.learning_rate = 0.00025
self.learning_rate_decay_rate = 0
self.learning_rate_decay_steps = 0
self.learning_rate = learning_rate
self.learning_rate_decay_rate = learning_rate_decay_rate
self.learning_rate_decay_steps = learning_rate_decay_steps
# structure
self.input_embedders_parameters = {}
self.embedding_merger_type = EmbeddingMergerType.Concat
self.middleware_parameters = None
self.heads_parameters = []
self.use_separate_networks_per_head = False
self.optimizer_type = 'Adam'
self.optimizer_epsilon = 0.0001
self.adam_optimizer_beta1 = 0.9
self.adam_optimizer_beta2 = 0.99
self.rms_prop_optimizer_decay = 0.9
self.batch_size = 32
self.replace_mse_with_huber_loss = False
self.create_target_network = False
self.input_embedders_parameters = input_embedders_parameters
self.embedding_merger_type = embedding_merger_type
self.middleware_parameters = middleware_parameters
self.heads_parameters = heads_parameters
self.use_separate_networks_per_head = use_separate_networks_per_head
self.optimizer_type = optimizer_type
self.optimizer_epsilon = optimizer_epsilon
self.adam_optimizer_beta1 = adam_optimizer_beta1
self.adam_optimizer_beta2 = adam_optimizer_beta2
self.rms_prop_optimizer_decay = rms_prop_optimizer_decay
self.batch_size = batch_size
self.replace_mse_with_huber_loss = replace_mse_with_huber_loss
self.create_target_network = create_target_network
# Framework support
self.tensorflow_support = True
self.tensorflow_support = tensorflow_support
class NetworkComponentParameters(Parameters):

View File

@@ -83,91 +83,91 @@ def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'
def handle_distributed_coach_tasks(graph_manager, args):
ckpt_inside_container = "/checkpoint"
ckpt_inside_container = "/checkpoint"
memory_backend_params = None
if args.memory_backend_params:
memory_backend_params = json.loads(args.memory_backend_params)
memory_backend_params['run_type'] = str(args.distributed_coach_run_type)
graph_manager.agent_params.memory.register_var('memory_backend_params', construct_memory_params(memory_backend_params))
memory_backend_params = None
if args.memory_backend_params:
memory_backend_params = json.loads(args.memory_backend_params)
memory_backend_params['run_type'] = str(args.distributed_coach_run_type)
graph_manager.agent_params.memory.register_var('memory_backend_params', construct_memory_params(memory_backend_params))
data_store_params = None
data_store_params = None
if args.data_store_params:
data_store_params = construct_data_store_params(json.loads(args.data_store_params))
data_store_params.checkpoint_dir = ckpt_inside_container
graph_manager.data_store_params = data_store_params
if args.distributed_coach_run_type == RunType.TRAINER:
training_worker(
graph_manager=graph_manager,
checkpoint_dir=ckpt_inside_container
)
if args.distributed_coach_run_type == RunType.ROLLOUT_WORKER:
data_store = None
if args.data_store_params:
data_store_params = construct_data_store_params(json.loads(args.data_store_params))
data_store_params.checkpoint_dir = ckpt_inside_container
graph_manager.data_store_params = data_store_params
data_store = get_data_store(data_store_params)
wait_for_checkpoint(checkpoint_dir=ckpt_inside_container, data_store=data_store)
if args.distributed_coach_run_type == RunType.TRAINER:
training_worker(
graph_manager=graph_manager,
checkpoint_dir=ckpt_inside_container
)
if args.distributed_coach_run_type == RunType.ROLLOUT_WORKER:
data_store = None
if args.data_store_params:
data_store = get_data_store(data_store_params)
wait_for_checkpoint(checkpoint_dir=ckpt_inside_container, data_store=data_store)
rollout_worker(
graph_manager=graph_manager,
checkpoint_dir=ckpt_inside_container,
data_store=data_store,
num_workers=args.num_workers
)
rollout_worker(
graph_manager=graph_manager,
checkpoint_dir=ckpt_inside_container,
data_store=data_store,
num_workers=args.num_workers
)
def handle_distributed_coach_orchestrator(graph_manager, args):
ckpt_inside_container = "/checkpoint"
rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]
ckpt_inside_container = "/checkpoint"
rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]
if '--experiment_name' not in rollout_command:
rollout_command = rollout_command + ['--experiment_name', args.experiment_name]
if '--experiment_name' not in rollout_command:
rollout_command = rollout_command + ['--experiment_name', args.experiment_name]
if '--experiment_name' not in trainer_command:
trainer_command = trainer_command + ['--experiment_name', args.experiment_name]
if '--experiment_name' not in trainer_command:
trainer_command = trainer_command + ['--experiment_name', args.experiment_name]
memory_backend_params = None
if args.memory_backend == "redispubsub":
memory_backend_params = RedisPubSubMemoryBackendParameters()
memory_backend_params = None
if args.memory_backend == "redispubsub":
memory_backend_params = RedisPubSubMemoryBackendParameters()
ds_params_instance = None
if args.data_store == "s3":
ds_params = DataStoreParameters("s3", "", "")
ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)
ds_params_instance = None
if args.data_store == "s3":
ds_params = DataStoreParameters("s3", "", "")
ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)
worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))
worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))
orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
kubeconfig='~/.kube/config',
memory_backend_parameters=memory_backend_params,
data_store_params=ds_params_instance)
orchestrator = Kubernetes(orchestration_params)
if not orchestrator.setup():
print("Could not setup.")
return
orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
kubeconfig='~/.kube/config',
memory_backend_parameters=memory_backend_params,
data_store_params=ds_params_instance)
orchestrator = Kubernetes(orchestration_params)
if not orchestrator.setup():
print("Could not setup.")
return
if orchestrator.deploy_trainer():
print("Successfully deployed trainer.")
else:
print("Could not deploy trainer.")
return
if orchestrator.deploy_trainer():
print("Successfully deployed trainer.")
else:
print("Could not deploy trainer.")
return
if orchestrator.deploy_worker():
print("Successfully deployed rollout worker(s).")
else:
print("Could not deploy rollout worker(s).")
return
if orchestrator.deploy_worker():
print("Successfully deployed rollout worker(s).")
else:
print("Could not deploy rollout worker(s).")
return
try:
orchestrator.trainer_logs()
except KeyboardInterrupt:
pass
try:
orchestrator.trainer_logs()
except KeyboardInterrupt:
pass
orchestrator.undeploy()
orchestrator.undeploy()
class CoachLauncher(object):
@@ -192,7 +192,6 @@ class CoachLauncher(object):
graph_manager = self.get_graph_manager_from_args(args)
self.run_graph_manager(graph_manager, args)
def get_graph_manager_from_args(self, args: argparse.Namespace) -> 'GraphManager':
"""
Return the graph manager according to the command line arguments given by the user.
@@ -251,7 +250,6 @@ class CoachLauncher(object):
return graph_manager
def display_all_presets_and_exit(self):
# list available presets
screen.log_title("Available Presets:")
@@ -259,7 +257,6 @@ class CoachLauncher(object):
print(preset)
sys.exit(0)
def expand_preset(self, preset):
"""
Replace a short preset name with the full python path, and verify that it can be imported.
@@ -287,7 +284,6 @@ class CoachLauncher(object):
return preset
def get_config_args(self, parser: argparse.ArgumentParser) -> argparse.Namespace:
"""
Returns a Namespace object with all the user-specified configuration options needed to launch.
@@ -317,7 +313,6 @@ class CoachLauncher(object):
if args.list:
self.display_all_presets_and_exit()
# Read args from config file for distributed Coach.
if args.distributed_coach and args.distributed_coach_run_type == RunType.ORCHESTRATOR:
coach_config = ConfigParser({
@@ -401,7 +396,6 @@ class CoachLauncher(object):
return args
def get_argument_parser(self) -> argparse.ArgumentParser:
"""
This returns an ArgumentParser object which defines the set of options that customers are expected to supply in order
@@ -545,7 +539,6 @@ class CoachLauncher(object):
return parser
def run_graph_manager(self, graph_manager: 'GraphManager', args: argparse.Namespace):
if args.distributed_coach and not graph_manager.agent_params.algorithm.distributed_coach_synchronization_type:
screen.error("{} algorithm is not supported using distributed Coach.".format(graph_manager.agent_params.algorithm))
@@ -581,7 +574,6 @@ class CoachLauncher(object):
else:
self.start_multi_threaded(graph_manager, args)
def start_single_threaded(self, graph_manager: 'GraphManager', args: argparse.Namespace):
# Start the training or evaluation
task_parameters = TaskParameters(
@@ -598,7 +590,6 @@ class CoachLauncher(object):
start_graph(graph_manager=graph_manager, task_parameters=task_parameters)
def start_multi_threaded(self, graph_manager: 'GraphManager', args: argparse.Namespace):
total_tasks = args.num_workers
if args.evaluation_worker:

View File

@@ -260,6 +260,7 @@ class EnvResponse(object):
"""
An env response is a collection containing the information returning from the environment after a single action
has been performed on it.
:param next_state: The new state that the environment has transitioned into. Assumed to be a dictionary where the
observation is located at state['observation']
:param reward: The reward received from the environment
@@ -350,11 +351,13 @@ class ActionInfo(object):
class Batch(object):
"""
A wrapper around a list of transitions that helps extracting batches of parameters from it.
For example, one can extract a list of states corresponding to the list of transitions.
The class uses lazy evaluation in order to return each of the available parameters.
"""
def __init__(self, transitions: List[Transition]):
"""
A wrapper around a list of transitions that helps extracting batches of parameters from it.
For example, one can extract a list of states corresponding to the list of transitions.
The class uses lazy evaluation in order to return each of the available parameters.
:param transitions: a list of transitions to extract the batch from
"""
self.transitions = transitions
@@ -370,6 +373,7 @@ class Batch(object):
def slice(self, start, end) -> None:
"""
Keep a slice from the batch and discard the rest of the batch
:param start: the start index in the slice
:param end: the end index in the slice
:return: None
@@ -396,6 +400,7 @@ class Batch(object):
def shuffle(self) -> None:
"""
Shuffle all the transitions in the batch
:return: None
"""
batch_order = list(range(self.size))
@@ -432,6 +437,7 @@ class Batch(object):
"""
follow the keys in fetches to extract the corresponding items from the states in the batch
if these keys were not already extracted before. return only the values corresponding to those keys
:param fetches: the keys of the state dictionary to extract
:param expand_dims: add an extra dimension to each of the value batches
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
@@ -452,6 +458,7 @@ class Batch(object):
def actions(self, expand_dims=False) -> np.ndarray:
"""
if the actions were not converted to a batch before, extract them to a batch and then return the batch
:param expand_dims: add an extra dimension to the actions batch
:return: a numpy array containing all the actions of the batch
"""
@@ -464,6 +471,7 @@ class Batch(object):
def rewards(self, expand_dims=False) -> np.ndarray:
"""
if the rewards were not converted to a batch before, extract them to a batch and then return the batch
:param expand_dims: add an extra dimension to the rewards batch
:return: a numpy array containing all the rewards of the batch
"""
@@ -491,6 +499,7 @@ class Batch(object):
def game_overs(self, expand_dims=False) -> np.ndarray:
"""
if the game_overs were not converted to a batch before, extract them to a batch and then return the batch
:param expand_dims: add an extra dimension to the game_overs batch
:return: a numpy array containing all the game over flags of the batch
"""
@@ -504,6 +513,7 @@ class Batch(object):
"""
follow the keys in fetches to extract the corresponding items from the next states in the batch
if these keys were not already extracted before. return only the values corresponding to those keys
:param fetches: the keys of the state dictionary to extract
:param expand_dims: add an extra dimension to each of the value batches
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
@@ -526,6 +536,7 @@ class Batch(object):
"""
if the goals were not converted to a batch before, extract them to a batch and then return the batch
if the goal was not filled, this will raise an exception
:param expand_dims: add an extra dimension to the goals batch
:return: a numpy array containing all the goals of the batch
"""
@@ -549,6 +560,7 @@ class Batch(object):
"""
if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
batch. if the key is not part of the keys in the info dictionary, this will raise an exception
:param expand_dims: add an extra dimension to the info batch
:return: a numpy array containing all the info values of the batch corresponding to the given key
"""
@@ -568,6 +580,7 @@ class Batch(object):
def __getitem__(self, key):
"""
get an item from the transitions list
:param key: index of the transition in the batch
:return: the transition corresponding to the given index
"""
@@ -576,6 +589,7 @@ class Batch(object):
def __setitem__(self, key, item):
"""
set an item in the transition list
:param key: index of the transition in the batch
:param item: the transition to place in the given index
:return: None
@@ -598,6 +612,7 @@ class TotalStepsCounter(object):
def __getitem__(self, key: Type[StepMethod]) -> int:
"""
get counter value
:param key: counter type
:return: the counter value
"""
@@ -606,6 +621,7 @@ class TotalStepsCounter(object):
def __setitem__(self, key: StepMethod, item: int) -> None:
"""
set an item in the transition list
:param key: counter type
:param item: an integer representing the new counter value
:return: None
@@ -626,6 +642,9 @@ class GradientClippingMethod(Enum):
class Episode(object):
"""
An Episode represents a set of sequential transitions, that end with a terminal state.
"""
def __init__(self, discount: float=0.99, bootstrap_total_return_from_old_policy: bool=False, n_step: int=-1):
"""
:param discount: the discount factor to use when calculating total returns
@@ -634,38 +653,78 @@ class Episode(object):
:param n_step: the number of future steps to sum the reward over before bootstrapping
"""
self.transitions = []
# a num_transitions x num_transitions table with the n step return in the n'th row
self._length = 0
self.discount = discount
self.bootstrap_total_return_from_old_policy = bootstrap_total_return_from_old_policy
self.n_step = n_step
self.is_complete = False
def insert(self, transition):
def insert(self, transition: Transition) -> None:
"""
Insert a new transition to the episode. If the game_over flag in the transition is set to True,
the episode will be marked as complete.
:param transition: The new transition to insert to the episode
:return: None
"""
self.transitions.append(transition)
self._length += 1
if transition.game_over:
self.is_complete = True
def is_empty(self):
def is_empty(self) -> bool:
"""
Check if the episode is empty
:return: A boolean value determining if the episode is empty or not
"""
return self.length() == 0
def length(self):
def length(self) -> int:
"""
Return the length of the episode, which is the number of transitions it holds.
:return: The number of transitions in the episode
"""
return self._length
def __len__(self):
return self.length()
def get_transition(self, transition_idx):
def get_transition(self, transition_idx: int) -> Transition:
"""
Get a specific transition by its index.
:param transition_idx: The index of the transition to get
:return: The transition which is stored in the given index
"""
return self.transitions[transition_idx]
def get_last_transition(self):
def get_last_transition(self) -> Transition:
"""
Get the last transition in the episode, or None if there are no transition available
:return: The last transition in the episode
"""
return self.get_transition(-1) if self.length() > 0 else None
def get_first_transition(self):
def get_first_transition(self) -> Transition:
"""
Get the first transition in the episode, or None if there are no transitions available
:return: The first transition in the episode
"""
return self.get_transition(0) if self.length() > 0 else None
def update_discounted_rewards(self):
"""
Update the discounted returns for all the transitions in the episode.
The returns will be calculated according to the rewards of each transition, together with the number of steps
to bootstrap from and the discount factor, as defined by n_step and discount respectively when initializing
the episode.
:return: None
"""
if self.n_step == -1 or self.n_step > self.length():
curr_n_step = self.length()
else:
@@ -708,15 +767,17 @@ class Episode(object):
self.update_discounted_rewards()
def update_actions_probabilities(self):
probability_product = 1
for transition_idx, transition in enumerate(self.transitions):
if 'action_probabilities' in transition.info.keys():
probability_product *= transition.info['action_probabilities']
for transition_idx, transition in enumerate(self.transitions):
transition.info['probability_product'] = probability_product
def get_transitions_attribute(self, attribute_name):
def get_transitions_attribute(self, attribute_name: str) -> List[Any]:
"""
Get the values for some transition attribute from all the transitions in the episode.
For example, this allows getting the rewards for all the transitions as a list by calling
get_transitions_attribute('reward')
:param attribute_name: The name of the attribute to extract from all the transitions
:return: A list of values from all the transitions according to the attribute given in attribute_name
"""
if len(self.transitions) > 0 and hasattr(self.transitions[0], attribute_name):
return [getattr(t, attribute_name) for t in self.transitions]
elif len(self.transitions) == 0:
@@ -724,12 +785,6 @@ class Episode(object):
else:
raise ValueError("The transitions have no such attribute name")
def to_batch(self):
batch = []
for i in range(self.length()):
batch.append(self.get_transition(i))
return batch
def __getitem__(self, sliced):
return self.transitions[sliced]

View File

@@ -69,6 +69,38 @@ class ControlSuiteEnvironment(Environment):
target_success_rate: float=1.0, seed: Union[None, int]=None, human_control: bool=False,
observation_type: ObservationType=ObservationType.Measurements,
custom_reward_threshold: Union[int, float]=None, **kwargs):
"""
:param level: (str)
A string representing the control suite level to run. This can also be a LevelSelection object.
For example, cartpole:swingup.
:param frame_skip: (int)
The number of frames to skip between any two actions given by the agent. The action will be repeated
for all the skipped frames.
:param visualization_parameters: (VisualizationParameters)
The parameters used for visualizing the environment, such as the render flag, storing videos etc.
:param target_success_rate: (float)
Stop experiment if given target success rate was achieved.
:param seed: (int)
A seed to use for the random number generator when running the environment.
:param human_control: (bool)
A flag that allows controlling the environment using the keyboard keys.
:param observation_type: (ObservationType)
An enum which defines which observation to use. The current options are to use:
* Measurements only - a vector of joint torques and similar measurements
* Image only - an image of the environment as seen by a camera attached to the simulator
* Measurements & Image - both type of observations will be returned in the state using the keys
'measurements' and 'pixels' respectively.
:param custom_reward_threshold: (float)
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
"""
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters, target_success_rate)
self.observation_type = observation_type

View File

@@ -125,6 +125,36 @@ class DoomEnvironment(Environment):
def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
cameras: List[CameraTypes], target_success_rate: float=1.0, **kwargs):
"""
:param level: (str)
A string representing the doom level to run. This can also be a LevelSelection object.
This should be one of the levels defined in the DoomLevel enum. For example, HEALTH_GATHERING.
:param seed: (int)
A seed to use for the random number generator when running the environment.
:param frame_skip: (int)
The number of frames to skip between any two actions given by the agent. The action will be repeated
for all the skipped frames.
:param human_control: (bool)
A flag that allows controlling the environment using the keyboard keys.
:param custom_reward_threshold: (float)
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
:param visualization_parameters: (VisualizationParameters)
The parameters used for visualizing the environment, such as the render flag, storing videos etc.
:param cameras: (List[CameraTypes])
A list of camera types to use as observation in the state returned from the environment.
Each camera should be an enum from CameraTypes, and there are several options like an RGB observation,
a depth map, a segmentation map, and a top down map of the enviornment.
:param target_success_rate: (float)
Stop experiment if given target success rate was achieved.
"""
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters, target_success_rate)
self.cameras = cameras

View File

@@ -176,6 +176,7 @@ class Environment(EnvironmentInterface):
def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
"""
Get the action space of the environment
:return: the action space
"""
return self._action_space
@@ -184,6 +185,7 @@ class Environment(EnvironmentInterface):
def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
"""
Set the action space of the environment
:return: None
"""
self._action_space = val
@@ -192,6 +194,7 @@ class Environment(EnvironmentInterface):
def state_space(self) -> Union[List[StateSpace], StateSpace]:
"""
Get the state space of the environment
:return: the observation space
"""
return self._state_space
@@ -200,6 +203,7 @@ class Environment(EnvironmentInterface):
def state_space(self, val: Union[List[StateSpace], StateSpace]):
"""
Set the state space of the environment
:return: None
"""
self._state_space = val
@@ -208,6 +212,7 @@ class Environment(EnvironmentInterface):
def goal_space(self) -> Union[List[ObservationSpace], ObservationSpace]:
"""
Get the state space of the environment
:return: the observation space
"""
return self._goal_space
@@ -216,6 +221,7 @@ class Environment(EnvironmentInterface):
def goal_space(self, val: Union[List[ObservationSpace], ObservationSpace]):
"""
Set the goal space of the environment
:return: None
"""
self._goal_space = val
@@ -223,6 +229,7 @@ class Environment(EnvironmentInterface):
def get_action_from_user(self) -> ActionType:
"""
Get an action from the user keyboard
:return: action index
"""
if self.wait_for_explicit_human_action:
@@ -250,6 +257,7 @@ class Environment(EnvironmentInterface):
def last_env_response(self) -> Union[List[EnvResponse], EnvResponse]:
"""
Get the last environment response
:return: a dictionary that contains the state, reward, etc.
"""
return squeeze_list(self._last_env_response)
@@ -258,6 +266,7 @@ class Environment(EnvironmentInterface):
def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]):
"""
Set the last environment response
:param val: the last environment response
"""
self._last_env_response = force_list(val)
@@ -265,6 +274,7 @@ class Environment(EnvironmentInterface):
def step(self, action: ActionType) -> EnvResponse:
"""
Make a single step in the environment using the given action
:param action: an action to use for stepping the environment. Should follow the definition of the action space.
:return: the environment response as returned in get_last_env_response
"""
@@ -317,6 +327,8 @@ class Environment(EnvironmentInterface):
def render(self) -> None:
"""
Call the environment function for rendering to the screen
:return: None
"""
if self.native_rendering:
self._render()
@@ -326,6 +338,7 @@ class Environment(EnvironmentInterface):
def handle_episode_ended(self) -> None:
"""
End an episode
:return: None
"""
self.dump_video_of_last_episode_if_needed()
@@ -333,6 +346,7 @@ class Environment(EnvironmentInterface):
def reset_internal_state(self, force_environment_reset=False) -> EnvResponse:
"""
Reset the environment and all the variable of the wrapper
:param force_environment_reset: forces environment reset even when the game did not end
:return: A dictionary containing the observation, reward, done flag, action and measurements
"""
@@ -368,6 +382,7 @@ class Environment(EnvironmentInterface):
def get_random_action(self) -> ActionType:
"""
Returns an action picked uniformly from the available actions
:return: a numpy array with a random action
"""
return self.action_space.sample()
@@ -375,6 +390,7 @@ class Environment(EnvironmentInterface):
def get_available_keys(self) -> List[Tuple[str, ActionType]]:
"""
Return a list of tuples mapping between action names and the keyboard key that triggers them
:return: a list of tuples mapping between action names and the keyboard key that triggers them
"""
available_keys = []
@@ -391,6 +407,7 @@ class Environment(EnvironmentInterface):
def get_goal(self) -> GoalType:
"""
Get the current goal that the agents needs to achieve in the environment
:return: The goal
"""
return self.goal
@@ -398,6 +415,7 @@ class Environment(EnvironmentInterface):
def set_goal(self, goal: GoalType) -> None:
"""
Set the current goal that the agent needs to achieve in the environment
:param goal: the goal that needs to be achieved
:return: None
"""
@@ -424,14 +442,6 @@ class Environment(EnvironmentInterface):
if self.visualization_parameters.dump_mp4:
logger.create_mp4(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Episode"] = self.episode_idx
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
log["Steps"] = self.total_steps_counter
screen.log_dict(log, prefix=self.phase.value)
# The following functions define the interaction with the environment.
# Any new environment that inherits the Environment class should use these signatures.
# Some of these functions are optional - please read their description for more details.
@@ -439,6 +449,7 @@ class Environment(EnvironmentInterface):
def _take_action(self, action_idx: ActionType) -> None:
"""
An environment dependent function that sends an action to the simulator.
:param action_idx: the action to perform on the environment
:return: None
"""
@@ -448,6 +459,7 @@ class Environment(EnvironmentInterface):
"""
Updates the state from the environment.
Should update self.observation, self.reward, self.done, self.measurements and self.info
:return: None
"""
raise NotImplementedError("")
@@ -455,6 +467,7 @@ class Environment(EnvironmentInterface):
def _restart_environment_episode(self, force_environment_reset=False) -> None:
"""
Restarts the simulator episode
:param force_environment_reset: Force the environment to reset even if the episode is not done yet.
:return: None
"""
@@ -463,6 +476,7 @@ class Environment(EnvironmentInterface):
def _render(self) -> None:
"""
Renders the environment using the native simulator renderer
:return: None
"""
pass
@@ -471,6 +485,7 @@ class Environment(EnvironmentInterface):
"""
Return a numpy array containing the image that will be rendered to the screen.
This can be different from the observation. For example, mujoco's observation is a measurements vector.
:return: numpy array containing the image that will be rendered to the screen
"""
return np.transpose(self.state['observation'], [1, 2, 0])

View File

@@ -140,7 +140,7 @@ atari_schedule = ScheduleParameters()
atari_schedule.improve_steps = EnvironmentSteps(50000000)
atari_schedule.steps_between_evaluation_periods = EnvironmentSteps(250000)
atari_schedule.evaluation_steps = EnvironmentSteps(135000)
atari_schedule.heatup_steps = EnvironmentSteps(50000)
atari_schedule.heatup_steps = EnvironmentSteps(1)
class MaxOverFramesAndFrameskipEnvWrapper(gym.Wrapper):
@@ -181,6 +181,41 @@ class GymEnvironment(Environment):
target_success_rate: float=1.0, additional_simulator_parameters: Dict[str, Any] = {}, seed: Union[None, int]=None,
human_control: bool=False, custom_reward_threshold: Union[int, float]=None,
random_initialization_steps: int=1, max_over_num_frames: int=1, **kwargs):
"""
:param level: (str)
A string representing the gym level to run. This can also be a LevelSelection object.
For example, BreakoutDeterministic-v0
:param frame_skip: (int)
The number of frames to skip between any two actions given by the agent. The action will be repeated
for all the skipped frames.
:param visualization_parameters: (VisualizationParameters)
The parameters used for visualizing the environment, such as the render flag, storing videos etc.
:param additional_simulator_parameters: (Dict[str, Any])
Any additional parameters that the user can pass to the Gym environment. These parameters should be
accepted by the __init__ function of the implemented Gym environment.
:param seed: (int)
A seed to use for the random number generator when running the environment.
:param human_control: (bool)
A flag that allows controlling the environment using the keyboard keys.
:param custom_reward_threshold: (float)
Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
If not set, this value will be taken from the Gym environment definition.
:param random_initialization_steps: (int)
The number of random steps that will be taken in the environment after each reset.
This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.
:param max_over_num_frames: (int)
This value will be used for merging multiple frames into a single frame by taking the maximum value for each
of the pixels in the frame. This is particularly used in Atari games, where the frames flicker, and objects
can be seen in one frame but disappear in the next.
"""
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold,
visualization_parameters, target_success_rate)

View File

@@ -13,3 +13,43 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from .additive_noise import AdditiveNoiseParameters, AdditiveNoise
from .boltzmann import BoltzmannParameters, Boltzmann
from .bootstrapped import BootstrappedParameters, Bootstrapped
from .categorical import CategoricalParameters, Categorical
from .continuous_entropy import ContinuousEntropyParameters, ContinuousEntropy
from .e_greedy import EGreedyParameters, EGreedy
from .exploration_policy import ExplorationParameters, ExplorationPolicy
from .greedy import GreedyParameters, Greedy
from .ou_process import OUProcessParameters, OUProcess
from .parameter_noise import ParameterNoiseParameters, ParameterNoise
from .truncated_normal import TruncatedNormalParameters, TruncatedNormal
from .ucb import UCBParameters, UCB
__all__ = [
'AdditiveNoiseParameters',
'AdditiveNoise',
'BoltzmannParameters',
'Boltzmann',
'BootstrappedParameters',
'Bootstrapped',
'CategoricalParameters',
'Categorical',
'ContinuousEntropyParameters',
'ContinuousEntropy',
'EGreedyParameters',
'EGreedy',
'ExplorationParameters',
'ExplorationPolicy',
'GreedyParameters',
'Greedy',
'OUProcessParameters',
'OUProcess',
'ParameterNoiseParameters',
'ParameterNoise',
'TruncatedNormalParameters',
'TruncatedNormal',
'UCBParameters',
'UCB'
]

View File

@@ -37,6 +37,14 @@ class AdditiveNoiseParameters(ExplorationParameters):
class AdditiveNoise(ExplorationPolicy):
"""
AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent
and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that
can be given in two different ways:
1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
be the mean of the action, and 2nd is assumed to be its standard deviation.
"""
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
evaluation_noise_percentage: float):
"""

View File

@@ -36,6 +36,12 @@ class BoltzmannParameters(ExplorationParameters):
class Boltzmann(ExplorationPolicy):
"""
The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible
actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values
into a distribution over the actions. It then samples the action for playing out of the calculated distribution.
An additional temperature schedule can be given by the user, and will control the steepness of the softmax function.
"""
def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
"""
:param action_space: the action space used by the environment

View File

@@ -39,6 +39,17 @@ class BootstrappedParameters(EGreedyParameters):
class Bootstrapped(EGreedy):
"""
Bootstrapped exploration policy is currently only used for discrete action spaces along with the
Bootstrapped DQN agent. It assumes that there is an ensemble of network heads, where each one predicts the
values for all the possible actions. For each episode, a single head is selected to lead the agent, according
to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
predictions.
.. note::
This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
since it requires the agent to have a network with multiple heads.
"""
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
architecture_num_q_heads: int,
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):

View File

@@ -30,6 +30,12 @@ class CategoricalParameters(ExplorationParameters):
class Categorical(ExplorationPolicy):
"""
Categorical exploration policy is intended for discrete action spaces. It expects the action values to
represent a probability distribution over the action, from which a single action will be sampled.
In evaluation, the action that has the highest probability will be selected. This is particularly useful for
actor-critic schemes, where the actors output is a probability distribution over the actions.
"""
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment

View File

@@ -24,4 +24,15 @@ class ContinuousEntropyParameters(AdditiveNoiseParameters):
class ContinuousEntropy(AdditiveNoise):
"""
Continuous entropy is an exploration policy that is actually implemented as part of the network.
The exploration policy class is only a placeholder for choosing this policy. The exploration policy is
implemented by adding a regularization factor to the network loss, which regularizes the entropy of the action.
This exploration policy is only intended for continuous action spaces, and assumes that the entire calculation
is implemented as part of the head.
.. warning::
This exploration policy expects the agent or the network to implement the exploration functionality.
There are only a few heads that actually are relevant and implement the entropy regularization factor.
"""
pass

View File

@@ -43,6 +43,19 @@ class EGreedyParameters(ExplorationParameters):
class EGreedy(ExplorationPolicy):
"""
e-greedy is an exploration policy that is intended for both discrete and continuous action spaces.
For discrete action spaces, it assumes that each action is assigned a value, and it selects the action with the
highest value with probability 1 - epsilon. Otherwise, it selects a action sampled uniformly out of all the
possible actions. The epsilon value is given by the user and can be given as a schedule.
In evaluation, a different epsilon value can be specified.
For continuous action spaces, it assumes that the mean action is given by the agent. With probability epsilon,
it samples a random action out of the action space bounds. Otherwise, it selects the action according to a
given continuous exploration policy, which is set to AdditiveNoise by default. In evaluation, the action is
always selected according to the given continuous exploration policy (where its phase is set to evaluation as well).
"""
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
evaluation_epsilon: float,
continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):

View File

@@ -31,6 +31,10 @@ class ExplorationParameters(Parameters):
class ExplorationPolicy(object):
"""
An exploration policy takes the predicted actions or action values from the agent, and selects the action to
actually apply to the environment using some predefined algorithm.
"""
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment

View File

@@ -30,6 +30,11 @@ class GreedyParameters(ExplorationParameters):
class Greedy(ExplorationPolicy):
"""
The Greedy exploration policy is intended for both discrete and continuous action spaces.
For discrete action spaces, it always selects the action with the maximum value, as given by the agent.
For continuous action spaces, it always return the exact action, as it was given by the agent.
"""
def __init__(self, action_space: ActionSpace):
"""
:param action_space: the action space used by the environment

View File

@@ -40,6 +40,11 @@ class OUProcessParameters(ExplorationParameters):
# Ornstein-Uhlenbeck process
class OUProcess(ExplorationPolicy):
"""
OUProcess exploration policy is intended for continuous action spaces, and selects the action according to
an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where
the samples are correlated between consequent time steps.
"""
def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
"""
:param action_space: the action space used by the environment

View File

@@ -42,10 +42,18 @@ class ParameterNoiseParameters(ExplorationParameters):
class ParameterNoise(ExplorationPolicy):
"""
The ParameterNoise exploration policy is intended for both discrete and continuous action spaces.
It applies the exploration policy by replacing all the dense network layers with noisy layers.
The noisy layers have both weight means and weight standard deviations, and for each forward pass of the network
the weights are sampled from a normal distribution that follows the learned weights mean and standard deviation
values.
Warning: currently supported only by DQN variants
"""
def __init__(self, network_params: Dict[str, NetworkParameters], action_space: ActionSpace):
"""
:param action_space: the action space used by the environment
:param alpha0:
"""
super().__init__(action_space)
self.network_params = network_params

View File

@@ -39,6 +39,16 @@ class TruncatedNormalParameters(ExplorationParameters):
class TruncatedNormal(ExplorationPolicy):
"""
The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a
normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t
wo different ways:
1. Specified by the user as a noise schedule which is taken in percentiles out of the action space size
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
be the mean of the action, and 2nd is assumed to be its standard deviation.
When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
is within the bounds.
"""
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
evaluation_noise_percentage: float, clip_low: float, clip_high: float):
"""

View File

@@ -43,6 +43,15 @@ class UCBParameters(EGreedyParameters):
class UCB(EGreedy):
"""
UCB exploration policy is following the upper confidence bound heuristic to sample actions in discrete action spaces.
It assumes that there are multiple network heads that are predicting action values, and that the standard deviation
between the heads predictions represents the uncertainty of the agent in each of the actions.
It then updates the action value estimates to by mean(actions)+lambda*stdev(actions), where lambda is
given by the user. This exploration policy aims to take advantage of the uncertainty of the agent in its predictions,
and select the action according to the tradeoff between how uncertain the agent is, and how large it predicts
the outcome from those actions to be.
"""
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
architecture_num_q_heads: int, lamb: int,
continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):

View File

@@ -0,0 +1,14 @@
from .attention_discretization import AttentionDiscretization
from .box_discretization import BoxDiscretization
from .box_masking import BoxMasking
from .full_discrete_action_space_map import FullDiscreteActionSpaceMap
from .linear_box_to_box_map import LinearBoxToBoxMap
from .partial_discrete_action_space_map import PartialDiscreteActionSpaceMap
__all__ = [
'AttentionDiscretization',
'BoxDiscretization',
'BoxMasking',
'FullDiscreteActionSpaceMap',
'LinearBoxToBoxMap',
'PartialDiscreteActionSpaceMap'
]

View File

@@ -25,11 +25,18 @@ from rl_coach.spaces import AttentionActionSpace, BoxActionSpace, DiscreteAction
class AttentionDiscretization(PartialDiscreteActionSpaceMap):
"""
Given a box action space, this is used to discretize the space.
The discretization is achieved by creating a grid in the space with num_bins_per_dimension bins per dimension in the
space. Each discrete action is mapped to a single sub-box in the BoxActionSpace action space.
Discretizes an **AttentionActionSpace**. The attention action space defines the actions
as choosing sub-boxes in a given box. For example, consider an image of size 100x100, where the action is choosing
a crop window of size 20x20 to attend to in the image. AttentionDiscretization allows discretizing the possible crop
windows to choose into a finite number of options, and map a discrete action space into those crop windows.
Warning! this will currently only work for attention spaces with 2 dimensions.
"""
def __init__(self, num_bins_per_dimension: Union[int, List[int]], force_int_bins=False):
"""
:param num_bins_per_dimension: Number of discrete bins to use for each dimension of the action space
:param force_int_bins: If set to True, all the bins will represent integer coordinates in space.
"""
# we allow specifying either a single number for all dimensions, or a single number per dimension in the target
# action space
self.num_bins_per_dimension = num_bins_per_dimension

View File

@@ -25,9 +25,12 @@ from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
class BoxDiscretization(PartialDiscreteActionSpaceMap):
"""
Given a box action space, this is used to discretize the space.
The discretization is achieved by creating a grid in the space with num_bins_per_dimension bins per dimension in the
space. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
Discretizes a continuous action space into a discrete action space, allowing the usage of
agents such as DQN for continuous environments such as MuJoCo. Given the number of bins to discretize into, the
original continuous action space is uniformly separated into the given number of bins, each mapped to a discrete
action index. Each discrete action is mapped to a single N dimensional action in the BoxActionSpace action space.
For example, if the original actions space is between -1 and 1 and 5 bins were selected, the new action
space will consist of 5 actions mapped to -1, -0.5, 0, 0.5 and 1.
"""
def __init__(self, num_bins_per_dimension: Union[int, List[int]], force_int_bins=False):
"""

View File

@@ -25,12 +25,10 @@ from rl_coach.spaces import BoxActionSpace
class BoxMasking(ActionFilter):
"""
Masks a box action space by allowing only selecting a subset of the space
For example,
- the target action space has actions of shape 1 with values between 10 and 32
- we mask the target action space so that only the action 20 to 25 can be chosen
The actions will be between 0 to 5 and the mapping will add an offset of 20 to the incoming actions
The shape of the source and target action spaces is always the same
Masks part of the action space to enforce the agent to work in a defined space. For example,
if the original action space is between -1 and 1, then this filter can be used in order to constrain the agent actions
to the range 0 and 1 instead. This essentially masks the range -1 and 0 from the agent.
The resulting action space will be shifted and will always start from 0 and have the size of the unmasked area.
"""
def __init__(self,
masked_target_space_low: Union[None, int, float, np.ndarray],

View File

@@ -20,7 +20,9 @@ from rl_coach.spaces import ActionSpace, DiscreteActionSpace
class FullDiscreteActionSpaceMap(PartialDiscreteActionSpaceMap):
"""
Maps all the actions in the output space to discrete actions in the action space.
Full map of two countable action spaces. This works in a similar way to the
PartialDiscreteActionSpaceMap, but maps the entire source action space into the entire target action space, without
masking any actions.
For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those
multiselect actions.
"""

View File

@@ -25,17 +25,19 @@ from rl_coach.spaces import BoxActionSpace
class LinearBoxToBoxMap(ActionFilter):
"""
Maps a box action space to a box action space.
For example,
- the source action space has actions of shape 1 with values between -42 and -10,
- the target action space has actions of shape 1 with values between 10 and 32
The mapping will add an offset of 52 to the incoming actions and then multiply them by 22/32 to scale them to the
target action space
The shape of the source and target action spaces is always the same
A linear mapping of two box action spaces. For example, if the action space of the
environment consists of continuous actions between 0 and 1, and we want the agent to choose actions between -1 and 1,
the LinearBoxToBoxMap can be used to map the range -1 and 1 to the range 0 and 1 in a linear way. This means that the
action -1 will be mapped to 0, the action 1 will be mapped to 1, and the rest of the actions will be linearly mapped
between those values.
"""
def __init__(self,
input_space_low: Union[None, int, float, np.ndarray],
input_space_high: Union[None, int, float, np.ndarray]):
"""
:param input_space_low: the low values of the desired action space
:param input_space_high: the high values of the desired action space
"""
self.input_space_low = input_space_low
self.input_space_high = input_space_high
self.rescale = None

View File

@@ -23,11 +23,17 @@ from rl_coach.spaces import DiscreteActionSpace, ActionSpace
class PartialDiscreteActionSpaceMap(ActionFilter):
"""
Maps the given actions from the output space to discrete actions in the action space.
For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those
multiselect actions.
Partial map of two countable action spaces. For example, consider an environment
with a MultiSelect action space (select multiple actions at the same time, such as jump and go right), with 8 actual
MultiSelect actions. If we want the agent to be able to select only 5 of those actions by their index (0-4), we can
map a discrete action space with 5 actions into the 5 selected MultiSelect actions. This will both allow the agent to
use regular discrete actions, and mask 3 of the actions from the agent.
"""
def __init__(self, target_actions: List[ActionType]=None, descriptions: List[str]=None):
"""
:param target_actions: A partial list of actions from the target space to map to.
:param descriptions: a list of descriptions of each of the actions
"""
self.target_actions = target_actions
self.descriptions = descriptions
super().__init__()

View File

@@ -0,0 +1,25 @@
from .observation_clipping_filter import ObservationClippingFilter
from .observation_crop_filter import ObservationCropFilter
from .observation_move_axis_filter import ObservationMoveAxisFilter
from .observation_normalization_filter import ObservationNormalizationFilter
from .observation_reduction_by_sub_parts_name_filter import ObservationReductionBySubPartsNameFilter
from .observation_rescale_size_by_factor_filter import ObservationRescaleSizeByFactorFilter
from .observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
from .observation_rgb_to_y_filter import ObservationRGBToYFilter
from .observation_squeeze_filter import ObservationSqueezeFilter
from .observation_stacking_filter import ObservationStackingFilter
from .observation_to_uint8_filter import ObservationToUInt8Filter
__all__ = [
'ObservationClippingFilter',
'ObservationCropFilter',
'ObservationMoveAxisFilter',
'ObservationNormalizationFilter',
'ObservationReductionBySubPartsNameFilter',
'ObservationRescaleSizeByFactorFilter',
'ObservationRescaleToSizeFilter',
'ObservationRGBToYFilter',
'ObservationSqueezeFilter',
'ObservationStackingFilter',
'ObservationToUInt8Filter'
]

View File

@@ -24,7 +24,10 @@ from rl_coach.spaces import ObservationSpace
class ObservationClippingFilter(ObservationFilter):
"""
Clip the observation values using the given ranges
Clips the observation values to a given range of values.
For example, if the observation consists of measurements in an arbitrary range,
and we want to control the minimum and maximum values of these observations,
we can define a range and clip the values of the measurements.
"""
def __init__(self, clipping_low: float=-np.inf, clipping_high: float=np.inf):
"""

View File

@@ -24,7 +24,9 @@ from rl_coach.spaces import ObservationSpace
class ObservationCropFilter(ObservationFilter):
"""
Crops the current state observation to a given shape
Crops the size of the observation to a given crop window. For example, in Atari, the
observations are images with a shape of 210x160. Usually, we will want to crop the size of the observation to a
square of 160x160 before rescaling them.
"""
def __init__(self, crop_low: np.ndarray=None, crop_high: np.ndarray=None):
"""

View File

@@ -23,9 +23,14 @@ from rl_coach.spaces import ObservationSpace, PlanarMapsObservationSpace
class ObservationMoveAxisFilter(ObservationFilter):
"""
Move an axis of the observation to a different place.
Reorders the axes of the observation. This can be useful when the observation is an
image, and we want to move the channel axis to be the last axis instead of the first axis.
"""
def __init__(self, axis_origin: int = None, axis_target: int=None):
"""
:param axis_origin: The axis to move
:param axis_target: Where to move the selected axis to
"""
super().__init__()
self.axis_origin = axis_origin
self.axis_target = axis_target

View File

@@ -25,8 +25,9 @@ from rl_coach.spaces import ObservationSpace
class ObservationNormalizationFilter(ObservationFilter):
"""
Normalize the observation with a running standard deviation and mean of the observations seen so far
If there is more than a single worker, the statistics of the observations are shared between all the workers
Normalizes the observation values with a running mean and standard deviation of
all the observations seen so far. The normalization is performed element-wise. Additionally, when working with
multiple workers, the statistics used for the normalization operation are accumulated over all the workers.
"""
def __init__(self, clip_min: float=-5.0, clip_max: float=5.0, name='observation_stats'):
"""

View File

@@ -26,9 +26,11 @@ from rl_coach.spaces import ObservationSpace, VectorObservationSpace
class ObservationReductionBySubPartsNameFilter(ObservationFilter):
"""
Choose sub parts of the observation to remove or keep using their name.
This is useful when the environment has a measurements vector as observation which includes several different
Allows keeping only parts of the observation, by specifying their
name. This is useful when the environment has a measurements vector as observation which includes several different
measurements, but you want the agent to only see some of the measurements and not all.
For example, the CARLA environment extracts multiple measurements that can be used by the agent, such as
speed and location. If we want to only use the speed, it can be done using this filter.
This will currently work only for VectorObservationSpace observations
"""
class ReductionMethod(Enum):

View File

@@ -35,7 +35,8 @@ class RescaleInterpolationType(Enum):
class ObservationRescaleSizeByFactorFilter(ObservationFilter):
"""
Scales the current state observation size by a given factor
Rescales an image observation by some factor. For example, the image size
can be reduced by a factor of 2.
Warning: this requires the input observation to be of type uint8 due to scipy requirements!
"""
def __init__(self, rescale_factor: float, rescaling_interpolation_type: RescaleInterpolationType):

View File

@@ -37,7 +37,8 @@ class RescaleInterpolationType(Enum):
class ObservationRescaleToSizeFilter(ObservationFilter):
"""
Scales the current state observation to a given shape
Rescales an image observation to a given size. The target size does not
necessarily keep the aspect ratio of the original observation.
Warning: this requires the input observation to be of type uint8 due to scipy requirements!
"""
def __init__(self, output_observation_space: PlanarMapsObservationSpace,

View File

@@ -21,7 +21,9 @@ from rl_coach.spaces import ObservationSpace
class ObservationRGBToYFilter(ObservationFilter):
"""
Converts the observation in the current state to gray scale (Y channel).
Converts a color image observation specified using the RGB encoding into a grayscale
image observation, by keeping only the luminance (Y) channel of the YUV encoding. This can be useful if the colors
in the original image are not relevant for solving the task at hand.
The channels axis is assumed to be the last axis
"""
def __init__(self):

View File

@@ -23,9 +23,12 @@ from rl_coach.spaces import ObservationSpace
class ObservationSqueezeFilter(ObservationFilter):
"""
Squeezes the observation so to eliminate redundant axes.
Removes redundant axes from the observation, which are axes with a dimension of 1.
"""
def __init__(self, axis: int = None):
"""
:param axis: Specifies which axis to remove. If set to None, all the axes of size 1 will be removed.
"""
super().__init__()
self.axis = axis

View File

@@ -43,7 +43,10 @@ class LazyStack(object):
class ObservationStackingFilter(ObservationFilter):
"""
Stack the current state observation on top of several previous observations.
Stacks several observations on top of each other. For image observation this will
create a 3D blob. The stacking is done in a lazy manner in order to reduce memory consumption. To achieve this,
a LazyStack object is used in order to wrap the observations in the stack. For this reason, the
ObservationStackingFilter **must** be the last filter in the inputs filters stack.
This filter is stateful since it stores the previous step result and depends on it.
The filter adds an additional dimension to the output observation.

View File

@@ -23,10 +23,15 @@ from rl_coach.spaces import ObservationSpace
class ObservationToUInt8Filter(ObservationFilter):
"""
Converts the observation values to be uint8 values between 0 and 255.
It first scales the observation values to fit in the range and then converts them to uint8.
Converts a floating point observation into an unsigned int 8 bit observation. This is
mostly useful for reducing memory consumption and is usually used for image observations. The filter will first
spread the observation values over the range 0-255 and then discretize them into integer values.
"""
def __init__(self, input_low: float, input_high: float):
"""
:param input_low: The lowest value currently present in the observation
:param input_high: The highest value currently present in the observation
"""
super().__init__()
self.input_low = input_low
self.input_high = input_high

View File

@@ -0,0 +1,8 @@
from .reward_rescale_filter import RewardRescaleFilter
from .reward_clipping_filter import RewardClippingFilter
from .reward_normalization_filter import RewardNormalizationFilter
__all__ = [
'RewardRescaleFilter',
'RewardClippingFilter',
'RewardNormalizationFilter'
]

View File

@@ -23,7 +23,8 @@ from rl_coach.spaces import RewardSpace
class RewardClippingFilter(RewardFilter):
"""
Clips the reward to some range
Clips the reward values into a given range. For example, in DQN, the Atari rewards are
clipped into the range -1 and 1 in order to control the scale of the returns.
"""
def __init__(self, clipping_low: float=-np.inf, clipping_high: float=np.inf):
"""

View File

@@ -25,8 +25,9 @@ from rl_coach.spaces import RewardSpace
class RewardNormalizationFilter(RewardFilter):
"""
Normalize the reward with a running standard deviation and mean of the rewards seen so far
If there is more than a single worker, the statistics of the rewards are shared between all the workers
Normalizes the reward values with a running mean and standard deviation of
all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation
are accumulated over all the workers.
"""
def __init__(self, clip_min: float=-5.0, clip_max: float=5.0):
"""

View File

@@ -21,7 +21,8 @@ from rl_coach.spaces import RewardSpace
class RewardRescaleFilter(RewardFilter):
"""
Rescales the reward by multiplying with some factor
Rescales the reward by a given factor. Rescaling the rewards of the environment has been
observed to have a large effect (negative or positive) on the behavior of the learning process.
"""
def __init__(self, rescale_factor: float):
"""

View File

@@ -504,6 +504,8 @@ class GraphManager(object):
:return: None
"""
self.verify_graph_was_created()
# initialize the network parameters from the global network
self.sync()

View File

@@ -0,0 +1,14 @@
from .episodic_experience_replay import EpisodicExperienceReplayParameters, EpisodicExperienceReplay
from .episodic_hindsight_experience_replay import EpisodicHindsightExperienceReplayParameters, EpisodicHindsightExperienceReplay
from .episodic_hrl_hindsight_experience_replay import EpisodicHRLHindsightExperienceReplayParameters, EpisodicHRLHindsightExperienceReplay
from .single_episode_buffer import SingleEpisodeBufferParameters, SingleEpisodeBuffer
__all__ = [
'EpisodicExperienceReplayParameters',
'EpisodicHindsightExperienceReplayParameters',
'EpisodicHRLHindsightExperienceReplayParameters',
'SingleEpisodeBufferParameters',
'EpisodicExperienceReplay',
'EpisodicHindsightExperienceReplay',
'EpisodicHRLHindsightExperienceReplay',
'SingleEpisodeBuffer'
]

View File

@@ -0,0 +1,13 @@
from .balanced_experience_replay import BalancedExperienceReplayParameters, BalancedExperienceReplay
from .differentiable_neural_dictionary import QDND
from .experience_replay import ExperienceReplayParameters, ExperienceReplay
from .prioritized_experience_replay import PrioritizedExperienceReplayParameters, PrioritizedExperienceReplay
from .transition_collection import TransitionCollection
__all__ = [
'BalancedExperienceReplayParameters',
'BalancedExperienceReplay',
'QDND',
'ExperienceReplay',
'PrioritizedExperienceReplay',
'TransitionCollection'
]

View File

@@ -120,6 +120,7 @@ class Space(object):
def val_matches_space_definition(self, val: Union[int, float, np.ndarray]) -> bool:
"""
Checks if the given value matches the space definition in terms of shape and values
:param val: a value to check
:return: True / False depending on if the val matches the space definition
"""
@@ -136,6 +137,7 @@ class Space(object):
def is_point_in_space_shape(self, point: np.ndarray) -> bool:
"""
Checks if a given multidimensional point is within the bounds of the shape of the space
:param point: a multidimensional point
:return: True if the point is within the shape of the space. False otherwise
"""
@@ -146,6 +148,12 @@ class Space(object):
return True
def sample(self) -> np.ndarray:
"""
Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no
bounds are defined
:return: A numpy array sampled from the space
"""
# if there are infinite bounds, we sample using gaussian noise with mean 0 and std 1
if np.any(self.low == -np.inf) or np.any(self.high == np.inf):
return np.random.normal(0, 1, self.shape)
@@ -173,6 +181,10 @@ class ObservationSpace(Space):
class VectorObservationSpace(ObservationSpace):
"""
An observation space which is defined as a vector of elements. This can be particularly useful for environments
which return measurements, such as in robotic environmnets.
"""
def __init__(self, shape: int, low: Union[None, int, float, np.ndarray]=-np.inf,
high: Union[None, int, float, np.ndarray]=np.inf, measurements_names: List[str]=None):
if measurements_names is None:
@@ -186,6 +198,10 @@ class VectorObservationSpace(ObservationSpace):
class PlanarMapsObservationSpace(ObservationSpace):
"""
An observation space which defines a stack of 2D observations. For example, an environment which returns
a stack of segmentation maps like in Starcraft.
"""
def __init__(self, shape: Union[np.ndarray], low: int, high: int, channels_axis: int=-1):
super().__init__(shape, low, high)
self.channels_axis = channels_axis
@@ -200,6 +216,10 @@ class PlanarMapsObservationSpace(ObservationSpace):
class ImageObservationSpace(PlanarMapsObservationSpace):
"""
An observation space which is a private case of the PlanarMapsObservationSpace, where the stack of 2D observations
represent a RGB image, or a grayscale image.
"""
def __init__(self, shape: Union[np.ndarray], high: int, channels_axis: int=-1):
# TODO: consider allowing arbitrary low values for images
super().__init__(shape, 0, high, channels_axis)
@@ -245,6 +265,7 @@ class ActionSpace(Space):
def sample_with_info(self) -> ActionInfo:
"""
Get a random action with additional "fake" info
:return: An action info instance
"""
return ActionInfo(self.sample())
@@ -252,6 +273,7 @@ class ActionSpace(Space):
def clip_action_to_space(self, action: ActionType) -> ActionType:
"""
Given an action, clip its values to fit to the action space ranges
:param action: a given action
:return: the clipped action
"""
@@ -460,6 +482,7 @@ class GoalToRewardConversion(object):
def convert_distance_to_reward(self, distance: Union[float, np.ndarray]) -> Tuple[float, bool]:
"""
Given a distance from the goal, return a reward and a flag representing if the goal was reached
:param distance: the distance from the goal
:return:
"""
@@ -543,6 +566,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
def goal_from_state(self, state: Dict):
"""
Given a state, extract an observation according to the goal_name
:param state: a dictionary of observations
:return: the observation corresponding to the goal_name
"""
@@ -551,6 +575,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
def distance_from_goal(self, goal: np.ndarray, state: dict) -> float:
"""
Given a state, check its distance from the goal
:param goal: a numpy array representing the goal
:param state: a dict representing the state
:return: the distance from the goal
@@ -574,6 +599,7 @@ class GoalsSpace(VectorObservationSpace, ActionSpace):
def get_reward_for_goal_and_state(self, goal: np.ndarray, state: dict) -> Tuple[float, bool]:
"""
Given a state, check if the goal was reached and return a reward accordingly
:param goal: a numpy array representing the goal
:param state: a dict representing the state
:return: the reward for the current goal and state pair and a boolean representing if the goal was reached