diff --git a/README.md b/README.md index 12b1e18..1750d02 100644 --- a/README.md +++ b/README.md @@ -253,7 +253,7 @@ dashboard ## Supported Algorithms -Coach Design +Coach Design diff --git a/benchmarks/td3/README.md b/benchmarks/td3/README.md new file mode 100644 index 0000000..aba43e3 --- /dev/null +++ b/benchmarks/td3/README.md @@ -0,0 +1,48 @@ +# Twin Delayed DDPG + +Each experiment uses 5 seeds and is trained for 1M environment steps. +The parameters used for TD3 are the same parameters as described in the [original paper](https://arxiv.org/pdf/1802.09477.pdf), and [repository](https://github.com/sfujim/TD3). + +### Ant TD3 - single worker + +```bash +coach -p Mujoco_TD3 -lvl ant +``` + +Ant TD3 + + +### Hopper TD3 - single worker + +```bash +coach -p Mujoco_TD3 -lvl hopper +``` + +Hopper TD3 + + +### Half Cheetah TD3 - single worker + +```bash +coach -p Mujoco_TD3 -lvl half_cheetah +``` + +Half Cheetah TD3 + + +### Reacher TD3 - single worker + +```bash +coach -p Mujoco_TD3 -lvl reacher +``` + +Reacher TD3 + + +### Walker2D TD3 - single worker + +```bash +coach -p Mujoco_TD3 -lvl walker2d +``` + +Walker2D TD3 diff --git a/benchmarks/td3/ant.png b/benchmarks/td3/ant.png new file mode 100644 index 0000000..159aeff Binary files /dev/null and b/benchmarks/td3/ant.png differ diff --git a/benchmarks/td3/half_cheetah.png b/benchmarks/td3/half_cheetah.png new file mode 100644 index 0000000..c951d0b Binary files /dev/null and b/benchmarks/td3/half_cheetah.png differ diff --git a/benchmarks/td3/hopper.png b/benchmarks/td3/hopper.png new file mode 100644 index 0000000..8ab2c32 Binary files /dev/null and b/benchmarks/td3/hopper.png differ diff --git a/benchmarks/td3/reacher.png b/benchmarks/td3/reacher.png new file mode 100644 index 0000000..65a23c3 Binary files /dev/null and b/benchmarks/td3/reacher.png differ diff --git a/benchmarks/td3/walker2d.png b/benchmarks/td3/walker2d.png new file mode 100644 index 0000000..117d8fa Binary files /dev/null and b/benchmarks/td3/walker2d.png differ diff --git a/docs/_images/algorithms.png b/docs/_images/algorithms.png index b3310c0..6c00f21 100644 Binary files a/docs/_images/algorithms.png and b/docs/_images/algorithms.png differ diff --git a/docs/_images/td3.png b/docs/_images/td3.png new file mode 100644 index 0000000..fc28eb4 Binary files /dev/null and b/docs/_images/td3.png differ diff --git a/docs/_modules/index.html b/docs/_modules/index.html index 993ed48..b774d7e 100644 --- a/docs/_modules/index.html +++ b/docs/_modules/index.html @@ -200,6 +200,7 @@
  • rl_coach.agents.qr_dqn_agent
  • rl_coach.agents.rainbow_dqn_agent
  • rl_coach.agents.soft_actor_critic_agent
  • +
  • rl_coach.agents.td3_agent
  • rl_coach.agents.value_optimization_agent
  • rl_coach.architectures.architecture
  • rl_coach.architectures.network_wrapper
  • diff --git a/docs/_modules/rl_coach/agents/agent.html b/docs/_modules/rl_coach/agents/agent.html index a7c44f7..49d4a8a 100644 --- a/docs/_modules/rl_coach/agents/agent.html +++ b/docs/_modules/rl_coach/agents/agent.html @@ -278,19 +278,6 @@ if self.ap.memory.memory_backend_params.run_type != 'trainer': self.memory.set_memory_backend(self.memory_backend) - if agent_parameters.memory.load_memory_from_file_path: - if isinstance(agent_parameters.memory.load_memory_from_file_path, PickledReplayBuffer): - screen.log_title("Loading a pickled replay buffer. Pickled file path: {}" - .format(agent_parameters.memory.load_memory_from_file_path.filepath)) - self.memory.load_pickled(agent_parameters.memory.load_memory_from_file_path.filepath) - elif isinstance(agent_parameters.memory.load_memory_from_file_path, CsvDataset): - screen.log_title("Loading a replay buffer from a CSV file. CSV file path: {}" - .format(agent_parameters.memory.load_memory_from_file_path.filepath)) - self.memory.load_csv(agent_parameters.memory.load_memory_from_file_path) - else: - raise ValueError('Trying to load a replay buffer using an unsupported method - {}. ' - .format(agent_parameters.memory.load_memory_from_file_path)) - if self.shared_memory and self.is_chief: self.shared_memory_scratchpad.add(self.memory_lookup_name, self.memory) @@ -444,7 +431,39 @@ self.input_filter.set_session(sess) self.output_filter.set_session(sess) self.pre_network_filter.set_session(sess) - [network.set_session(sess) for network in self.networks.values()] + [network.set_session(sess) for network in self.networks.values()] + self.initialize_session_dependent_components() + +
    [docs] def initialize_session_dependent_components(self): + """ + Initialize components which require a session as part of their initialization. + + :return: None + """ + + # Loading a memory from a CSV file, requires an input filter to filter through the data. + # The filter needs a session before it can be used. + if self.ap.memory.load_memory_from_file_path: + self.load_memory_from_file()
    + +
    [docs] def load_memory_from_file(self): + """ + Load memory transitions from a file. + + :return: None + """ + + if isinstance(self.ap.memory.load_memory_from_file_path, PickledReplayBuffer): + screen.log_title("Loading a pickled replay buffer. Pickled file path: {}" + .format(self.ap.memory.load_memory_from_file_path.filepath)) + self.memory.load_pickled(self.ap.memory.load_memory_from_file_path.filepath) + elif isinstance(self.ap.memory.load_memory_from_file_path, CsvDataset): + screen.log_title("Loading a replay buffer from a CSV file. CSV file path: {}" + .format(self.ap.memory.load_memory_from_file_path.filepath)) + self.memory.load_csv(self.ap.memory.load_memory_from_file_path, self.input_filter) + else: + raise ValueError('Trying to load a replay buffer using an unsupported method - {}. ' + .format(self.ap.memory.load_memory_from_file_path))
    [docs] def register_signal(self, signal_name: str, dump_one_value_per_episode: bool=True, dump_one_value_per_step: bool=False) -> Signal: @@ -868,7 +887,10 @@ """ loss = 0 if self._should_train(): - self.training_epoch += 1 + if self.ap.is_batch_rl_training: + # when training an agent for generating a dataset in batch-rl, we don't want it to be counted as part of + # the training epochs. we only care for training epochs in batch-rl anyway. + self.training_epoch += 1 for network in self.networks.values(): network.set_is_training(True) @@ -1229,7 +1251,15 @@ TimeTypes.TrainingIteration: self.training_iteration, TimeTypes.EnvironmentSteps: self.total_steps_counter, TimeTypes.WallClockTime: self.agent_logger.get_current_wall_clock_time(), - TimeTypes.Epoch: self.training_epoch}[self.parent_level_manager.parent_graph_manager.time_metric]
    + TimeTypes.Epoch: self.training_epoch}[self.parent_level_manager.parent_graph_manager.time_metric] + +
    [docs] def freeze_memory(self): + """ + Shuffle episodes in the memory and freeze it to make sure that no extra data is being pushed anymore. + :return: None + """ + self.call_memory('shuffle_episodes') + self.call_memory('freeze')
    diff --git a/docs/_modules/rl_coach/agents/categorical_dqn_agent.html b/docs/_modules/rl_coach/agents/categorical_dqn_agent.html index 371b630..6262571 100644 --- a/docs/_modules/rl_coach/agents/categorical_dqn_agent.html +++ b/docs/_modules/rl_coach/agents/categorical_dqn_agent.html @@ -196,7 +196,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - from typing import Union import numpy as np @@ -266,13 +265,22 @@ # prediction's format is (batch,actions,atoms) def get_all_q_values_for_states(self, states: StateType): + q_values = None if self.exploration_policy.requires_action_values(): q_values = self.get_prediction(states, outputs=[self.networks['main'].online_network.output_heads[0].q_values]) - else: - q_values = None + return q_values + def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType): + actions_q_values, softmax_probabilities = None, None + if self.exploration_policy.requires_action_values(): + outputs = [self.networks['main'].online_network.output_heads[0].q_values, + self.networks['main'].online_network.output_heads[0].softmax] + actions_q_values, softmax_probabilities = self.get_prediction(states, outputs=outputs) + + return actions_q_values, softmax_probabilities + def learn_from_batch(self, batch): network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys() diff --git a/docs/_modules/rl_coach/agents/ddpg_agent.html b/docs/_modules/rl_coach/agents/ddpg_agent.html index c4f194e..cbed0ab 100644 --- a/docs/_modules/rl_coach/agents/ddpg_agent.html +++ b/docs/_modules/rl_coach/agents/ddpg_agent.html @@ -206,7 +206,7 @@ from rl_coach.agents.actor_critic_agent import ActorCriticAgent from rl_coach.agents.agent import Agent from rl_coach.architectures.embedder_parameters import InputEmbedderParameters -from rl_coach.architectures.head_parameters import DDPGActorHeadParameters, VHeadParameters +from rl_coach.architectures.head_parameters import DDPGActorHeadParameters, DDPGVHeadParameters from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \ AgentParameters, EmbedderScheme @@ -222,14 +222,17 @@ self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True), 'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)} self.middleware_parameters = FCMiddlewareParameters() - self.heads_parameters = [VHeadParameters()] + self.heads_parameters = [DDPGVHeadParameters()] self.optimizer_type = 'Adam' self.batch_size = 64 self.async_training = False self.learning_rate = 0.001 + self.adam_optimizer_beta2 = 0.999 + self.optimizer_epsilon = 1e-8 self.create_target_network = True self.shared_optimizer = True self.scale_down_gradients_by_number_of_workers_for_sync_training = False + # self.l2_regularization = 1e-2 class DDPGActorNetworkParameters(NetworkParameters): @@ -240,6 +243,8 @@ self.heads_parameters = [DDPGActorHeadParameters()] self.optimizer_type = 'Adam' self.batch_size = 64 + self.adam_optimizer_beta2 = 0.999 + self.optimizer_epsilon = 1e-8 self.async_training = False self.learning_rate = 0.0001 self.create_target_network = True @@ -323,7 +328,7 @@ critic_inputs = copy.copy(batch.next_states(critic_keys)) critic_inputs['action'] = next_actions - q_st_plus_1 = critic.target_network.predict(critic_inputs) + q_st_plus_1 = critic.target_network.predict(critic_inputs)[0] # calculate the bootstrapped TD targets while discounting terminal states according to # use_non_zero_discount_for_terminal_states @@ -343,7 +348,7 @@ critic_inputs = copy.copy(batch.states(critic_keys)) critic_inputs['action'] = actions_mean action_gradients = critic.online_network.predict(critic_inputs, - outputs=critic.online_network.gradients_wrt_inputs[0]['action']) + outputs=critic.online_network.gradients_wrt_inputs[1]['action']) # train the critic critic_inputs = copy.copy(batch.states(critic_keys)) diff --git a/docs/_modules/rl_coach/agents/dfp_agent.html b/docs/_modules/rl_coach/agents/dfp_agent.html index 2bcb686..2056a75 100644 --- a/docs/_modules/rl_coach/agents/dfp_agent.html +++ b/docs/_modules/rl_coach/agents/dfp_agent.html @@ -365,7 +365,7 @@ action_values = None # choose action according to the exploration policy and the current phase (evaluating or training the agent) - action = self.exploration_policy.get_action(action_values) + action, _ = self.exploration_policy.get_action(action_values) if action_values is not None: action_values = action_values.squeeze() diff --git a/docs/_modules/rl_coach/agents/dqn_agent.html b/docs/_modules/rl_coach/agents/dqn_agent.html index 7b7eb82..c823c0f 100644 --- a/docs/_modules/rl_coach/agents/dqn_agent.html +++ b/docs/_modules/rl_coach/agents/dqn_agent.html @@ -232,6 +232,7 @@ self.batch_size = 32 self.replace_mse_with_huber_loss = True self.create_target_network = True + self.should_get_softmax_probabilities = False class DQNAgentParameters(AgentParameters): diff --git a/docs/_modules/rl_coach/agents/nec_agent.html b/docs/_modules/rl_coach/agents/nec_agent.html index ef658b1..d1a877c 100644 --- a/docs/_modules/rl_coach/agents/nec_agent.html +++ b/docs/_modules/rl_coach/agents/nec_agent.html @@ -199,7 +199,7 @@ import os import pickle -from typing import Union +from typing import Union, List import numpy as np @@ -223,6 +223,7 @@ self.middleware_parameters = FCMiddlewareParameters() self.heads_parameters = [DNDQHeadParameters()] self.optimizer_type = 'Adam' + self.should_get_softmax_probabilities = False
    [docs]class NECAlgorithmParameters(AlgorithmParameters): @@ -349,11 +350,25 @@ return super().act() - def get_all_q_values_for_states(self, states: StateType): + def get_all_q_values_for_states(self, states: StateType, additional_outputs: List = None): # we need to store the state embeddings regardless if the action is random or not - return self.get_prediction(states) + return self.get_prediction_and_update_embeddings(states) - def get_prediction(self, states): + def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType): + # get the actions q values and the state embedding + embedding, actions_q_values, softmax_probabilities = self.networks['main'].online_network.predict( + self.prepare_batch_for_inference(states, 'main'), + outputs=[self.networks['main'].online_network.state_embedding, + self.networks['main'].online_network.output_heads[0].output, + self.networks['main'].online_network.output_heads[0].softmax] + ) + if self.phase != RunPhase.TEST: + # store the state embedding for inserting it to the DND later + self.current_episode_state_embeddings.append(embedding.squeeze()) + actions_q_values = actions_q_values[0][0] + return actions_q_values, softmax_probabilities + + def get_prediction_and_update_embeddings(self, states): # get the actions q values and the state embedding embedding, actions_q_values = self.networks['main'].online_network.predict( self.prepare_batch_for_inference(states, 'main'), @@ -362,7 +377,7 @@ ) if self.phase != RunPhase.TEST: # store the state embedding for inserting it to the DND later - self.current_episode_state_embeddings.append(embedding.squeeze()) + self.current_episode_state_embeddings.append(embedding[0].squeeze()) actions_q_values = actions_q_values[0][0] return actions_q_values diff --git a/docs/_modules/rl_coach/agents/qr_dqn_agent.html b/docs/_modules/rl_coach/agents/qr_dqn_agent.html index b08cb4c..d77af62 100644 --- a/docs/_modules/rl_coach/agents/qr_dqn_agent.html +++ b/docs/_modules/rl_coach/agents/qr_dqn_agent.html @@ -196,7 +196,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +from copy import copy from typing import Union import numpy as np @@ -262,6 +262,17 @@ actions_q_values = None return actions_q_values + # prediction's format is (batch,actions,atoms) + def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType): + actions_q_values, softmax_probabilities = None, None + if self.exploration_policy.requires_action_values(): + outputs = copy(self.networks['main'].online_network.outputs) + outputs.append(self.networks['main'].online_network.output_heads[0].softmax) + quantile_values, softmax_probabilities = self.get_prediction(states, outputs) + actions_q_values = self.get_q_values(quantile_values) + + return actions_q_values, softmax_probabilities + def learn_from_batch(self, batch): network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys() diff --git a/docs/_modules/rl_coach/agents/td3_agent.html b/docs/_modules/rl_coach/agents/td3_agent.html new file mode 100644 index 0000000..9b68178 --- /dev/null +++ b/docs/_modules/rl_coach/agents/td3_agent.html @@ -0,0 +1,448 @@ + + + + + + + + + + + rl_coach.agents.td3_agent — Reinforcement Learning Coach 0.12.0 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • rl_coach.agents.td3_agent
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for rl_coach.agents.td3_agent

    +#
    +# Copyright (c) 2019 Intel Corporation
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#      http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +#
    +
    +import copy
    +from typing import Union
    +from collections import OrderedDict
    +
    +import numpy as np
    +
    +from rl_coach.agents.agent import Agent
    +from rl_coach.agents.ddpg_agent import DDPGAgent
    +from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
    +from rl_coach.architectures.head_parameters import DDPGActorHeadParameters, TD3VHeadParameters
    +from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
    +from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
    +    AgentParameters, EmbedderScheme
    +from rl_coach.core_types import ActionInfo, TrainingSteps, Transition
    +from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
    +from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
    +from rl_coach.spaces import BoxActionSpace, GoalsSpace
    +
    +
    +class TD3CriticNetworkParameters(NetworkParameters):
    +    def __init__(self, num_q_networks):
    +        super().__init__()
    +        self.input_embedders_parameters = {'observation': InputEmbedderParameters(),
    +                                            'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
    +        self.middleware_parameters = FCMiddlewareParameters(num_streams=num_q_networks)
    +        self.heads_parameters = [TD3VHeadParameters()]
    +        self.optimizer_type = 'Adam'
    +        self.adam_optimizer_beta2 = 0.999
    +        self.optimizer_epsilon = 1e-8
    +        self.batch_size = 100
    +        self.async_training = False
    +        self.learning_rate = 0.001
    +        self.create_target_network = True
    +        self.shared_optimizer = True
    +        self.scale_down_gradients_by_number_of_workers_for_sync_training = False
    +
    +
    +class TD3ActorNetworkParameters(NetworkParameters):
    +    def __init__(self):
    +        super().__init__()
    +        self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
    +        self.middleware_parameters = FCMiddlewareParameters()
    +        self.heads_parameters = [DDPGActorHeadParameters(batchnorm=False)]
    +        self.optimizer_type = 'Adam'
    +        self.adam_optimizer_beta2 = 0.999
    +        self.optimizer_epsilon = 1e-8
    +        self.batch_size = 100
    +        self.async_training = False
    +        self.learning_rate = 0.001
    +        self.create_target_network = True
    +        self.shared_optimizer = True
    +        self.scale_down_gradients_by_number_of_workers_for_sync_training = False
    +
    +
    +
    [docs]class TD3AlgorithmParameters(AlgorithmParameters): + """ + :param num_steps_between_copying_online_weights_to_target: (StepMethod) + The number of steps between copying the online network weights to the target network weights. + + :param rate_for_copying_weights_to_target: (float) + When copying the online network weights to the target network weights, a soft update will be used, which + weight the new online network weights by rate_for_copying_weights_to_target + + :param num_consecutive_playing_steps: (StepMethod) + The number of consecutive steps to act between every two training iterations + + :param use_target_network_for_evaluation: (bool) + If set to True, the target network will be used for predicting the actions when choosing actions to act. + Since the target network weights change more slowly, the predicted actions will be more consistent. + + :param action_penalty: (float) + The amount by which to penalize the network on high action feature (pre-activation) values. + This can prevent the actions features from saturating the TanH activation function, and therefore prevent the + gradients from becoming very low. + + :param clip_critic_targets: (Tuple[float, float] or None) + The range to clip the critic target to in order to prevent overestimation of the action values. + + :param use_non_zero_discount_for_terminal_states: (bool) + If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state + values. If set to False, the terminal states reward will be taken as the target return for the network. + """ + def __init__(self): + super().__init__() + self.rate_for_copying_weights_to_target = 0.005 + self.use_target_network_for_evaluation = False + self.action_penalty = 0 + self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None + self.use_non_zero_discount_for_terminal_states = False + self.act_for_full_episodes = True + self.update_policy_every_x_episode_steps = 2 + self.num_steps_between_copying_online_weights_to_target = TrainingSteps(self.update_policy_every_x_episode_steps) + self.policy_noise = 0.2 + self.noise_clipping = 0.5 + self.num_q_networks = 2
    + + +class TD3AgentExplorationParameters(AdditiveNoiseParameters): + def __init__(self): + super().__init__() + self.noise_as_percentage_from_action_space = False + + +class TD3AgentParameters(AgentParameters): + def __init__(self): + td3_algorithm_params = TD3AlgorithmParameters() + super().__init__(algorithm=td3_algorithm_params, + exploration=TD3AgentExplorationParameters(), + memory=EpisodicExperienceReplayParameters(), + networks=OrderedDict([("actor", TD3ActorNetworkParameters()), + ("critic", + TD3CriticNetworkParameters(td3_algorithm_params.num_q_networks))])) + + @property + def path(self): + return 'rl_coach.agents.td3_agent:TD3Agent' + + +# Twin Delayed DDPG - https://arxiv.org/pdf/1802.09477.pdf +class TD3Agent(DDPGAgent): + def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None): + super().__init__(agent_parameters, parent) + + self.q_values = self.register_signal("Q") + self.TD_targets_signal = self.register_signal("TD targets") + self.action_signal = self.register_signal("actions") + + def learn_from_batch(self, batch): + actor = self.networks['actor'] + critic = self.networks['critic'] + + actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys() + critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys() + + # TD error = r + discount*max(q_st_plus_1) - q_st + next_actions, actions_mean = actor.parallel_prediction([ + (actor.target_network, batch.next_states(actor_keys)), + (actor.online_network, batch.states(actor_keys)) + ]) + + # add noise to the next_actions + noise = np.random.normal(0, self.ap.algorithm.policy_noise, next_actions.shape).clip( + -self.ap.algorithm.noise_clipping, self.ap.algorithm.noise_clipping) + next_actions = self.spaces.action.clip_action_to_space(next_actions + noise) + + critic_inputs = copy.copy(batch.next_states(critic_keys)) + critic_inputs['action'] = next_actions + q_st_plus_1 = critic.target_network.predict(critic_inputs)[2] # output #2 is the min (Q1, Q2) + + # calculate the bootstrapped TD targets while discounting terminal states according to + # use_non_zero_discount_for_terminal_states + if self.ap.algorithm.use_non_zero_discount_for_terminal_states: + TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1 + else: + TD_targets = batch.rewards(expand_dims=True) + \ + (1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1 + + # clip the TD targets to prevent overestimation errors + if self.ap.algorithm.clip_critic_targets: + TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets) + + self.TD_targets_signal.add_sample(TD_targets) + + # train the critic + critic_inputs = copy.copy(batch.states(critic_keys)) + critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1) + result = critic.train_and_sync_networks(critic_inputs, TD_targets) + total_loss, losses, unclipped_grads = result[:3] + + if self.training_iteration % self.ap.algorithm.update_policy_every_x_episode_steps == 0: + # get the gradients of output #3 (=mean of Q1 network) w.r.t the action + critic_inputs = copy.copy(batch.states(critic_keys)) + critic_inputs['action'] = actions_mean + action_gradients = critic.online_network.predict(critic_inputs, + outputs=critic.online_network.gradients_wrt_inputs[3]['action']) + + # apply the gradients from the critic to the actor + initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients} + gradients = actor.online_network.predict(batch.states(actor_keys), + outputs=actor.online_network.weighted_gradients[0], + initial_feed_dict=initial_feed_dict) + + if actor.has_global: + actor.apply_gradients_to_global_network(gradients) + actor.update_online_network() + else: + actor.apply_gradients_to_online_network(gradients) + + return total_loss, losses, unclipped_grads + + def train(self): + self.ap.algorithm.num_consecutive_training_steps = self.current_episode_steps_counter + return Agent.train(self) + + def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition: + """ + Allows agents to update the transition just before adding it to the replay buffer. + Can be useful for agents that want to tweak the reward, termination signal, etc. + + :param transition: the transition to update + :return: the updated transition + """ + transition.game_over = False if self.current_episode_steps_counter ==\ + self.parent_level_manager.environment.env._max_episode_steps\ + else transition.game_over + + return transition +
    + +
    + +
    +
    + + +
    + +
    +

    + © Copyright 2018-2019, Intel AI Lab + +

    +
    + Built with Sphinx using a theme provided by Read the Docs. + +
    + +
    +
    + +
    + +
    + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/rl_coach/agents/value_optimization_agent.html b/docs/_modules/rl_coach/agents/value_optimization_agent.html index cc33a18..4ce7e7c 100644 --- a/docs/_modules/rl_coach/agents/value_optimization_agent.html +++ b/docs/_modules/rl_coach/agents/value_optimization_agent.html @@ -197,7 +197,7 @@ # limitations under the License. # from collections import OrderedDict -from typing import Union +from typing import Union, List import numpy as np @@ -207,7 +207,8 @@ from rl_coach.logger import screen from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplay from rl_coach.spaces import DiscreteActionSpace -from copy import deepcopy +from copy import deepcopy, copy + ## This is an abstract agent - there is no learn_from_batch method ## @@ -218,6 +219,12 @@ self.q_values = self.register_signal("Q") self.q_value_for_action = {} + # currently we use softmax action probabilities only in batch-rl, + # but we might want to extend this later at some point. + self.should_get_softmax_probabilities = \ + hasattr(self.ap.network_wrappers['main'], 'should_get_softmax_probabilities') and \ + self.ap.network_wrappers['main'].should_get_softmax_probabilities + def init_environment_dependent_modules(self): super().init_environment_dependent_modules() if isinstance(self.spaces.action, DiscreteActionSpace): @@ -228,12 +235,21 @@ # Algorithms for which q_values are calculated from predictions will override this function def get_all_q_values_for_states(self, states: StateType): + actions_q_values = None if self.exploration_policy.requires_action_values(): actions_q_values = self.get_prediction(states) - else: - actions_q_values = None + return actions_q_values + def get_all_q_values_for_states_and_softmax_probabilities(self, states: StateType): + actions_q_values, softmax_probabilities = None, None + if self.exploration_policy.requires_action_values(): + outputs = copy(self.networks['main'].online_network.outputs) + outputs.append(self.networks['main'].online_network.output_heads[0].softmax) + + actions_q_values, softmax_probabilities = self.get_prediction(states, outputs=outputs) + return actions_q_values, softmax_probabilities + def get_prediction(self, states, outputs=None): return self.networks['main'].online_network.predict(self.prepare_batch_for_inference(states, 'main'), outputs=outputs) @@ -255,10 +271,19 @@ ).format(policy.__class__.__name__)) def choose_action(self, curr_state): - actions_q_values = self.get_all_q_values_for_states(curr_state) + if self.should_get_softmax_probabilities: + actions_q_values, softmax_probabilities = \ + self.get_all_q_values_for_states_and_softmax_probabilities(curr_state) + else: + actions_q_values = self.get_all_q_values_for_states(curr_state) # choose action according to the exploration policy and the current phase (evaluating or training the agent) - action = self.exploration_policy.get_action(actions_q_values) + action, action_probabilities = self.exploration_policy.get_action(actions_q_values) + if self.should_get_softmax_probabilities and softmax_probabilities is not None: + # override the exploration policy's generated probabilities when an action was taken + # with the agent's actual policy + action_probabilities = softmax_probabilities + self._validate_action(self.exploration_policy, action) if actions_q_values is not None: @@ -270,15 +295,18 @@ self.q_values.add_sample(actions_q_values) actions_q_values = actions_q_values.squeeze() + action_probabilities = action_probabilities.squeeze() for i, q_value in enumerate(actions_q_values): self.q_value_for_action[i].add_sample(q_value) action_info = ActionInfo(action=action, action_value=actions_q_values[action], - max_action_value=np.max(actions_q_values)) + max_action_value=np.max(actions_q_values), + all_action_probabilities=action_probabilities) + else: - action_info = ActionInfo(action=action) + action_info = ActionInfo(action=action, all_action_probabilities=action_probabilities) return action_info diff --git a/docs/_modules/rl_coach/base_parameters.html b/docs/_modules/rl_coach/base_parameters.html index 62144ed..60aac7f 100644 --- a/docs/_modules/rl_coach/base_parameters.html +++ b/docs/_modules/rl_coach/base_parameters.html @@ -182,6 +182,7 @@

    Source code for rl_coach.base_parameters

     #
    +#
     # Copyright (c) 2017 Intel Corporation
     #
     # Licensed under the Apache License, Version 2.0 (the "License");
    @@ -405,7 +406,8 @@
                      reward_test_level=None,
                      test_using_a_trace_test=True,
                      trace_test_levels=None,
    -                 trace_max_env_steps=5000):
    +                 trace_max_env_steps=5000,
    +                 read_csv_tries=200):
             """
             :param test:
                 A flag which specifies if the preset should be tested as part of the validation process.
    @@ -428,6 +430,8 @@
             :param trace_max_env_steps:
                 An integer representing the maximum number of environment steps to run when running this preset as part
                 of the trace tests suite.
    +        :param read_csv_tries:
    +            The number of retries to attempt for reading the experiment csv file, before declaring failure.
             """
             super().__init__()
     
    @@ -443,7 +447,8 @@
             self.reward_test_level = reward_test_level
             self.test_using_a_trace_test = test_using_a_trace_test
             self.trace_test_levels = trace_test_levels
    -        self.trace_max_env_steps = trace_max_env_steps
    + self.trace_max_env_steps = trace_max_env_steps + self.read_csv_tries = read_csv_tries
    [docs]class NetworkParameters(Parameters): diff --git a/docs/_modules/rl_coach/exploration_policies/additive_noise.html b/docs/_modules/rl_coach/exploration_policies/additive_noise.html index 1cc0bd0..44e2dc3 100644 --- a/docs/_modules/rl_coach/exploration_policies/additive_noise.html +++ b/docs/_modules/rl_coach/exploration_policies/additive_noise.html @@ -202,24 +202,27 @@ import numpy as np from rl_coach.core_types import RunPhase, ActionType -from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters +from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters from rl_coach.schedules import Schedule, LinearSchedule from rl_coach.spaces import ActionSpace, BoxActionSpace # TODO: consider renaming to gaussian sampling + + class AdditiveNoiseParameters(ExplorationParameters): def __init__(self): super().__init__() - self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000) - self.evaluation_noise_percentage = 0.05 + self.noise_schedule = LinearSchedule(0.1, 0.1, 50000) + self.evaluation_noise = 0.05 + self.noise_as_percentage_from_action_space = True @property def path(self): return 'rl_coach.exploration_policies.additive_noise:AdditiveNoise' -
    [docs]class AdditiveNoise(ExplorationPolicy): +
    [docs]class AdditiveNoise(ContinuousActionExplorationPolicy): """ AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that @@ -228,17 +231,19 @@ 2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to be the mean of the action, and 2nd is assumed to be its standard deviation. """ - def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule, - evaluation_noise_percentage: float): + def __init__(self, action_space: ActionSpace, noise_schedule: Schedule, + evaluation_noise: float, noise_as_percentage_from_action_space: bool = True): """ :param action_space: the action space used by the environment - :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range - of the action space - :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases + :param noise_schedule: the schedule for the noise + :param evaluation_noise: the noise variance that will be used during evaluation phases + :param noise_as_percentage_from_action_space: a bool deciding whether the noise is absolute or as a percentage + from the action space """ super().__init__(action_space) - self.noise_percentage_schedule = noise_percentage_schedule - self.evaluation_noise_percentage = evaluation_noise_percentage + self.noise_schedule = noise_schedule + self.evaluation_noise = evaluation_noise + self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space if not isinstance(action_space, BoxActionSpace): raise ValueError("Additive noise exploration works only for continuous controls." @@ -248,19 +253,20 @@ or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf): raise ValueError("Additive noise exploration requires bounded actions") - # TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage - def get_action(self, action_values: List[ActionType]) -> ActionType: # TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies - # set the current noise percentage + # set the current noise if self.phase == RunPhase.TEST: - current_noise_precentage = self.evaluation_noise_percentage + current_noise = self.evaluation_noise else: - current_noise_precentage = self.noise_percentage_schedule.current_value + current_noise = self.noise_schedule.current_value # scale the noise to the action space range - action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low) + if self.noise_as_percentage_from_action_space: + action_values_std = current_noise * (self.action_space.high - self.action_space.low) + else: + action_values_std = current_noise # extract the mean values if isinstance(action_values, list): @@ -272,18 +278,21 @@ # step the noise schedule if self.phase is not RunPhase.TEST: - self.noise_percentage_schedule.step() + self.noise_schedule.step() # the second element of the list is assumed to be the standard deviation if isinstance(action_values, list) and len(action_values) > 1: action_values_std = action_values[1].squeeze() # add noise to the action means - action = np.random.normal(action_values_mean, action_values_std) + if self.phase is not RunPhase.TEST: + action = np.random.normal(action_values_mean, action_values_std) + else: + action = action_values_mean - return action + return np.atleast_1d(action) def get_control_param(self): - return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
    + return np.ones(self.action_space.shape)*self.noise_schedule.current_value
    diff --git a/docs/_modules/rl_coach/exploration_policies/boltzmann.html b/docs/_modules/rl_coach/exploration_policies/boltzmann.html index 9a5620b..09b228b 100644 --- a/docs/_modules/rl_coach/exploration_policies/boltzmann.html +++ b/docs/_modules/rl_coach/exploration_policies/boltzmann.html @@ -202,7 +202,7 @@ import numpy as np from rl_coach.core_types import RunPhase, ActionType -from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters +from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters from rl_coach.schedules import Schedule from rl_coach.spaces import ActionSpace @@ -217,8 +217,7 @@ return 'rl_coach.exploration_policies.boltzmann:Boltzmann' - -
    [docs]class Boltzmann(ExplorationPolicy): +
    [docs]class Boltzmann(DiscreteActionExplorationPolicy): """ The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values @@ -233,7 +232,7 @@ super().__init__(action_space) self.temperature_schedule = temperature_schedule - def get_action(self, action_values: List[ActionType]) -> ActionType: + def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]): if self.phase == RunPhase.TRAIN: self.temperature_schedule.step() # softmax calculation @@ -242,7 +241,8 @@ # make sure probs sum to 1 probabilities[-1] = 1 - np.sum(probabilities[:-1]) # choose actions according to the probabilities - return np.random.choice(range(self.action_space.shape), p=probabilities) + action = np.random.choice(range(self.action_space.shape), p=probabilities) + return action, probabilities def get_control_param(self): return self.temperature_schedule.current_value
    diff --git a/docs/_modules/rl_coach/exploration_policies/categorical.html b/docs/_modules/rl_coach/exploration_policies/categorical.html index 3ba4e81..2dabd88 100644 --- a/docs/_modules/rl_coach/exploration_policies/categorical.html +++ b/docs/_modules/rl_coach/exploration_policies/categorical.html @@ -202,7 +202,7 @@ import numpy as np from rl_coach.core_types import RunPhase, ActionType -from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters +from rl_coach.exploration_policies.exploration_policy import DiscreteActionExplorationPolicy, ExplorationParameters from rl_coach.spaces import ActionSpace @@ -212,7 +212,7 @@ return 'rl_coach.exploration_policies.categorical:Categorical' -
    [docs]class Categorical(ExplorationPolicy): +
    [docs]class Categorical(DiscreteActionExplorationPolicy): """ Categorical exploration policy is intended for discrete action spaces. It expects the action values to represent a probability distribution over the action, from which a single action will be sampled. @@ -225,13 +225,18 @@ """ super().__init__(action_space) - def get_action(self, action_values: List[ActionType]) -> ActionType: + def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]): if self.phase == RunPhase.TRAIN: # choose actions according to the probabilities - return np.random.choice(self.action_space.actions, p=action_values) + action = np.random.choice(self.action_space.actions, p=action_values) + return action, action_values else: # take the action with the highest probability - return np.argmax(action_values) + action = np.argmax(action_values) + one_hot_action_probabilities = np.zeros(len(self.action_space.actions)) + one_hot_action_probabilities[action] = 1 + + return action, one_hot_action_probabilities def get_control_param(self): return 0
    diff --git a/docs/_modules/rl_coach/exploration_policies/e_greedy.html b/docs/_modules/rl_coach/exploration_policies/e_greedy.html index f6ee4c1..e476897 100644 --- a/docs/_modules/rl_coach/exploration_policies/e_greedy.html +++ b/docs/_modules/rl_coach/exploration_policies/e_greedy.html @@ -203,8 +203,7 @@ from rl_coach.core_types import RunPhase, ActionType from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters -from rl_coach.exploration_policies.exploration_policy import ExplorationParameters -from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy +from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy from rl_coach.schedules import Schedule, LinearSchedule from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace from rl_coach.utils import dynamic_import_and_instantiate_module_from_params @@ -216,7 +215,7 @@ self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000) self.evaluation_epsilon = 0.05 self.continuous_exploration_policy_parameters = AdditiveNoiseParameters() - self.continuous_exploration_policy_parameters.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000) + self.continuous_exploration_policy_parameters.noise_schedule = LinearSchedule(0.1, 0.1, 50000) # for continuous control - # (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf) @@ -265,26 +264,32 @@ epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value return self.current_random_value >= epsilon - def get_action(self, action_values: List[ActionType]) -> ActionType: + def get_action(self, action_values: List[ActionType]) -> (ActionType, List[float]): epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value if isinstance(self.action_space, DiscreteActionSpace): - top_action = np.argmax(action_values) if self.current_random_value < epsilon: chosen_action = self.action_space.sample() + probabilities = np.full(len(self.action_space.actions), + 1. / (self.action_space.high[0] - self.action_space.low[0] + 1)) else: - chosen_action = top_action + chosen_action = np.argmax(action_values) + + # one-hot probabilities vector + probabilities = np.zeros(len(self.action_space.actions)) + probabilities[chosen_action] = 1 + + self.step_epsilon() + return chosen_action, probabilities + else: if self.current_random_value < epsilon and self.phase == RunPhase.TRAIN: chosen_action = self.action_space.sample() else: chosen_action = self.continuous_exploration_policy.get_action(action_values) - # step the epsilon schedule and generate a new random value for next time - if self.phase == RunPhase.TRAIN: - self.epsilon_schedule.step() - self.current_random_value = np.random.rand() - return chosen_action + self.step_epsilon() + return chosen_action def get_control_param(self): if isinstance(self.action_space, DiscreteActionSpace): @@ -295,7 +300,13 @@ def change_phase(self, phase): super().change_phase(phase) if isinstance(self.action_space, BoxActionSpace): - self.continuous_exploration_policy.change_phase(phase)
    + self.continuous_exploration_policy.change_phase(phase) + + def step_epsilon(self): + # step the epsilon schedule and generate a new random value for next time + if self.phase == RunPhase.TRAIN: + self.epsilon_schedule.step() + self.current_random_value = np.random.rand()
    diff --git a/docs/_modules/rl_coach/exploration_policies/exploration_policy.html b/docs/_modules/rl_coach/exploration_policies/exploration_policy.html index 912355e..f5ffd7d 100644 --- a/docs/_modules/rl_coach/exploration_policies/exploration_policy.html +++ b/docs/_modules/rl_coach/exploration_policies/exploration_policy.html @@ -201,7 +201,7 @@ from rl_coach.base_parameters import Parameters from rl_coach.core_types import RunPhase, ActionType -from rl_coach.spaces import ActionSpace +from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace, GoalsSpace class ExplorationParameters(Parameters): @@ -237,14 +237,10 @@ Given a list of values corresponding to each action, choose one actions according to the exploration policy :param action_values: A list of action values - :return: The chosen action + :return: The chosen action, + The probability of the action (if available, otherwise 1 for absolute certainty in the action) """ - if self.__class__ == ExplorationPolicy: - raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. " - "Please set the exploration parameters to point to an inheriting class like EGreedy or " - "AdditiveNoise") - else: - raise ValueError("The get_action function should be overridden in the inheriting exploration class") + raise NotImplementedError()
    [docs] def change_phase(self, phase): """ @@ -265,6 +261,45 @@ def get_control_param(self): return 0
    + + +class DiscreteActionExplorationPolicy(ExplorationPolicy): + """ + A discrete action exploration policy. + """ + def __init__(self, action_space: ActionSpace): + """ + :param action_space: the action space used by the environment + """ + assert isinstance(action_space, DiscreteActionSpace) + super().__init__(action_space) + + def get_action(self, action_values: List[ActionType]) -> (ActionType, List): + """ + Given a list of values corresponding to each action, + choose one actions according to the exploration policy + :param action_values: A list of action values + :return: The chosen action, + The probabilities of actions to select from (if not available a one-hot vector) + """ + if self.__class__ == ExplorationPolicy: + raise ValueError("The ExplorationPolicy class is an abstract class and should not be used directly. " + "Please set the exploration parameters to point to an inheriting class like EGreedy or " + "AdditiveNoise") + else: + raise ValueError("The get_action function should be overridden in the inheriting exploration class") + + +class ContinuousActionExplorationPolicy(ExplorationPolicy): + """ + A continuous action exploration policy. + """ + def __init__(self, action_space: ActionSpace): + """ + :param action_space: the action space used by the environment + """ + assert isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace) + super().__init__(action_space) diff --git a/docs/_modules/rl_coach/exploration_policies/greedy.html b/docs/_modules/rl_coach/exploration_policies/greedy.html index 6756ad6..cf0ab20 100644 --- a/docs/_modules/rl_coach/exploration_policies/greedy.html +++ b/docs/_modules/rl_coach/exploration_policies/greedy.html @@ -202,7 +202,7 @@ import numpy as np from rl_coach.core_types import ActionType -from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters +from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ExplorationPolicy from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace @@ -224,9 +224,12 @@ """ super().__init__(action_space) - def get_action(self, action_values: List[ActionType]) -> ActionType: + def get_action(self, action_values: List[ActionType]): if type(self.action_space) == DiscreteActionSpace: - return np.argmax(action_values) + action = np.argmax(action_values) + one_hot_action_probabilities = np.zeros(len(self.action_space.actions)) + one_hot_action_probabilities[action] = 1 + return action, one_hot_action_probabilities if type(self.action_space) == BoxActionSpace: return action_values diff --git a/docs/_modules/rl_coach/exploration_policies/ou_process.html b/docs/_modules/rl_coach/exploration_policies/ou_process.html index 54010b2..52dd9d6 100644 --- a/docs/_modules/rl_coach/exploration_policies/ou_process.html +++ b/docs/_modules/rl_coach/exploration_policies/ou_process.html @@ -202,12 +202,13 @@ import numpy as np from rl_coach.core_types import RunPhase, ActionType -from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters +from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters from rl_coach.spaces import ActionSpace, BoxActionSpace, GoalsSpace # Based on on the description in: # https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab + class OUProcessParameters(ExplorationParameters): def __init__(self): super().__init__() @@ -222,7 +223,7 @@ # Ornstein-Uhlenbeck process -
    [docs]class OUProcess(ExplorationPolicy): +
    [docs]class OUProcess(ContinuousActionExplorationPolicy): """ OUProcess exploration policy is intended for continuous action spaces, and selects the action according to an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where @@ -239,10 +240,6 @@ self.state = np.zeros(self.action_space.shape) self.dt = dt - if not (isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)): - raise ValueError("OU process exploration works only for continuous controls." - "The given action space is of type: {}".format(action_space.__class__.__name__)) - def reset(self): self.state = np.zeros(self.action_space.shape) diff --git a/docs/_modules/rl_coach/exploration_policies/parameter_noise.html b/docs/_modules/rl_coach/exploration_policies/parameter_noise.html index d381dd2..3d9a10f 100644 --- a/docs/_modules/rl_coach/exploration_policies/parameter_noise.html +++ b/docs/_modules/rl_coach/exploration_policies/parameter_noise.html @@ -242,9 +242,13 @@ self.network_params = network_params self._replace_network_dense_layers() - def get_action(self, action_values: List[ActionType]) -> ActionType: + def get_action(self, action_values: List[ActionType]): if type(self.action_space) == DiscreteActionSpace: - return np.argmax(action_values) + action = np.argmax(action_values) + one_hot_action_probabilities = np.zeros(len(self.action_space.actions)) + one_hot_action_probabilities[action] = 1 + + return action, one_hot_action_probabilities elif type(self.action_space) == BoxActionSpace: action_values_mean = action_values[0].squeeze() action_values_std = action_values[1].squeeze() diff --git a/docs/_modules/rl_coach/exploration_policies/truncated_normal.html b/docs/_modules/rl_coach/exploration_policies/truncated_normal.html index a8a0acc..7d33198 100644 --- a/docs/_modules/rl_coach/exploration_policies/truncated_normal.html +++ b/docs/_modules/rl_coach/exploration_policies/truncated_normal.html @@ -203,7 +203,7 @@ from scipy.stats import truncnorm from rl_coach.core_types import RunPhase, ActionType -from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters +from rl_coach.exploration_policies.exploration_policy import ExplorationParameters, ContinuousActionExplorationPolicy from rl_coach.schedules import Schedule, LinearSchedule from rl_coach.spaces import ActionSpace, BoxActionSpace @@ -211,17 +211,18 @@ class TruncatedNormalParameters(ExplorationParameters): def __init__(self): super().__init__() - self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000) - self.evaluation_noise_percentage = 0.05 + self.noise_schedule = LinearSchedule(0.1, 0.1, 50000) + self.evaluation_noise = 0.05 self.clip_low = 0 self.clip_high = 1 + self.noise_as_percentage_from_action_space = True @property def path(self): return 'rl_coach.exploration_policies.truncated_normal:TruncatedNormal' -
    [docs]class TruncatedNormal(ExplorationPolicy): +
    [docs]class TruncatedNormal(ContinuousActionExplorationPolicy): """ The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t @@ -232,17 +233,20 @@ When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it is within the bounds. """ - def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule, - evaluation_noise_percentage: float, clip_low: float, clip_high: float): + def __init__(self, action_space: ActionSpace, noise_schedule: Schedule, + evaluation_noise: float, clip_low: float, clip_high: float, + noise_as_percentage_from_action_space: bool = True): """ :param action_space: the action space used by the environment - :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range - of the action space - :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases + :param noise_schedule: the schedule for the noise variance + :param evaluation_noise: the noise variance that will be used during evaluation phases + :param noise_as_percentage_from_action_space: whether to consider the noise as a percentage of the action space + or absolute value """ super().__init__(action_space) - self.noise_percentage_schedule = noise_percentage_schedule - self.evaluation_noise_percentage = evaluation_noise_percentage + self.noise_schedule = noise_schedule + self.evaluation_noise = evaluation_noise + self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space self.clip_low = clip_low self.clip_high = clip_high @@ -254,17 +258,21 @@ or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf): raise ValueError("Additive noise exploration requires bounded actions") - # TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage - def get_action(self, action_values: List[ActionType]) -> ActionType: - # set the current noise percentage + # set the current noise if self.phase == RunPhase.TEST: - current_noise_precentage = self.evaluation_noise_percentage + current_noise = self.evaluation_noise else: - current_noise_precentage = self.noise_percentage_schedule.current_value + current_noise = self.noise_schedule.current_value # scale the noise to the action space range - action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low) + if self.noise_as_percentage_from_action_space: + action_values_std = current_noise * (self.action_space.high - self.action_space.low) + else: + action_values_std = current_noise + + # scale the noise to the action space range + action_values_std = current_noise * (self.action_space.high - self.action_space.low) # extract the mean values if isinstance(action_values, list): @@ -276,7 +284,7 @@ # step the noise schedule if self.phase is not RunPhase.TEST: - self.noise_percentage_schedule.step() + self.noise_schedule.step() # the second element of the list is assumed to be the standard deviation if isinstance(action_values, list) and len(action_values) > 1: action_values_std = action_values[1].squeeze() @@ -290,7 +298,7 @@ return action def get_control_param(self): - return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
    + return np.ones(self.action_space.shape)*self.noise_schedule.current_value
    diff --git a/docs/_modules/rl_coach/filters/observation/observation_stacking_filter.html b/docs/_modules/rl_coach/filters/observation/observation_stacking_filter.html index 25c22f4..7e60af4 100644 --- a/docs/_modules/rl_coach/filters/observation/observation_stacking_filter.html +++ b/docs/_modules/rl_coach/filters/observation/observation_stacking_filter.html @@ -204,7 +204,7 @@ from rl_coach.core_types import ObservationType from rl_coach.filters.observation.observation_filter import ObservationFilter -from rl_coach.spaces import ObservationSpace +from rl_coach.spaces import ObservationSpace, VectorObservationSpace class LazyStack(object): @@ -246,6 +246,7 @@ self.stack_size = stack_size self.stacking_axis = stacking_axis self.stack = [] + self.input_observation_space = None if stack_size <= 0: raise ValueError("The stack shape must be a positive number") @@ -269,7 +270,6 @@ raise ValueError("The stacking axis is larger than the number of dimensions in the observation space") def filter(self, observation: ObservationType, update_internal_state: bool=True) -> ObservationType: - if len(self.stack) == 0: self.stack = deque([observation] * self.stack_size, maxlen=self.stack_size) else: @@ -277,14 +277,21 @@ self.stack.append(observation) observation = LazyStack(self.stack, self.stacking_axis) + if isinstance(self.input_observation_space, VectorObservationSpace): + # when stacking vectors, we cannot avoid copying the memory as we're flattening it all + observation = np.array(observation).flatten() + return observation def get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace: - if self.stacking_axis == -1: - input_observation_space.shape = np.append(input_observation_space.shape, values=[self.stack_size], axis=0) + if isinstance(input_observation_space, VectorObservationSpace): + self.input_observation_space = input_observation_space = VectorObservationSpace(input_observation_space.shape * self.stack_size) else: - input_observation_space.shape = np.insert(input_observation_space.shape, obj=self.stacking_axis, - values=[self.stack_size], axis=0) + if self.stacking_axis == -1: + input_observation_space.shape = np.append(input_observation_space.shape, values=[self.stack_size], axis=0) + else: + input_observation_space.shape = np.insert(input_observation_space.shape, obj=self.stacking_axis, + values=[self.stack_size], axis=0) return input_observation_space def reset(self) -> None: diff --git a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html index 02a7056..2070f69 100644 --- a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html +++ b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html @@ -208,6 +208,7 @@ import random from rl_coach.core_types import Transition, Episode +from rl_coach.filters.filter import InputFilter from rl_coach.logger import screen from rl_coach.memories.memory import Memory, MemoryGranularity, MemoryParameters from rl_coach.utils import ReaderWriterLock, ProgressBar @@ -591,11 +592,12 @@ self.reader_writer_lock.release_writing() return mean - def load_csv(self, csv_dataset: CsvDataset) -> None: + def load_csv(self, csv_dataset: CsvDataset, input_filter: InputFilter) -> None: """ Restore the replay buffer contents from a csv file. The csv file is assumed to include a list of transitions. :param csv_dataset: A construct which holds the dataset parameters + :param input_filter: A filter used to filter the CSV data before feeding it to the memory. """ self.assert_not_frozen() @@ -612,18 +614,30 @@ for e_id in episode_ids: progress_bar.update(e_id) df_episode_transitions = df[df['episode_id'] == e_id] + input_filter.reset() + + if len(df_episode_transitions) < 2: + # we have to have at least 2 rows in each episode for creating a transition + continue + episode = Episode() + transitions = [] for (_, current_transition), (_, next_transition) in zip(df_episode_transitions[:-1].iterrows(), df_episode_transitions[1:].iterrows()): state = np.array([current_transition[col] for col in state_columns]) next_state = np.array([next_transition[col] for col in state_columns]) - episode.insert( + transitions.append( Transition(state={'observation': state}, action=current_transition['action'], reward=current_transition['reward'], next_state={'observation': next_state}, game_over=False, info={'all_action_probabilities': - ast.literal_eval(current_transition['all_action_probabilities'])})) + ast.literal_eval(current_transition['all_action_probabilities'])}), + ) + + transitions = input_filter.filter(transitions, deep_copy=False) + for t in transitions: + episode.insert(t) # Set the last transition to end the episode if csv_dataset.is_episodic: @@ -635,8 +649,6 @@ progress_bar.update(len(episode_ids)) progress_bar.close() - self.shuffle_episodes() - def freeze(self): """ Freezing the replay buffer does not allow any new transitions to be added to the memory. diff --git a/docs/_sources/components/agents/index.rst.txt b/docs/_sources/components/agents/index.rst.txt index 476bc7a..ca21713 100644 --- a/docs/_sources/components/agents/index.rst.txt +++ b/docs/_sources/components/agents/index.rst.txt @@ -21,6 +21,7 @@ A detailed description of those algorithms can be found by navigating to each of imitation/cil policy_optimization/cppo policy_optimization/ddpg + policy_optimization/td3 policy_optimization/sac other/dfp value_optimization/double_dqn diff --git a/docs/_sources/components/agents/policy_optimization/td3.rst.txt b/docs/_sources/components/agents/policy_optimization/td3.rst.txt new file mode 100644 index 0000000..f7b9a78 --- /dev/null +++ b/docs/_sources/components/agents/policy_optimization/td3.rst.txt @@ -0,0 +1,55 @@ +Twin Delayed Deep Deterministic Policy Gradient +================================== + +**Actions space:** Continuous + +**References:** `Addressing Function Approximation Error in Actor-Critic Methods `_ + +Network Structure +----------------- + +.. image:: /_static/img/design_imgs/td3.png + :align: center + +Algorithm Description +--------------------- +Choosing an action +++++++++++++++++++ + +Pass the current states through the actor network, and get an action mean vector :math:`\mu`. +While in training phase, use a continuous exploration policy, such as a small zero-meaned gaussian noise, +to add exploration noise to the action. When testing, use the mean vector :math:`\mu` as-is. + +Training the network +++++++++++++++++++++ + +Start by sampling a batch of transitions from the experience replay. + +* To train the two **critic networks**, use the following targets: + + :math:`y_t=r(s_t,a_t )+\gamma \cdot \min_{i=1,2} Q_{i}(s_{t+1},\mu(s_{t+1} )+[\mathcal{N}(0,\,\sigma^{2})]^{MAX\_NOISE}_{MIN\_NOISE})` + + First run the actor target network, using the next states as the inputs, and get :math:`\mu (s_{t+1} )`. Then, add a + clipped gaussian noise to these actions, and clip the resulting actions to the actions space. + Next, run the critic target networks using the next states and :math:`\mu (s_{t+1} )+[\mathcal{N}(0,\,\sigma^{2})]^{MAX\_NOISE}_{MIN\_NOISE}`, + and use the minimum between the two critic networks predictions in order to calculate :math:`y_t` according to the + equation above. To train the networks, use the current states and actions as the inputs, and :math:`y_t` + as the targets. + +* To train the **actor network**, use the following equation: + + :math:`\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q_{1}(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]` + + Use the actor's online network to get the action mean values using the current states as the inputs. + Then, use the first critic's online network in order to get the gradients of the critic output with respect to the + action mean values :math:`\nabla _a Q_{1}(s,a)|_{s=s_t,a=\mu(s_t ) }`. + Using the chain rule, calculate the gradients of the actor's output, with respect to the actor weights, + given :math:`\nabla_a Q(s,a)`. Finally, apply those gradients to the actor network. + + The actor's training is done at a slower frequency than the critic's training, in order to allow the critic to better fit the + current policy, before exercising the critic in order to train the actor. + Following the same, delayed, actor's training cadence, do a soft update of the critic and actor target networks' weights + from the online networks. + + +.. autoclass:: rl_coach.agents.td3_agent.TD3AlgorithmParameters \ No newline at end of file diff --git a/docs/_sources/selecting_an_algorithm.rst.txt b/docs/_sources/selecting_an_algorithm.rst.txt index e4027de..d039cc4 100644 --- a/docs/_sources/selecting_an_algorithm.rst.txt +++ b/docs/_sources/selecting_an_algorithm.rst.txt @@ -214,6 +214,16 @@ The algorithms are ordered by their release date in descending order. and therefore it is able to use a replay buffer in order to improve sample efficiency. +
    + + TD3 +
    + Very similar to DDPG, i.e. an actor-critic for continuous action spaces, that uses a replay buffer in + order to improve sample efficiency. TD3 uses two critic networks in order to mitigate the overestimation + in the Q state-action value prediction, slows down the actor updates in order to increase stability and + adds noise to actions while training the critic in order to smooth out the critic's predictions. +
    +
    PPO diff --git a/docs/_static/basic.css b/docs/_static/basic.css index 53acd09..c41d718 100644 --- a/docs/_static/basic.css +++ b/docs/_static/basic.css @@ -289,6 +289,12 @@ img.align-center, .figure.align-center, object.align-center { margin-right: auto; } +img.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + .align-left { text-align: left; } @@ -297,6 +303,10 @@ img.align-center, .figure.align-center, object.align-center { text-align: center; } +.align-default { + text-align: center; +} + .align-right { text-align: right; } @@ -368,6 +378,11 @@ table.align-center { margin-right: auto; } +table.align-default { + margin-left: auto; + margin-right: auto; +} + table caption span.caption-number { font-style: italic; } diff --git a/docs/_static/searchtools.js b/docs/_static/searchtools.js index bdc2706..6031f99 100644 --- a/docs/_static/searchtools.js +++ b/docs/_static/searchtools.js @@ -319,12 +319,13 @@ var Search = { for (var prefix in objects) { for (var name in objects[prefix]) { var fullname = (prefix ? prefix + '.' : '') + name; - if (fullname.toLowerCase().indexOf(object) > -1) { + var fullnameLower = fullname.toLowerCase() + if (fullnameLower.indexOf(object) > -1) { var score = 0; - var parts = fullname.split('.'); + var parts = fullnameLower.split('.'); // check for different match types: exact matches of full name or // "last name" (i.e. last dotted part) - if (fullname == object || parts[parts.length - 1] == object) { + if (fullnameLower == object || parts[parts.length - 1] == object) { score += Scorer.objNameMatch; // matches in last name } else if (parts[parts.length - 1].indexOf(object) > -1) { diff --git a/docs/components/additional_parameters.html b/docs/components/additional_parameters.html index 7e51f31..3070a91 100644 --- a/docs/components/additional_parameters.html +++ b/docs/components/additional_parameters.html @@ -195,7 +195,7 @@

    VisualizationParameters

    -class rl_coach.base_parameters.VisualizationParameters(print_networks_summary=False, dump_csv=True, dump_signals_to_csv_every_x_episodes=5, dump_gifs=False, dump_mp4=False, video_dump_methods=None, dump_in_episode_signals=False, dump_parameters_documentation=True, render=False, native_rendering=False, max_fps_for_human_control=10, tensorboard=False, add_rendered_image_to_env_response=False)[source]
    +class rl_coach.base_parameters.VisualizationParameters(print_networks_summary=False, dump_csv=True, dump_signals_to_csv_every_x_episodes=5, dump_gifs=False, dump_mp4=False, video_dump_methods=None, dump_in_episode_signals=False, dump_parameters_documentation=True, render=False, native_rendering=False, max_fps_for_human_control=10, tensorboard=False, add_rendered_image_to_env_response=False)[source]
    Parameters
      @@ -244,7 +244,7 @@ which will be passed to the agent and allow using those images.

      PresetValidationParameters

      -class rl_coach.base_parameters.PresetValidationParameters(test=False, min_reward_threshold=0, max_episodes_to_achieve_reward=1, num_workers=1, reward_test_level=None, test_using_a_trace_test=True, trace_test_levels=None, trace_max_env_steps=5000)[source]
      +class rl_coach.base_parameters.PresetValidationParameters(test=False, min_reward_threshold=0, max_episodes_to_achieve_reward=1, num_workers=1, reward_test_level=None, test_using_a_trace_test=True, trace_test_levels=None, trace_max_env_steps=5000, read_csv_tries=200)[source]
      Parameters
        @@ -261,6 +261,7 @@ reward tests suite.

        trace tests suite.

      • trace_max_env_steps – An integer representing the maximum number of environment steps to run when running this preset as part of the trace tests suite.

      • +
      • read_csv_tries – The number of retries to attempt for reading the experiment csv file, before declaring failure.

      @@ -271,7 +272,7 @@ of the trace tests suite.

      TaskParameters

      -class rl_coach.base_parameters.TaskParameters(framework_type: rl_coach.base_parameters.Frameworks = <Frameworks.tensorflow: 'TensorFlow'>, evaluate_only: int = None, use_cpu: bool = False, experiment_path='/tmp', seed=None, checkpoint_save_secs=None, checkpoint_restore_dir=None, checkpoint_restore_path=None, checkpoint_save_dir=None, export_onnx_graph: bool = False, apply_stop_condition: bool = False, num_gpu: int = 1)[source]
      +class rl_coach.base_parameters.TaskParameters(framework_type: rl_coach.base_parameters.Frameworks = <Frameworks.tensorflow: 'TensorFlow'>, evaluate_only: int = None, use_cpu: bool = False, experiment_path='/tmp', seed=None, checkpoint_save_secs=None, checkpoint_restore_dir=None, checkpoint_restore_path=None, checkpoint_save_dir=None, export_onnx_graph: bool = False, apply_stop_condition: bool = False, num_gpu: int = 1)[source]
      Parameters
        @@ -299,7 +300,7 @@ the dir to restore the checkpoints from

        DistributedTaskParameters

        -class rl_coach.base_parameters.DistributedTaskParameters(framework_type: rl_coach.base_parameters.Frameworks, parameters_server_hosts: str, worker_hosts: str, job_type: str, task_index: int, evaluate_only: int = None, num_tasks: int = None, num_training_tasks: int = None, use_cpu: bool = False, experiment_path=None, dnd=None, shared_memory_scratchpad=None, seed=None, checkpoint_save_secs=None, checkpoint_restore_path=None, checkpoint_save_dir=None, export_onnx_graph: bool = False, apply_stop_condition: bool = False)[source]
        +class rl_coach.base_parameters.DistributedTaskParameters(framework_type: rl_coach.base_parameters.Frameworks, parameters_server_hosts: str, worker_hosts: str, job_type: str, task_index: int, evaluate_only: int = None, num_tasks: int = None, num_training_tasks: int = None, use_cpu: bool = False, experiment_path=None, dnd=None, shared_memory_scratchpad=None, seed=None, checkpoint_save_secs=None, checkpoint_restore_path=None, checkpoint_save_dir=None, export_onnx_graph: bool = False, apply_stop_condition: bool = False)[source]
        Parameters
    diff --git a/docs/components/agents/imitation/cil.html b/docs/components/agents/imitation/cil.html index cc5ca21..2b5fedf 100644 --- a/docs/components/agents/imitation/cil.html +++ b/docs/components/agents/imitation/cil.html @@ -124,6 +124,7 @@
  • Clipped Proximal Policy Optimization
  • Deep Deterministic Policy Gradient
  • +
  • Twin Delayed Deep Deterministic Policy Gradient
  • Soft Actor-Critic
  • Direct Future Prediction
  • Double DQN
  • @@ -245,7 +246,7 @@ so that the loss for the other heads will be zeroed out.

    -class rl_coach.agents.cil_agent.CILAlgorithmParameters[source]
    +class rl_coach.agents.cil_agent.CILAlgorithmParameters[source]
    Parameters

    state_key_with_the_class_index – (str) diff --git a/docs/components/agents/index.html b/docs/components/agents/index.html index 136475d..62b3bbd 100644 --- a/docs/components/agents/index.html +++ b/docs/components/agents/index.html @@ -117,6 +117,7 @@

  • Conditional Imitation Learning
  • Clipped Proximal Policy Optimization
  • Deep Deterministic Policy Gradient
  • +
  • Twin Delayed Deep Deterministic Policy Gradient
  • Soft Actor-Critic
  • Direct Future Prediction
  • Double DQN
  • @@ -225,6 +226,7 @@ A detailed description of those algorithms can be found by navigating to each of
  • Conditional Imitation Learning
  • Clipped Proximal Policy Optimization
  • Deep Deterministic Policy Gradient
  • +
  • Twin Delayed Deep Deterministic Policy Gradient
  • Soft Actor-Critic
  • Direct Future Prediction
  • Double DQN
  • @@ -243,7 +245,7 @@ A detailed description of those algorithms can be found by navigating to each of
    -class rl_coach.base_parameters.AgentParameters(algorithm: rl_coach.base_parameters.AlgorithmParameters, exploration: ExplorationParameters, memory: MemoryParameters, networks: Dict[str, rl_coach.base_parameters.NetworkParameters], visualization: rl_coach.base_parameters.VisualizationParameters = <rl_coach.base_parameters.VisualizationParameters object>)[source]
    +class rl_coach.base_parameters.AgentParameters(algorithm: rl_coach.base_parameters.AlgorithmParameters, exploration: ExplorationParameters, memory: MemoryParameters, networks: Dict[str, rl_coach.base_parameters.NetworkParameters], visualization: rl_coach.base_parameters.VisualizationParameters = <rl_coach.base_parameters.VisualizationParameters object>)[source]
    Parameters
      @@ -270,7 +272,7 @@ used for visualization purposes, such as printing to the screen, rendering, and
      -class rl_coach.agents.agent.Agent(agent_parameters: rl_coach.base_parameters.AgentParameters, parent: Union[LevelManager, CompositeAgent] = None)[source]
      +class rl_coach.agents.agent.Agent(agent_parameters: rl_coach.base_parameters.AgentParameters, parent: Union[LevelManager, CompositeAgent] = None)[source]
      Parameters

      agent_parameters – A AgentParameters class instance with all the agent parameters

      @@ -278,7 +280,7 @@ used for visualization purposes, such as printing to the screen, rendering, and
      -act(action: Union[None, int, float, numpy.ndarray, List] = None) → rl_coach.core_types.ActionInfo[source]
      +act(action: Union[None, int, float, numpy.ndarray, List] = None) → rl_coach.core_types.ActionInfo[source]

      Given the agents current knowledge, decide on the next action to apply to the environment

      Parameters
      @@ -292,7 +294,7 @@ used for visualization purposes, such as printing to the screen, rendering, and
      -call_memory(func, args=())[source]
      +call_memory(func, args=())[source]

      This function is a wrapper to allow having the same calls for shared or unshared memories. It should be used instead of calling the memory directly in order to allow different algorithms to work both with a shared and a local memory.

      @@ -311,7 +313,7 @@ both with a shared and a local memory.

      -choose_action(curr_state)[source]
      +choose_action(curr_state)[source]

      choose an action to act with in the current episode being played. Different behavior might be exhibited when training or testing.

      @@ -326,7 +328,7 @@ training or testing.

      -collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection[source]
      +collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection[source]

      Collect all of agent’s network savers :param parent_path_suffix: path suffix of the parent of the agent (could be name of level manager or composite agent) @@ -335,7 +337,7 @@ training or testing.

      -create_networks() → Dict[str, rl_coach.architectures.network_wrapper.NetworkWrapper][source]
      +create_networks() → Dict[str, rl_coach.architectures.network_wrapper.NetworkWrapper][source]

      Create all the networks of the agent. The network creation will be done after setting the environment parameters for the agent, since they are needed for creating the network.

      @@ -346,9 +348,16 @@ for creating the network.

      +
      +
      +freeze_memory()[source]
      +

      Shuffle episodes in the memory and freeze it to make sure that no extra data is being pushed anymore. +:return: None

      +
      +
      -get_predictions(states: List[Dict[str, numpy.ndarray]], prediction_type: rl_coach.core_types.PredictionType)[source]
      +get_predictions(states: List[Dict[str, numpy.ndarray]], prediction_type: rl_coach.core_types.PredictionType)[source]

      Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so, raise a ValueException.

      @@ -367,7 +376,7 @@ raise a ValueException.

      -get_state_embedding(state: dict) → numpy.ndarray[source]
      +get_state_embedding(state: dict) → numpy.ndarray[source]

      Given a state, get the corresponding state embedding from the main network

      Parameters
      @@ -381,7 +390,7 @@ raise a ValueException.

      -handle_episode_ended() → None[source]
      +handle_episode_ended() → None[source]

      Make any changes needed when each episode is ended. This includes incrementing counters, updating full episode dependent values, updating logs, etc. This function is called right after each episode is ended.

      @@ -394,7 +403,7 @@ This function is called right after each episode is ended.

      -init_environment_dependent_modules() → None[source]
      +init_environment_dependent_modules() → None[source]

      Initialize any modules that depend on knowing information about the environment such as the action space or the observation space

      @@ -404,9 +413,20 @@ the observation space

      +
      +
      +initialize_session_dependent_components()[source]
      +

      Initialize components which require a session as part of their initialization.

      +
      +
      Returns
      +

      None

      +
      +
      +
      +
      -learn_from_batch(batch) → Tuple[float, List, List][source]
      +learn_from_batch(batch) → Tuple[float, List, List][source]

      Given a batch of transitions, calculates their target values and updates the network.

      Parameters
      @@ -418,9 +438,20 @@ the observation space

      +
      +
      +load_memory_from_file()[source]
      +

      Load memory transitions from a file.

      +
      +
      Returns
      +

      None

      +
      +
      +
      +
      -log_to_screen() → None[source]
      +log_to_screen() → None[source]

      Write an episode summary line to the terminal

      Returns
      @@ -431,7 +462,7 @@ the observation space

      -observe(env_response: rl_coach.core_types.EnvResponse) → bool[source]
      +observe(env_response: rl_coach.core_types.EnvResponse) → bool[source]

      Given a response from the environment, distill the observation from it and store it for later use. The response should be a dictionary containing the performed action, the new observation and measurements, the reward, a game over flag and any additional information necessary.

      @@ -446,9 +477,9 @@ given observation

      -
      +
      -parent
      +property parent

      Get the parent class of the agent

      Returns
      @@ -457,9 +488,9 @@ given observation

      -
      +
      -phase
      +property phase

      The current running phase of the agent

      Returns
      @@ -470,7 +501,7 @@ given observation

      -post_training_commands() → None[source]
      +post_training_commands() → None[source]

      A function which allows adding any functionality that is required to run right after the training phase ends.

      Returns
      @@ -481,7 +512,7 @@ given observation

      -prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.array][source]
      +prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.core.multiarray.array][source]

      Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all observations together, measurements together, etc.

      @@ -501,7 +532,7 @@ the observation relevant for the network from the states.

      -register_signal(signal_name: str, dump_one_value_per_episode: bool = True, dump_one_value_per_step: bool = False) → rl_coach.utils.Signal[source]
      +register_signal(signal_name: str, dump_one_value_per_episode: bool = True, dump_one_value_per_step: bool = False) → rl_coach.utils.Signal[source]

      Register a signal such that its statistics will be dumped and be viewable through dashboard

      Parameters
      @@ -519,7 +550,7 @@ the observation relevant for the network from the states.

      -reset_evaluation_state(val: rl_coach.core_types.RunPhase) → None[source]
      +reset_evaluation_state(val: rl_coach.core_types.RunPhase) → None[source]

      Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given by val, and by the current phase set in self.phase.

      @@ -535,7 +566,7 @@ by val, and by the current phase set in self.phase.

      -reset_internal_state() → None[source]
      +reset_internal_state() → None[source]

      Reset all the episodic parameters. This function is called right before each episode starts.

      Returns
      @@ -546,7 +577,7 @@ by val, and by the current phase set in self.phase.

      -restore_checkpoint(checkpoint_dir: str) → None[source]
      +restore_checkpoint(checkpoint_dir: str) → None[source]

      Allows agents to store additional information when saving checkpoints.

      Parameters
      @@ -560,7 +591,7 @@ by val, and by the current phase set in self.phase.

      -run_off_policy_evaluation() → None
      +run_off_policy_evaluation() → None

      Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset. Should only be implemented for off-policy RL algorithms.

      @@ -572,7 +603,7 @@ Should only be implemented for off-policy RL algorithms.

      -run_pre_network_filter_for_inference(state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True) → Dict[str, numpy.ndarray][source]
      +run_pre_network_filter_for_inference(state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True) → Dict[str, numpy.ndarray][source]

      Run filters which where defined for being applied right before using the state for inference.

      Parameters
      @@ -589,7 +620,7 @@ Should only be implemented for off-policy RL algorithms.

      -save_checkpoint(checkpoint_prefix: str) → None[source]
      +save_checkpoint(checkpoint_prefix: str) → None[source]

      Allows agents to store additional information when saving checkpoints.

      Parameters
      @@ -603,7 +634,7 @@ Should only be implemented for off-policy RL algorithms.

      -set_environment_parameters(spaces: rl_coach.spaces.SpacesDefinition)[source]
      +set_environment_parameters(spaces: rl_coach.spaces.SpacesDefinition)[source]

      Sets the parameters that are environment dependent. As a side effect, initializes all the components that are dependent on those values, by calling init_environment_dependent_modules

      @@ -618,7 +649,7 @@ dependent on those values, by calling init_environment_dependent_modules

      -set_incoming_directive(action: Union[int, float, numpy.ndarray, List]) → None[source]
      +set_incoming_directive(action: Union[int, float, numpy.ndarray, List]) → None[source]

      Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent has another master agent that is controlling it. In such cases, the master agent can define the goals for the slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent @@ -635,7 +666,7 @@ in-action-space.

      -set_session(sess) → None[source]
      +set_session(sess) → None[source]

      Set the deep learning framework session for all the agents in the composite agent

      Returns
      @@ -646,7 +677,7 @@ in-action-space.

      -setup_logger() → None[source]
      +setup_logger() → None[source]

      Setup the logger for the agent

      Returns
      @@ -657,7 +688,7 @@ in-action-space.

      -sync() → None[source]
      +sync() → None[source]

      Sync the global network parameters to local networks

      Returns
      @@ -668,7 +699,7 @@ in-action-space.

      -train() → float[source]
      +train() → float[source]

      Check if a training phase should be done as configured by num_consecutive_playing_steps. If it should, then do several training steps as configured by num_consecutive_training_steps. A single training iteration: Sample a batch, train on it and update target networks.

      @@ -681,7 +712,7 @@ A single training iteration: Sample a batch, train on it and update target netwo
      -update_log() → None[source]
      +update_log() → None[source]

      Updates the episodic log file with all the signal values from the most recent episode. Additional signals for logging can be set by the creating a new signal using self.register_signal, and then updating it with some internal agent values.

      @@ -694,7 +725,7 @@ and then updating it with some internal agent values.

      -update_step_in_episode_log() → None[source]
      +update_step_in_episode_log() → None[source]

      Updates the in-episode log file with all the signal values from the most recent step.

      Returns
      @@ -705,7 +736,7 @@ and then updating it with some internal agent values.

      -update_transition_before_adding_to_replay_buffer(transition: rl_coach.core_types.Transition) → rl_coach.core_types.Transition[source]
      +update_transition_before_adding_to_replay_buffer(transition: rl_coach.core_types.Transition) → rl_coach.core_types.Transition[source]

      Allows agents to update the transition just before adding it to the replay buffer. Can be useful for agents that want to tweak the reward, termination signal, etc.

      diff --git a/docs/components/agents/other/dfp.html b/docs/components/agents/other/dfp.html index 4d0c6ce..564ed83 100644 --- a/docs/components/agents/other/dfp.html +++ b/docs/components/agents/other/dfp.html @@ -117,6 +117,7 @@
    • Conditional Imitation Learning
    • Clipped Proximal Policy Optimization
    • Deep Deterministic Policy Gradient
    • +
    • Twin Delayed Deep Deterministic Policy Gradient
    • Soft Actor-Critic
    • Direct Future Prediction
      • Network Structure
      • @@ -249,7 +250,7 @@ measurements that were seen in time-steps
        -class rl_coach.agents.dfp_agent.DFPAlgorithmParameters[source]
        +class rl_coach.agents.dfp_agent.DFPAlgorithmParameters[source]
        Parameters
          diff --git a/docs/components/agents/policy_optimization/ac.html b/docs/components/agents/policy_optimization/ac.html index 88bf2fd..c567ecf 100644 --- a/docs/components/agents/policy_optimization/ac.html +++ b/docs/components/agents/policy_optimization/ac.html @@ -125,6 +125,7 @@
        • Conditional Imitation Learning
        • Clipped Proximal Policy Optimization
        • Deep Deterministic Policy Gradient
        • +
        • Twin Delayed Deep Deterministic Policy Gradient
        • Soft Actor-Critic
        • Direct Future Prediction
        • Double DQN
        • @@ -247,7 +248,7 @@ where \(k\) is \(L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]\)

          -class rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters[source]
          +class rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters[source]
          Parameters
            diff --git a/docs/components/agents/policy_optimization/acer.html b/docs/components/agents/policy_optimization/acer.html index fcd65e4..3d39ebf 100644 --- a/docs/components/agents/policy_optimization/acer.html +++ b/docs/components/agents/policy_optimization/acer.html @@ -125,6 +125,7 @@
          • Conditional Imitation Learning
          • Clipped Proximal Policy Optimization
          • Deep Deterministic Policy Gradient
          • +
          • Twin Delayed Deep Deterministic Policy Gradient
          • Soft Actor-Critic
          • Direct Future Prediction
          • Double DQN
          • @@ -279,7 +280,7 @@ The goal of the trust region update is to the difference between the updated pol
            -class rl_coach.agents.acer_agent.ACERAlgorithmParameters[source]
            +class rl_coach.agents.acer_agent.ACERAlgorithmParameters[source]
            Parameters
              diff --git a/docs/components/agents/policy_optimization/cppo.html b/docs/components/agents/policy_optimization/cppo.html index ef8df6c..5996cf1 100644 --- a/docs/components/agents/policy_optimization/cppo.html +++ b/docs/components/agents/policy_optimization/cppo.html @@ -125,6 +125,7 @@
          • Deep Deterministic Policy Gradient
          • +
          • Twin Delayed Deep Deterministic Policy Gradient
          • Soft Actor-Critic
          • Direct Future Prediction
          • Double DQN
          • @@ -252,7 +253,7 @@ clipped surrogate loss:

            -class rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters[source]
            +class rl_coach.agents.clipped_ppo_agent.ClippedPPOAlgorithmParameters[source]
            Parameters
              diff --git a/docs/components/agents/policy_optimization/ddpg.html b/docs/components/agents/policy_optimization/ddpg.html index 6841c0f..b0f9afc 100644 --- a/docs/components/agents/policy_optimization/ddpg.html +++ b/docs/components/agents/policy_optimization/ddpg.html @@ -37,7 +37,7 @@ - + @@ -125,6 +125,7 @@
            +
          • Twin Delayed Deep Deterministic Policy Gradient
          • Soft Actor-Critic
          • Direct Future Prediction
          • Double DQN
          • @@ -257,7 +258,7 @@ given \(\nabla_a Q(s,a)\). Fin

            After every training step, do a soft update of the critic and actor target networks’ weights from the online networks.

            -class rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters[source]
            +class rl_coach.agents.ddpg_agent.DDPGAlgorithmParameters[source]
            Parameters
              @@ -297,7 +298,7 @@ values. If set to False, the terminal states reward will be taken as the target diff --git a/docs/components/agents/policy_optimization/td3.html b/docs/components/agents/policy_optimization/td3.html new file mode 100644 index 0000000..536265d --- /dev/null +++ b/docs/components/agents/policy_optimization/td3.html @@ -0,0 +1,347 @@ + + + + + + + + + + + Twin Delayed Deep Deterministic Policy Gradient — Reinforcement Learning Coach 0.12.0 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
              + + + +
              + + + + + +
              + +
              + + + + + + + + + + + + + + + + + +
              + + + + +
              +
              +
              +
              + +
              +

              Twin Delayed Deep Deterministic Policy Gradient

              +

              Actions space: Continuous

              +

              References: Addressing Function Approximation Error in Actor-Critic Methods

              +
              +

              Network Structure

              +../../../_images/td3.png +
              +
              +

              Algorithm Description

              +
              +

              Choosing an action

              +

              Pass the current states through the actor network, and get an action mean vector \(\mu\). +While in training phase, use a continuous exploration policy, such as a small zero-meaned gaussian noise, +to add exploration noise to the action. When testing, use the mean vector \(\mu\) as-is.

              +
              +
              +

              Training the network

              +

              Start by sampling a batch of transitions from the experience replay.

              +
                +
              • To train the two critic networks, use the following targets:

                +

                \(y_t=r(s_t,a_t )+\gamma \cdot \min_{i=1,2} Q_{i}(s_{t+1},\mu(s_{t+1} )+[\mathcal{N}(0,\,\sigma^{2})]^{MAX\_NOISE}_{MIN\_NOISE})\)

                +

                First run the actor target network, using the next states as the inputs, and get \(\mu (s_{t+1} )\). Then, add a +clipped gaussian noise to these actions, and clip the resulting actions to the actions space. +Next, run the critic target networks using the next states and \(\mu (s_{t+1} )+[\mathcal{N}(0,\,\sigma^{2})]^{MAX\_NOISE}_{MIN\_NOISE}\), +and use the minimum between the two critic networks predictions in order to calculate \(y_t\) according to the +equation above. To train the networks, use the current states and actions as the inputs, and \(y_t\) +as the targets.

                +
              • +
              • To train the actor network, use the following equation:

                +

                \(\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q_{1}(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]\)

                +

                Use the actor’s online network to get the action mean values using the current states as the inputs. +Then, use the first critic’s online network in order to get the gradients of the critic output with respect to the +action mean values \(\nabla _a Q_{1}(s,a)|_{s=s_t,a=\mu(s_t ) }\). +Using the chain rule, calculate the gradients of the actor’s output, with respect to the actor weights, +given \(\nabla_a Q(s,a)\). Finally, apply those gradients to the actor network.

                +

                The actor’s training is done at a slower frequency than the critic’s training, in order to allow the critic to better fit the +current policy, before exercising the critic in order to train the actor. +Following the same, delayed, actor’s training cadence, do a soft update of the critic and actor target networks’ weights +from the online networks.

                +
              • +
              +
              +
              +class rl_coach.agents.td3_agent.TD3AlgorithmParameters[source]
              +
              +
              Parameters
              +
                +
              • num_steps_between_copying_online_weights_to_target – (StepMethod) +The number of steps between copying the online network weights to the target network weights.

              • +
              • rate_for_copying_weights_to_target – (float) +When copying the online network weights to the target network weights, a soft update will be used, which +weight the new online network weights by rate_for_copying_weights_to_target

              • +
              • num_consecutive_playing_steps – (StepMethod) +The number of consecutive steps to act between every two training iterations

              • +
              • use_target_network_for_evaluation – (bool) +If set to True, the target network will be used for predicting the actions when choosing actions to act. +Since the target network weights change more slowly, the predicted actions will be more consistent.

              • +
              • action_penalty – (float) +The amount by which to penalize the network on high action feature (pre-activation) values. +This can prevent the actions features from saturating the TanH activation function, and therefore prevent the +gradients from becoming very low.

              • +
              • clip_critic_targets – (Tuple[float, float] or None) +The range to clip the critic target to in order to prevent overestimation of the action values.

              • +
              • use_non_zero_discount_for_terminal_states – (bool) +If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state +values. If set to False, the terminal states reward will be taken as the target return for the network.

              • +
              +
              +
              +
              + +
              +
              +
              + + +
              + +
              + + +
              +
              + +
              + +
              + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/components/agents/value_optimization/bs_dqn.html b/docs/components/agents/value_optimization/bs_dqn.html index 64d53e8..a9bb895 100644 --- a/docs/components/agents/value_optimization/bs_dqn.html +++ b/docs/components/agents/value_optimization/bs_dqn.html @@ -126,6 +126,7 @@
            • Conditional Imitation Learning
            • Clipped Proximal Policy Optimization
            • Deep Deterministic Policy Gradient
            • +
            • Twin Delayed Deep Deterministic Policy Gradient
            • Soft Actor-Critic
            • Direct Future Prediction
            • Double DQN
            • diff --git a/docs/components/agents/value_optimization/categorical_dqn.html b/docs/components/agents/value_optimization/categorical_dqn.html index 1a599c3..d276df6 100644 --- a/docs/components/agents/value_optimization/categorical_dqn.html +++ b/docs/components/agents/value_optimization/categorical_dqn.html @@ -124,6 +124,7 @@
            • Conditional Imitation Learning
            • Clipped Proximal Policy Optimization
            • Deep Deterministic Policy Gradient
            • +
            • Twin Delayed Deep Deterministic Policy Gradient
            • Soft Actor-Critic
            • Direct Future Prediction
            • Double DQN
            • @@ -244,7 +245,7 @@ probability distribution. Only the target of the actions that were actually ta
              -class rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters[source]
              +class rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters[source]
              Parameters
                diff --git a/docs/components/agents/value_optimization/double_dqn.html b/docs/components/agents/value_optimization/double_dqn.html index 9ba7c6c..1aac16d 100644 --- a/docs/components/agents/value_optimization/double_dqn.html +++ b/docs/components/agents/value_optimization/double_dqn.html @@ -117,6 +117,7 @@
              • Conditional Imitation Learning
              • Clipped Proximal Policy Optimization
              • Deep Deterministic Policy Gradient
              • +
              • Twin Delayed Deep Deterministic Policy Gradient
              • Soft Actor-Critic
              • Direct Future Prediction
              • Double DQN
                  diff --git a/docs/components/agents/value_optimization/dqn.html b/docs/components/agents/value_optimization/dqn.html index 52c866a..9d10883 100644 --- a/docs/components/agents/value_optimization/dqn.html +++ b/docs/components/agents/value_optimization/dqn.html @@ -117,6 +117,7 @@
                • Conditional Imitation Learning
                • Clipped Proximal Policy Optimization
                • Deep Deterministic Policy Gradient
                • +
                • Twin Delayed Deep Deterministic Policy Gradient
                • Soft Actor-Critic
                • Direct Future Prediction
                • Double DQN
                • @@ -243,7 +244,7 @@ Set those values as the targets for the actions that were not actually played.
                  -class rl_coach.agents.dqn_agent.DQNAlgorithmParameters[source]
                  +class rl_coach.agents.dqn_agent.DQNAlgorithmParameters[source]
                  diff --git a/docs/components/agents/value_optimization/dueling_dqn.html b/docs/components/agents/value_optimization/dueling_dqn.html index ded46ae..c25e608 100644 --- a/docs/components/agents/value_optimization/dueling_dqn.html +++ b/docs/components/agents/value_optimization/dueling_dqn.html @@ -117,6 +117,7 @@
                • Conditional Imitation Learning
                • Clipped Proximal Policy Optimization
                • Deep Deterministic Policy Gradient
                • +
                • Twin Delayed Deep Deterministic Policy Gradient
                • Soft Actor-Critic
                • Direct Future Prediction
                • Double DQN
                • diff --git a/docs/components/agents/value_optimization/mmc.html b/docs/components/agents/value_optimization/mmc.html index 58e1f24..f8afdbc 100644 --- a/docs/components/agents/value_optimization/mmc.html +++ b/docs/components/agents/value_optimization/mmc.html @@ -117,6 +117,7 @@
                • Conditional Imitation Learning
                • Clipped Proximal Policy Optimization
                • Deep Deterministic Policy Gradient
                • +
                • Twin Delayed Deep Deterministic Policy Gradient
                • Soft Actor-Critic
                • Direct Future Prediction
                • Double DQN
                • @@ -240,7 +241,7 @@ Once in every few thousand steps, copy the weights from the online network to the target network.

                  -class rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters[source]
                  +class rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters[source]
                  Parameters

                  monte_carlo_mixing_rate – (float) diff --git a/docs/components/agents/value_optimization/n_step.html b/docs/components/agents/value_optimization/n_step.html index 3154abf..abf03cf 100644 --- a/docs/components/agents/value_optimization/n_step.html +++ b/docs/components/agents/value_optimization/n_step.html @@ -117,6 +117,7 @@

                • Conditional Imitation Learning
                • Clipped Proximal Policy Optimization
                • Deep Deterministic Policy Gradient
                • +
                • Twin Delayed Deep Deterministic Policy Gradient
                • Soft Actor-Critic
                • Direct Future Prediction
                • Double DQN
                • @@ -242,7 +243,7 @@ where \(k\) is
                  -class rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters[source]
                  +class rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters[source]
                  Parameters
                    diff --git a/docs/components/agents/value_optimization/naf.html b/docs/components/agents/value_optimization/naf.html index 07190c1..a5f3ddf 100644 --- a/docs/components/agents/value_optimization/naf.html +++ b/docs/components/agents/value_optimization/naf.html @@ -117,6 +117,7 @@
                  • Conditional Imitation Learning
                  • Clipped Proximal Policy Optimization
                  • Deep Deterministic Policy Gradient
                  • +
                  • Twin Delayed Deep Deterministic Policy Gradient
                  • Soft Actor-Critic
                  • Direct Future Prediction
                  • Double DQN
                  • @@ -243,7 +244,7 @@ and \(y_t\) as the targets. After every training step, use a soft update in order to copy the weights from the online network to the target network.

                    -class rl_coach.agents.naf_agent.NAFAlgorithmParameters[source]
                    +class rl_coach.agents.naf_agent.NAFAlgorithmParameters[source]
                    diff --git a/docs/components/agents/value_optimization/nec.html b/docs/components/agents/value_optimization/nec.html index bbcfbea..4e953d5 100644 --- a/docs/components/agents/value_optimization/nec.html +++ b/docs/components/agents/value_optimization/nec.html @@ -117,6 +117,7 @@
                  • Conditional Imitation Learning
                  • Clipped Proximal Policy Optimization
                  • Deep Deterministic Policy Gradient
                  • +
                  • Twin Delayed Deep Deterministic Policy Gradient
                  • Soft Actor-Critic
                  • Direct Future Prediction
                  • Double DQN
                  • @@ -258,7 +259,7 @@ the network if necessary: \(y_t=\sum_{j=0}^{N-1}\gamma^j r(s_{t+j},a_{t+j} ) +\gamma^N max_a Q(s_{t+N},a)\)

                    -class rl_coach.agents.nec_agent.NECAlgorithmParameters[source]
                    +class rl_coach.agents.nec_agent.NECAlgorithmParameters[source]
                    Parameters
                      diff --git a/docs/components/agents/value_optimization/pal.html b/docs/components/agents/value_optimization/pal.html index b5d53f3..372d244 100644 --- a/docs/components/agents/value_optimization/pal.html +++ b/docs/components/agents/value_optimization/pal.html @@ -117,6 +117,7 @@
                    • Conditional Imitation Learning
                    • Clipped Proximal Policy Optimization
                    • Deep Deterministic Policy Gradient
                    • +
                    • Twin Delayed Deep Deterministic Policy Gradient
                    • Soft Actor-Critic
                    • Direct Future Prediction
                    • Double DQN
                    • @@ -251,7 +252,7 @@ has the highest predicted \(Q\)
                      -class rl_coach.agents.pal_agent.PALAlgorithmParameters[source]
                      +class rl_coach.agents.pal_agent.PALAlgorithmParameters[source]
                      Parameters
                        diff --git a/docs/components/agents/value_optimization/qr_dqn.html b/docs/components/agents/value_optimization/qr_dqn.html index 92064a1..e5d5121 100644 --- a/docs/components/agents/value_optimization/qr_dqn.html +++ b/docs/components/agents/value_optimization/qr_dqn.html @@ -117,6 +117,7 @@
                      • Conditional Imitation Learning
                      • Clipped Proximal Policy Optimization
                      • Deep Deterministic Policy Gradient
                      • +
                      • Twin Delayed Deep Deterministic Policy Gradient
                      • Soft Actor-Critic
                      • Direct Future Prediction
                      • Double DQN
                      • @@ -241,7 +242,7 @@ quantile locations. Only the targets of the actions that were actually taken are
                        -class rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters[source]
                        +class rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters[source]
                        Parameters
                          diff --git a/docs/components/agents/value_optimization/rainbow.html b/docs/components/agents/value_optimization/rainbow.html index bc6182c..2969064 100644 --- a/docs/components/agents/value_optimization/rainbow.html +++ b/docs/components/agents/value_optimization/rainbow.html @@ -117,6 +117,7 @@
                        • Conditional Imitation Learning
                        • Clipped Proximal Policy Optimization
                        • Deep Deterministic Policy Gradient
                        • +
                        • Twin Delayed Deep Deterministic Policy Gradient
                        • Soft Actor-Critic
                        • Direct Future Prediction
                        • Double DQN
                        • @@ -256,7 +257,7 @@ using the KL divergence loss that is returned from the network.

                          -class rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters[source]
                          +class rl_coach.agents.rainbow_dqn_agent.RainbowDQNAlgorithmParameters[source]
                          Parameters
                            diff --git a/docs/components/architectures/index.html b/docs/components/architectures/index.html index d6e8ebb..6a4c5b2 100644 --- a/docs/components/architectures/index.html +++ b/docs/components/architectures/index.html @@ -196,7 +196,7 @@ own components under a dedicated directory. For example, tensorflow components w parts that are implemented using TensorFlow.

                            -class rl_coach.base_parameters.NetworkParameters(force_cpu=False, async_training=False, shared_optimizer=True, scale_down_gradients_by_number_of_workers_for_sync_training=True, clip_gradients=None, gradients_clipping_method=<GradientClippingMethod.ClipByGlobalNorm: 0>, l2_regularization=0, learning_rate=0.00025, learning_rate_decay_rate=0, learning_rate_decay_steps=0, input_embedders_parameters={}, embedding_merger_type=<EmbeddingMergerType.Concat: 0>, middleware_parameters=None, heads_parameters=[], use_separate_networks_per_head=False, optimizer_type='Adam', optimizer_epsilon=0.0001, adam_optimizer_beta1=0.9, adam_optimizer_beta2=0.99, rms_prop_optimizer_decay=0.9, batch_size=32, replace_mse_with_huber_loss=False, create_target_network=False, tensorflow_support=True, softmax_temperature=1)[source]
                            +class rl_coach.base_parameters.NetworkParameters(force_cpu=False, async_training=False, shared_optimizer=True, scale_down_gradients_by_number_of_workers_for_sync_training=True, clip_gradients=None, gradients_clipping_method=<GradientClippingMethod.ClipByGlobalNorm: 0>, l2_regularization=0, learning_rate=0.00025, learning_rate_decay_rate=0, learning_rate_decay_steps=0, input_embedders_parameters={}, embedding_merger_type=<EmbeddingMergerType.Concat: 0>, middleware_parameters=None, heads_parameters=[], use_separate_networks_per_head=False, optimizer_type='Adam', optimizer_epsilon=0.0001, adam_optimizer_beta1=0.9, adam_optimizer_beta2=0.99, rms_prop_optimizer_decay=0.9, batch_size=32, replace_mse_with_huber_loss=False, create_target_network=False, tensorflow_support=True, softmax_temperature=1)[source]
                            Parameters
                              @@ -268,7 +268,7 @@ online network at will.

                              Architecture

                              -class rl_coach.architectures.architecture.Architecture(agent_parameters: rl_coach.base_parameters.AgentParameters, spaces: rl_coach.spaces.SpacesDefinition, name: str = '')[source]
                              +class rl_coach.architectures.architecture.Architecture(agent_parameters: rl_coach.base_parameters.AgentParameters, spaces: rl_coach.spaces.SpacesDefinition, name: str = '')[source]

                              Creates a neural network ‘architecture’, that can be trained and used for inference.

                              Parameters
                              @@ -281,7 +281,7 @@ online network at will.

                              -accumulate_gradients(inputs: Dict[str, numpy.ndarray], targets: List[numpy.ndarray], additional_fetches: list = None, importance_weights: numpy.ndarray = None, no_accumulation: bool = False) → Tuple[float, List[float], float, list][source]
                              +accumulate_gradients(inputs: Dict[str, numpy.ndarray], targets: List[numpy.ndarray], additional_fetches: list = None, importance_weights: numpy.ndarray = None, no_accumulation: bool = False) → Tuple[float, List[float], float, list][source]

                              Given a batch of inputs (i.e. states) and targets (e.g. discounted rewards), computes and accumulates the gradients for model parameters. Will run forward and backward pass to compute gradients, clip the gradient values if required and then accumulate gradients from all learners. It does not update the model weights, @@ -324,7 +324,7 @@ fetched_tensors: all values for additional_fetches

                              -apply_and_reset_gradients(gradients: List[numpy.ndarray], scaler: float = 1.0) → None[source]
                              +apply_and_reset_gradients(gradients: List[numpy.ndarray], scaler: float = 1.0) → None[source]

                              Applies the given gradients to the network weights and resets the gradient accumulations. Has the same impact as calling apply_gradients, then reset_accumulated_gradients.

                              @@ -340,7 +340,7 @@ of an identical network (either self or another identical network)

                              -apply_gradients(gradients: List[numpy.ndarray], scaler: float = 1.0) → None[source]
                              +apply_gradients(gradients: List[numpy.ndarray], scaler: float = 1.0) → None[source]

                              Applies the given gradients to the network weights. Will be performed sync or async depending on network_parameters.async_training

                              @@ -356,7 +356,7 @@ of an identical network (either self or another identical network)

                              -collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection[source]
                              +collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection[source]

                              Collection of all savers for the network (typically only one saver for network and one for ONNX export) :param parent_path_suffix: path suffix of the parent of the network

                              @@ -369,9 +369,9 @@ of an identical network (either self or another identical network)

                              -
                              +
                              -static construct(variable_scope: str, devices: List[str], *args, **kwargs) → rl_coach.architectures.architecture.Architecture[source]
                              +static construct(variable_scope: str, devices: List[str], *args, **kwargs) → rl_coach.architectures.architecture.Architecture[source]

                              Construct a network class using the provided variable scope and on requested devices :param variable_scope: string specifying variable scope under which to create network variables :param devices: list of devices (can be list of Device objects, or string for TF distributed) @@ -382,7 +382,7 @@ of an identical network (either self or another identical network)

                              -get_variable_value(variable: Any) → numpy.ndarray[source]
                              +get_variable_value(variable: Any) → numpy.ndarray[source]

                              Gets value of a specified variable. Type of variable is dependant on the framework. Example of a variable is head.kl_coefficient, which could be a symbol for evaluation or could be a string representing the value.

                              @@ -398,7 +398,7 @@ or could be a string representing the value.

                              -get_weights() → List[numpy.ndarray][source]
                              +get_weights() → List[numpy.ndarray][source]

                              Gets model weights as a list of ndarrays. It is used for synchronizing weight between two identical networks.

                              Returns
                              @@ -407,9 +407,9 @@ or could be a string representing the value.

                              -
                              +
                              -static parallel_predict(sess: Any, network_input_tuples: List[Tuple[Architecture, Dict[str, numpy.ndarray]]]) → Tuple[numpy.ndarray, ...][source]
                              +static parallel_predict(sess: Any, network_input_tuples: List[Tuple[Architecture, Dict[str, numpy.ndarray]]]) → Tuple[numpy.ndarray, ...][source]
                              Parameters
                                @@ -425,7 +425,7 @@ or could be a string representing the value.

                                -predict(inputs: Dict[str, numpy.ndarray], outputs: List[Any] = None, squeeze_output: bool = True, initial_feed_dict: Dict[Any, numpy.ndarray] = None) → Tuple[numpy.ndarray, ...][source]
                                +predict(inputs: Dict[str, numpy.ndarray], outputs: List[Any] = None, squeeze_output: bool = True, initial_feed_dict: Dict[Any, numpy.ndarray] = None) → Tuple[numpy.ndarray, ...][source]

                                Given input observations, use the model to make predictions (e.g. action or value).

                                Parameters
                                @@ -446,7 +446,7 @@ depends on the framework backend.

                                -reset_accumulated_gradients() → None[source]
                                +reset_accumulated_gradients() → None[source]

                                Sets gradient of all parameters to 0.

                                Once gradients are reset, they must be accessible by accumulated_gradients property of this class, which must return a list of numpy ndarrays. Child class must ensure that accumulated_gradients is set.

                                @@ -454,7 +454,7 @@ which must return a list of numpy ndarrays. Child class must ensure that a
                                -set_variable_value(assign_op: Any, value: numpy.ndarray, placeholder: Any)[source]
                                +set_variable_value(assign_op: Any, value: numpy.ndarray, placeholder: Any)[source]

                                Updates the value of a specified variable. Type of assign_op is dependant on the framework and is a unique identifier for assigning value to a variable. For example an agent may use head.assign_kl_coefficient. There is a one to one mapping between assign_op and placeholder @@ -472,7 +472,7 @@ head.assign_kl_coefficient. There is a one to one mapping between assign_op and

                                -set_weights(weights: List[numpy.ndarray], rate: float = 1.0) → None[source]
                                +set_weights(weights: List[numpy.ndarray], rate: float = 1.0) → None[source]

                                Sets model weights for provided layer parameters.

                                Parameters
                                @@ -490,7 +490,7 @@ i.e. new_weight = rate * given_weight + (1 - rate) * old_weight

                                -train_on_batch(inputs: Dict[str, numpy.ndarray], targets: List[numpy.ndarray], scaler: float = 1.0, additional_fetches: list = None, importance_weights: numpy.ndarray = None) → Tuple[float, List[float], float, list][source]
                                +train_on_batch(inputs: Dict[str, numpy.ndarray], targets: List[numpy.ndarray], scaler: float = 1.0, additional_fetches: list = None, importance_weights: numpy.ndarray = None) → Tuple[float, List[float], float, list][source]

                                Given a batch of inputs (e.g. states) and targets (e.g. discounted rewards), takes a training step: i.e. runs a forward pass and backward pass of the network, accumulates the gradients and applies an optimization step to update the weights. @@ -535,7 +535,7 @@ fetched_tensors: all values for additional_fetches

                                ../../_images/distributed.png
                                -class rl_coach.architectures.network_wrapper.NetworkWrapper(agent_parameters: rl_coach.base_parameters.AgentParameters, has_target: bool, has_global: bool, name: str, spaces: rl_coach.spaces.SpacesDefinition, replicated_device=None, worker_device=None)[source]
                                +class rl_coach.architectures.network_wrapper.NetworkWrapper(agent_parameters: rl_coach.base_parameters.AgentParameters, has_target: bool, has_global: bool, name: str, spaces: rl_coach.spaces.SpacesDefinition, replicated_device=None, worker_device=None)[source]

                                The network wrapper contains multiple copies of the same network, each one with a different set of weights which is updating in a different time scale. The network wrapper will always contain an online network. It will contain an additional slow updating target network if it was requested by the user, @@ -544,7 +544,7 @@ multi-process distributed mode. The network wrapper contains functionality for m between them.

                                -apply_gradients_and_sync_networks(reset_gradients=True)[source]
                                +apply_gradients_and_sync_networks(reset_gradients=True)[source]

                                Applies the gradients accumulated in the online network to the global network or to itself and syncs the networks if necessary

                                @@ -559,7 +559,7 @@ complexity for this function by around 10%

                                -apply_gradients_to_global_network(gradients=None)[source]
                                +apply_gradients_to_global_network(gradients=None)[source]

                                Apply gradients from the online network on the global network

                                Parameters
                                @@ -573,7 +573,7 @@ complexity for this function by around 10%

                                -apply_gradients_to_online_network(gradients=None)[source]
                                +apply_gradients_to_online_network(gradients=None)[source]

                                Apply gradients from the online network on itself

                                Returns
                                @@ -584,7 +584,7 @@ complexity for this function by around 10%

                                -collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection[source]
                                +collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection[source]

                                Collect all of network’s savers for global or online network Note: global, online, and target network are all copies fo the same network which parameters that are

                                @@ -610,7 +610,7 @@ for saving.

                                -parallel_prediction(network_input_tuples: List[Tuple])[source]
                                +parallel_prediction(network_input_tuples: List[Tuple])[source]

                                Run several network prediction in parallel. Currently this only supports running each of the network once.

                                Parameters
                                @@ -625,7 +625,7 @@ target_network or global_network) and the second element is the inputs

                                -set_is_training(state: bool)[source]
                                +set_is_training(state: bool)[source]

                                Set the phase of the network between training and testing

                                Parameters
                                @@ -639,7 +639,7 @@ target_network or global_network) and the second element is the inputs

                                -sync()[source]
                                +sync()[source]

                                Initializes the weights of the networks to match each other

                                Returns
                                @@ -650,7 +650,7 @@ target_network or global_network) and the second element is the inputs

                                -train_and_sync_networks(inputs, targets, additional_fetches=[], importance_weights=None)[source]
                                +train_and_sync_networks(inputs, targets, additional_fetches=[], importance_weights=None)[source]

                                A generic training function that enables multi-threading training using a global network if necessary.

                                Parameters
                                @@ -670,7 +670,7 @@ error of this sample. If it is not given, the samples losses won’t be scaled
                                -update_online_network(rate=1.0)[source]
                                +update_online_network(rate=1.0)[source]

                                Copy weights: global network >>> online network

                                Parameters
                                @@ -681,7 +681,7 @@ error of this sample. If it is not given, the samples losses won’t be scaled
                                -update_target_network(rate=1.0)[source]
                                +update_target_network(rate=1.0)[source]

                                Copy weights: online network >>> target network

                                Parameters
                                diff --git a/docs/components/core_types.html b/docs/components/core_types.html index d23a959..0d957af 100644 --- a/docs/components/core_types.html +++ b/docs/components/core_types.html @@ -197,7 +197,7 @@

                                ActionInfo

                                -class rl_coach.core_types.ActionInfo(action: Union[int, float, numpy.ndarray, List], all_action_probabilities: float = 0, action_value: float = 0.0, state_value: float = 0.0, max_action_value: float = None)[source]
                                +class rl_coach.core_types.ActionInfo(action: Union[int, float, numpy.ndarray, List], all_action_probabilities: float = 0, action_value: float = 0.0, state_value: float = 0.0, max_action_value: float = None)[source]

                                Action info is a class that holds an action and various additional information details about it

                                Parameters
                                @@ -219,7 +219,7 @@ action with the maximum value

                                Batch

                                -class rl_coach.core_types.Batch(transitions: List[rl_coach.core_types.Transition])[source]
                                +class rl_coach.core_types.Batch(transitions: List[rl_coach.core_types.Transition])[source]

                                A wrapper around a list of transitions that helps extracting batches of parameters from it. For example, one can extract a list of states corresponding to the list of transitions. The class uses lazy evaluation in order to return each of the available parameters.

                                @@ -230,7 +230,7 @@ The class uses lazy evaluation in order to return each of the available paramete
                                -actions(expand_dims=False) → numpy.ndarray[source]
                                +actions(expand_dims=False) → numpy.ndarray[source]

                                if the actions were not converted to a batch before, extract them to a batch and then return the batch

                                Parameters
                                @@ -244,7 +244,7 @@ The class uses lazy evaluation in order to return each of the available paramete
                                -game_overs(expand_dims=False) → numpy.ndarray[source]
                                +game_overs(expand_dims=False) → numpy.ndarray[source]

                                if the game_overs were not converted to a batch before, extract them to a batch and then return the batch

                                Parameters
                                @@ -258,7 +258,7 @@ The class uses lazy evaluation in order to return each of the available paramete
                                -goals(expand_dims=False) → numpy.ndarray[source]
                                +goals(expand_dims=False) → numpy.ndarray[source]

                                if the goals were not converted to a batch before, extract them to a batch and then return the batch if the goal was not filled, this will raise an exception

                                @@ -273,7 +273,7 @@ if the goal was not filled, this will raise an exception

                                -info(key, expand_dims=False) → numpy.ndarray[source]
                                +info(key, expand_dims=False) → numpy.ndarray[source]

                                if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the batch. if the key is not part of the keys in the info dictionary, this will raise an exception

                                @@ -288,7 +288,7 @@ batch. if the key is not part of the keys in the info dictionary, this will rais
                                -info_as_list(key) → list[source]
                                +info_as_list(key) → list[source]

                                get the info and store it internally as a list, if wasn’t stored before. return it as a list :param expand_dims: add an extra dimension to the info batch :return: a list containing all the info values of the batch corresponding to the given key

                                @@ -296,7 +296,7 @@ batch. if the key is not part of the keys in the info dictionary, this will rais
                                -n_step_discounted_rewards(expand_dims=False) → numpy.ndarray[source]
                                +n_step_discounted_rewards(expand_dims=False) → numpy.ndarray[source]
                                if the n_step_discounted_rewards were not converted to a batch before, extract them to a batch and then return

                                the batch

                                @@ -308,7 +308,7 @@ batch. if the key is not part of the keys in the info dictionary, this will rais
                                -next_states(fetches: List[str], expand_dims=False) → Dict[str, numpy.ndarray][source]
                                +next_states(fetches: List[str], expand_dims=False) → Dict[str, numpy.ndarray][source]

                                follow the keys in fetches to extract the corresponding items from the next states in the batch if these keys were not already extracted before. return only the values corresponding to those keys

                                @@ -326,7 +326,7 @@ if these keys were not already extracted before. return only the values correspo
                                -rewards(expand_dims=False) → numpy.ndarray[source]
                                +rewards(expand_dims=False) → numpy.ndarray[source]

                                if the rewards were not converted to a batch before, extract them to a batch and then return the batch

                                Parameters
                                @@ -340,7 +340,7 @@ if these keys were not already extracted before. return only the values correspo
                                -shuffle() → None[source]
                                +shuffle() → None[source]

                                Shuffle all the transitions in the batch

                                Returns
                                @@ -349,9 +349,9 @@ if these keys were not already extracted before. return only the values correspo
                                -
                                +
                                -size
                                +property size
                                Returns

                                the size of the batch

                                @@ -361,7 +361,7 @@ if these keys were not already extracted before. return only the values correspo
                                -slice(start, end) → None[source]
                                +slice(start, end) → None[source]

                                Keep a slice from the batch and discard the rest of the batch

                                Parameters
                                @@ -378,7 +378,7 @@ if these keys were not already extracted before. return only the values correspo
                                -states(fetches: List[str], expand_dims=False) → Dict[str, numpy.ndarray][source]
                                +states(fetches: List[str], expand_dims=False) → Dict[str, numpy.ndarray][source]

                                follow the keys in fetches to extract the corresponding items from the states in the batch if these keys were not already extracted before. return only the values corresponding to those keys

                                @@ -401,7 +401,7 @@ if these keys were not already extracted before. return only the values correspo

                                EnvResponse

                                -class rl_coach.core_types.EnvResponse(next_state: Dict[str, numpy.ndarray], reward: Union[int, float, numpy.ndarray], game_over: bool, info: Dict = None, goal: numpy.ndarray = None)[source]
                                +class rl_coach.core_types.EnvResponse(next_state: Dict[str, numpy.ndarray], reward: Union[int, float, numpy.ndarray], game_over: bool, info: Dict = None, goal: numpy.ndarray = None)[source]

                                An env response is a collection containing the information returning from the environment after a single action has been performed on it.

                                @@ -424,7 +424,7 @@ the execution of the action.

                                Episode

                                -class rl_coach.core_types.Episode(discount: float = 0.99, bootstrap_total_return_from_old_policy: bool = False, n_step: int = -1)[source]
                                +class rl_coach.core_types.Episode(discount: float = 0.99, bootstrap_total_return_from_old_policy: bool = False, n_step: int = -1)[source]

                                An Episode represents a set of sequential transitions, that end with a terminal state.

                                Parameters
                                @@ -438,7 +438,7 @@ memory

                                -get_first_transition() → rl_coach.core_types.Transition[source]
                                +get_first_transition() → rl_coach.core_types.Transition[source]

                                Get the first transition in the episode, or None if there are no transitions available

                                Returns
                                @@ -449,7 +449,7 @@ memory

                                -get_last_transition() → rl_coach.core_types.Transition[source]
                                +get_last_transition() → rl_coach.core_types.Transition[source]

                                Get the last transition in the episode, or None if there are no transition available

                                Returns
                                @@ -460,7 +460,7 @@ memory

                                -get_transition(transition_idx: int) → rl_coach.core_types.Transition[source]
                                +get_transition(transition_idx: int) → rl_coach.core_types.Transition[source]

                                Get a specific transition by its index.

                                Parameters
                                @@ -474,7 +474,7 @@ memory

                                -get_transitions_attribute(attribute_name: str) → List[Any][source]
                                +get_transitions_attribute(attribute_name: str) → List[Any][source]

                                Get the values for some transition attribute from all the transitions in the episode. For example, this allows getting the rewards for all the transitions as a list by calling get_transitions_attribute(‘reward’)

                                @@ -490,7 +490,7 @@ get_transitions_attribute(‘reward’)

                                -insert(transition: rl_coach.core_types.Transition) → None[source]
                                +insert(transition: rl_coach.core_types.Transition) → None[source]

                                Insert a new transition to the episode. If the game_over flag in the transition is set to True, the episode will be marked as complete.

                                @@ -505,7 +505,7 @@ the episode will be marked as complete.

                                -is_empty() → bool[source]
                                +is_empty() → bool[source]

                                Check if the episode is empty

                                Returns
                                @@ -516,7 +516,7 @@ the episode will be marked as complete.

                                -length() → int[source]
                                +length() → int[source]

                                Return the length of the episode, which is the number of transitions it holds.

                                Returns
                                @@ -527,7 +527,7 @@ the episode will be marked as complete.

                                -update_discounted_rewards()[source]
                                +update_discounted_rewards()[source]

                                Update the discounted returns for all the transitions in the episode. The returns will be calculated according to the rewards of each transition, together with the number of steps to bootstrap from and the discount factor, as defined by n_step and discount respectively when initializing @@ -546,7 +546,7 @@ the episode.

                                Transition

                                -class rl_coach.core_types.Transition(state: Dict[str, numpy.ndarray] = None, action: Union[int, float, numpy.ndarray, List] = None, reward: Union[int, float, numpy.ndarray] = None, next_state: Dict[str, numpy.ndarray] = None, game_over: bool = None, info: Dict = None)[source]
                                +class rl_coach.core_types.Transition(state: Dict[str, numpy.ndarray] = None, action: Union[int, float, numpy.ndarray, List] = None, reward: Union[int, float, numpy.ndarray] = None, next_state: Dict[str, numpy.ndarray] = None, game_over: bool = None, info: Dict = None)[source]

                                A transition is a tuple containing the information of a single step of interaction between the agent and the environment. The most basic version should contain the following values: (current state, action, reward, next state, game over) diff --git a/docs/components/data_stores/index.html b/docs/components/data_stores/index.html index 4e3fc79..0f3b777 100644 --- a/docs/components/data_stores/index.html +++ b/docs/components/data_stores/index.html @@ -194,7 +194,7 @@

                                S3DataStore

                                -class rl_coach.data_stores.s3_data_store.S3DataStore(params: rl_coach.data_stores.s3_data_store.S3DataStoreParameters)[source]
                                +class rl_coach.data_stores.s3_data_store.S3DataStore(params: rl_coach.data_stores.s3_data_store.S3DataStoreParameters)[source]

                                An implementation of the data store using S3 for storing policy checkpoints when using Coach in distributed mode. The policy checkpoints are written by the trainer and read by the rollout worker.

                                @@ -209,7 +209,7 @@ The policy checkpoints are written by the trainer and read by the rollout worker

                                NFSDataStore

                                -class rl_coach.data_stores.nfs_data_store.NFSDataStore(params: rl_coach.data_stores.nfs_data_store.NFSDataStoreParameters)[source]
                                +class rl_coach.data_stores.nfs_data_store.NFSDataStore(params: rl_coach.data_stores.nfs_data_store.NFSDataStoreParameters)[source]

                                An implementation of data store which uses NFS for storing policy checkpoints when using Coach in distributed mode. The policy checkpoints are written by the trainer and read by the rollout worker.

                                diff --git a/docs/components/environments/index.html b/docs/components/environments/index.html index dab4dd1..9ece82b 100644 --- a/docs/components/environments/index.html +++ b/docs/components/environments/index.html @@ -195,7 +195,7 @@

                                Environments

                                -class rl_coach.environments.environment.Environment(level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, **kwargs)[source]
                                +class rl_coach.environments.environment.Environment(level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, **kwargs)[source]
                                Parameters
                                  @@ -210,9 +210,9 @@ additional arguments which will be ignored by this class, but might be used by o
                                -
                                +
                                -action_space
                                +property action_space

                                Get the action space of the environment

                                Returns
                                @@ -223,7 +223,7 @@ additional arguments which will be ignored by this class, but might be used by o
                                -close() → None[source]
                                +close() → None[source]

                                Clean up steps.

                                Returns
                                @@ -234,7 +234,7 @@ additional arguments which will be ignored by this class, but might be used by o
                                -get_action_from_user() → Union[int, float, numpy.ndarray, List][source]
                                +get_action_from_user() → Union[int, float, numpy.ndarray, List][source]

                                Get an action from the user keyboard

                                Returns
                                @@ -245,7 +245,7 @@ additional arguments which will be ignored by this class, but might be used by o
                                -get_available_keys() → List[Tuple[str, Union[int, float, numpy.ndarray, List]]][source]
                                +get_available_keys() → List[Tuple[str, Union[int, float, numpy.ndarray, List]]][source]

                                Return a list of tuples mapping between action names and the keyboard key that triggers them

                                Returns
                                @@ -256,7 +256,7 @@ additional arguments which will be ignored by this class, but might be used by o
                                -get_goal() → Union[None, numpy.ndarray][source]
                                +get_goal() → Union[None, numpy.ndarray][source]

                                Get the current goal that the agents needs to achieve in the environment

                                Returns
                                @@ -267,7 +267,7 @@ additional arguments which will be ignored by this class, but might be used by o
                                -get_random_action() → Union[int, float, numpy.ndarray, List][source]
                                +get_random_action() → Union[int, float, numpy.ndarray, List][source]

                                Returns an action picked uniformly from the available actions

                                Returns
                                @@ -278,7 +278,7 @@ additional arguments which will be ignored by this class, but might be used by o
                                -get_rendered_image() → numpy.ndarray[source]
                                +get_rendered_image() → numpy.ndarray[source]

                                Return a numpy array containing the image that will be rendered to the screen. This can be different from the observation. For example, mujoco’s observation is a measurements vector.

                                @@ -288,9 +288,9 @@ This can be different from the observation. For example, mujoco’s observation
                                -
                                +
                                -goal_space
                                +property goal_space

                                Get the state space of the environment

                                Returns
                                @@ -301,7 +301,7 @@ This can be different from the observation. For example, mujoco’s observation
                                -handle_episode_ended() → None[source]
                                +handle_episode_ended() → None[source]

                                End an episode

                                Returns
                                @@ -310,9 +310,9 @@ This can be different from the observation. For example, mujoco’s observation
                                -
                                +
                                -last_env_response
                                +property last_env_response

                                Get the last environment response

                                Returns
                                @@ -321,16 +321,16 @@ This can be different from the observation. For example, mujoco’s observation
                                -
                                +
                                -phase
                                +property phase

                                Get the phase of the environment :return: the current phase

                                -render() → None[source]
                                +render() → None[source]

                                Call the environment function for rendering to the screen

                                Returns
                                @@ -341,7 +341,7 @@ This can be different from the observation. For example, mujoco’s observation
                                -reset_internal_state(force_environment_reset=False) → rl_coach.core_types.EnvResponse[source]
                                +reset_internal_state(force_environment_reset=False) → rl_coach.core_types.EnvResponse[source]

                                Reset the environment and all the variable of the wrapper

                                Parameters
                                @@ -355,7 +355,7 @@ This can be different from the observation. For example, mujoco’s observation
                                -set_goal(goal: Union[None, numpy.ndarray]) → None[source]
                                +set_goal(goal: Union[None, numpy.ndarray]) → None[source]

                                Set the current goal that the agent needs to achieve in the environment

                                Parameters
                                @@ -367,9 +367,9 @@ This can be different from the observation. For example, mujoco’s observation
                                -
                                +
                                -state_space
                                +property state_space

                                Get the state space of the environment

                                Returns
                                @@ -380,7 +380,7 @@ This can be different from the observation. For example, mujoco’s observation
                                -step(action: Union[int, float, numpy.ndarray, List]) → rl_coach.core_types.EnvResponse[source]
                                +step(action: Union[int, float, numpy.ndarray, List]) → rl_coach.core_types.EnvResponse[source]

                                Make a single step in the environment using the given action

                                Parameters
                                @@ -400,7 +400,7 @@ This can be different from the observation. For example, mujoco’s observation

                                Website: DeepMind Control Suite

                                -class rl_coach.environments.control_suite_environment.ControlSuiteEnvironment(level: rl_coach.environments.environment.LevelSelection, frame_skip: int, visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, seed: Union[None, int] = None, human_control: bool = False, observation_type: rl_coach.environments.control_suite_environment.ObservationType = <ObservationType.Measurements: 1>, custom_reward_threshold: Union[int, float] = None, **kwargs)[source]
                                +class rl_coach.environments.control_suite_environment.ControlSuiteEnvironment(level: rl_coach.environments.environment.LevelSelection, frame_skip: int, visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, seed: Union[None, int] = None, human_control: bool = False, observation_type: rl_coach.environments.control_suite_environment.ObservationType = <ObservationType.Measurements: 1>, custom_reward_threshold: Union[int, float] = None, **kwargs)[source]
                                Parameters
                                  @@ -438,7 +438,7 @@ Allows defining a custom reward that will be used to decide when the agent succe

                                  Website: Blizzard Starcraft II

                                  -class rl_coach.environments.starcraft2_environment.StarCraft2Environment(level: rl_coach.environments.environment.LevelSelection, frame_skip: int, visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, seed: Union[None, int] = None, human_control: bool = False, custom_reward_threshold: Union[int, float] = None, screen_size: int = 84, minimap_size: int = 64, feature_minimap_maps_to_use: List = range(0, 7), feature_screen_maps_to_use: List = range(0, 17), observation_type: rl_coach.environments.starcraft2_environment.StarcraftObservationType = <StarcraftObservationType.Features: 0>, disable_fog: bool = False, auto_select_all_army: bool = True, use_full_action_space: bool = False, **kwargs)[source]
                                  +class rl_coach.environments.starcraft2_environment.StarCraft2Environment(level: rl_coach.environments.environment.LevelSelection, frame_skip: int, visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, seed: Union[None, int] = None, human_control: bool = False, custom_reward_threshold: Union[int, float] = None, screen_size: int = 84, minimap_size: int = 64, feature_minimap_maps_to_use: List = range(0, 7), feature_screen_maps_to_use: List = range(0, 17), observation_type: rl_coach.environments.starcraft2_environment.StarcraftObservationType = <StarcraftObservationType.Features: 0>, disable_fog: bool = False, auto_select_all_army: bool = True, use_full_action_space: bool = False, **kwargs)[source]
                                  @@ -448,7 +448,7 @@ Allows defining a custom reward that will be used to decide when the agent succe

                                  Website: ViZDoom

                                  -class rl_coach.environments.doom_environment.DoomEnvironment(level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, cameras: List[rl_coach.environments.doom_environment.DoomEnvironment.CameraTypes], target_success_rate: float = 1.0, **kwargs)[source]
                                  +class rl_coach.environments.doom_environment.DoomEnvironment(level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, cameras: List[rl_coach.environments.doom_environment.DoomEnvironment.CameraTypes], target_success_rate: float = 1.0, **kwargs)[source]
                                  Parameters
                                    @@ -491,7 +491,7 @@ Stop experiment if given target success rate was achieved.

                                    Website: CARLA

                                    -class rl_coach.environments.carla_environment.CarlaEnvironment(level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, server_height: int, server_width: int, camera_height: int, camera_width: int, verbose: bool, experiment_suite: carla.driving_benchmark.experiment_suites.experiment_suite.ExperimentSuite, config: str, episode_max_time: int, allow_braking: bool, quality: rl_coach.environments.carla_environment.CarlaEnvironmentParameters.Quality, cameras: List[rl_coach.environments.carla_environment.CameraTypes], weather_id: List[int], experiment_path: str, separate_actions_for_throttle_and_brake: bool, num_speedup_steps: int, max_speed: float, target_success_rate: float = 1.0, **kwargs)[source]
                                    +class rl_coach.environments.carla_environment.CarlaEnvironment(level: rl_coach.environments.environment.LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: rl_coach.base_parameters.VisualizationParameters, server_height: int, server_width: int, camera_height: int, camera_width: int, verbose: bool, experiment_suite: carla.driving_benchmark.experiment_suites.experiment_suite.ExperimentSuite, config: str, episode_max_time: int, allow_braking: bool, quality: rl_coach.environments.carla_environment.CarlaEnvironmentParameters.Quality, cameras: List[rl_coach.environments.carla_environment.CameraTypes], weather_id: List[int], experiment_path: str, separate_actions_for_throttle_and_brake: bool, num_speedup_steps: int, max_speed: float, target_success_rate: float = 1.0, **kwargs)[source]
                                    @@ -511,7 +511,7 @@ includes a set of robotics environments.

                                  -class rl_coach.environments.gym_environment.GymEnvironment(level: rl_coach.environments.environment.LevelSelection, frame_skip: int, visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, additional_simulator_parameters: Dict[str, Any] = {}, seed: Union[None, int] = None, human_control: bool = False, custom_reward_threshold: Union[int, float] = None, random_initialization_steps: int = 1, max_over_num_frames: int = 1, observation_space_type: rl_coach.environments.gym_environment.ObservationSpaceType = None, **kwargs)[source]
                                  +class rl_coach.environments.gym_environment.GymEnvironment(level: rl_coach.environments.environment.LevelSelection, frame_skip: int, visualization_parameters: rl_coach.base_parameters.VisualizationParameters, target_success_rate: float = 1.0, additional_simulator_parameters: Dict[str, Any] = {}, seed: Union[None, int] = None, human_control: bool = False, custom_reward_threshold: Union[int, float] = None, random_initialization_steps: int = 1, max_over_num_frames: int = 1, observation_space_type: rl_coach.environments.gym_environment.ObservationSpaceType = None, **kwargs)[source]
                                  Parameters
                                    diff --git a/docs/components/exploration_policies/index.html b/docs/components/exploration_policies/index.html index ad904a2..e5483c1 100644 --- a/docs/components/exploration_policies/index.html +++ b/docs/components/exploration_policies/index.html @@ -205,7 +205,7 @@ predefined policy. This is one of the most important aspects of reinforcement le tuning to get it right. Coach supports several pre-defined exploration policies, and it can be easily extended with custom policies. Note that not all exploration policies are expected to work for both discrete and continuous action spaces.

                                    - +
                                    @@ -268,7 +268,7 @@ spaces.

                                    ExplorationPolicy

                                    -class rl_coach.exploration_policies.exploration_policy.ExplorationPolicy(action_space: rl_coach.spaces.ActionSpace)[source]
                                    +class rl_coach.exploration_policies.exploration_policy.ExplorationPolicy(action_space: rl_coach.spaces.ActionSpace)[source]

                                    An exploration policy takes the predicted actions or action values from the agent, and selects the action to actually apply to the environment using some predefined algorithm.

                                    @@ -278,7 +278,7 @@ actually apply to the environment using some predefined algorithm.

                                    -change_phase(phase)[source]
                                    +change_phase(phase)[source]

                                    Change between running phases of the algorithm :param phase: Either Heatup or Train :return: none

                                    @@ -286,16 +286,19 @@ actually apply to the environment using some predefined algorithm.

                                    -get_action(action_values: List[Union[int, float, numpy.ndarray, List]]) → Union[int, float, numpy.ndarray, List][source]
                                    +get_action(action_values: List[Union[int, float, numpy.ndarray, List]]) → Union[int, float, numpy.ndarray, List][source]

                                    Given a list of values corresponding to each action, choose one actions according to the exploration policy :param action_values: A list of action values -:return: The chosen action

                                    +:return: The chosen action,

                                    +
                                    +

                                    The probability of the action (if available, otherwise 1 for absolute certainty in the action)

                                    +
                                    -requires_action_values() → bool[source]
                                    +requires_action_values() → bool[source]

                                    Allows exploration policies to define if they require the action values for the current step. This can save up a lot of computation. For example in e-greedy, if the random value generated is smaller than epsilon, the action is completely random, and the action values don’t need to be calculated @@ -304,7 +307,7 @@ than epsilon, the action is completely random, and the action values don’t nee

                                    -reset()[source]
                                    +reset()[source]

                                    Used for resetting the exploration policy parameters when needed :return: None

                                    @@ -316,7 +319,7 @@ than epsilon, the action is completely random, and the action values don’t nee

                                    AdditiveNoise

                                    -class rl_coach.exploration_policies.additive_noise.AdditiveNoise(action_space: rl_coach.spaces.ActionSpace, noise_percentage_schedule: rl_coach.schedules.Schedule, evaluation_noise_percentage: float)[source]
                                    +class rl_coach.exploration_policies.additive_noise.AdditiveNoise(action_space: rl_coach.spaces.ActionSpace, noise_schedule: rl_coach.schedules.Schedule, evaluation_noise: float, noise_as_percentage_from_action_space: bool = True)[source]

                                    AdditiveNoise is an exploration policy intended for continuous action spaces. It takes the action from the agent and adds a Gaussian distributed noise to it. The amount of noise added to the action follows the noise amount that can be given in two different ways: @@ -327,9 +330,10 @@ be the mean of the action, and 2nd is assumed to be its standard deviation.

                                    Parameters
                                    • action_space – the action space used by the environment

                                    • -
                                    • noise_percentage_schedule – the schedule for the noise variance percentage relative to the absolute range -of the action space

                                    • -
                                    • evaluation_noise_percentage – the noise variance percentage that will be used during evaluation phases

                                    • +
                                    • noise_schedule – the schedule for the noise

                                    • +
                                    • evaluation_noise – the noise variance that will be used during evaluation phases

                                    • +
                                    • noise_as_percentage_from_action_space – a bool deciding whether the noise is absolute or as a percentage +from the action space

                                    @@ -340,7 +344,7 @@ of the action space

                                    Boltzmann

                                    -class rl_coach.exploration_policies.boltzmann.Boltzmann(action_space: rl_coach.spaces.ActionSpace, temperature_schedule: rl_coach.schedules.Schedule)[source]
                                    +class rl_coach.exploration_policies.boltzmann.Boltzmann(action_space: rl_coach.spaces.ActionSpace, temperature_schedule: rl_coach.schedules.Schedule)[source]

                                    The Boltzmann exploration policy is intended for discrete action spaces. It assumes that each of the possible actions has some value assigned to it (such as the Q value), and uses a softmax function to convert these values into a distribution over the actions. It then samples the action for playing out of the calculated distribution. @@ -360,7 +364,7 @@ An additional temperature schedule can be given by the user, and will control th

                                    Bootstrapped

                                    -class rl_coach.exploration_policies.bootstrapped.Bootstrapped(action_space: rl_coach.spaces.ActionSpace, epsilon_schedule: rl_coach.schedules.Schedule, evaluation_epsilon: float, architecture_num_q_heads: int, continuous_exploration_policy_parameters: rl_coach.exploration_policies.exploration_policy.ExplorationParameters = <rl_coach.exploration_policies.additive_noise.AdditiveNoiseParameters object>)[source]
                                    +class rl_coach.exploration_policies.bootstrapped.Bootstrapped(action_space: rl_coach.spaces.ActionSpace, epsilon_schedule: rl_coach.schedules.Schedule, evaluation_epsilon: float, architecture_num_q_heads: int, continuous_exploration_policy_parameters: rl_coach.exploration_policies.exploration_policy.ExplorationParameters = <rl_coach.exploration_policies.additive_noise.AdditiveNoiseParameters object>)[source]

                                    Bootstrapped exploration policy is currently only used for discrete action spaces along with the Bootstrapped DQN agent. It assumes that there is an ensemble of network heads, where each one predicts the values for all the possible actions. For each episode, a single head is selected to lead the agent, according @@ -390,7 +394,7 @@ if the e-greedy is used for a continuous policy

                                    Categorical

                                    -class rl_coach.exploration_policies.categorical.Categorical(action_space: rl_coach.spaces.ActionSpace)[source]
                                    +class rl_coach.exploration_policies.categorical.Categorical(action_space: rl_coach.spaces.ActionSpace)[source]

                                    Categorical exploration policy is intended for discrete action spaces. It expects the action values to represent a probability distribution over the action, from which a single action will be sampled. In evaluation, the action that has the highest probability will be selected. This is particularly useful for @@ -407,7 +411,7 @@ actor-critic schemes, where the actors output is a probability distribution over

                                    ContinuousEntropy

                                    -class rl_coach.exploration_policies.continuous_entropy.ContinuousEntropy(action_space: rl_coach.spaces.ActionSpace, noise_percentage_schedule: rl_coach.schedules.Schedule, evaluation_noise_percentage: float)[source]
                                    +class rl_coach.exploration_policies.continuous_entropy.ContinuousEntropy(action_space: rl_coach.spaces.ActionSpace, noise_schedule: rl_coach.schedules.Schedule, evaluation_noise: float, noise_as_percentage_from_action_space: bool = True)[source]

                                    Continuous entropy is an exploration policy that is actually implemented as part of the network. The exploration policy class is only a placeholder for choosing this policy. The exploration policy is implemented by adding a regularization factor to the network loss, which regularizes the entropy of the action. @@ -422,9 +426,10 @@ There are only a few heads that actually are relevant and implement the entropy

                                    Parameters
                                    • action_space – the action space used by the environment

                                    • -
                                    • noise_percentage_schedule – the schedule for the noise variance percentage relative to the absolute range -of the action space

                                    • -
                                    • evaluation_noise_percentage – the noise variance percentage that will be used during evaluation phases

                                    • +
                                    • noise_schedule – the schedule for the noise

                                    • +
                                    • evaluation_noise – the noise variance that will be used during evaluation phases

                                    • +
                                    • noise_as_percentage_from_action_space – a bool deciding whether the noise is absolute or as a percentage +from the action space

                                    @@ -435,7 +440,7 @@ of the action space

                                    EGreedy

                                    -class rl_coach.exploration_policies.e_greedy.EGreedy(action_space: rl_coach.spaces.ActionSpace, epsilon_schedule: rl_coach.schedules.Schedule, evaluation_epsilon: float, continuous_exploration_policy_parameters: rl_coach.exploration_policies.exploration_policy.ExplorationParameters = <rl_coach.exploration_policies.additive_noise.AdditiveNoiseParameters object>)[source]
                                    +class rl_coach.exploration_policies.e_greedy.EGreedy(action_space: rl_coach.spaces.ActionSpace, epsilon_schedule: rl_coach.schedules.Schedule, evaluation_epsilon: float, continuous_exploration_policy_parameters: rl_coach.exploration_policies.exploration_policy.ExplorationParameters = <rl_coach.exploration_policies.additive_noise.AdditiveNoiseParameters object>)[source]

                                    e-greedy is an exploration policy that is intended for both discrete and continuous action spaces.

                                    For discrete action spaces, it assumes that each action is assigned a value, and it selects the action with the highest value with probability 1 - epsilon. Otherwise, it selects a action sampled uniformly out of all the @@ -463,7 +468,7 @@ if the e-greedy is used for a continuous policy

                                    Greedy

                                    -class rl_coach.exploration_policies.greedy.Greedy(action_space: rl_coach.spaces.ActionSpace)[source]
                                    +class rl_coach.exploration_policies.greedy.Greedy(action_space: rl_coach.spaces.ActionSpace)[source]

                                    The Greedy exploration policy is intended for both discrete and continuous action spaces. For discrete action spaces, it always selects the action with the maximum value, as given by the agent. For continuous action spaces, it always return the exact action, as it was given by the agent.

                                    @@ -479,7 +484,7 @@ For continuous action spaces, it always return the exact action, as it was given

                                    OUProcess

                                    -class rl_coach.exploration_policies.ou_process.OUProcess(action_space: rl_coach.spaces.ActionSpace, mu: float = 0, theta: float = 0.15, sigma: float = 0.2, dt: float = 0.01)[source]
                                    +class rl_coach.exploration_policies.ou_process.OUProcess(action_space: rl_coach.spaces.ActionSpace, mu: float = 0, theta: float = 0.15, sigma: float = 0.2, dt: float = 0.01)[source]

                                    OUProcess exploration policy is intended for continuous action spaces, and selects the action according to an Ornstein-Uhlenbeck process. The Ornstein-Uhlenbeck process implements the action as a Gaussian process, where the samples are correlated between consequent time steps.

                                    @@ -495,7 +500,7 @@ the samples are correlated between consequent time steps.

                                    ParameterNoise

                                    -class rl_coach.exploration_policies.parameter_noise.ParameterNoise(network_params: Dict[str, rl_coach.base_parameters.NetworkParameters], action_space: rl_coach.spaces.ActionSpace)[source]
                                    +class rl_coach.exploration_policies.parameter_noise.ParameterNoise(network_params: Dict[str, rl_coach.base_parameters.NetworkParameters], action_space: rl_coach.spaces.ActionSpace)[source]

                                    The ParameterNoise exploration policy is intended for both discrete and continuous action spaces. It applies the exploration policy by replacing all the dense network layers with noisy layers. The noisy layers have both weight means and weight standard deviations, and for each forward pass of the network @@ -514,7 +519,7 @@ values.

                                    TruncatedNormal

                                    -class rl_coach.exploration_policies.truncated_normal.TruncatedNormal(action_space: rl_coach.spaces.ActionSpace, noise_percentage_schedule: rl_coach.schedules.Schedule, evaluation_noise_percentage: float, clip_low: float, clip_high: float)[source]
                                    +class rl_coach.exploration_policies.truncated_normal.TruncatedNormal(action_space: rl_coach.spaces.ActionSpace, noise_schedule: rl_coach.schedules.Schedule, evaluation_noise: float, clip_low: float, clip_high: float, noise_as_percentage_from_action_space: bool = True)[source]

                                    The TruncatedNormal exploration policy is intended for continuous action spaces. It samples the action from a normal distribution, where the mean action is given by the agent, and the standard deviation can be given in t wo different ways: @@ -527,9 +532,10 @@ is within the bounds.

                                    Parameters
                                    • action_space – the action space used by the environment

                                    • -
                                    • noise_percentage_schedule – the schedule for the noise variance percentage relative to the absolute range -of the action space

                                    • -
                                    • evaluation_noise_percentage – the noise variance percentage that will be used during evaluation phases

                                    • +
                                    • noise_schedule – the schedule for the noise variance

                                    • +
                                    • evaluation_noise – the noise variance that will be used during evaluation phases

                                    • +
                                    • noise_as_percentage_from_action_space – whether to consider the noise as a percentage of the action space +or absolute value

                                    @@ -540,7 +546,7 @@ of the action space

                                    UCB

                                    -class rl_coach.exploration_policies.ucb.UCB(action_space: rl_coach.spaces.ActionSpace, epsilon_schedule: rl_coach.schedules.Schedule, evaluation_epsilon: float, architecture_num_q_heads: int, lamb: int, continuous_exploration_policy_parameters: rl_coach.exploration_policies.exploration_policy.ExplorationParameters = <rl_coach.exploration_policies.additive_noise.AdditiveNoiseParameters object>)[source]
                                    +class rl_coach.exploration_policies.ucb.UCB(action_space: rl_coach.spaces.ActionSpace, epsilon_schedule: rl_coach.schedules.Schedule, evaluation_epsilon: float, architecture_num_q_heads: int, lamb: int, continuous_exploration_policy_parameters: rl_coach.exploration_policies.exploration_policy.ExplorationParameters = <rl_coach.exploration_policies.additive_noise.AdditiveNoiseParameters object>)[source]

                                    UCB exploration policy is following the upper confidence bound heuristic to sample actions in discrete action spaces. It assumes that there are multiple network heads that are predicting action values, and that the standard deviation between the heads predictions represents the uncertainty of the agent in each of the actions. diff --git a/docs/components/filters/input_filters.html b/docs/components/filters/input_filters.html index 5269536..9df4d37 100644 --- a/docs/components/filters/input_filters.html +++ b/docs/components/filters/input_filters.html @@ -221,7 +221,7 @@

                                    ObservationClippingFilter

                                    -class rl_coach.filters.observation.ObservationClippingFilter(clipping_low: float = -inf, clipping_high: float = inf)[source]
                                    +class rl_coach.filters.observation.ObservationClippingFilter(clipping_low: float = -inf, clipping_high: float = inf)[source]

                                    Clips the observation values to a given range of values. For example, if the observation consists of measurements in an arbitrary range, and we want to control the minimum and maximum values of these observations, @@ -241,7 +241,7 @@ we can define a range and clip the values of the measurements.

                                    ObservationCropFilter

                                    -class rl_coach.filters.observation.ObservationCropFilter(crop_low: numpy.ndarray = None, crop_high: numpy.ndarray = None)[source]
                                    +class rl_coach.filters.observation.ObservationCropFilter(crop_low: numpy.ndarray = None, crop_high: numpy.ndarray = None)[source]

                                    Crops the size of the observation to a given crop window. For example, in Atari, the observations are images with a shape of 210x160. Usually, we will want to crop the size of the observation to a square of 160x160 before rescaling them.

                                    @@ -262,7 +262,7 @@ corresponding dimension. a negative value of -1 will be mapped to the max sizeObservationMoveAxisFilter
                                    -class rl_coach.filters.observation.ObservationMoveAxisFilter(axis_origin: int = None, axis_target: int = None)[source]
                                    +class rl_coach.filters.observation.ObservationMoveAxisFilter(axis_origin: int = None, axis_target: int = None)[source]

                                    Reorders the axes of the observation. This can be useful when the observation is an image, and we want to move the channel axis to be the last axis instead of the first axis.

                                    @@ -280,7 +280,7 @@ image, and we want to move the channel axis to be the last axis instead of the f

                                    ObservationNormalizationFilter

                                    -class rl_coach.filters.observation.ObservationNormalizationFilter(clip_min: float = -5.0, clip_max: float = 5.0, name='observation_stats')[source]
                                    +class rl_coach.filters.observation.ObservationNormalizationFilter(clip_min: float = -5.0, clip_max: float = 5.0, name='observation_stats')[source]

                                    Normalizes the observation values with a running mean and standard deviation of all the observations seen so far. The normalization is performed element-wise. Additionally, when working with multiple workers, the statistics used for the normalization operation are accumulated over all the workers.

                                    @@ -299,7 +299,7 @@ multiple workers, the statistics used for the normalization operation are accumu

                                    ObservationReductionBySubPartsNameFilter

                                    -class rl_coach.filters.observation.ObservationReductionBySubPartsNameFilter(part_names: List[str], reduction_method: rl_coach.filters.observation.observation_reduction_by_sub_parts_name_filter.ObservationReductionBySubPartsNameFilter.ReductionMethod)[source]
                                    +class rl_coach.filters.observation.ObservationReductionBySubPartsNameFilter(part_names: List[str], reduction_method: rl_coach.filters.observation.observation_reduction_by_sub_parts_name_filter.ObservationReductionBySubPartsNameFilter.ReductionMethod)[source]

                                    Allows keeping only parts of the observation, by specifying their name. This is useful when the environment has a measurements vector as observation which includes several different measurements, but you want the agent to only see some of the measurements and not all. @@ -321,7 +321,7 @@ This will currently work only for VectorObservationSpace observations

                                    ObservationRescaleSizeByFactorFilter

                                    -class rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter(rescale_factor: float)[source]
                                    +class rl_coach.filters.observation.ObservationRescaleSizeByFactorFilter(rescale_factor: float)[source]

                                    Rescales an image observation by some factor. For example, the image size can be reduced by a factor of 2.

                                    @@ -336,7 +336,7 @@ can be reduced by a factor of 2.

                                    ObservationRescaleToSizeFilter

                                    -class rl_coach.filters.observation.ObservationRescaleToSizeFilter(output_observation_space: rl_coach.spaces.PlanarMapsObservationSpace)[source]
                                    +class rl_coach.filters.observation.ObservationRescaleToSizeFilter(output_observation_space: rl_coach.spaces.PlanarMapsObservationSpace)[source]

                                    Rescales an image observation to a given size. The target size does not necessarily keep the aspect ratio of the original observation. Warning: this requires the input observation to be of type uint8 due to scipy requirements!

                                    @@ -352,7 +352,7 @@ Warning: this requires the input observation to be of type uint8 due to scipy re

                                    ObservationRGBToYFilter

                                    -class rl_coach.filters.observation.ObservationRGBToYFilter[source]
                                    +class rl_coach.filters.observation.ObservationRGBToYFilter[source]

                                    Converts a color image observation specified using the RGB encoding into a grayscale image observation, by keeping only the luminance (Y) channel of the YUV encoding. This can be useful if the colors in the original image are not relevant for solving the task at hand. @@ -364,7 +364,7 @@ The channels axis is assumed to be the last axis

                                    ObservationSqueezeFilter

                                    -class rl_coach.filters.observation.ObservationSqueezeFilter(axis: int = None)[source]
                                    +class rl_coach.filters.observation.ObservationSqueezeFilter(axis: int = None)[source]

                                    Removes redundant axes from the observation, which are axes with a dimension of 1.

                                    Parameters
                                    @@ -378,7 +378,7 @@ The channels axis is assumed to be the last axis

                                    ObservationStackingFilter

                                    -class rl_coach.filters.observation.ObservationStackingFilter(stack_size: int, stacking_axis: int = -1)[source]
                                    +class rl_coach.filters.observation.ObservationStackingFilter(stack_size: int, stacking_axis: int = -1)[source]

                                    Stacks several observations on top of each other. For image observation this will create a 3D blob. The stacking is done in a lazy manner in order to reduce memory consumption. To achieve this, a LazyStack object is used in order to wrap the observations in the stack. For this reason, the @@ -403,7 +403,7 @@ and increase the memory footprint.

                                    ObservationToUInt8Filter

                                    -class rl_coach.filters.observation.ObservationToUInt8Filter(input_low: float, input_high: float)[source]
                                    +class rl_coach.filters.observation.ObservationToUInt8Filter(input_low: float, input_high: float)[source]

                                    Converts a floating point observation into an unsigned int 8 bit observation. This is mostly useful for reducing memory consumption and is usually used for image observations. The filter will first spread the observation values over the range 0-255 and then discretize them into integer values.

                                    @@ -425,7 +425,7 @@ spread the observation values over the range 0-255 and then discretize them into

                                    RewardClippingFilter

                                    -class rl_coach.filters.reward.RewardClippingFilter(clipping_low: float = -inf, clipping_high: float = inf)[source]
                                    +class rl_coach.filters.reward.RewardClippingFilter(clipping_low: float = -inf, clipping_high: float = inf)[source]

                                    Clips the reward values into a given range. For example, in DQN, the Atari rewards are clipped into the range -1 and 1 in order to control the scale of the returns.

                                    @@ -443,7 +443,7 @@ clipped into the range -1 and 1 in order to control the scale of the returns.

                                    RewardNormalizationFilter
                                    -class rl_coach.filters.reward.RewardNormalizationFilter(clip_min: float = -5.0, clip_max: float = 5.0)[source]
                                    +class rl_coach.filters.reward.RewardNormalizationFilter(clip_min: float = -5.0, clip_max: float = 5.0)[source]

                                    Normalizes the reward values with a running mean and standard deviation of all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation are accumulated over all the workers.

                                    @@ -462,7 +462,7 @@ are accumulated over all the workers.

                                    RewardRescaleFilter

                                    -class rl_coach.filters.reward.RewardRescaleFilter(rescale_factor: float)[source]
                                    +class rl_coach.filters.reward.RewardRescaleFilter(rescale_factor: float)[source]

                                    Rescales the reward by a given factor. Rescaling the rewards of the environment has been observed to have a large effect (negative or positive) on the behavior of the learning process.

                                    diff --git a/docs/components/filters/output_filters.html b/docs/components/filters/output_filters.html index 8c6fa80..73df03f 100644 --- a/docs/components/filters/output_filters.html +++ b/docs/components/filters/output_filters.html @@ -200,7 +200,7 @@

                                    Action Filters

                                    -class rl_coach.filters.action.AttentionDiscretization(num_bins_per_dimension: Union[int, List[int]], force_int_bins=False)[source]
                                    +class rl_coach.filters.action.AttentionDiscretization(num_bins_per_dimension: Union[int, List[int]], force_int_bins=False)[source]

                                    Discretizes an AttentionActionSpace. The attention action space defines the actions as choosing sub-boxes in a given box. For example, consider an image of size 100x100, where the action is choosing a crop window of size 20x20 to attend to in the image. AttentionDiscretization allows discretizing the possible crop @@ -219,7 +219,7 @@ windows to choose into a finite number of options, and map a discrete action spa ../../_images/attention_discretization.png

                                    -class rl_coach.filters.action.BoxDiscretization(num_bins_per_dimension: Union[int, List[int]], force_int_bins=False)[source]
                                    +class rl_coach.filters.action.BoxDiscretization(num_bins_per_dimension: Union[int, List[int]], force_int_bins=False)[source]

                                    Discretizes a continuous action space into a discrete action space, allowing the usage of agents such as DQN for continuous environments such as MuJoCo. Given the number of bins to discretize into, the original continuous action space is uniformly separated into the given number of bins, each mapped to a discrete @@ -242,7 +242,7 @@ instead of 0, 2.5, 5, 7.5, 10.

                                    ../../_images/box_discretization.png
                                    -class rl_coach.filters.action.BoxMasking(masked_target_space_low: Union[None, int, float, numpy.ndarray], masked_target_space_high: Union[None, int, float, numpy.ndarray])[source]
                                    +class rl_coach.filters.action.BoxMasking(masked_target_space_low: Union[None, int, float, numpy.ndarray], masked_target_space_high: Union[None, int, float, numpy.ndarray])[source]

                                    Masks part of the action space to enforce the agent to work in a defined space. For example, if the original action space is between -1 and 1, then this filter can be used in order to constrain the agent actions to the range 0 and 1 instead. This essentially masks the range -1 and 0 from the agent. @@ -260,7 +260,7 @@ The resulting action space will be shifted and will always start from 0 and have ../../_images/box_masking.png

                                    -class rl_coach.filters.action.PartialDiscreteActionSpaceMap(target_actions: List[Union[int, float, numpy.ndarray, List]] = None, descriptions: List[str] = None)[source]
                                    +class rl_coach.filters.action.PartialDiscreteActionSpaceMap(target_actions: List[Union[int, float, numpy.ndarray, List]] = None, descriptions: List[str] = None)[source]

                                    Partial map of two countable action spaces. For example, consider an environment with a MultiSelect action space (select multiple actions at the same time, such as jump and go right), with 8 actual MultiSelect actions. If we want the agent to be able to select only 5 of those actions by their index (0-4), we can @@ -279,7 +279,7 @@ use regular discrete actions, and mask 3 of the actions from the agent.

                                    ../../_images/partial_discrete_action_space_map.png
                                    -class rl_coach.filters.action.FullDiscreteActionSpaceMap[source]
                                    +class rl_coach.filters.action.FullDiscreteActionSpaceMap[source]

                                    Full map of two countable action spaces. This works in a similar way to the PartialDiscreteActionSpaceMap, but maps the entire source action space into the entire target action space, without masking any actions. @@ -290,7 +290,7 @@ multiselect actions.

                                    ../../_images/full_discrete_action_space_map.png
                                    -class rl_coach.filters.action.LinearBoxToBoxMap(input_space_low: Union[None, int, float, numpy.ndarray], input_space_high: Union[None, int, float, numpy.ndarray])[source]
                                    +class rl_coach.filters.action.LinearBoxToBoxMap(input_space_low: Union[None, int, float, numpy.ndarray], input_space_high: Union[None, int, float, numpy.ndarray])[source]

                                    A linear mapping of two box action spaces. For example, if the action space of the environment consists of continuous actions between 0 and 1, and we want the agent to choose actions between -1 and 1, the LinearBoxToBoxMap can be used to map the range -1 and 1 to the range 0 and 1 in a linear way. This means that the diff --git a/docs/components/memories/index.html b/docs/components/memories/index.html index c0d828f..a46a988 100644 --- a/docs/components/memories/index.html +++ b/docs/components/memories/index.html @@ -209,7 +209,7 @@

                                    EpisodicExperienceReplay

                                    -class rl_coach.memories.episodic.EpisodicExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int] = (<MemoryGranularity.Transitions: 0>, 1000000), n_step=-1, train_to_eval_ratio: int = 1)[source]
                                    +class rl_coach.memories.episodic.EpisodicExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int] = (<MemoryGranularity.Transitions: 0>, 1000000), n_step=-1, train_to_eval_ratio: int = 1)[source]

                                    A replay buffer that stores episodes of transitions. The additional structure allows performing various calculations of total return and other values that depend on the sequential behavior of the transitions in the episode.

                                    @@ -225,7 +225,7 @@ in the episode.

                                    EpisodicHindsightExperienceReplay

                                    -class rl_coach.memories.episodic.EpisodicHindsightExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace)[source]
                                    +class rl_coach.memories.episodic.EpisodicHindsightExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace)[source]

                                    Implements Hindsight Experience Replay as described in the following paper: https://arxiv.org/pdf/1707.01495.pdf

                                    Parameters
                                    @@ -246,7 +246,7 @@ hindsight transitions. Should be one of HindsightGoalSelectionMethod

                                    EpisodicHRLHindsightExperienceReplay

                                    -class rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace)[source]
                                    +class rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], hindsight_transitions_per_regular_transition: int, hindsight_goal_selection_method: rl_coach.memories.episodic.episodic_hindsight_experience_replay.HindsightGoalSelectionMethod, goals_space: rl_coach.spaces.GoalsSpace)[source]

                                    Implements HRL Hindsight Experience Replay as described in the following paper: https://arxiv.org/abs/1805.08180

                                    This is the memory you should use if you want a shared hindsight experience replay buffer between multiple workers

                                    @@ -269,7 +269,7 @@ hindsight transitions. Should be one of HindsightGoalSelectionMethod

                                    SingleEpisodeBuffer

                                    -class rl_coach.memories.episodic.SingleEpisodeBuffer[source]
                                    +class rl_coach.memories.episodic.SingleEpisodeBuffer[source]
                                    @@ -280,7 +280,7 @@ hindsight transitions. Should be one of HindsightGoalSelectionMethod

                                    BalancedExperienceReplay

                                    -class rl_coach.memories.non_episodic.BalancedExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True, num_classes: int = 0, state_key_with_the_class_index: Any = 'class')[source]
                                    +class rl_coach.memories.non_episodic.BalancedExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True, num_classes: int = 0, state_key_with_the_class_index: Any = 'class')[source]
                                    Parameters
                                      @@ -299,7 +299,7 @@ this parameter determines the key to retrieve the class index value

                                      QDND

                                      -class rl_coach.memories.non_episodic.QDND(dict_size, key_width, num_actions, new_value_shift_coefficient=0.1, key_error_threshold=0.01, learning_rate=0.01, num_neighbors=50, return_additional_data=False, override_existing_keys=False, rebuild_on_every_update=False)[source]
                                      +class rl_coach.memories.non_episodic.QDND(dict_size, key_width, num_actions, new_value_shift_coefficient=0.1, key_error_threshold=0.01, learning_rate=0.01, num_neighbors=50, return_additional_data=False, override_existing_keys=False, rebuild_on_every_update=False)[source]
                                      @@ -307,7 +307,7 @@ this parameter determines the key to retrieve the class index value

                                      ExperienceReplay

                                      -class rl_coach.memories.non_episodic.ExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True)[source]
                                      +class rl_coach.memories.non_episodic.ExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], allow_duplicates_in_batch_sampling: bool = True)[source]

                                      A regular replay buffer which stores transition without any additional structure

                                      Parameters
                                      @@ -324,7 +324,7 @@ this parameter determines the key to retrieve the class index value

                                      PrioritizedExperienceReplay

                                      -class rl_coach.memories.non_episodic.PrioritizedExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], alpha: float = 0.6, beta: rl_coach.schedules.Schedule = <rl_coach.schedules.ConstantSchedule object>, epsilon: float = 1e-06, allow_duplicates_in_batch_sampling: bool = True)[source]
                                      +class rl_coach.memories.non_episodic.PrioritizedExperienceReplay(max_size: Tuple[rl_coach.memories.memory.MemoryGranularity, int], alpha: float = 0.6, beta: rl_coach.schedules.Schedule = <rl_coach.schedules.ConstantSchedule object>, epsilon: float = 1e-06, allow_duplicates_in_batch_sampling: bool = True)[source]

                                      This is the proportional sampling variant of the prioritized experience replay as described in https://arxiv.org/pdf/1511.05952.pdf.

                                      @@ -345,7 +345,7 @@ in htt

                                      TransitionCollection

                                      -class rl_coach.memories.non_episodic.TransitionCollection[source]
                                      +class rl_coach.memories.non_episodic.TransitionCollection[source]

                                      Simple python implementation of transitions collection non-episodic memories are constructed on top of.

                                      diff --git a/docs/components/memory_backends/index.html b/docs/components/memory_backends/index.html index 01ac55b..6832296 100644 --- a/docs/components/memory_backends/index.html +++ b/docs/components/memory_backends/index.html @@ -193,7 +193,7 @@

                                      RedisPubSubBackend

                                      -class rl_coach.memories.backend.redis.RedisPubSubBackend(params: rl_coach.memories.backend.redis.RedisPubSubMemoryBackendParameters)[source]
                                      +class rl_coach.memories.backend.redis.RedisPubSubBackend(params: rl_coach.memories.backend.redis.RedisPubSubMemoryBackendParameters)[source]

                                      A memory backend which transfers the experiences from the rollout to the training worker using Redis Pub/Sub in Coach when distributed mode is used.

                                      diff --git a/docs/components/orchestrators/index.html b/docs/components/orchestrators/index.html index e31a799..1536ec3 100644 --- a/docs/components/orchestrators/index.html +++ b/docs/components/orchestrators/index.html @@ -193,7 +193,7 @@

                                      Kubernetes

                                      -class rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes(params: rl_coach.orchestrators.kubernetes_orchestrator.KubernetesParameters)[source]
                                      +class rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes(params: rl_coach.orchestrators.kubernetes_orchestrator.KubernetesParameters)[source]

                                      An orchestrator implmentation which uses Kubernetes to deploy the components such as training and rollout workers and Redis Pub/Sub in Coach when used in the distributed mode.

                                      diff --git a/docs/components/spaces.html b/docs/components/spaces.html index 4f4b278..a62653c 100644 --- a/docs/components/spaces.html +++ b/docs/components/spaces.html @@ -208,7 +208,7 @@

                                      Space

                                      -class rl_coach.spaces.Space(shape: Union[int, tuple, list, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf)[source]
                                      +class rl_coach.spaces.Space(shape: Union[int, tuple, list, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf)[source]

                                      A space defines a set of valid values

                                      Parameters
                                      @@ -223,7 +223,7 @@ or a single value defining the general highest values

                                      -contains(val: Union[int, float, numpy.ndarray]) → bool[source]
                                      +contains(val: Union[int, float, numpy.ndarray]) → bool[source]

                                      Checks if value is contained by this space. The shape must match and all of the values must be within the low and high bounds.

                                      @@ -238,7 +238,7 @@ all of the values must be within the low and high bounds.

                                      -is_valid_index(index: numpy.ndarray) → bool[source]
                                      +is_valid_index(index: numpy.ndarray) → bool[source]

                                      Checks if a given multidimensional index is within the bounds of the shape of the space

                                      Parameters
                                      @@ -252,7 +252,7 @@ all of the values must be within the low and high bounds.

                                      -sample() → numpy.ndarray[source]
                                      +sample() → numpy.ndarray[source]

                                      Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no bounds are defined

                                      @@ -269,10 +269,10 @@ bounds are defined

                                      Observation Spaces

                                      -class rl_coach.spaces.ObservationSpace(shape: Union[int, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf)[source]
                                      +class rl_coach.spaces.ObservationSpace(shape: Union[int, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf)[source]
                                      -contains(val: Union[int, float, numpy.ndarray]) → bool
                                      +contains(val: Union[int, float, numpy.ndarray]) → bool

                                      Checks if value is contained by this space. The shape must match and all of the values must be within the low and high bounds.

                                      @@ -287,7 +287,7 @@ all of the values must be within the low and high bounds.

                                      -is_valid_index(index: numpy.ndarray) → bool
                                      +is_valid_index(index: numpy.ndarray) → bool

                                      Checks if a given multidimensional index is within the bounds of the shape of the space

                                      Parameters
                                      @@ -301,7 +301,7 @@ all of the values must be within the low and high bounds.

                                      -sample() → numpy.ndarray
                                      +sample() → numpy.ndarray

                                      Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no bounds are defined

                                      @@ -317,7 +317,7 @@ bounds are defined

                                      VectorObservationSpace

                                      -class rl_coach.spaces.VectorObservationSpace(shape: int, low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, measurements_names: List[str] = None)[source]
                                      +class rl_coach.spaces.VectorObservationSpace(shape: int, low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, measurements_names: List[str] = None)[source]

                                      An observation space which is defined as a vector of elements. This can be particularly useful for environments which return measurements, such as in robotic environments.

                                      @@ -327,7 +327,7 @@ which return measurements, such as in robotic environments.

                                      PlanarMapsObservationSpace

                                      -class rl_coach.spaces.PlanarMapsObservationSpace(shape: numpy.ndarray, low: int, high: int, channels_axis: int = -1)[source]
                                      +class rl_coach.spaces.PlanarMapsObservationSpace(shape: numpy.ndarray, low: int, high: int, channels_axis: int = -1)[source]

                                      An observation space which defines a stack of 2D observations. For example, an environment which returns a stack of segmentation maps like in Starcraft.

                                      @@ -337,7 +337,7 @@ a stack of segmentation maps like in Starcraft.

                                      ImageObservationSpace

                                      -class rl_coach.spaces.ImageObservationSpace(shape: numpy.ndarray, high: int, channels_axis: int = -1)[source]
                                      +class rl_coach.spaces.ImageObservationSpace(shape: numpy.ndarray, high: int, channels_axis: int = -1)[source]

                                      An observation space which is a private case of the PlanarMapsObservationSpace, where the stack of 2D observations represent a RGB image, or a grayscale image.

                                      @@ -348,10 +348,10 @@ represent a RGB image, or a grayscale image.

                                      Action Spaces

                                      -class rl_coach.spaces.ActionSpace(shape: Union[int, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, descriptions: Union[None, List, Dict] = None, default_action: Union[int, float, numpy.ndarray, List] = None)[source]
                                      +class rl_coach.spaces.ActionSpace(shape: Union[int, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, descriptions: Union[None, List, Dict] = None, default_action: Union[int, float, numpy.ndarray, List] = None)[source]
                                      -clip_action_to_space(action: Union[int, float, numpy.ndarray, List]) → Union[int, float, numpy.ndarray, List][source]
                                      +clip_action_to_space(action: Union[int, float, numpy.ndarray, List]) → Union[int, float, numpy.ndarray, List][source]

                                      Given an action, clip its values to fit to the action space ranges

                                      Parameters
                                      @@ -365,7 +365,7 @@ represent a RGB image, or a grayscale image.

                                      -contains(val: Union[int, float, numpy.ndarray]) → bool
                                      +contains(val: Union[int, float, numpy.ndarray]) → bool

                                      Checks if value is contained by this space. The shape must match and all of the values must be within the low and high bounds.

                                      @@ -380,7 +380,7 @@ all of the values must be within the low and high bounds.

                                      -is_valid_index(index: numpy.ndarray) → bool
                                      +is_valid_index(index: numpy.ndarray) → bool

                                      Checks if a given multidimensional index is within the bounds of the shape of the space

                                      Parameters
                                      @@ -394,7 +394,7 @@ all of the values must be within the low and high bounds.

                                      -sample() → numpy.ndarray
                                      +sample() → numpy.ndarray

                                      Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no bounds are defined

                                      @@ -406,7 +406,7 @@ bounds are defined

                                      -sample_with_info() → rl_coach.core_types.ActionInfo[source]
                                      +sample_with_info() → rl_coach.core_types.ActionInfo[source]

                                      Get a random action with additional “fake” info

                                      Returns
                                      @@ -421,7 +421,7 @@ bounds are defined

                                      AttentionActionSpace

                                      -class rl_coach.spaces.AttentionActionSpace(shape: int, low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None, forced_attention_size: Union[None, int, float, numpy.ndarray] = None)[source]
                                      +class rl_coach.spaces.AttentionActionSpace(shape: int, low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None, forced_attention_size: Union[None, int, float, numpy.ndarray] = None)[source]

                                      A box selection continuous action space, meaning that the actions are defined as selecting a multidimensional box from a given range. The actions will be in the form: @@ -433,7 +433,7 @@ The actions will be in the form:

                                      BoxActionSpace

                                      -class rl_coach.spaces.BoxActionSpace(shape: Union[int, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None)[source]
                                      +class rl_coach.spaces.BoxActionSpace(shape: Union[int, numpy.ndarray], low: Union[None, int, float, numpy.ndarray] = -inf, high: Union[None, int, float, numpy.ndarray] = inf, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None)[source]

                                      A multidimensional bounded or unbounded continuous action space

                                      @@ -442,7 +442,7 @@ The actions will be in the form:

                                      DiscreteActionSpace

                                      -class rl_coach.spaces.DiscreteActionSpace(num_actions: int, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None)[source]
                                      +class rl_coach.spaces.DiscreteActionSpace(num_actions: int, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None)[source]

                                      A discrete action space with action indices as actions

                                      @@ -451,7 +451,7 @@ The actions will be in the form:

                                      MultiSelectActionSpace

                                      -class rl_coach.spaces.MultiSelectActionSpace(size: int, max_simultaneous_selected_actions: int = 1, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None, allow_no_action_to_be_selected=True)[source]
                                      +class rl_coach.spaces.MultiSelectActionSpace(size: int, max_simultaneous_selected_actions: int = 1, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None, allow_no_action_to_be_selected=True)[source]

                                      A discrete action space where multiple actions can be selected at once. The actions are encoded as multi-hot vectors

                                      @@ -460,7 +460,7 @@ The actions will be in the form:

                                      CompoundActionSpace

                                      -class rl_coach.spaces.CompoundActionSpace(sub_spaces: List[rl_coach.spaces.ActionSpace])[source]
                                      +class rl_coach.spaces.CompoundActionSpace(sub_spaces: List[rl_coach.spaces.ActionSpace])[source]

                                      An action space which consists of multiple sub-action spaces. For example, in Starcraft the agent should choose an action identifier from ~550 options (Discrete(550)), but it also needs to choose 13 different arguments for the selected action identifier, where each argument is @@ -473,7 +473,7 @@ by itself an action space. In Starcraft, the arguments are Discrete action space

                                      Goal Spaces

                                      -class rl_coach.spaces.GoalsSpace(goal_name: str, reward_type: rl_coach.spaces.GoalToRewardConversion, distance_metric: Union[rl_coach.spaces.GoalsSpace.DistanceMetric, Callable])[source]
                                      +class rl_coach.spaces.GoalsSpace(goal_name: str, reward_type: rl_coach.spaces.GoalToRewardConversion, distance_metric: Union[rl_coach.spaces.GoalsSpace.DistanceMetric, Callable])[source]

                                      A multidimensional space with a goal type definition. It also behaves as an action space, so that hierarchical agents can use it as an output action space. The class acts as a wrapper to the target space. So after setting the target space, all the values of the class @@ -491,13 +491,13 @@ returns the distance between them

                                      -class DistanceMetric[source]
                                      +class DistanceMetric[source]

                                      An enumeration.

                                      -clip_action_to_space(action: Union[int, float, numpy.ndarray, List]) → Union[int, float, numpy.ndarray, List]
                                      +clip_action_to_space(action: Union[int, float, numpy.ndarray, List]) → Union[int, float, numpy.ndarray, List]

                                      Given an action, clip its values to fit to the action space ranges

                                      Parameters
                                      @@ -511,7 +511,7 @@ returns the distance between them

                                      -contains(val: Union[int, float, numpy.ndarray]) → bool
                                      +contains(val: Union[int, float, numpy.ndarray]) → bool

                                      Checks if value is contained by this space. The shape must match and all of the values must be within the low and high bounds.

                                      @@ -526,7 +526,7 @@ all of the values must be within the low and high bounds.

                                      -distance_from_goal(goal: numpy.ndarray, state: dict) → float[source]
                                      +distance_from_goal(goal: numpy.ndarray, state: dict) → float[source]

                                      Given a state, check its distance from the goal

                                      Parameters
                                      @@ -543,7 +543,7 @@ all of the values must be within the low and high bounds.

                                      -get_reward_for_goal_and_state(goal: numpy.ndarray, state: dict) → Tuple[float, bool][source]
                                      +get_reward_for_goal_and_state(goal: numpy.ndarray, state: dict) → Tuple[float, bool][source]

                                      Given a state, check if the goal was reached and return a reward accordingly

                                      Parameters
                                      @@ -560,7 +560,7 @@ all of the values must be within the low and high bounds.

                                      -goal_from_state(state: Dict)[source]
                                      +goal_from_state(state: Dict)[source]

                                      Given a state, extract an observation according to the goal_name

                                      Parameters
                                      @@ -574,7 +574,7 @@ all of the values must be within the low and high bounds.

                                      -is_valid_index(index: numpy.ndarray) → bool
                                      +is_valid_index(index: numpy.ndarray) → bool

                                      Checks if a given multidimensional index is within the bounds of the shape of the space

                                      Parameters
                                      @@ -588,7 +588,7 @@ all of the values must be within the low and high bounds.

                                      -sample() → numpy.ndarray
                                      +sample() → numpy.ndarray

                                      Sample the defined space, either uniformly, if space bounds are defined, or Normal distributed if no bounds are defined

                                      @@ -600,7 +600,7 @@ bounds are defined

                                      -sample_with_info() → rl_coach.core_types.ActionInfo
                                      +sample_with_info() → rl_coach.core_types.ActionInfo

                                      Get a random action with additional “fake” info

                                      Returns
                                      diff --git a/docs/genindex.html b/docs/genindex.html index 23d1b6e..28d3d71 100644 --- a/docs/genindex.html +++ b/docs/genindex.html @@ -221,7 +221,7 @@
                                    • (rl_coach.agents.dqn_agent.DQNAgent method)
                                    -
                                  • action_space (rl_coach.environments.environment.Environment attribute) +
                                  • action_space() (rl_coach.environments.environment.Environment property)
                                  • ActionInfo (class in rl_coach.core_types)
                                  • @@ -408,6 +408,14 @@

                                    F

                                    + @@ -537,7 +551,7 @@

                                    L

                                    - + @@ -835,7 +855,7 @@
                                  • SingleEpisodeBuffer (class in rl_coach.memories.episodic)
                                  • -
                                  • size (rl_coach.core_types.Batch attribute) +
                                  • size() (rl_coach.core_types.Batch property)
                                  • slice() (rl_coach.core_types.Batch method)
                                  • @@ -845,7 +865,7 @@
                                  • StarCraft2Environment (class in rl_coach.environments.starcraft2_environment)
                                  • -
                                  • state_space (rl_coach.environments.environment.Environment attribute) +
                                  • state_space() (rl_coach.environments.environment.Environment property)
                                  • states() (rl_coach.core_types.Batch method)
                                  • @@ -866,6 +886,8 @@
                                    -
                                    -
                                  • train_and_sync_networks() (rl_coach.architectures.network_wrapper.NetworkWrapper method) -
                                    • +
                                    • train_and_sync_networks() (rl_coach.architectures.network_wrapper.NetworkWrapper method) +
                                    • train_on_batch() (rl_coach.architectures.architecture.Architecture method)
                                    • Transition (class in rl_coach.core_types) diff --git a/docs/objects.inv b/docs/objects.inv index df98ef0..c239778 100644 Binary files a/docs/objects.inv and b/docs/objects.inv differ diff --git a/docs/searchindex.js b/docs/searchindex.js index 6d20f79..35fd839 100644 --- a/docs/searchindex.js +++ b/docs/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["components/additional_parameters","components/agents/imitation/bc","components/agents/imitation/cil","components/agents/index","components/agents/other/dfp","components/agents/policy_optimization/ac","components/agents/policy_optimization/acer","components/agents/policy_optimization/cppo","components/agents/policy_optimization/ddpg","components/agents/policy_optimization/hac","components/agents/policy_optimization/pg","components/agents/policy_optimization/ppo","components/agents/policy_optimization/sac","components/agents/value_optimization/bs_dqn","components/agents/value_optimization/categorical_dqn","components/agents/value_optimization/double_dqn","components/agents/value_optimization/dqn","components/agents/value_optimization/dueling_dqn","components/agents/value_optimization/mmc","components/agents/value_optimization/n_step","components/agents/value_optimization/naf","components/agents/value_optimization/nec","components/agents/value_optimization/pal","components/agents/value_optimization/qr_dqn","components/agents/value_optimization/rainbow","components/architectures/index","components/core_types","components/data_stores/index","components/environments/index","components/exploration_policies/index","components/filters/index","components/filters/input_filters","components/filters/output_filters","components/memories/index","components/memory_backends/index","components/orchestrators/index","components/spaces","contributing/add_agent","contributing/add_env","dashboard","design/control_flow","design/horizontal_scaling","design/network","dist_usage","features/algorithms","features/benchmarks","features/environments","features/index","index","selecting_an_algorithm","test","usage"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["components/additional_parameters.rst","components/agents/imitation/bc.rst","components/agents/imitation/cil.rst","components/agents/index.rst","components/agents/other/dfp.rst","components/agents/policy_optimization/ac.rst","components/agents/policy_optimization/acer.rst","components/agents/policy_optimization/cppo.rst","components/agents/policy_optimization/ddpg.rst","components/agents/policy_optimization/hac.rst","components/agents/policy_optimization/pg.rst","components/agents/policy_optimization/ppo.rst","components/agents/policy_optimization/sac.rst","components/agents/value_optimization/bs_dqn.rst","components/agents/value_optimization/categorical_dqn.rst","components/agents/value_optimization/double_dqn.rst","components/agents/value_optimization/dqn.rst","components/agents/value_optimization/dueling_dqn.rst","components/agents/value_optimization/mmc.rst","components/agents/value_optimization/n_step.rst","components/agents/value_optimization/naf.rst","components/agents/value_optimization/nec.rst","components/agents/value_optimization/pal.rst","components/agents/value_optimization/qr_dqn.rst","components/agents/value_optimization/rainbow.rst","components/architectures/index.rst","components/core_types.rst","components/data_stores/index.rst","components/environments/index.rst","components/exploration_policies/index.rst","components/filters/index.rst","components/filters/input_filters.rst","components/filters/output_filters.rst","components/memories/index.rst","components/memory_backends/index.rst","components/orchestrators/index.rst","components/spaces.rst","contributing/add_agent.rst","contributing/add_env.rst","dashboard.rst","design/control_flow.rst","design/horizontal_scaling.rst","design/network.rst","dist_usage.rst","features/algorithms.rst","features/benchmarks.rst","features/environments.rst","features/index.rst","index.rst","selecting_an_algorithm.rst","test.rst","usage.rst"],objects:{"rl_coach.agents.acer_agent":{ACERAlgorithmParameters:[6,0,1,""]},"rl_coach.agents.actor_critic_agent":{ActorCriticAlgorithmParameters:[5,0,1,""]},"rl_coach.agents.agent":{Agent:[3,0,1,""]},"rl_coach.agents.agent.Agent":{act:[3,1,1,""],call_memory:[3,1,1,""],choose_action:[3,1,1,""],collect_savers:[3,1,1,""],create_networks:[3,1,1,""],get_predictions:[3,1,1,""],get_state_embedding:[3,1,1,""],handle_episode_ended:[3,1,1,""],init_environment_dependent_modules:[3,1,1,""],learn_from_batch:[3,1,1,""],log_to_screen:[3,1,1,""],observe:[3,1,1,""],parent:[3,2,1,""],phase:[3,2,1,""],post_training_commands:[3,1,1,""],prepare_batch_for_inference:[3,1,1,""],register_signal:[3,1,1,""],reset_evaluation_state:[3,1,1,""],reset_internal_state:[3,1,1,""],restore_checkpoint:[3,1,1,""],run_off_policy_evaluation:[3,1,1,""],run_pre_network_filter_for_inference:[3,1,1,""],save_checkpoint:[3,1,1,""],set_environment_parameters:[3,1,1,""],set_incoming_directive:[3,1,1,""],set_session:[3,1,1,""],setup_logger:[3,1,1,""],sync:[3,1,1,""],train:[3,1,1,""],update_log:[3,1,1,""],update_step_in_episode_log:[3,1,1,""],update_transition_before_adding_to_replay_buffer:[3,1,1,""]},"rl_coach.agents.bc_agent":{BCAlgorithmParameters:[1,0,1,""]},"rl_coach.agents.categorical_dqn_agent":{CategoricalDQNAlgorithmParameters:[14,0,1,""]},"rl_coach.agents.cil_agent":{CILAlgorithmParameters:[2,0,1,""]},"rl_coach.agents.clipped_ppo_agent":{ClippedPPOAlgorithmParameters:[7,0,1,""]},"rl_coach.agents.ddpg_agent":{DDPGAlgorithmParameters:[8,0,1,""]},"rl_coach.agents.dfp_agent":{DFPAlgorithmParameters:[4,0,1,""]},"rl_coach.agents.dqn_agent":{DQNAgent:[50,0,1,""],DQNAlgorithmParameters:[16,0,1,""]},"rl_coach.agents.dqn_agent.DQNAgent":{act:[50,1,1,""],call_memory:[50,1,1,""],choose_action:[50,1,1,""],collect_savers:[50,1,1,""],create_networks:[50,1,1,""],get_predictions:[50,1,1,""],get_state_embedding:[50,1,1,""],handle_episode_ended:[50,1,1,""],improve_reward_model:[50,1,1,""],init_environment_dependent_modules:[50,1,1,""],learn_from_batch:[50,1,1,""],log_to_screen:[50,1,1,""],observe:[50,1,1,""],parent:[50,2,1,""],phase:[50,2,1,""],post_training_commands:[50,1,1,""],prepare_batch_for_inference:[50,1,1,""],register_signal:[50,1,1,""],reset_evaluation_state:[50,1,1,""],reset_internal_state:[50,1,1,""],restore_checkpoint:[50,1,1,""],run_off_policy_evaluation:[50,1,1,""],run_pre_network_filter_for_inference:[50,1,1,""],save_checkpoint:[50,1,1,""],set_environment_parameters:[50,1,1,""],set_incoming_directive:[50,1,1,""],set_session:[50,1,1,""],setup_logger:[50,1,1,""],sync:[50,1,1,""],train:[50,1,1,""],update_log:[50,1,1,""],update_step_in_episode_log:[50,1,1,""],update_transition_before_adding_to_replay_buffer:[50,1,1,""]},"rl_coach.agents.mmc_agent":{MixedMonteCarloAlgorithmParameters:[18,0,1,""]},"rl_coach.agents.n_step_q_agent":{NStepQAlgorithmParameters:[19,0,1,""]},"rl_coach.agents.naf_agent":{NAFAlgorithmParameters:[20,0,1,""]},"rl_coach.agents.nec_agent":{NECAlgorithmParameters:[21,0,1,""]},"rl_coach.agents.pal_agent":{PALAlgorithmParameters:[22,0,1,""]},"rl_coach.agents.policy_gradients_agent":{PolicyGradientAlgorithmParameters:[10,0,1,""]},"rl_coach.agents.ppo_agent":{PPOAlgorithmParameters:[11,0,1,""]},"rl_coach.agents.qr_dqn_agent":{QuantileRegressionDQNAlgorithmParameters:[23,0,1,""]},"rl_coach.agents.rainbow_dqn_agent":{RainbowDQNAlgorithmParameters:[24,0,1,""]},"rl_coach.agents.soft_actor_critic_agent":{SoftActorCriticAlgorithmParameters:[12,0,1,""]},"rl_coach.architectures.architecture":{Architecture:[25,0,1,""]},"rl_coach.architectures.architecture.Architecture":{accumulate_gradients:[25,1,1,""],apply_and_reset_gradients:[25,1,1,""],apply_gradients:[25,1,1,""],collect_savers:[25,1,1,""],construct:[25,3,1,""],get_variable_value:[25,1,1,""],get_weights:[25,1,1,""],parallel_predict:[25,3,1,""],predict:[25,1,1,""],reset_accumulated_gradients:[25,1,1,""],set_variable_value:[25,1,1,""],set_weights:[25,1,1,""],train_on_batch:[25,1,1,""]},"rl_coach.architectures.network_wrapper":{NetworkWrapper:[25,0,1,""]},"rl_coach.architectures.network_wrapper.NetworkWrapper":{apply_gradients_and_sync_networks:[25,1,1,""],apply_gradients_to_global_network:[25,1,1,""],apply_gradients_to_online_network:[25,1,1,""],collect_savers:[25,1,1,""],parallel_prediction:[25,1,1,""],set_is_training:[25,1,1,""],sync:[25,1,1,""],train_and_sync_networks:[25,1,1,""],update_online_network:[25,1,1,""],update_target_network:[25,1,1,""]},"rl_coach.base_parameters":{AgentParameters:[3,0,1,""],DistributedTaskParameters:[0,0,1,""],NetworkParameters:[25,0,1,""],PresetValidationParameters:[0,0,1,""],TaskParameters:[0,0,1,""],VisualizationParameters:[0,0,1,""]},"rl_coach.core_types":{ActionInfo:[26,0,1,""],Batch:[26,0,1,""],EnvResponse:[26,0,1,""],Episode:[26,0,1,""],Transition:[26,0,1,""]},"rl_coach.core_types.Batch":{actions:[26,1,1,""],game_overs:[26,1,1,""],goals:[26,1,1,""],info:[26,1,1,""],info_as_list:[26,1,1,""],n_step_discounted_rewards:[26,1,1,""],next_states:[26,1,1,""],rewards:[26,1,1,""],shuffle:[26,1,1,""],size:[26,2,1,""],slice:[26,1,1,""],states:[26,1,1,""]},"rl_coach.core_types.Episode":{get_first_transition:[26,1,1,""],get_last_transition:[26,1,1,""],get_transition:[26,1,1,""],get_transitions_attribute:[26,1,1,""],insert:[26,1,1,""],is_empty:[26,1,1,""],length:[26,1,1,""],update_discounted_rewards:[26,1,1,""]},"rl_coach.data_stores.nfs_data_store":{NFSDataStore:[27,0,1,""]},"rl_coach.data_stores.s3_data_store":{S3DataStore:[27,0,1,""]},"rl_coach.environments.carla_environment":{CarlaEnvironment:[28,0,1,""]},"rl_coach.environments.control_suite_environment":{ControlSuiteEnvironment:[28,0,1,""]},"rl_coach.environments.doom_environment":{DoomEnvironment:[28,0,1,""]},"rl_coach.environments.environment":{Environment:[28,0,1,""]},"rl_coach.environments.environment.Environment":{action_space:[28,2,1,""],close:[28,1,1,""],get_action_from_user:[28,1,1,""],get_available_keys:[28,1,1,""],get_goal:[28,1,1,""],get_random_action:[28,1,1,""],get_rendered_image:[28,1,1,""],goal_space:[28,2,1,""],handle_episode_ended:[28,1,1,""],last_env_response:[28,2,1,""],phase:[28,2,1,""],render:[28,1,1,""],reset_internal_state:[28,1,1,""],set_goal:[28,1,1,""],state_space:[28,2,1,""],step:[28,1,1,""]},"rl_coach.environments.gym_environment":{GymEnvironment:[28,0,1,""]},"rl_coach.environments.starcraft2_environment":{StarCraft2Environment:[28,0,1,""]},"rl_coach.exploration_policies.additive_noise":{AdditiveNoise:[29,0,1,""]},"rl_coach.exploration_policies.boltzmann":{Boltzmann:[29,0,1,""]},"rl_coach.exploration_policies.bootstrapped":{Bootstrapped:[29,0,1,""]},"rl_coach.exploration_policies.categorical":{Categorical:[29,0,1,""]},"rl_coach.exploration_policies.continuous_entropy":{ContinuousEntropy:[29,0,1,""]},"rl_coach.exploration_policies.e_greedy":{EGreedy:[29,0,1,""]},"rl_coach.exploration_policies.exploration_policy":{ExplorationPolicy:[29,0,1,""]},"rl_coach.exploration_policies.exploration_policy.ExplorationPolicy":{change_phase:[29,1,1,""],get_action:[29,1,1,""],requires_action_values:[29,1,1,""],reset:[29,1,1,""]},"rl_coach.exploration_policies.greedy":{Greedy:[29,0,1,""]},"rl_coach.exploration_policies.ou_process":{OUProcess:[29,0,1,""]},"rl_coach.exploration_policies.parameter_noise":{ParameterNoise:[29,0,1,""]},"rl_coach.exploration_policies.truncated_normal":{TruncatedNormal:[29,0,1,""]},"rl_coach.exploration_policies.ucb":{UCB:[29,0,1,""]},"rl_coach.filters.action":{AttentionDiscretization:[32,0,1,""],BoxDiscretization:[32,0,1,""],BoxMasking:[32,0,1,""],FullDiscreteActionSpaceMap:[32,0,1,""],LinearBoxToBoxMap:[32,0,1,""],PartialDiscreteActionSpaceMap:[32,0,1,""]},"rl_coach.filters.observation":{ObservationClippingFilter:[31,0,1,""],ObservationCropFilter:[31,0,1,""],ObservationMoveAxisFilter:[31,0,1,""],ObservationNormalizationFilter:[31,0,1,""],ObservationRGBToYFilter:[31,0,1,""],ObservationReductionBySubPartsNameFilter:[31,0,1,""],ObservationRescaleSizeByFactorFilter:[31,0,1,""],ObservationRescaleToSizeFilter:[31,0,1,""],ObservationSqueezeFilter:[31,0,1,""],ObservationStackingFilter:[31,0,1,""],ObservationToUInt8Filter:[31,0,1,""]},"rl_coach.filters.reward":{RewardClippingFilter:[31,0,1,""],RewardNormalizationFilter:[31,0,1,""],RewardRescaleFilter:[31,0,1,""]},"rl_coach.memories.backend.redis":{RedisPubSubBackend:[34,0,1,""]},"rl_coach.memories.episodic":{EpisodicExperienceReplay:[33,0,1,""],EpisodicHRLHindsightExperienceReplay:[33,0,1,""],EpisodicHindsightExperienceReplay:[33,0,1,""],SingleEpisodeBuffer:[33,0,1,""]},"rl_coach.memories.non_episodic":{BalancedExperienceReplay:[33,0,1,""],ExperienceReplay:[33,0,1,""],PrioritizedExperienceReplay:[33,0,1,""],QDND:[33,0,1,""],TransitionCollection:[33,0,1,""]},"rl_coach.orchestrators.kubernetes_orchestrator":{Kubernetes:[35,0,1,""]},"rl_coach.spaces":{ActionSpace:[36,0,1,""],AttentionActionSpace:[36,0,1,""],BoxActionSpace:[36,0,1,""],CompoundActionSpace:[36,0,1,""],DiscreteActionSpace:[36,0,1,""],GoalsSpace:[36,0,1,""],ImageObservationSpace:[36,0,1,""],MultiSelectActionSpace:[36,0,1,""],ObservationSpace:[36,0,1,""],PlanarMapsObservationSpace:[36,0,1,""],Space:[36,0,1,""],VectorObservationSpace:[36,0,1,""]},"rl_coach.spaces.ActionSpace":{clip_action_to_space:[36,1,1,""],contains:[36,1,1,""],is_valid_index:[36,1,1,""],sample:[36,1,1,""],sample_with_info:[36,1,1,""]},"rl_coach.spaces.GoalsSpace":{DistanceMetric:[36,0,1,""],clip_action_to_space:[36,1,1,""],contains:[36,1,1,""],distance_from_goal:[36,1,1,""],get_reward_for_goal_and_state:[36,1,1,""],goal_from_state:[36,1,1,""],is_valid_index:[36,1,1,""],sample:[36,1,1,""],sample_with_info:[36,1,1,""]},"rl_coach.spaces.ObservationSpace":{contains:[36,1,1,""],is_valid_index:[36,1,1,""],sample:[36,1,1,""]},"rl_coach.spaces.Space":{contains:[36,1,1,""],is_valid_index:[36,1,1,""],sample:[36,1,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","attribute","Python attribute"],"3":["py","staticmethod","Python static method"]},objtypes:{"0":"py:class","1":"py:method","2":"py:attribute","3":"py:staticmethod"},terms:{"100x100":32,"160x160":31,"1_0":[14,24],"1st":29,"20x20":32,"210x160":31,"2nd":29,"50k":40,"9_amd64":43,"abstract":[37,41],"boolean":[3,26,36,50],"break":39,"case":[0,3,5,21,25,26,29,36,49,50,51],"class":[0,1,2,3,4,5,6,7,8,10,11,12,14,16,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,40,44,50],"default":[0,29,51],"enum":[25,28,36],"export":[0,25,43],"final":[8,15,16,18,22,40],"float":[3,4,5,6,7,8,10,11,12,14,18,21,22,23,25,26,28,29,31,32,33,36,37,50],"function":[0,1,3,6,7,8,11,25,28,29,36,37,38,40,42,50],"import":[6,17,29,33,38,49,51],"int":[0,3,4,5,6,7,10,14,19,21,23,24,26,28,29,31,32,33,36,50],"long":42,"new":[0,3,7,8,11,12,21,22,25,26,32,40,41,48,49,50],"return":[0,3,8,10,11,13,18,21,22,24,25,26,28,29,31,33,36,37,38,40,49,50],"short":[0,40],"static":25,"super":[37,38],"switch":[0,39],"true":[0,3,4,5,6,7,8,11,12,21,22,24,25,26,28,29,32,33,36,50],"try":[4,45,49],"while":[0,5,6,8,9,10,11,12,25,28,39,42,49,51],AWS:43,Adding:[17,48],And:[38,49],But:[39,49],Doing:49,For:[0,1,2,3,4,7,10,13,14,15,16,19,21,22,25,26,28,29,30,31,32,36,37,38,40,41,42,43,45,50,51],Has:25,Its:50,NFS:[27,43],One:[23,49,51],That:39,The:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,39,40,41,42,43,45,46,48,49,50,51],Then:[4,7,8,13,20,22],There:[7,11,25,29,30,37,38,42,51],These:[1,2,3,23,28,35,41,42,43],Use:[1,2,8,20,21],Used:29,Uses:49,Using:[8,13,15,16,43],Will:25,With:[29,48],__init__:[28,37,38],_index:[5,19],_render:38,_restart_environment_episod:38,_take_act:38,_update_st:38,a2c:49,a3c:[10,19,39,49],a_i:21,a_t:[4,5,6,8,12,13,14,15,16,18,19,20,22,24],a_valu:5,abl:[32,49],about:[3,26,40,50,51],abov:[8,12,25,40],abs:[19,33],absolut:29,acceler:20,accept:28,access:[25,37,43],accord:[0,3,4,5,6,8,12,13,19,25,26,29,36,39,40,42,50],accordingli:[21,36,40,51],account:[4,7,11,21,22,29],accumul:[3,4,5,6,10,19,21,24,25,31,49,50],accumulate_gradi:25,accumulated_gradi:25,accur:49,acer:[3,49],acer_ag:6,aceralgorithmparamet:6,achiev:[0,4,7,28,31,33,36,45,49,51],acquir:12,across:[10,18,39],act:[3,4,8,13,23,36,37,40,50],action:[1,2,3,14,15,16,17,18,19,22,23,24,25,26,28,29,30,33,37,38,40,42,50],action_idx:38,action_penalti:8,action_spac:[28,29],action_space_s:25,action_valu:[26,29],actioninfo:[3,36,40,50],actionspac:[29,36],actiontyp:38,activ:[8,25],actor:[3,6,7,8,11,29,42,49],actor_critic_ag:5,actorcriticag:37,actorcriticalgorithmparamet:5,actual:[4,5,14,15,16,23,24,29,32,33],adam:[7,25],adam_optimizer_beta1:25,adam_optimizer_beta2:25,adapt:[7,11],add:[8,9,20,26,29,31,38,40,43],add_rendered_image_to_env_respons:0,added:[0,4,6,7,10,11,21,29,33,37],adding:[3,11,29,37,50],addit:[3,25,26,28,29,31,33,36,38,39,40,42,48,49,50],addition:[25,28,31,37,38,40,45,46,51],additional_fetch:25,additional_simulator_paramet:[28,38],additionali:39,additive_nois:29,additivenoiseparamet:29,advanc:[24,48],advantag:[3,5,7,11,17,29],affect:[0,13,25],aforement:[15,16,22],after:[0,3,8,10,11,12,19,20,22,24,25,26,28,31,36,50,51],again:29,against:3,agent:[0,1,2,4,5,6,7,8,10,11,12,14,16,18,19,20,21,22,23,24,25,26,28,29,30,31,32,36,38,39,42,44,45,48,49,50],agent_param:41,agent_paramet:[3,25,50],agentparamet:[3,25,37],aggreg:40,ahead:[4,49],aim:29,algorithm:[3,26,29,37,39,40,41,45,47,48,50],algorithmparamet:[3,37],all:[0,3,10,13,21,22,25,26,28,29,31,32,36,37,38,39,40,41,42,43,46,50,51],all_action_prob:26,allow:[0,3,4,17,25,26,28,29,30,31,32,33,39,40,41,42,48,49,50,51],allow_brak:28,allow_duplicates_in_batch_sampl:33,allow_no_action_to_be_select:36,along:[21,28,29,46],alpha:[6,18,22,33],alreadi:[21,26,38,49],also:[5,6,7,21,22,25,28,36,37,39,45,49,51],altern:[28,38,46],alwai:[25,29,32],amazon:43,amazonaw:43,amount:[8,10,18,22,29,40,49],analysi:39,analyz:39,ani:[3,25,26,28,32,33,37,40,41,42,43,50],anoth:[3,17,25,30,50],answer:49,api:[28,42,46,48],appear:[3,50],appli:[0,3,5,8,10,19,25,26,29,31,49,50],applic:49,apply_and_reset_gradi:25,apply_gradi:25,apply_gradients_and_sync_network:25,apply_gradients_every_x_episod:[5,10,19],apply_gradients_to_global_network:25,apply_gradients_to_online_network:25,apply_stop_condit:0,appropri:43,approx:[8,12],approxim:[12,42,49],apt:43,arbitrari:31,architectur:[3,17,37,48,50],architecture_num_q_head:29,area:32,arg:[3,25,43,50],argmax_a:[15,18,22],argument:[3,14,24,25,28,36,40,50],around:[25,26,42],arrai:[3,25,26,28,31,36,38,50],art:[3,44],artifact:43,artifici:33,arxiv:[19,33],aspect:[29,31,39],assign:[0,2,5,6,25,29],assign_kl_coeffici:25,assign_op:25,assum:[26,29,31,33,49],async:[25,41],async_train:25,asynchron:[5,19,25],atari:[16,28,31,43,51],atari_a3c:51,atari_dqn:51,ath:17,atom:[14,23,24],attach:28,attend:32,attent:32,attentionactionspac:32,attentiondiscret:32,attribut:26,attribute_nam:26,author:[28,45,46],auto_select_all_armi:28,autoclean:43,automat:[25,51],autonom:[28,46,48],autoremov:43,auxiliari:[28,46],avail:[4,25,26,28,39,41,43,48,49,51],averag:[6,7,11,25,39,40],avg:6,aws:43,axes:[31,39],axi:[31,39],axis_origin:31,axis_target:31,back:[7,41],backend:[25,41,43,48,51],background:51,backpropag:21,backward:25,balanc:2,band:39,bar:6,base1:43,base64:43,base:[7,11,12,18,20,22,28,33,37,40,43,46,49,50],base_paramet:[0,3,25,28,29],baselin:49,basic:[10,26,41,51],batch:[1,2,3,4,5,6,8,10,11,12,13,14,15,16,17,19,22,23,24,25,33,37,40,50],batch_siz:25,bc_agent:1,bcalgorithmparamet:1,becaus:40,becom:[8,41],been:[17,26,31,45,49],befor:[3,5,11,24,25,26,31,40,41,42,43,49,50],begin:[0,4,40],behav:36,behavior:[3,31,33,37,45,49,50,51],being:[3,37,48,49,50],bellman:[14,23,24],benchmark:[39,47,48,49],best:[49,51],beta1:25,beta2:25,beta:[6,8,10,33],beta_entropi:[5,6,7,10,11],better:[17,49],between:[0,1,2,3,6,7,8,10,11,12,14,18,19,21,23,24,25,26,28,29,32,33,36,37,39,40,42,48,49],bfg:[7,11],bia:[6,49],big:[11,14,24],bin:[32,43],binari:13,bind:25,binomi:13,bit:31,blizzard:46,blob:[28,31],block:48,blog:48,boilerpl:40,bolling:39,bool:[0,3,4,5,6,7,8,11,12,21,22,24,25,26,28,29,33,36,50],boost:[43,49],bootstrap:[3,5,6,7,8,11,18,19,21,22,24,26,49],bootstrap_total_return_from_old_polici:[21,26],both:[3,7,25,28,29,32,49,50],bound:[6,7,11,14,24,29,36,49],box2d:43,box:[29,32,36],boxactionspac:32,boxdiscret:32,boxmask:32,breakout:51,breakoutdeterminist:[28,51],bring:11,bucket:43,buffer:[1,2,3,6,12,13,14,15,16,19,21,22,23,24,33,40,49,50,51],build:[30,48,49],builder:43,built:[37,40],bullet:6,button:[39,51],c51:14,cach:43,calcul:[3,4,5,6,7,8,10,11,13,14,15,16,18,19,21,22,23,24,25,26,29,33,37,50],call:[0,3,10,19,25,26,28,40,50],call_memori:[3,50],callabl:36,camera:[28,38],camera_height:28,camera_width:28,cameratyp:[28,38],can:[0,2,3,5,6,7,8,11,12,22,25,26,28,29,30,31,32,36,37,38,39,40,42,46,48,50,51],cannot:[3,50],carla:[31,46],carla_environ:28,carlaenviron:28,carlaenvironmentparamet:28,carlo:[3,22],cartpol:[28,38],cartpole_a3c:51,cartpole_clippedppo:[43,51],cartpole_dqn:51,categor:[3,5,6,49],categori:[30,31],categorical_dqn_ag:14,categoricaldqnalgorithmparamet:14,caus:[31,39],cdot:[5,7,8,10,12,13,14,15,16,18,20,22,24],central:[25,39],chain:8,challeng:40,chang:[0,3,6,7,8,11,13,17,19,22,29,40,43,50],change_phas:29,channel:[28,31],channels_axi:36,check:[0,3,26,36,50],checkpoint:[0,3,25,27,41,43,50,51],checkpoint_dir:[3,50],checkpoint_prefix:[3,50],checkpoint_restore_dir:[0,51],checkpoint_restore_path:0,checkpoint_save_dir:0,checkpoint_save_sec:0,child:25,chmod:43,choic:[37,43],choos:[3,17,22,29,30,32,36,37,40,42,49,50,51],choose_act:[3,37,40,50],chosen:[3,12,22,29,32,37,50],chunk:11,cil:49,cil_ag:2,cilalgorithmparamet:2,classic_control:43,clean:[28,37,43],cli:43,clip:[3,6,8,11,25,31,36,49],clip_action_to_spac:36,clip_critic_target:8,clip_gradi:25,clip_high:29,clip_likelihood_ratio_using_epsilon:[7,11],clip_low:29,clip_max:31,clip_min:31,clipbyglobalnorm:25,clipped_ppo_ag:7,clippedppoalgorithmparamet:7,clipping_high:31,clipping_low:31,clone:[3,49],close:28,cmake:43,coach:[0,3,25,27,28,29,30,34,35,37,40,44,45,46,49,51],code:[38,40,49],coeffici:[7,11,25,29,33],collect:[3,7,10,11,19,25,26,33,40,45,48,50,51],collect_sav:[3,25,50],color:31,com:43,combin:[24,42,48,49],comma:0,command:[40,43,51],common:[37,39,43,51],commun:41,compar:[0,11,17,49],complet:[26,29,40],complex:[25,30,40,42,49,51],compon:[3,14,24,25,29,35,37,40,48,50,51],composit:[3,50],compositeag:[3,50],comput:[25,29],concat:25,concentr:40,condit:[0,3],confid:29,config:[28,51],configur:[3,5,10,37,43,50],confus:40,connect:[12,25],connectionist:10,consecut:[8,21],consequ:[19,29],consid:[5,6,32,39],consist:[8,28,31,32,36,40,46],constant:6,constantli:51,constantschedul:33,constrain:32,construct:[12,25,33],consumpt:31,contain:[0,1,2,3,13,25,26,28,36,38,40,50,51],content:43,contin:41,continu:[1,2,5,8,9,10,20,29,30,32,36,45],continuous_entropi:29,continuous_exploration_policy_paramet:29,contribut:[4,48],control:[2,3,5,6,7,8,11,25,29,31,39,46,48,49,50],control_suite_environ:28,controlsuiteenviron:28,conveni:[39,51],converg:10,convers:30,convert:[3,26,29,31,36,40,42,50],convolut:[25,42],coordin:32,copi:[8,12,13,14,15,16,18,19,20,22,23,24,25,43],core:48,core_typ:[3,26,28,36,50,51],correct:[3,6,49],correctli:25,correl:29,correpond:26,correspond:[2,3,4,14,15,25,26,29,31,36,38,50],could:[3,25,36,43,50],count:18,countabl:32,counter:[3,50],counterpart:42,cpu:[0,25],crd:51,creat:[3,19,25,31,38,50,51],create_network:[3,50],create_target_network:25,creation:[3,50],credenti:43,critic:[3,6,7,8,11,29,42,49],crop:[31,32],crop_high:31,crop_low:31,cross:[1,14,24],csv:0,ctrl:39,cuda:43,cudnn7:43,curl:43,curr_stat:[3,37,50],current:[0,1,2,3,4,6,7,8,9,10,11,12,13,15,16,18,20,21,22,23,25,26,28,29,31,32,36,37,40,48,49,50],custom:[28,29,36,37,40],custom_reward_threshold:28,cycl:40,dai:51,dashboard:[0,3,43,48,50],data:[0,10,19,25,33,40,41,43,45,48,49,51],data_stor:[27,43],dataset:[3,7,11,49,50,51],date:[21,42,49,51],dcp:[43,51],ddpg:49,ddpg_agent:8,ddpgalgorithmparamet:8,ddqn:[18,22,49],deal:49,debug:[0,39,48],decai:[5,7,11,25],decid:[0,3,4,28,37,50],decis:[3,50],decod:43,dedic:25,deep:[0,3,5,12,13,15,17,19,20,24,50],deepmind:46,def:[37,38],default_act:36,default_input_filt:38,default_output_filt:38,defin:[0,3,5,6,7,10,11,12,19,21,22,25,26,28,29,31,32,33,36,37,38,40,41,42,45,46,50,51],definit:[3,25,28,36,38,40,50],delai:49,delta:[6,14,21,24],demonstr:[1,2,51],dens:29,densiti:18,depecr:0,depend:[0,3,6,25,31,33,36,38,43,45,49,50],deploi:[35,41],depth:28,descend:49,describ:[3,14,23,31,33,37,40,43,50],descript:[3,32,36,44,51],design:[40,43,48],desir:[32,37],destabil:10,detail:[3,26,44,46,48,51],determin:[2,3,21,26,33,50],determinist:[3,12,49],dev:43,develop:[40,45],deviat:[10,11,29,31,39],devic:25,dfp:49,dfp_agent:4,dfpalgorithmparamet:4,dict:[3,4,25,26,28,29,36,50],dict_siz:33,dictat:4,dictionari:[2,3,25,26,28,33,36,37,50],did:28,differ:[0,1,2,3,4,5,6,7,10,11,13,17,25,28,29,31,36,37,38,39,41,42,48,49,50],differenti:17,difficult:[39,45],difficulti:51,dimens:[26,28,31,32],dimension:[11,32],dir:[0,3,50,51],direct:[3,28,50],directli:[3,5,40,42,50],directori:[0,25,37,39,43,51],disabl:51,disable_fog:28,disappear:28,disassembl:49,discard:[26,31],discount:[8,10,11,18,21,22,24,25,26,49],discret:[1,2,4,7,11,13,14,15,16,17,18,19,21,22,23,24,29,30,31,32,36,40],disentangl:40,disk:0,displai:[0,39],distanc:36,distance_from_go:36,distance_metr:36,distancemetr:36,distil:[3,50],distribut:[5,6,10,11,12,14,23,24,25,27,29,34,35,36,42,48,49,51],distributed_coach:41,distributed_coach_synchronization_typ:41,distributedcoachsynchronizationtyp:41,divereg:[7,11],diverg:[6,7,11,24],dnd:[0,21,49],dnd_key_error_threshold:21,dnd_size:21,do_action_hindsight:33,doc:43,docker:43,dockerfil:43,document:46,doe:[13,25,31],doesn:41,doing:[7,11,30],domain:42,don:[4,29,39,49],done:[0,3,7,10,11,28,31,38,50,51],doom:[28,38,43,46],doom_basic_bc:51,doom_basic_dqn:51,doom_environ:[28,38,51],doomenviron:[28,38],doomenvironmentparamet:[38,51],doominputfilt:38,doomlevel:28,doomoutputfilt:38,doubl:[3,18,24],doubli:50,down:[25,28],download:43,dpkg:43,dqn:[3,18,19,24,28,29,31,32,40,42,49],dqn_agent:[16,50],dqnagent:50,dqnalgorithmparamet:16,drive:[2,28,46,48],driving_benchmark:28,due:31,duel:[3,24],dump:[0,3,50],dump_csv:0,dump_gif:0,dump_in_episode_sign:0,dump_mp4:0,dump_one_value_per_episod:[3,50],dump_one_value_per_step:[3,50],dump_parameters_document:0,dump_signals_to_csv_every_x_episod:0,dure:[3,6,7,10,11,12,13,21,29,39,40,50,51],dynam:[39,45,49],e_greedi:29,each:[0,1,2,3,4,5,6,7,10,11,12,13,15,16,17,19,21,22,23,25,26,28,29,30,31,32,33,36,37,39,40,41,42,43,45,49,50],eas:39,easi:[38,39,48],easier:42,easili:[29,51],echo:43,effect:[0,3,6,7,19,31,40,50],effici:[6,40,49],either:[0,3,5,19,25,29,36,39,42,51],element:[3,13,25,31,36],elf:43,embbed:25,embed:[3,21,25,50],embedd:[25,42],embedding_merger_typ:25,embeddingmergertyp:25,emploi:49,empti:26,emul:6,enabl:[25,42,51],encod:[31,36],encourag:[20,22,40],end:[2,3,10,24,26,28,31,50,51],enforc:32,engin:[28,46],enough:[4,6,21],ensembl:[29,49],ensur:[6,25],enter:[3,50,51],entir:[11,18,21,24,29,32,40],entri:[21,40],entropi:[1,5,6,7,10,11,12,14,24,29,49],enumer:36,env:[26,43],env_param:38,env_respons:[3,50],enviorn:28,environ:[0,3,4,6,17,25,26,29,30,31,32,36,37,40,43,45,47,48,50],environmentparamet:[28,38],envrespons:[0,3,28,50],episod:[0,3,4,5,10,11,13,18,19,24,28,29,37,38,39,40,41,50,51],episode_max_tim:28,episodic_hindsight_experience_replai:33,epoch:[7,50],epsilon:[7,29,33],epsilon_schedul:29,equal:2,equat:[8,12,15,16,19,23],error:[25,49],escap:51,especi:17,essenti:[19,25,32,38,40,43],estim:[3,5,7,11,13,18,22,29,50],estimate_state_value_using_ga:[5,7,11],eta:[7,11],etc:[0,3,25,28,30,36,37,46,50],evalu:[0,3,12,25,26,29,40,50],evaluate_onli:0,evaluation_epsilon:29,evaluation_noise_percentag:29,even:[17,25,28,38,39,40,49],everi:[0,5,6,8,10,12,13,14,15,16,18,19,20,22,23,24,51],exact:[21,29,45],exactli:25,exampl:[2,3,4,25,26,28,29,30,31,32,36,37,38,40,42,50,51],except:[19,26],execut:[26,39,40],exhibit:[3,37,50],exist:[21,25],exit:[3,50],expand_dim:26,expect:[0,3,29,45,50],experi:[0,6,8,11,12,24,28,33,34,39,40,41,43,48,49,51],experiment_path:[0,28],experiment_suit:28,experimentsuit:28,expert:[1,2,26,49],exploit:[29,40],explor:[3,4,5,6,7,8,9,11,13,18,20,21,37,40,48,49],exploration_polici:29,explorationparamet:[3,29,37],exponenti:[6,7,11,24,25],expor:3,export_onnx_graph:0,expos:[39,42,48],extend:[28,29,46],extens:[28,46],extent:51,extern:0,extra:[25,26,42],extract:[3,20,21,26,31,36,39,40,50],factor:[8,10,11,22,24,25,26,29,31],faithfulli:39,fake:36,fals:[0,3,8,25,26,28,29,32,33,36,38,50],far:[11,31,40,45],faster:[17,49],featur:[8,28,42,48,49],feature_minimap_maps_to_us:28,feature_screen_maps_to_us:28,fetch:[25,26],fetched_tensor:25,few:[10,13,14,15,16,18,22,23,24,29,38],field:[45,48],file:[0,3,37,40,50,51],fill:[26,38],filter:[0,3,48,50],find:[15,39,46,48],finish:[21,51],finit:32,first:[0,8,11,13,21,23,24,25,26,31,40,42],fit:36,flag:[0,3,25,26,28,50],flexibl:41,flicker:28,flow:[30,48],follow:[2,3,5,6,8,10,12,14,15,16,19,20,21,23,24,25,26,28,29,33,37,38,43,45,49,50],footprint:31,forc:[25,28,32,38],force_cpu:25,force_environment_reset:[28,38],force_int_bin:32,forced_attention_s:36,form:[4,19,36,49],format:37,formul:[5,6],forward:[25,29],found:[3,44,51],frac:[6,7,12,14,24],fraction:[7,11],frame:[0,28],frame_skip:28,framework:[0,3,25,37,48,50],framework_typ:0,free:[28,46],freeglut3:43,from:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,28,29,30,31,32,34,36,37,38,39,40,41,42,43,45,46,48,50,51],full:[3,10,18,32,50],fulldiscreteactionspacemap:32,fulli:25,func:[3,50],futur:[0,3,10,26,49],future_measurements_weight:4,gae:[5,7,11],gae_lambda:[5,7,11],game:[3,26,28,46,48,50,51],game_ov:26,gamma:[5,6,8,12,13,14,15,16,18,19,20,21,22,24],gap:[22,49],gather:41,gaussian:[11,12,29],gener:[0,5,7,11,13,25,28,29,33,36,37,43,51],general_network:37,get:[3,4,7,8,9,10,11,13,15,16,18,20,22,25,26,28,29,36,40,42,43,45,50],get_act:29,get_action_from_us:28,get_available_kei:28,get_first_transit:26,get_goal:28,get_last_env_respons:28,get_last_transit:26,get_output_head:37,get_predict:[3,50],get_random_act:28,get_rendered_imag:[28,38],get_reward_for_goal_and_st:36,get_state_embed:[3,50],get_transit:26,get_transitions_attribut:26,get_variable_valu:25,get_weight:25,gfortran:43,gif:0,git:43,github:[38,43,45,48],given:[0,1,2,3,4,5,8,10,11,25,26,28,29,31,32,33,36,37,40,50],given_weight:25,global:[3,25,42,50],global_network:25,glx:43,goal:[1,2,3,4,6,25,26,28,33,40,42,49,50],goal_from_st:36,goal_nam:36,goal_spac:28,goal_vector:4,goals_spac:33,goalsspac:[33,36],goaltorewardconvers:36,going:30,good:[38,39],gpu:[0,25],gracefulli:51,gradient:[3,5,6,7,11,19,21,25,37,49,50],gradientclippingmethod:25,gradients_clipping_method:25,granular:33,graph:0,graphmanag:40,grayscal:[31,36],greedili:40,group:39,grow:24,guidelin:49,gym:[43,46],gym_environ:[28,51],gymenviron:28,gymenvironmentparamet:38,hac:49,had:45,hand:[17,31,40,49],handl:4,handle_episode_end:[3,28,50],handling_targets_after_episode_end:4,handlingtargetsafterepisodeend:4,hard:[39,49],harder:39,has:[0,3,17,21,22,26,29,31,40,42,45,49,50],has_glob:25,has_target:25,hat:[6,7,14,24],have:[0,3,4,6,25,28,29,31,32,33,40,42,45,50],head:[1,2,3,5,6,10,13,17,20,21,25,29,37,42,50],headparamet:25,heads_paramet:25,health_gath:28,heat:6,heatup:[29,40],help:[22,26,39,40,49],here:[38,40],heurist:[11,29],hide:42,hierarch:[36,40],hierarchi:[3,40,49,50],high:[8,11,31,32,36,39],high_i:36,high_kl_penalty_coeffici:11,high_x:36,higher:11,highest:[5,6,10,22,29,31,32,36],highli:[0,38,49],hindsight:[9,33,49],hindsight_goal_selection_method:33,hindsight_transitions_per_regular_transit:33,hindsightgoalselectionmethod:33,hold:[13,25,26,33,39,40,42],horizont:[43,48,51],host:43,hostnam:0,hot:36,how:[4,7,11,29,41,43,49,51],hrl:33,html:43,http:[19,33,43],hub:43,huber:23,huber_loss_interv:23,human:[0,28],human_control:28,hyper:[37,45],hyperparamet:37,ident:25,identifi:[25,36],ies:50,ignor:28,imag:[0,25,28,31,32,36,38,42,51],image1:43,imit:[3,26,44,49],impact:25,implement:[3,7,11,25,27,28,29,33,37,38,41,45,49,51],impli:51,implment:35,importance_weight:25,importance_weight_trunc:6,importantli:40,improv:[5,17,24,28,40,49],improve_reward_model:50,includ:[0,3,4,28,30,31,35,42,46,50,51],increas:[11,22,31,49],increment:[3,50],index:[0,2,26,28,31,32,33,36],indic:36,inf:[31,36],infer:[3,25,28,50],infinit:[0,49],info:[3,13,26,36,38,50],info_as_list:26,inform:[3,4,19,26,28,30,39,40,43,46,50],inherit:[3,37,38],init_environment_dependent_modul:[3,50],initi:[3,4,11,22,25,26,37,40,48,50],initial_feed_dict:25,initial_kl_coeffici:11,innov:49,input:[1,2,3,4,8,13,15,16,18,20,21,22,25,30,36,40,42,50],input_embedders_paramet:25,input_high:31,input_low:31,input_space_high:32,input_space_low:32,inputembedderparamet:25,inputfilt:40,insert:[21,26],inspect:0,instal:[43,51],instanc:[3,34,36,42],instanti:[3,28,40],instead:[0,3,7,19,22,25,31,32,40,49,50],instruct:51,intact:[13,45],integ:[0,31,32],integr:[38,40,41,48],intel:48,intend:[10,25,29,40],interact:[26,40,41,48,51],interest:[25,39],interfac:[28,39,41,46],intermedi:21,intern:[3,10,19,25,26,30,40,50,51],intersect:49,interv:23,intro:48,introduc:49,invers:[28,46],invok:40,involv:37,is_empti:26,is_valid_index:36,item:26,iter:[3,5,6,8,11,17,25,50],its:[0,3,14,24,25,26,29,36,40,43,49,50,51],itself:[25,36,51],job:0,job_typ:0,joint:28,json:0,jump:[4,32],jupyt:37,just:[3,11,22,24,38,40,42,50,51],kapa:23,keep:[16,26,31,51],kei:[2,21,25,26,28,33,37,39,43,49,51],key_error_threshold:33,key_width:33,keyboard:[28,51],keyword:25,kl_coeffici:25,kl_coefficient_ph:25,know:[3,49,50,51],knowledg:[3,40,50],known:[26,39,45,49],kubeconfig:35,kubernet:43,kubernetes_orchestr:35,kubernetesparamet:35,kwarg:[25,28],l2_norm_added_delta:21,l2_regular:25,lack:39,lamb:29,lambda:[5,7,11,29],lane:2,larg:[29,31,46],larger:25,last:[4,6,11,21,26,28,31],last_env_respons:28,lastli:40,later:[0,3,25,50,51],latest:[19,21,40,43],layer:[25,29,33,40,42],lazi:[26,31],lazystack:31,lbfg:25,ld_library_path:43,lead:29,learn:[0,3,4,5,6,8,9,10,12,13,14,15,16,17,20,23,24,25,26,28,29,31,39,40,42,44,45,46,49,50],learn_from_batch:[3,37,40,50],learner:25,learning_r:[25,33],learning_rate_decay_r:25,learning_rate_decay_step:25,least:[42,49],leav:[11,13],left:[2,6,12,49],length:[4,5,7,11,19,21,25,26],less:[17,49],level:[0,3,25,28,38,50,51],levelmanag:[3,40,50],levelselect:28,libatla:43,libav:43,libavformat:43,libbla:43,libboost:43,libbz2:43,libfluidsynth:43,libgl1:43,libglew:43,libgm:43,libgstream:43,libgtk2:43,libgtk:43,libjpeg:43,liblapack:43,libnotifi:43,libopen:43,libosmesa6:43,libportmidi:43,librari:[28,43,46],libsdl1:43,libsdl2:43,libsdl:43,libsm:43,libsmpeg:43,libswscal:43,libtiff:43,libwebkitgtk:43,libwildmidi:43,like:[12,28,36,40,42,43,49],likelihood:[7,11],line:[3,40,50,51],linear:32,linearboxtoboxmap:32,linearli:32,list:[0,3,4,25,26,28,29,31,32,36,37,50,51],load:[0,39,41,51],load_memory_from_file_path:51,local:[3,42,43,50],locat:[23,26,31,49],log:[0,3,5,6,10,12,50],log_to_screen:[3,50],logger:[0,3,50],look:[38,43],loop:40,loss:[1,2,3,6,7,10,11,14,15,16,23,24,25,29,37,42,50],lot:[29,39,45,49],low:[8,11,31,32,36],low_i:36,low_x:36,lower:[0,33,40],lowest:[31,32,36],lstm:42,lumin:31,lvert:[6,14,24],lvl:51,mai:[0,25,44,51],main:[3,37,40,42,44,50,51],mainli:41,major:29,make:[0,3,25,28,37,39,43,45,49,50],manag:[3,25,41,43,50],mandatori:[36,38,42],mani:[3,17,44,45],manner:[11,18,19,22,31,40],manual:43,map:[3,25,28,30,31,32,36,37,50],mark:26,markdown:50,mask:[13,32],masked_target_space_high:32,masked_target_space_low:32,master:[3,40,43,50],match:[2,21,25,36],mathbb:[5,6],mathop:5,max:[5,6,14,19,24,31],max_a:[13,16,21,22],max_action_valu:26,max_episodes_to_achieve_reward:0,max_fps_for_human_control:0,max_kl_diverg:6,max_over_num_fram:28,max_simultaneous_selected_act:36,max_siz:33,max_spe:28,maxim:[4,15],maximum:[0,12,14,16,21,22,26,28,29,31,33,49],mean:[0,2,7,8,9,10,11,12,20,25,29,31,32,36,39,49],meant:42,measur:[3,4,25,28,31,36,38,49,50],measurements_nam:36,mechan:[30,41,45,51],memor:49,memori:[3,24,26,31,37,40,41,43,48,49,50],memory_backend:43,memorygranular:33,memoryparamet:[3,37],merg:[25,28],mesa:43,method:[0,5,7,11,19,25,31,33],metric:[0,36,39],mid:6,middlewar:[21,25,42],middleware_paramet:25,middlewareparamet:25,midpoint:23,might:[3,10,28,37,42,50],min:[6,7,14,22,24],min_:12,min_reward_threshold:0,mind:51,minim:[2,4,14],minimap_s:28,minimum:[0,7,31],mix:[3,7,11,21,22,49],mixedmontecarloalgorithmparamet:18,mixer1:43,mixtur:[18,25],mjkei:43,mjpro150:43,mjpro150_linux:43,mkdir:43,mmc:[18,49],mmc_agent:18,mode:[22,25,27,34,35,40,41,43,51],model:[0,18,20,25,48,50,51],modif:49,modifi:6,modul:[3,37,40,41,50],modular:[37,40,42,48],monitor:41,mont:[3,22],monte_carlo_mixing_r:[18,22],more:[3,8,19,25,31,37,39,40,42,43,48,50,51],moreov:39,most:[3,10,21,25,26,29,42,45,49,50,51],mostli:[31,40],motiv:40,move:[6,7,11,31,39,45],mp4:0,mse:[2,6,15,16,23],much:[7,11,40,49],mujoco:[28,32,38,43,46],mujoco_kei:43,mujoco_pi:43,multi:[11,25,36,42],multidimension:36,multipl:[4,7,11,19,25,28,29,31,32,33,36,39,40,45,48,51],multipli:[4,10,25,31],multiselect:32,multitask:[28,46],must:[25,31,36,45],mxnet:51,n_step:[21,24,26,33],n_step_discounted_reward:26,n_step_q_ag:19,nabla:[6,8],nabla_:[8,12],nabla_a:8,naf:49,naf_ag:20,nafalgorithmparamet:20,name:[3,25,26,28,31,36,37,43,50,51],namespac:35,nasm:43,nativ:[0,28,38,46],native_rend:0,navig:3,ndarrai:[3,25,26,28,29,31,32,36,38,50],nearest:21,neat:39,nec:[0,49],nec_ag:21,necalgorithmparamet:21,necessari:[3,21,25,50],necessarili:31,need:[0,3,6,24,25,28,29,36,37,40,45,49,50,51],neg:[4,31],neighbor:21,neon_compon:37,nervanasystem:43,network:[0,3,25,29,37,40,45,48,49,50,51],network_input_tupl:25,network_nam:[3,50],network_param:29,network_paramet:25,network_wrapp:[3,25,50],networkparamet:[3,25,29,37],networkwrapp:[3,50],neural:[3,18,25,42,45],never:25,new_value_shift_coeffici:[21,33],new_weight:25,newli:[22,38,49],next:[0,3,8,15,16,20,22,23,26,28,40,50,51],next_stat:26,nfs_data_stor:27,nfsdatastoreparamet:27,nice:51,no_accumul:25,node:[25,42],nois:[8,9,20,29,40],noise_percentage_schedul:29,noisi:[10,24,29],non_episod:33,none:[0,3,7,8,11,25,26,28,29,31,32,36,38,50],norm:25,norm_unclipped_grad:25,norm_unclippsed_grad:25,normal:[3,4,10,29,30,31,36],note:[21,25,29,50],notebook:37,notic:[25,49],notori:[39,45,49],now:[7,38],nstepqalgorithmparamet:19,nth:24,num_act:[21,33,36],num_bins_per_dimens:32,num_class:33,num_consecutive_playing_step:[3,8,50],num_consecutive_training_step:[3,50],num_gpu:0,num_neighbor:33,num_predicted_steps_ahead:4,num_speedup_step:28,num_steps_between_copying_online_weights_to_target:[8,12,19],num_steps_between_gradient_upd:[5,6,10,19],num_task:0,num_training_task:0,num_transitions_to_start_replai:6,num_work:0,number:[0,2,4,5,6,8,10,12,13,14,19,21,23,24,25,26,28,29,31,32,33,39,46,50,51],number_of_knn:21,numpi:[3,25,26,28,29,31,32,36,38,50],nvidia:43,object:[0,3,24,25,28,29,31,33,40,50],observ:[0,3,4,11,25,26,28,30,38,40,50],observation_reduction_by_sub_parts_name_filt:31,observation_space_s:25,observation_space_typ:28,observation_stat:31,observation_typ:28,observationspac:36,observationspacetyp:28,observationtyp:28,off:[3,6,12,41,49,50],offer:[28,46],often:[39,40,42],old:[7,11,25,49],old_weight:25,onc:[0,7,10,11,13,14,15,16,18,19,22,23,24,25,36,51],one:[0,3,6,17,21,22,25,26,28,29,30,33,36,38,39,42,49,50],ones:[38,49],onli:[0,3,4,5,6,7,10,11,13,14,16,17,19,21,23,24,25,26,28,29,31,32,38,40,49,50,51],onlin:[8,12,13,14,15,16,18,19,20,21,22,23,24,25,40,42],online_network:25,onnx:[0,25],onto:30,open:[0,28,46],openai:[43,46],opencv:43,oper:[22,25,31],optim:[3,4,6,25,44,49],optimization_epoch:7,optimizer_epsilon:25,optimizer_typ:25,option:[6,10,25,28,32,36,37,39,41,42,51],orchestr:[41,43,48],order:[0,3,5,6,7,8,10,11,12,15,16,17,19,20,21,22,23,25,26,30,31,32,39,40,42,45,49,50],org:[19,33],origin:[19,31,32,45],ornstein:[8,9,29],other:[0,2,10,17,22,25,28,30,31,33,39,40,49],otherwis:[11,13,25,28,29,36],ou_process:29,our:7,out:[2,15,16,29,30,32,39,43,48,49,51],outcom:[29,40],output:[0,4,6,8,13,14,20,21,25,29,30,31,36,37,42],output_0_0:25,output_observation_spac:31,outputfilt:40,outsid:[4,29],over:[3,7,10,11,19,21,24,25,26,29,31,32,39,40,49,50],overestim:8,overfit:11,overhead:0,overlai:39,overrid:[3,50],override_existing_kei:33,overriden:37,overview:40,overwhelm:40,overwritten:25,own:[25,37],p_j:[14,24],page:[3,45],pair:[0,36],pal:[22,49],pal_ag:22,pal_alpha:22,palalgorithmparamet:22,paper:[5,10,12,14,19,21,23,28,33,45],parallel:[6,25,39,42],parallel_predict:25,param:[3,25,26,27,28,29,34,35,37,38,50],paramet:[2,3,4,5,6,7,8,10,11,12,14,18,19,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,45,48,50,51],parameter_nois:29,parameters_server_host:0,parent:[3,25,50],parent_path_suffix:[3,25,50],parmet:3,pars:40,part:[0,13,25,26,29,31,32,41,42,45,49],part_nam:31,partial:32,partialdiscreteactionspacemap:32,particular:4,particularli:[28,29,36,45,49],pass:[0,4,8,9,20,21,25,28,29,30,38,39,40,42,51],patamet:21,patchelf:43,patchelf_0:43,path:[0,3,25,37,38,43,50,51],pattern:40,pdf:33,penal:[7,8,11],penalti:11,pendulum_hac:38,pendulum_with_go:38,pendulumwithgo:38,per:[0,3,4,36,37,40,50],percentag:29,percentil:29,perceptron:42,perform:[0,3,6,25,26,31,33,38,39,40,49,50],period:[42,51],persist:3,persistent_advantage_learn:22,perspect:14,phase:[3,6,7,8,9,11,12,25,28,29,40,50],phi:[14,24],physic:[28,46],pi_:[6,7,12],pick:[12,28],pickl:51,pickledreplaybuff:51,pip3:43,pip:43,pixel:28,place:[32,39,40],placehold:[25,29],plai:[0,3,10,13,15,16,19,29,37,39,50],plain:42,planarmap:28,planarmapsobservationspac:31,platform:[28,46],pleas:[19,45],plu:25,plugin:43,point:[31,36,40,41],polici:[1,3,4,5,6,9,12,13,19,20,21,27,37,40,41,42,43,44,48,49,50],policy_gradient_rescal:[5,7,10,11],policy_gradients_ag:10,policygradientalgorithmparamet:10,policygradientrescal:[5,7,10,11],policyoptimizationag:37,popul:40,popular:[28,46],port:0,posit:[4,31],possibl:[2,3,4,21,29,32,36,39,42,48,49,50,51],post:[30,48],post_training_command:[3,50],power:[28,46],ppo:[7,11,49],ppo_ag:11,ppoalgorithmparamet:11,pre:[8,29,30],predefin:[13,22,29,51],predict:[1,2,3,5,6,7,8,11,12,13,14,15,16,22,23,24,25,29,42,49,50],prediction_typ:[3,50],predictiontyp:[3,50],prefect:49,prefer:25,prefix:[3,50],prep:43,prepar:[3,50],prepare_batch_for_infer:[3,50],present:[17,21,25,28,31,49],preset:[0,5,37,38,40,41,43,51],press:[39,51],prevent:[8,11,40],previou:31,previous:[11,25],print:[0,3,51],print_networks_summari:0,priorit:[24,33],prioriti:[24,33],privat:36,probabilit:[5,6],probabl:[3,5,6,10,13,14,24,26,29,37,49,50],problem:49,procedur:6,process:[0,3,8,9,25,29,30,31,32,37,39,40,42,45,48,50],produc:25,progress:25,project:[14,24],propag:7,propagate_updates_to_dnd:21,properti:[25,33,37,38,43],proport:33,provid:[25,41],proxi:40,proxim:3,pub:[34,35,43],publish:45,purpos:[0,3,10],pursuit:2,pybullet:[28,46],pygam:[0,43],pytest:43,python3:43,python:[28,33,37,43,46,48],q_i:12,qr_dqn_agent:23,quad:6,qualiti:28,quantil:[3,49],quantileregressiondqnalgorithmparamet:23,queri:[21,25,40,49],question:49,quit:39,r_i:[5,19],r_t:[4,6,7,24],rainbow:[3,37,49],rainbow_ag:37,rainbow_dqn_ag:24,rainbowag:37,rainbowagentparamet:37,rainbowalgorithmparamet:37,rainbowdqnalgorithmparamet:24,rainbowexplorationparamet:37,rainbowmemoryparamet:37,rainbownetworkparamet:37,rais:[3,26,50],ramp:[37,40],random:[0,19,28,29,36,40,45],random_initialization_step:28,randomli:[26,40],rang:[4,7,8,11,14,24,28,29,31,32,36,49],rare:21,rate:[0,6,18,21,25,28,42],rate_for_copying_weights_to_target:[6,8,12],rather:[4,12,39],ratio:[6,7,11,18,31],ratio_of_replai:6,raw:[28,46],reach:[0,11,36],read:27,readabl:40,readm:43,real:3,reason:[31,45],rebuild_on_every_upd:33,receiv:[25,26],recent:[3,24,25,49,50],recommend:38,redi:[34,35,43],redispubsub:43,redispubsubmemorybackendparamet:34,reduc:[1,2,10,11,22,25,31,40,49],reduct:31,reduction_method:31,reductionmethod:31,redund:31,refer:[2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,41,43],referenc:3,regard:[3,50],region:[6,49],regist:[3,50],register_sign:[3,50],registri:43,regress:[2,3,49],regula:[6,7,11],regular:[5,7,10,11,19,21,25,29,32,33,49],regularli:25,reinforc:[3,5,8,9,10,12,14,15,16,17,19,22,23,24,28,29,39,40,42,44,45,46,49],rel:29,relat:[25,43],relationship:49,releas:[0,48,49],relev:[3,13,29,31,50],remov:[0,31],render:[0,3,28,38],reorder:31,repeat:[28,40],replac:[29,31,33,43],replace_mse_with_huber_loss:25,replai:[1,2,3,6,8,12,13,14,15,16,19,21,22,23,24,33,40,49,50,51],replay_buff:51,replicated_devic:25,repo:38,repositori:48,repres:[0,7,11,14,24,25,26,28,29,32,36,51],represent:42,reproduc:[40,45],request:[3,25,50],requir:[3,25,27,29,31,39,42,43,49,50],requires_action_valu:29,rescal:[4,5,7,10,11,25,30,31],rescale_factor:31,research:[28,45,46],reset:[3,21,25,28,29,38,50],reset_accumulated_gradi:25,reset_evaluation_st:[3,50],reset_gradi:25,reset_internal_st:[3,28,50],resourc:[41,43],respect:[8,26,28],respons:[3,26,28,40,50],rest:[25,26,32,43],restart:38,restor:[0,3,50],restore_checkpoint:[3,50],result:[3,4,14,15,16,17,23,24,25,31,32,45,49,50,51],ret:6,retrac:6,retriev:[21,33],return_additional_data:33,reus:40,reusabl:42,reward:[0,1,2,3,4,8,10,18,19,24,25,26,28,30,36,38,39,40,49,50],reward_test_level:0,reward_typ:36,rgb:[28,31,36],rho:[6,8],rho_t:6,right:[2,3,6,12,29,32,39,49,50],rl_coach:[0,1,2,3,4,5,6,7,8,10,11,12,14,16,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,38,43,50,51],rms_prop_optimizer_decai:25,rmsprop:25,roboschool:[28,46],robot:[28,36,46,48],roboti:43,robust:50,rollout:[27,34,35,41,43,51],root:[39,43],rule:[8,13],run:[0,3,4,8,10,11,12,13,15,16,21,22,25,28,29,31,50,51],run_off_policy_evalu:[3,50],run_pre_network_filter_for_infer:[3,50],runphas:[3,50],runtim:43,rvert:[14,24],rvert_2:6,s3_bucket_nam:43,s3_creds_fil:43,s3_data_stor:27,s3_end_point:43,s3datastoreparamet:27,s_t:[4,5,6,8,12,13,14,15,16,18,19,20,22,24],sac:49,sai:49,same:[3,4,7,10,18,19,22,25,28,32,33,39,42,45,49,50],sampl:[1,2,3,5,6,8,10,11,12,13,14,15,16,18,19,22,23,24,25,29,33,36,40,43,50],sample_with_info:36,satur:8,save:[0,3,24,25,29,43,50,51],save_checkpoint:[3,50],saver:[3,25,50],savercollect:[3,25,50],scale:[4,10,25,31,39,43,48,51],scale_down_gradients_by_number_of_workers_for_sync_train:25,scale_measurements_target:4,scaler:25,schedul:[7,29,33,40,41,43,51],scheme:[5,29,40,49],schulman:11,sci:43,scienc:45,scipi:[31,43],scope:25,scratch:49,scratchpad:0,screen:[3,28,38,51],screen_siz:28,script:40,second:[0,25,39,49,51],section:[43,44,46],see:[3,28,31,43,45,46,49,50,51],seed:[0,28,45],seen:[4,21,22,28,31,40,45,49],segment:[28,36],select:[5,13,21,25,26,29,31,32,36,38,39,40,48,51],self:[3,25,37,38,50],send:[38,42],separ:[0,3,17,31,32,42,44,49],separate_actions_for_throttle_and_brak:28,seper:10,sequenti:[4,26,33],serv:[7,10,42],server:0,server_height:28,server_width:28,sess:[3,25,50],session:[3,25,50],set:[0,2,3,4,5,6,7,8,11,14,15,16,18,21,22,24,25,26,28,29,31,32,36,37,41,45,46,48,49,50,51],set_environment_paramet:[3,50],set_goal:28,set_incoming_direct:[3,50],set_is_train:25,set_sess:[3,50],set_variable_valu:25,set_weight:25,setup:[3,43,50],setup_logg:[3,50],setuptool:43,sever:[0,3,7,10,11,13,25,28,29,31,37,38,39,40,42,46,49,50,51],shape:[25,31,36],share:[0,3,25,33,42,50],shared_memory_scratchpad:0,shared_optim:25,shift:[32,40],shine:39,should:[0,3,4,7,11,13,19,22,25,26,28,31,33,36,37,38,41,50,51],should_dump:0,shouldn:13,show:45,shown:45,shuffl:26,side:[3,50],sigma:29,signal:[3,40,50],signal_nam:[3,50],significantli:17,sim:[6,12],similar:[7,17,19,26,28,32,49],simpl:[10,33,37,38,42,48,49,51],simplest:49,simplif:49,simplifi:[7,39,42],simul:[28,38,46,51],simultan:7,sinc:[3,7,8,10,19,21,22,24,25,29,31,50],singl:[3,4,5,6,7,11,13,17,18,19,25,26,28,29,32,36,39,40,42,50],size:[25,26,29,31,32,33,36],skill:49,skip:[28,40],slave:[3,50],slice:26,slow:[25,51],slower:[0,17,25],slowli:8,small:[7,21,33],smaller:29,smooth:39,soft:[3,8,11,20,49],soft_actor_critic_ag:12,softactorcriticalgorithmparamet:12,softmax:[25,29],softmax_temperatur:25,softwar:43,solut:49,solv:[31,38,46,48],some:[0,3,11,25,26,29,31,37,38,39,42,45,49,50,51],sort:23,sourc:[0,1,2,3,4,5,6,7,8,10,11,12,14,16,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,38,43,46,50],space:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,28,29,30,31,32,33,40,48,50],spacesdefinit:[3,25,50],spatial:49,spawn:[41,43],special:17,specif:[0,3,13,17,21,25,26,37,40,51],specifi:[0,25,28,29,31,38,41,51],speed:[25,31,49],speedup:51,spread:[31,32],squar:31,squeeze_list:25,squeeze_output:25,src:43,stabil:[6,19,25,49],stabl:[42,49],stack:[3,30,31,36,50],stack_siz:[25,31],stacking_axi:31,stage:42,stai:45,standard:[7,10,11,13,29,31,39],starcraft2_environ:28,starcraft2environ:28,starcraft:[36,46],starcraftobservationtyp:28,start:[3,6,8,11,12,17,22,26,31,32,38,43,50],state:[1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,28,31,33,36,37,38,40,42,44,49,50],state_key_with_the_class_index:[2,33],state_spac:28,state_valu:26,statist:[3,10,31,48,50],std:12,stdev:29,steep:29,step:[0,3,4,5,6,7,8,10,11,12,13,14,15,16,18,20,21,22,23,24,25,26,28,29,31,37,38,39,40,49,50,51],stepmethod:[8,12,19],stochast:[12,40,49],stop:[0,28],store:[0,3,21,24,26,28,31,33,39,40,41,43,48,50,51],store_transitions_only_when_episodes_are_termin:24,str:[0,2,3,4,19,25,26,28,29,31,32,36,50],strategi:[28,46],stream:[17,41],strict:45,string:[0,25,28],structur:[0,3,26,33,37,40,50],stuff:25,style:29,sub:[32,33,34,35,36,37,40,43,51],sub_spac:36,subset:[39,45,49],subtract:22,succeed:28,success:[0,28,49],suffer:39,suffici:26,suffix:[3,25,50],suggest:37,suit:[0,46],suitabl:[41,51],sum:[4,7,10,18,25,26],sum_:[5,12,14,18,19,21,24],summari:[0,3,50],supervis:49,suppli:[3,50],support:[0,3,25,28,29,39,42,43,44,46,48,51],sure:[0,43,45],surrog:7,swig:43,swingup:28,symbol:25,sync:[3,25,40,41,50],synchron:[0,25,40,42],t_max:[10,19],tag:43,take:[0,3,10,11,17,21,22,25,28,29,30,38,39,40,50],taken:[1,2,4,5,6,7,8,11,12,14,17,21,22,23,24,25,26,28,29],tanh:8,tar:43,target:[0,1,2,3,4,5,6,7,8,11,12,13,14,15,16,18,19,20,21,22,23,24,25,28,31,32,36,37,40,42,50],target_act:32,target_kl_diverg:11,target_network:25,target_success_r:28,targets_horizon:19,task:[0,1,2,28,31,37,39,46],task_index:0,tau:12,techniqu:[7,11,48,49],technolog:41,teh:25,temperatur:[25,29],temperature_schedul:29,tensor:[3,25,50],tensorboard:0,tensorflow:[0,3,25,50,51],tensorflow_support:25,term:[6,7,11],termin:[3,8,26,40,50],test:[0,3,5,6,8,9,10,11,12,25,37,45,48,51],test_using_a_trace_test:0,text:6,textrm:40,than:[0,3,11,25,29,39,42,50],thei:[3,21,22,25,29,39,40,41,49,50,51],them:[4,5,10,19,25,26,28,31,36,38,39,42],therefor:[0,8,25,30,49],theta:[6,7,8,12,14,24,29],theta_:[6,7],thi:[0,3,4,5,6,7,8,10,11,13,17,19,21,24,25,26,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,45,49,50,51],thing:39,those:[0,3,8,13,15,16,17,21,26,29,32,40,42,44,49,50],thousand:[11,13,14,15,16,18,22,23,24],thread:25,three:[3,41,42,43,44],threshold:[11,21,31],through:[0,3,4,8,9,10,11,13,21,22,25,37,38,40,42,50],tild:[8,12],time:[0,4,22,25,29,32,33,39,42,49],time_limit:38,timestep:[4,10],timid:43,tmp:0,togeth:[3,19,26,40,50],toggl:39,too:11,tool:[39,43,49],top:[25,28,30,31,33,38,39,49],torqu:28,total:[0,3,10,11,18,21,22,26,33,37,39,49,50],total_loss:25,total_return:26,trace:0,trace_max_env_step:0,trace_test_level:0,tradeoff:29,train:[0,3,17,25,29,34,35,37,38,39,40,41,42,45,48,49,50],train_and_sync_network:25,train_on_batch:25,train_to_eval_ratio:33,trainer:[27,41],transfer:[28,34,46],transit:[1,2,3,4,5,6,8,10,11,12,14,15,16,19,21,22,23,24,33,37,40,41,50],transition_idx:26,tri:49,trick:45,tricki:39,trigger:[28,43],truncat:6,truncated_norm:29,trust:[6,49],ttf2:43,tune:29,tupl:[1,2,3,8,25,26,28,33,36,37],turn:[2,49],tutori:[37,38],tweak:[3,50],two:[8,10,19,25,28,29,30,31,32,36,38,41,42,51],txt:43,type:[0,3,10,17,25,28,31,36,37,40,42,48,49,50,51],typic:[7,11,25,49,51],ubuntu16:43,uhlenbeck:[8,9,29],uint8:31,unbound:36,uncertain:29,uncertainti:29,unchang:11,unclip:[3,37,50],uncorrel:19,undeploi:41,under:[3,25,37,51],underbrac:5,understand:51,unifi:7,uniformli:[28,29,32,36],union:[3,26,28,29,32,36,50],uniqu:25,unit:39,unlik:11,unmask:32,unnecessari:0,unshar:[3,50],unsign:31,unspecifi:25,unstabl:[39,45],until:[0,6,10,11,21,24,29],unus:25,unzip:43,updat:[3,6,7,8,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,29,37,38,39,40,42,43,49,50],update_discounted_reward:26,update_filter_internal_st:[3,50],update_log:[3,50],update_online_network:25,update_step_in_episode_log:[3,50],update_target_network:25,update_transition_before_adding_to_replay_buff:[3,50],upgrad:43,upon:[3,5,37,50],upper:[6,29],usag:[32,48],use:[0,1,2,3,4,5,6,8,9,10,12,13,15,16,20,25,26,27,28,29,31,32,33,36,37,38,40,42,43,48,49,50,51],use_accumulated_reward_as_measur:4,use_cpu:0,use_deterministic_for_evalu:12,use_full_action_spac:28,use_kl_regular:[7,11],use_non_zero_discount_for_terminal_st:8,use_separate_networks_per_head:25,use_target_network_for_evalu:8,use_trust_region_optim:6,used:[0,2,3,5,6,7,8,10,11,12,13,14,18,19,20,21,22,23,25,28,29,31,32,33,34,35,37,38,40,41,42,45,50,51],useful:[0,3,4,24,25,29,31,36,45,49,50,51],user:[25,28,29,39,40,43],userguid:43,uses:[0,1,7,11,17,26,27,29,35,40,41,43,45,49,51],using:[0,3,5,6,7,8,10,11,12,15,16,18,19,20,21,22,24,25,27,28,29,31,34,37,38,39,41,46,49,50,51],usr:43,usual:[31,40],util:[3,39,50],v_max:14,v_min:14,val:[3,36,50],valid:[0,36],valu:[0,2,3,4,5,6,7,8,11,12,13,14,15,16,17,19,20,21,22,24,25,26,28,29,31,32,33,36,37,40,42,43,44,49,50],valuabl:39,value_targets_mix_fract:[7,11],valueexcept:[3,50],valueoptimizationag:37,van:4,vari:42,variabl:[25,28,43],variable_scop:25,varianc:[10,29,39,49],variant:[29,33,49],variou:[3,26,33,48],vector:[3,4,8,9,11,13,25,28,31,36,38,42,49,50],vectorobservationspac:31,verbos:28,veri:[0,7,8,10,17,21,39,49,51],version:[7,11,26],versu:25,vert:12,vertic:25,via:[2,13],video:[0,3,28],video_dump_method:0,view:39,viewabl:[3,50],visit:45,visual:[0,3,28,46,48],visualization_paramet:28,visualizationparamet:[3,28],vizdoom:[43,46],vote:29,wai:[3,7,11,29,32,38,40,42,48,49,50,51],wait:[5,25,41],walk:38,want:[3,4,24,25,31,32,33,50],warn:[29,31,32],wasn:26,weather_id:28,websit:[28,48],weight:[4,5,6,7,8,11,12,13,14,15,16,18,19,20,21,22,23,24,25,29,40,42,49],well:[21,25,29,36,49],went:11,were:[4,14,15,16,17,21,23,24,25,26,32,45],west:43,wget:43,what:[11,49],whatev:[3,50],when:[0,3,4,5,6,7,8,9,10,11,12,21,25,26,27,28,29,31,34,35,37,38,39,50,51],whenev:41,where:[2,3,4,5,6,7,11,13,14,17,19,21,22,24,25,26,28,29,31,32,36,39,49,50],which:[0,1,2,3,5,6,7,8,10,11,12,13,17,19,20,21,22,23,25,26,27,28,29,31,33,34,35,36,37,38,39,40,41,42,44,45,46,48,49,50,51],who:40,why:[39,40],window:[31,32],wise:31,within:[0,7,11,20,29,36,39],without:[5,11,32,33,39,49,51],won:[4,25],wont:25,work:[3,19,25,29,31,32,39,40,49,50,51],workaround:0,workdir:43,worker:[0,19,25,27,31,33,34,35,39,41,42,43,49,51],worker_devic:25,worker_host:0,wors:49,would:[25,43,49],wrap:[28,31,40,46],wrapper:[3,25,26,28,36,42,50],write:[0,3,50],written:[3,24,27,50],www:43,xdist:43,y_t:[8,12,13,15,16,18,20,21,22],year:49,yet:[17,38],you:[4,31,33,37,38,43,48,51],your:[37,38,43,51],yuv:31,z_i:[14,24],z_j:[14,24],zero:[2,15,16],zip:43,zlib1g:43},titles:["Additional Parameters","Behavioral Cloning","Conditional Imitation Learning","Agents","Direct Future Prediction","Actor-Critic","ACER","Clipped Proximal Policy Optimization","Deep Deterministic Policy Gradient","Hierarchical Actor Critic","Policy Gradient","Proximal Policy Optimization","Soft Actor-Critic","Bootstrapped DQN","Categorical DQN","Double DQN","Deep Q Networks","Dueling DQN","Mixed Monte Carlo","N-Step Q Learning","Normalized Advantage Functions","Neural Episodic Control","Persistent Advantage Learning","Quantile Regression DQN","Rainbow","Architectures","Core Types","Data Stores","Environments","Exploration Policies","Filters","Input Filters","Output Filters","Memories","Memory Backends","Orchestrators","Spaces","Adding a New Agent","Adding a New Environment","Coach Dashboard","Control Flow","Distributed Coach - Horizontal Scale-Out","Network Design","Usage - Distributed Coach","Algorithms","Benchmarks","Environments","Features","Reinforcement Learning Coach","Selecting an Algorithm","test","Usage"],titleterms:{"final":21,"function":20,"new":[37,38],"switch":51,Adding:[37,38],Using:38,acer:6,across:49,action:[4,5,6,7,8,9,10,11,12,13,20,21,32,36,49],actioninfo:26,actor:[5,9,12],addit:[0,51],additivenois:29,advantag:[20,22],agent:[3,37,40,51],algorithm:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,44,49,51],api:38,architectur:25,attentionactionspac:36,backend:34,balancedexperiencereplai:33,batch:26,behavior:1,benchmark:45,between:51,blizzard:28,boltzmann:29,bootstrap:[13,29],boxactionspac:36,build:43,can:49,carla:28,carlo:18,categor:[14,29],choos:[4,5,6,7,8,9,10,11,12,13,20,21],clip:7,clone:[1,43],coach:[38,39,41,43,48],collect:49,compar:39,compoundactionspac:36,condit:2,config:43,contain:43,continu:[7,11,12,49],continuousentropi:29,control:[21,28,40],copi:42,core:26,creat:43,critic:[5,9,12],dashboard:39,data:27,deep:[8,16,51],deepmind:28,demonstr:49,descript:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24],design:42,determinist:8,direct:4,discret:[5,6,10,49],discreteactionspac:36,distribut:[41,43],distributedtaskparamet:0,doe:49,doubl:15,dqn:[13,14,15,17,23],duel:17,dump:51,egreedi:29,environ:[28,38,46,49,51],envrespons:26,episod:[21,26,33],episodicexperiencereplai:33,episodichindsightexperiencereplai:33,episodichrlhindsightexperiencereplai:33,evalu:51,experiencereplai:33,explor:29,explorationpolici:29,featur:47,file:43,filter:[30,31,32],flag:51,flow:40,framework:51,from:49,futur:4,gener:17,gif:51,goal:36,gradient:[8,10],graph:40,greedi:29,gym:[28,38],have:49,hierarch:9,horizont:41,human:[49,51],imag:43,imageobservationspac:36,imit:[2,51],implement:43,input:31,interfac:43,keep:42,kubernet:35,learn:[2,19,22,48,51],level:40,manag:40,memori:[33,34],mix:18,mont:18,more:49,multi:51,multipl:49,multiselectactionspac:36,network:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,42],networkwrapp:25,neural:21,nfsdatastor:27,node:[49,51],non:33,normal:20,observ:[31,36],observationclippingfilt:31,observationcropfilt:31,observationmoveaxisfilt:31,observationnormalizationfilt:31,observationreductionbysubpartsnamefilt:31,observationrescalesizebyfactorfilt:31,observationrescaletosizefilt:31,observationrgbtoyfilt:31,observationsqueezefilt:31,observationstackingfilt:31,observationtouint8filt:31,openai:[28,38],optim:[7,11],orchestr:35,ouprocess:29,out:41,output:32,pain:49,parallel:49,paramet:0,parameternois:29,persist:22,plai:51,planarmapsobservationspac:36,polici:[7,8,10,11,29],predict:4,prerequisit:43,presetvalidationparamet:0,prioritizedexperiencereplai:33,process:49,proxim:[7,11],push:43,qdnd:33,quantil:23,rainbow:24,redispubsubbackend:34,regress:23,reinforc:48,render:51,repositori:43,reward:31,rewardclippingfilt:31,rewardnormalizationfilt:31,rewardrescalefilt:31,run:[39,43],s3datastor:27,sampl:49,scale:41,select:49,signal:39,simul:49,singl:51,singleepisodebuff:33,soft:12,solv:49,space:[36,49],starcraft:28,statist:39,step:19,store:[13,27],structur:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24],suit:28,support:41,sync:42,synchron:41,task:49,taskparamet:0,test:50,thread:51,through:51,track:39,train:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,51],transit:[13,26],transitioncollect:33,truncatednorm:29,type:[26,41],ucb:29,usag:[43,51],vectorobservationspac:36,visual:[39,51],visualizationparamet:0,vizdoom:28,you:49,your:49}}) \ No newline at end of file +Search.setIndex({docnames:["components/additional_parameters","components/agents/imitation/bc","components/agents/imitation/cil","components/agents/index","components/agents/other/dfp","components/agents/policy_optimization/ac","components/agents/policy_optimization/acer","components/agents/policy_optimization/cppo","components/agents/policy_optimization/ddpg","components/agents/policy_optimization/hac","components/agents/policy_optimization/pg","components/agents/policy_optimization/ppo","components/agents/policy_optimization/sac","components/agents/policy_optimization/td3","components/agents/value_optimization/bs_dqn","components/agents/value_optimization/categorical_dqn","components/agents/value_optimization/double_dqn","components/agents/value_optimization/dqn","components/agents/value_optimization/dueling_dqn","components/agents/value_optimization/mmc","components/agents/value_optimization/n_step","components/agents/value_optimization/naf","components/agents/value_optimization/nec","components/agents/value_optimization/pal","components/agents/value_optimization/qr_dqn","components/agents/value_optimization/rainbow","components/architectures/index","components/core_types","components/data_stores/index","components/environments/index","components/exploration_policies/index","components/filters/index","components/filters/input_filters","components/filters/output_filters","components/memories/index","components/memory_backends/index","components/orchestrators/index","components/spaces","contributing/add_agent","contributing/add_env","dashboard","design/control_flow","design/horizontal_scaling","design/network","dist_usage","features/algorithms","features/benchmarks","features/environments","features/index","index","selecting_an_algorithm","test","usage"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["components/additional_parameters.rst","components/agents/imitation/bc.rst","components/agents/imitation/cil.rst","components/agents/index.rst","components/agents/other/dfp.rst","components/agents/policy_optimization/ac.rst","components/agents/policy_optimization/acer.rst","components/agents/policy_optimization/cppo.rst","components/agents/policy_optimization/ddpg.rst","components/agents/policy_optimization/hac.rst","components/agents/policy_optimization/pg.rst","components/agents/policy_optimization/ppo.rst","components/agents/policy_optimization/sac.rst","components/agents/policy_optimization/td3.rst","components/agents/value_optimization/bs_dqn.rst","components/agents/value_optimization/categorical_dqn.rst","components/agents/value_optimization/double_dqn.rst","components/agents/value_optimization/dqn.rst","components/agents/value_optimization/dueling_dqn.rst","components/agents/value_optimization/mmc.rst","components/agents/value_optimization/n_step.rst","components/agents/value_optimization/naf.rst","components/agents/value_optimization/nec.rst","components/agents/value_optimization/pal.rst","components/agents/value_optimization/qr_dqn.rst","components/agents/value_optimization/rainbow.rst","components/architectures/index.rst","components/core_types.rst","components/data_stores/index.rst","components/environments/index.rst","components/exploration_policies/index.rst","components/filters/index.rst","components/filters/input_filters.rst","components/filters/output_filters.rst","components/memories/index.rst","components/memory_backends/index.rst","components/orchestrators/index.rst","components/spaces.rst","contributing/add_agent.rst","contributing/add_env.rst","dashboard.rst","design/control_flow.rst","design/horizontal_scaling.rst","design/network.rst","dist_usage.rst","features/algorithms.rst","features/benchmarks.rst","features/environments.rst","features/index.rst","index.rst","selecting_an_algorithm.rst","test.rst","usage.rst"],objects:{"rl_coach.agents.acer_agent":{ACERAlgorithmParameters:[6,0,1,""]},"rl_coach.agents.actor_critic_agent":{ActorCriticAlgorithmParameters:[5,0,1,""]},"rl_coach.agents.agent":{Agent:[3,0,1,""]},"rl_coach.agents.agent.Agent":{act:[3,1,1,""],call_memory:[3,1,1,""],choose_action:[3,1,1,""],collect_savers:[3,1,1,""],create_networks:[3,1,1,""],freeze_memory:[3,1,1,""],get_predictions:[3,1,1,""],get_state_embedding:[3,1,1,""],handle_episode_ended:[3,1,1,""],init_environment_dependent_modules:[3,1,1,""],initialize_session_dependent_components:[3,1,1,""],learn_from_batch:[3,1,1,""],load_memory_from_file:[3,1,1,""],log_to_screen:[3,1,1,""],observe:[3,1,1,""],parent:[3,1,1,""],phase:[3,1,1,""],post_training_commands:[3,1,1,""],prepare_batch_for_inference:[3,1,1,""],register_signal:[3,1,1,""],reset_evaluation_state:[3,1,1,""],reset_internal_state:[3,1,1,""],restore_checkpoint:[3,1,1,""],run_off_policy_evaluation:[3,1,1,""],run_pre_network_filter_for_inference:[3,1,1,""],save_checkpoint:[3,1,1,""],set_environment_parameters:[3,1,1,""],set_incoming_directive:[3,1,1,""],set_session:[3,1,1,""],setup_logger:[3,1,1,""],sync:[3,1,1,""],train:[3,1,1,""],update_log:[3,1,1,""],update_step_in_episode_log:[3,1,1,""],update_transition_before_adding_to_replay_buffer:[3,1,1,""]},"rl_coach.agents.bc_agent":{BCAlgorithmParameters:[1,0,1,""]},"rl_coach.agents.categorical_dqn_agent":{CategoricalDQNAlgorithmParameters:[15,0,1,""]},"rl_coach.agents.cil_agent":{CILAlgorithmParameters:[2,0,1,""]},"rl_coach.agents.clipped_ppo_agent":{ClippedPPOAlgorithmParameters:[7,0,1,""]},"rl_coach.agents.ddpg_agent":{DDPGAlgorithmParameters:[8,0,1,""]},"rl_coach.agents.dfp_agent":{DFPAlgorithmParameters:[4,0,1,""]},"rl_coach.agents.dqn_agent":{DQNAgent:[51,0,1,""],DQNAlgorithmParameters:[17,0,1,""]},"rl_coach.agents.dqn_agent.DQNAgent":{act:[51,1,1,""],call_memory:[51,1,1,""],choose_action:[51,1,1,""],collect_savers:[51,1,1,""],create_networks:[51,1,1,""],freeze_memory:[51,1,1,""],get_predictions:[51,1,1,""],get_state_embedding:[51,1,1,""],handle_episode_ended:[51,1,1,""],improve_reward_model:[51,1,1,""],init_environment_dependent_modules:[51,1,1,""],initialize_session_dependent_components:[51,1,1,""],learn_from_batch:[51,1,1,""],load_memory_from_file:[51,1,1,""],log_to_screen:[51,1,1,""],observe:[51,1,1,""],parent:[51,1,1,""],phase:[51,1,1,""],post_training_commands:[51,1,1,""],prepare_batch_for_inference:[51,1,1,""],register_signal:[51,1,1,""],reset_evaluation_state:[51,1,1,""],reset_internal_state:[51,1,1,""],restore_checkpoint:[51,1,1,""],run_off_policy_evaluation:[51,1,1,""],run_pre_network_filter_for_inference:[51,1,1,""],save_checkpoint:[51,1,1,""],set_environment_parameters:[51,1,1,""],set_incoming_directive:[51,1,1,""],set_session:[51,1,1,""],setup_logger:[51,1,1,""],sync:[51,1,1,""],train:[51,1,1,""],update_log:[51,1,1,""],update_step_in_episode_log:[51,1,1,""],update_transition_before_adding_to_replay_buffer:[51,1,1,""]},"rl_coach.agents.mmc_agent":{MixedMonteCarloAlgorithmParameters:[19,0,1,""]},"rl_coach.agents.n_step_q_agent":{NStepQAlgorithmParameters:[20,0,1,""]},"rl_coach.agents.naf_agent":{NAFAlgorithmParameters:[21,0,1,""]},"rl_coach.agents.nec_agent":{NECAlgorithmParameters:[22,0,1,""]},"rl_coach.agents.pal_agent":{PALAlgorithmParameters:[23,0,1,""]},"rl_coach.agents.policy_gradients_agent":{PolicyGradientAlgorithmParameters:[10,0,1,""]},"rl_coach.agents.ppo_agent":{PPOAlgorithmParameters:[11,0,1,""]},"rl_coach.agents.qr_dqn_agent":{QuantileRegressionDQNAlgorithmParameters:[24,0,1,""]},"rl_coach.agents.rainbow_dqn_agent":{RainbowDQNAlgorithmParameters:[25,0,1,""]},"rl_coach.agents.soft_actor_critic_agent":{SoftActorCriticAlgorithmParameters:[12,0,1,""]},"rl_coach.agents.td3_agent":{TD3AlgorithmParameters:[13,0,1,""]},"rl_coach.architectures.architecture":{Architecture:[26,0,1,""]},"rl_coach.architectures.architecture.Architecture":{accumulate_gradients:[26,1,1,""],apply_and_reset_gradients:[26,1,1,""],apply_gradients:[26,1,1,""],collect_savers:[26,1,1,""],construct:[26,1,1,""],get_variable_value:[26,1,1,""],get_weights:[26,1,1,""],parallel_predict:[26,1,1,""],predict:[26,1,1,""],reset_accumulated_gradients:[26,1,1,""],set_variable_value:[26,1,1,""],set_weights:[26,1,1,""],train_on_batch:[26,1,1,""]},"rl_coach.architectures.network_wrapper":{NetworkWrapper:[26,0,1,""]},"rl_coach.architectures.network_wrapper.NetworkWrapper":{apply_gradients_and_sync_networks:[26,1,1,""],apply_gradients_to_global_network:[26,1,1,""],apply_gradients_to_online_network:[26,1,1,""],collect_savers:[26,1,1,""],parallel_prediction:[26,1,1,""],set_is_training:[26,1,1,""],sync:[26,1,1,""],train_and_sync_networks:[26,1,1,""],update_online_network:[26,1,1,""],update_target_network:[26,1,1,""]},"rl_coach.base_parameters":{AgentParameters:[3,0,1,""],DistributedTaskParameters:[0,0,1,""],NetworkParameters:[26,0,1,""],PresetValidationParameters:[0,0,1,""],TaskParameters:[0,0,1,""],VisualizationParameters:[0,0,1,""]},"rl_coach.core_types":{ActionInfo:[27,0,1,""],Batch:[27,0,1,""],EnvResponse:[27,0,1,""],Episode:[27,0,1,""],Transition:[27,0,1,""]},"rl_coach.core_types.Batch":{actions:[27,1,1,""],game_overs:[27,1,1,""],goals:[27,1,1,""],info:[27,1,1,""],info_as_list:[27,1,1,""],n_step_discounted_rewards:[27,1,1,""],next_states:[27,1,1,""],rewards:[27,1,1,""],shuffle:[27,1,1,""],size:[27,1,1,""],slice:[27,1,1,""],states:[27,1,1,""]},"rl_coach.core_types.Episode":{get_first_transition:[27,1,1,""],get_last_transition:[27,1,1,""],get_transition:[27,1,1,""],get_transitions_attribute:[27,1,1,""],insert:[27,1,1,""],is_empty:[27,1,1,""],length:[27,1,1,""],update_discounted_rewards:[27,1,1,""]},"rl_coach.data_stores.nfs_data_store":{NFSDataStore:[28,0,1,""]},"rl_coach.data_stores.s3_data_store":{S3DataStore:[28,0,1,""]},"rl_coach.environments.carla_environment":{CarlaEnvironment:[29,0,1,""]},"rl_coach.environments.control_suite_environment":{ControlSuiteEnvironment:[29,0,1,""]},"rl_coach.environments.doom_environment":{DoomEnvironment:[29,0,1,""]},"rl_coach.environments.environment":{Environment:[29,0,1,""]},"rl_coach.environments.environment.Environment":{action_space:[29,1,1,""],close:[29,1,1,""],get_action_from_user:[29,1,1,""],get_available_keys:[29,1,1,""],get_goal:[29,1,1,""],get_random_action:[29,1,1,""],get_rendered_image:[29,1,1,""],goal_space:[29,1,1,""],handle_episode_ended:[29,1,1,""],last_env_response:[29,1,1,""],phase:[29,1,1,""],render:[29,1,1,""],reset_internal_state:[29,1,1,""],set_goal:[29,1,1,""],state_space:[29,1,1,""],step:[29,1,1,""]},"rl_coach.environments.gym_environment":{GymEnvironment:[29,0,1,""]},"rl_coach.environments.starcraft2_environment":{StarCraft2Environment:[29,0,1,""]},"rl_coach.exploration_policies.additive_noise":{AdditiveNoise:[30,0,1,""]},"rl_coach.exploration_policies.boltzmann":{Boltzmann:[30,0,1,""]},"rl_coach.exploration_policies.bootstrapped":{Bootstrapped:[30,0,1,""]},"rl_coach.exploration_policies.categorical":{Categorical:[30,0,1,""]},"rl_coach.exploration_policies.continuous_entropy":{ContinuousEntropy:[30,0,1,""]},"rl_coach.exploration_policies.e_greedy":{EGreedy:[30,0,1,""]},"rl_coach.exploration_policies.exploration_policy":{ExplorationPolicy:[30,0,1,""]},"rl_coach.exploration_policies.exploration_policy.ExplorationPolicy":{change_phase:[30,1,1,""],get_action:[30,1,1,""],requires_action_values:[30,1,1,""],reset:[30,1,1,""]},"rl_coach.exploration_policies.greedy":{Greedy:[30,0,1,""]},"rl_coach.exploration_policies.ou_process":{OUProcess:[30,0,1,""]},"rl_coach.exploration_policies.parameter_noise":{ParameterNoise:[30,0,1,""]},"rl_coach.exploration_policies.truncated_normal":{TruncatedNormal:[30,0,1,""]},"rl_coach.exploration_policies.ucb":{UCB:[30,0,1,""]},"rl_coach.filters.action":{AttentionDiscretization:[33,0,1,""],BoxDiscretization:[33,0,1,""],BoxMasking:[33,0,1,""],FullDiscreteActionSpaceMap:[33,0,1,""],LinearBoxToBoxMap:[33,0,1,""],PartialDiscreteActionSpaceMap:[33,0,1,""]},"rl_coach.filters.observation":{ObservationClippingFilter:[32,0,1,""],ObservationCropFilter:[32,0,1,""],ObservationMoveAxisFilter:[32,0,1,""],ObservationNormalizationFilter:[32,0,1,""],ObservationRGBToYFilter:[32,0,1,""],ObservationReductionBySubPartsNameFilter:[32,0,1,""],ObservationRescaleSizeByFactorFilter:[32,0,1,""],ObservationRescaleToSizeFilter:[32,0,1,""],ObservationSqueezeFilter:[32,0,1,""],ObservationStackingFilter:[32,0,1,""],ObservationToUInt8Filter:[32,0,1,""]},"rl_coach.filters.reward":{RewardClippingFilter:[32,0,1,""],RewardNormalizationFilter:[32,0,1,""],RewardRescaleFilter:[32,0,1,""]},"rl_coach.memories.backend.redis":{RedisPubSubBackend:[35,0,1,""]},"rl_coach.memories.episodic":{EpisodicExperienceReplay:[34,0,1,""],EpisodicHRLHindsightExperienceReplay:[34,0,1,""],EpisodicHindsightExperienceReplay:[34,0,1,""],SingleEpisodeBuffer:[34,0,1,""]},"rl_coach.memories.non_episodic":{BalancedExperienceReplay:[34,0,1,""],ExperienceReplay:[34,0,1,""],PrioritizedExperienceReplay:[34,0,1,""],QDND:[34,0,1,""],TransitionCollection:[34,0,1,""]},"rl_coach.orchestrators.kubernetes_orchestrator":{Kubernetes:[36,0,1,""]},"rl_coach.spaces":{ActionSpace:[37,0,1,""],AttentionActionSpace:[37,0,1,""],BoxActionSpace:[37,0,1,""],CompoundActionSpace:[37,0,1,""],DiscreteActionSpace:[37,0,1,""],GoalsSpace:[37,0,1,""],ImageObservationSpace:[37,0,1,""],MultiSelectActionSpace:[37,0,1,""],ObservationSpace:[37,0,1,""],PlanarMapsObservationSpace:[37,0,1,""],Space:[37,0,1,""],VectorObservationSpace:[37,0,1,""]},"rl_coach.spaces.ActionSpace":{clip_action_to_space:[37,1,1,""],contains:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""],sample_with_info:[37,1,1,""]},"rl_coach.spaces.GoalsSpace":{DistanceMetric:[37,0,1,""],clip_action_to_space:[37,1,1,""],contains:[37,1,1,""],distance_from_goal:[37,1,1,""],get_reward_for_goal_and_state:[37,1,1,""],goal_from_state:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""],sample_with_info:[37,1,1,""]},"rl_coach.spaces.ObservationSpace":{contains:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""]},"rl_coach.spaces.Space":{contains:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"]},objtypes:{"0":"py:class","1":"py:method"},terms:{"100x100":33,"160x160":32,"1_0":[15,25],"1st":30,"20x20":33,"210x160":32,"2nd":30,"50k":41,"9_amd64":44,"abstract":[38,42],"boolean":[3,27,37,51],"break":40,"case":[0,3,5,22,26,27,30,37,50,51,52],"class":[0,1,2,3,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,39,41,45,51],"default":[0,30,52],"enum":[26,29,37],"export":[0,26,44],"final":[8,13,16,17,19,23,41],"float":[3,4,5,6,7,8,10,11,12,13,15,19,22,23,24,26,27,29,30,32,33,34,37,38,51],"function":[0,1,3,6,7,8,11,13,26,29,30,37,38,39,41,43,51],"import":[6,18,30,34,39,50,52],"int":[0,3,4,5,6,7,10,15,20,22,24,25,27,29,30,32,33,34,37,51],"long":43,"new":[0,3,7,8,11,12,13,22,23,26,27,33,41,42,49,50,51],"return":[0,3,8,10,11,13,14,19,22,23,25,26,27,29,30,32,34,37,38,39,41,50,51],"short":[0,41],"static":26,"super":[38,39],"switch":[0,40],"true":[0,3,4,5,6,7,8,11,12,13,22,23,25,26,27,29,30,33,34,37,51],"try":[4,46,50],"while":[0,5,6,8,9,10,11,12,13,26,29,40,43,50,52],AWS:44,Adding:[18,49],And:[39,50],But:[40,50],Doing:50,For:[0,1,2,3,4,7,10,14,15,16,17,20,22,23,26,27,29,30,31,32,33,37,38,39,41,42,43,44,46,51,52],Has:26,Its:51,NFS:[28,44],One:[24,50,52],That:40,The:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,40,41,42,43,44,46,47,49,50,51,52],Then:[4,7,8,13,14,21,23],There:[7,11,26,30,31,38,39,43,52],These:[1,2,3,24,29,36,42,43,44],Use:[1,2,8,13,21,22],Used:30,Uses:50,Using:[8,13,14,16,17,44],Will:26,With:[30,49],__init__:[29,38,39],_index:[5,20],_nois:13,_render:39,_restart_environment_episod:39,_take_act:39,_update_st:39,a2c:50,a3c:[10,20,40,50],a_i:22,a_t:[4,5,6,8,12,13,14,15,16,17,19,20,21,23,25],a_valu:5,abl:[33,50],about:[3,27,41,51,52],abov:[8,12,13,26,41],abs:[20,34],absolut:30,acceler:21,accept:29,access:[26,38,44],accord:[0,3,4,5,6,8,12,13,14,20,26,27,30,37,40,41,43,51],accordingli:[22,37,41,52],account:[4,7,11,22,23,30],accumul:[3,4,5,6,10,20,22,25,26,32,50,51],accumulate_gradi:26,accumulated_gradi:26,accur:50,acer:[3,50],acer_ag:6,aceralgorithmparamet:6,achiev:[0,4,7,29,32,34,37,46,50,52],acquir:12,across:[10,19,40],act:[3,4,8,13,14,24,37,38,41,51],action:[1,2,3,15,16,17,18,19,20,23,24,25,26,27,29,30,31,34,38,39,41,43,51],action_idx:39,action_penalti:[8,13],action_spac:[29,30],action_space_s:26,action_valu:[27,30],actioninfo:[3,37,41,51],actionspac:[30,37],actiontyp:39,activ:[8,13,26],actor:[3,6,7,8,11,13,30,43,50],actor_critic_ag:5,actorcriticag:38,actorcriticalgorithmparamet:5,actual:[4,5,15,16,17,24,25,30,33,34],adam:[7,26],adam_optimizer_beta1:26,adam_optimizer_beta2:26,adapt:[7,11],add:[8,9,13,21,27,30,32,39,41,44,50],add_rendered_image_to_env_respons:0,added:[0,4,6,7,10,11,22,30,34,38],adding:[3,11,30,38,51],addit:[3,26,27,29,30,32,34,37,39,40,41,43,49,50,51],addition:[26,29,32,38,39,41,46,47,52],additional_fetch:26,additional_simulator_paramet:[29,39],additionali:40,additive_nois:30,additivenoiseparamet:30,address:13,advanc:[25,49],advantag:[3,5,7,11,18,30],affect:[0,14,26],aforement:[16,17,23],after:[0,3,8,10,11,12,20,21,23,25,26,27,29,32,37,51,52],again:30,against:3,agent:[0,1,2,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,29,30,31,32,33,37,39,40,43,45,46,49,50,51],agent_param:42,agent_paramet:[3,26,51],agentparamet:[3,26,38],aggreg:41,ahead:[4,50],aim:30,algorithm:[3,27,30,38,40,41,42,46,48,49,51],algorithmparamet:[3,38],all:[0,3,10,14,22,23,26,27,29,30,32,33,37,38,39,40,41,42,43,44,47,51,52],all_action_prob:27,allow:[0,3,4,13,18,26,27,29,30,31,32,33,34,40,41,42,43,49,50,51,52],allow_brak:29,allow_duplicates_in_batch_sampl:34,allow_no_action_to_be_select:37,along:[22,29,30,47],alpha:[6,19,23,34],alreadi:[22,27,39,50],also:[5,6,7,22,23,26,29,37,38,40,46,50,52],altern:[29,39,47],alwai:[26,30,33],amazon:44,amazonaw:44,amount:[8,10,13,19,23,30,41,50],analysi:40,analyz:40,ani:[3,26,27,29,33,34,38,41,42,43,44,51],anoth:[3,18,26,31,51],answer:50,anymor:[3,51],api:[29,43,47,49],appear:[3,51],appli:[0,3,5,8,10,13,20,26,27,30,32,50,51],applic:50,apply_and_reset_gradi:26,apply_gradi:26,apply_gradients_and_sync_network:26,apply_gradients_every_x_episod:[5,10,20],apply_gradients_to_global_network:26,apply_gradients_to_online_network:26,apply_stop_condit:0,appropri:44,approx:[8,12,13],approxim:[12,13,43,50],apt:44,arbitrari:32,architectur:[3,18,38,49,51],architecture_num_q_head:30,area:33,arg:[3,26,44,51],argmax_a:[16,19,23],argument:[3,15,25,26,29,37,41,51],around:[26,27,43],arrai:[3,26,27,29,32,37,39,51],art:[3,45],artifact:44,artifici:34,arxiv:[20,34],aspect:[30,32,40],assign:[0,2,5,6,26,30],assign_kl_coeffici:26,assign_op:26,assum:[27,30,32,34,50],async:[26,42],async_train:26,asynchron:[5,20,26],atari:[17,29,32,44,52],atari_a3c:52,atari_dqn:52,ath:18,atom:[15,24,25],attach:29,attempt:0,attend:33,attent:33,attentionactionspac:33,attentiondiscret:33,attribut:27,attribute_nam:27,author:[29,46,47],auto_select_all_armi:29,autoclean:44,automat:[26,52],autonom:[29,47,49],autoremov:44,auxiliari:[29,47],avail:[4,26,27,29,30,40,42,44,49,50,52],averag:[6,7,11,26,40,41],avg:6,aws:44,axes:[32,40],axi:[32,40],axis_origin:32,axis_target:32,back:[7,42],backend:[26,42,44,49,52],background:52,backpropag:22,backward:26,balanc:2,band:40,bar:6,base1:44,base64:44,base:[7,11,12,19,21,23,29,34,38,41,44,47,50,51],base_paramet:[0,3,26,29,30],baselin:50,basic:[10,27,42,52],batch:[1,2,3,4,5,6,8,10,11,12,13,14,15,16,17,18,20,23,24,25,26,34,38,41,51],batch_siz:26,bc_agent:1,bcalgorithmparamet:1,becaus:41,becom:[8,13,42],been:[18,27,32,46,50],befor:[0,3,5,11,13,25,26,27,32,41,42,43,44,50,51],begin:[0,4,41],behav:37,behavior:[3,32,34,38,46,50,51,52],being:[3,38,49,50,51],bellman:[15,24,25],benchmark:[40,48,49,50],best:[50,52],beta1:26,beta2:26,beta:[6,8,10,13,34],beta_entropi:[5,6,7,10,11],better:[13,18,50],between:[0,1,2,3,6,7,8,10,11,12,13,15,19,20,22,24,25,26,27,29,30,33,34,37,38,40,41,43,49,50],bfg:[7,11],bia:[6,50],big:[11,15,25],bin:[33,44],binari:14,bind:26,binomi:14,bit:32,blizzard:47,blob:[29,32],block:49,blog:49,boilerpl:41,bolling:40,bool:[0,3,4,5,6,7,8,11,12,13,22,23,25,26,27,29,30,34,37,51],boost:[44,50],bootstrap:[3,5,6,7,8,11,13,19,20,22,23,25,27,50],bootstrap_total_return_from_old_polici:[22,27],both:[3,7,26,29,30,33,50,51],bound:[6,7,11,15,25,30,37,50],box2d:44,box:[30,33,37],boxactionspac:33,boxdiscret:33,boxmask:33,breakout:52,breakoutdeterminist:[29,52],bring:11,bucket:44,buffer:[1,2,3,6,12,14,15,16,17,20,22,23,24,25,34,41,50,51,52],build:[31,49,50],builder:44,built:[38,41],bullet:6,button:[40,52],c51:15,cach:44,cadenc:13,calcul:[3,4,5,6,7,8,10,11,13,14,15,16,17,19,20,22,23,24,25,26,27,30,34,38,51],call:[0,3,10,20,26,27,29,41,51],call_memori:[3,51],callabl:37,camera:[29,39],camera_height:29,camera_width:29,cameratyp:[29,39],can:[0,2,3,5,6,7,8,11,12,13,23,26,27,29,30,31,32,33,37,38,39,40,41,43,47,49,51,52],cannot:[3,51],carla:[32,47],carla_environ:29,carlaenviron:29,carlaenvironmentparamet:29,carlo:[3,23],cartpol:[29,39],cartpole_a3c:52,cartpole_clippedppo:[44,52],cartpole_dqn:52,categor:[3,5,6,50],categori:[31,32],categorical_dqn_ag:15,categoricaldqnalgorithmparamet:15,caus:[32,40],cdot:[5,7,8,10,12,13,14,15,16,17,19,21,23,25],central:[26,40],certainti:30,chain:[8,13],challeng:41,chang:[0,3,6,7,8,11,13,14,18,20,23,30,41,44,51],change_phas:30,channel:[29,32],channels_axi:37,check:[0,3,27,37,51],checkpoint:[0,3,26,28,42,44,51,52],checkpoint_dir:[3,51],checkpoint_prefix:[3,51],checkpoint_restore_dir:[0,52],checkpoint_restore_path:0,checkpoint_save_dir:0,checkpoint_save_sec:0,child:26,chmod:44,choic:[38,44],choos:[3,18,23,30,31,33,37,38,41,43,50,51,52],choose_act:[3,38,41,51],chosen:[3,12,23,30,33,38,51],chunk:11,cil:50,cil_ag:2,cilalgorithmparamet:2,classic_control:44,clean:[29,38,44],cli:44,clip:[3,6,8,11,13,26,32,37,50],clip_action_to_spac:37,clip_critic_target:[8,13],clip_gradi:26,clip_high:30,clip_likelihood_ratio_using_epsilon:[7,11],clip_low:30,clip_max:32,clip_min:32,clipbyglobalnorm:26,clipped_ppo_ag:7,clippedppoalgorithmparamet:7,clipping_high:32,clipping_low:32,clone:[3,50],close:29,cmake:44,coach:[0,3,26,28,29,30,31,35,36,38,41,45,46,47,50,52],code:[39,41,50],coeffici:[7,11,26,30,34],collect:[3,7,10,11,20,26,27,34,41,46,49,51,52],collect_sav:[3,26,51],color:32,com:44,combin:[25,43,49,50],comma:0,command:[41,44,52],common:[38,40,44,52],commun:42,compar:[0,11,18,50],complet:[27,30,41],complex:[26,31,41,43,50,52],compon:[3,15,25,26,30,36,38,41,49,51,52],composit:[3,51],compositeag:[3,51],comput:[26,30],concat:26,concentr:41,condit:[0,3],confid:30,config:[29,52],configur:[3,5,10,38,44,51],confus:41,connect:[12,26],connectionist:10,consecut:[8,13,22],consequ:[20,30],consid:[5,6,30,33,40],consist:[8,13,29,32,33,37,41,47],constant:6,constantli:52,constantschedul:34,constrain:33,construct:[12,26,34],consumpt:32,contain:[0,1,2,3,14,26,27,29,37,39,41,51,52],content:44,contin:42,continu:[1,2,5,8,9,10,13,21,30,31,33,37,46],continuous_entropi:30,continuous_exploration_policy_paramet:30,contribut:[4,49],control:[2,3,5,6,7,8,11,26,30,32,40,47,49,50,51],control_suite_environ:29,controlsuiteenviron:29,conveni:[40,52],converg:10,convers:31,convert:[3,27,30,32,37,41,43,51],convolut:[26,43],coordin:33,copi:[8,12,13,14,15,16,17,19,20,21,23,24,25,26,44],core:[3,49,51],core_typ:[3,27,29,37,51,52],correct:[3,6,50],correctli:26,correl:30,correpond:27,correspond:[2,3,4,15,16,26,27,30,32,37,39,51],could:[3,26,37,44,51],count:19,countabl:33,counter:[3,51],counterpart:43,cpu:[0,26],crd:52,creat:[3,20,26,32,39,51,52],create_network:[3,51],create_target_network:26,creation:[3,51],credenti:44,critic:[3,6,7,8,11,13,30,43,50],crop:[32,33],crop_high:32,crop_low:32,cross:[1,15,25],csv:0,ctrl:40,cuda:44,cudnn7:44,curl:44,curr_stat:[3,38,51],current:[0,1,2,3,4,6,7,8,9,10,11,12,13,14,16,17,19,21,22,23,24,26,27,29,30,32,33,37,38,41,49,50,51],custom:[29,30,37,38,41],custom_reward_threshold:29,cycl:41,dai:52,dashboard:[0,3,44,49,51],data:[0,3,10,20,26,34,41,42,44,46,49,50,51,52],data_stor:[28,44],dataset:[3,7,11,50,51,52],date:[22,43,50,52],dcp:[44,52],ddpg:50,ddpg_agent:8,ddpgalgorithmparamet:8,ddqn:[19,23,50],deal:50,debug:[0,40,49],decai:[5,7,11,26],decid:[0,3,4,29,30,38,51],decis:[3,51],declar:0,decod:44,dedic:26,deep:[0,3,5,12,14,16,18,20,21,25,51],deepmind:47,def:[38,39],default_act:37,default_input_filt:39,default_output_filt:39,defin:[0,3,5,6,7,10,11,12,20,22,23,26,27,29,30,32,33,34,37,38,39,41,42,43,46,47,51,52],definit:[3,26,29,37,39,41,51],delai:[3,50],delta:[6,15,22,25],demonstr:[1,2,52],dens:30,densiti:19,depecr:0,depend:[0,3,6,26,32,34,37,39,44,46,50,51],deploi:[36,42],depth:29,descend:50,describ:[3,15,24,32,34,38,41,44,51],descript:[3,33,37,45,52],design:[41,44,49],desir:[33,38],destabil:10,detail:[3,27,45,47,49,52],determin:[2,3,22,27,34,51],determinist:[3,12,50],dev:44,develop:[41,46],deviat:[10,11,30,32,40],devic:26,dfp:50,dfp_agent:4,dfpalgorithmparamet:4,dict:[3,4,26,27,29,30,37,51],dict_siz:34,dictat:4,dictionari:[2,3,26,27,29,34,37,38,51],did:29,differ:[0,1,2,3,4,5,6,7,10,11,14,18,26,29,30,32,37,38,39,40,42,43,49,50,51],differenti:18,difficult:[40,46],difficulti:52,dimens:[27,29,32,33],dimension:[11,33],dir:[0,3,51,52],direct:[3,29,51],directli:[3,5,41,43,51],directori:[0,26,38,40,44,52],disabl:52,disable_fog:29,disappear:29,disassembl:50,discard:[27,32],discount:[8,10,11,13,19,22,23,25,26,27,50],discret:[1,2,4,7,11,14,15,16,17,18,19,20,22,23,24,25,30,31,32,33,37,41],disentangl:41,disk:0,displai:[0,40],distanc:37,distance_from_go:37,distance_metr:37,distancemetr:37,distil:[3,51],distribut:[5,6,10,11,12,15,24,25,26,28,30,35,36,37,43,49,50,52],distributed_coach:42,distributed_coach_synchronization_typ:42,distributedcoachsynchronizationtyp:42,divereg:[7,11],diverg:[6,7,11,25],dnd:[0,22,50],dnd_key_error_threshold:22,dnd_size:22,do_action_hindsight:34,doc:44,docker:44,dockerfil:44,document:47,doe:[14,26,32],doesn:42,doing:[7,11,31],domain:43,don:[4,30,40,50],done:[0,3,7,10,11,13,29,32,39,51,52],doom:[29,39,44,47],doom_basic_bc:52,doom_basic_dqn:52,doom_environ:[29,39,52],doomenviron:[29,39],doomenvironmentparamet:[39,52],doominputfilt:39,doomlevel:29,doomoutputfilt:39,doubl:[3,19,25],doubli:51,down:[26,29,50],download:44,dpkg:44,dqn:[3,19,20,25,29,30,32,33,41,43,50],dqn_agent:[17,51],dqnagent:51,dqnalgorithmparamet:17,drive:[2,29,47,49],driving_benchmark:29,due:32,duel:[3,25],dump:[0,3,51],dump_csv:0,dump_gif:0,dump_in_episode_sign:0,dump_mp4:0,dump_one_value_per_episod:[3,51],dump_one_value_per_step:[3,51],dump_parameters_document:0,dump_signals_to_csv_every_x_episod:0,dure:[3,6,7,10,11,12,14,22,30,40,41,51,52],dynam:[40,46,50],e_greedi:30,each:[0,1,2,3,4,5,6,7,10,11,12,14,16,17,18,20,22,23,24,26,27,29,30,31,32,33,34,37,38,40,41,42,43,44,46,50,51],eas:40,easi:[39,40,49],easier:43,easili:[30,52],echo:44,effect:[0,3,6,7,20,32,41,51],effici:[6,41,50],either:[0,3,5,20,26,30,37,40,43,52],element:[3,14,26,32,37],elf:44,embbed:26,embed:[3,22,26,51],embedd:[26,43],embedding_merger_typ:26,embeddingmergertyp:26,emploi:50,empti:27,emul:6,enabl:[26,43,52],encod:[32,37],encourag:[21,23,41],end:[2,3,10,25,27,29,32,51,52],enforc:33,engin:[29,47],enough:[4,6,22],ensembl:[30,50],ensur:[6,26],enter:[3,51,52],entir:[11,19,22,25,30,33,41],entri:[22,41],entropi:[1,5,6,7,10,11,12,15,25,30,50],enumer:37,env:[27,44],env_param:39,env_respons:[3,51],enviorn:29,environ:[0,3,4,6,18,26,27,30,31,32,33,37,38,41,44,46,48,49,51],environmentparamet:[29,39],envrespons:[0,3,29,51],episod:[0,3,4,5,10,11,14,19,20,25,29,30,38,39,40,41,42,51,52],episode_max_tim:29,episodic_hindsight_experience_replai:34,epoch:[7,51],epsilon:[7,30,34],epsilon_schedul:30,equal:2,equat:[8,12,13,16,17,20,24],error:[13,26,50],escap:52,especi:18,essenti:[20,26,33,39,41,44],estim:[3,5,7,11,14,19,23,30,51],estimate_state_value_using_ga:[5,7,11],eta:[7,11],etc:[0,3,26,29,31,37,38,47,51],evalu:[0,3,12,26,27,30,41,51],evaluate_onli:0,evaluation_epsilon:30,evaluation_nois:30,even:[18,26,29,39,40,41,50],everi:[0,5,6,8,10,12,13,14,15,16,17,19,20,21,23,24,25,52],exact:[22,30,46],exactli:26,exampl:[2,3,4,26,27,29,30,31,32,33,37,38,39,41,43,51,52],except:[20,27],execut:[27,40,41],exercis:13,exhibit:[3,38,51],exist:[22,26],exit:[3,51],expand_dim:27,expect:[0,3,30,46,51],experi:[0,6,8,11,12,13,25,29,34,35,40,41,42,44,49,50,52],experiment_path:[0,29],experiment_suit:29,experimentsuit:29,expert:[1,2,27,50],exploit:[30,41],explor:[3,4,5,6,7,8,9,11,13,14,19,21,22,38,41,49,50],exploration_polici:30,explorationparamet:[3,30,38],exponenti:[6,7,11,25,26],expor:3,export_onnx_graph:0,expos:[40,43,49],extend:[29,30,47],extens:[29,47],extent:52,extern:0,extra:[3,26,27,43,51],extract:[3,21,22,27,32,37,40,41,51],factor:[8,10,11,13,23,25,26,27,30,32],failur:0,faithfulli:40,fake:37,fals:[0,3,8,13,26,27,29,30,33,34,37,39,51],far:[11,32,41,46],faster:[18,50],featur:[8,13,29,43,49,50],feature_minimap_maps_to_us:29,feature_screen_maps_to_us:29,fetch:[26,27],fetched_tensor:26,few:[10,14,15,16,17,19,23,24,25,30,39],field:[46,49],file:[0,3,38,41,51,52],fill:[27,39],filter:[0,3,49,51],find:[16,40,47,49],finish:[22,52],finit:33,first:[0,8,11,13,14,22,24,25,26,27,32,41,43],fit:[13,37],flag:[0,3,26,27,29,51],flexibl:42,flicker:29,flow:[31,49],follow:[2,3,5,6,8,10,12,13,15,16,17,20,21,22,24,25,26,27,29,30,34,38,39,44,46,50,51],footprint:32,forc:[26,29,33,39],force_cpu:26,force_environment_reset:[29,39],force_int_bin:33,forced_attention_s:37,form:[4,20,37,50],format:38,formul:[5,6],forward:[26,30],found:[3,45,52],frac:[6,7,12,15,25],fraction:[7,11],frame:[0,29],frame_skip:29,framework:[0,3,26,38,49,51],framework_typ:0,free:[29,47],freeglut3:44,freez:[3,51],freeze_memori:[3,51],frequenc:13,from:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31,32,33,35,37,38,39,40,41,42,43,44,46,47,49,51,52],full:[3,10,19,33,51],fulldiscreteactionspacemap:33,fulli:26,func:[3,51],futur:[0,3,10,27,50],future_measurements_weight:4,gae:[5,7,11],gae_lambda:[5,7,11],game:[3,27,29,47,49,51,52],game_ov:27,gamma:[5,6,8,12,13,14,15,16,17,19,20,21,22,23,25],gap:[23,50],gather:42,gaussian:[11,12,13,30],gener:[0,5,7,11,14,26,29,30,34,37,38,44,52],general_network:38,get:[3,4,7,8,9,10,11,13,14,16,17,19,21,23,26,27,29,30,37,41,43,44,46,51],get_act:30,get_action_from_us:29,get_available_kei:29,get_first_transit:27,get_goal:29,get_last_env_respons:29,get_last_transit:27,get_output_head:38,get_predict:[3,51],get_random_act:29,get_rendered_imag:[29,39],get_reward_for_goal_and_st:37,get_state_embed:[3,51],get_transit:27,get_transitions_attribut:27,get_variable_valu:26,get_weight:26,gfortran:44,gif:0,git:44,github:[39,44,46,49],given:[0,1,2,3,4,5,8,10,11,13,26,27,29,30,32,33,34,37,38,41,51],given_weight:26,global:[3,26,43,51],global_network:26,glx:44,goal:[1,2,3,4,6,26,27,29,34,41,43,50,51],goal_from_st:37,goal_nam:37,goal_spac:29,goal_vector:4,goals_spac:34,goalsspac:[34,37],goaltorewardconvers:37,going:31,good:[39,40],gpu:[0,26],gracefulli:52,gradient:[3,5,6,7,11,20,22,26,38,50,51],gradientclippingmethod:26,gradients_clipping_method:26,granular:34,graph:0,graphmanag:41,grayscal:[32,37],greedili:41,group:40,grow:25,guidelin:50,gym:[44,47],gym_environ:[29,52],gymenviron:29,gymenvironmentparamet:39,hac:50,had:46,hand:[18,32,41,50],handl:4,handle_episode_end:[3,29,51],handling_targets_after_episode_end:4,handlingtargetsafterepisodeend:4,hard:[40,50],harder:40,has:[0,3,18,22,23,27,30,32,41,43,46,50,51],has_glob:26,has_target:26,hat:[6,7,15,25],have:[0,3,4,6,26,29,30,32,33,34,41,43,46,51],head:[1,2,3,5,6,10,14,18,21,22,26,30,38,43,51],headparamet:26,heads_paramet:26,health_gath:29,heat:6,heatup:[30,41],help:[23,27,40,41,50],here:[39,41],heurist:[11,30],hide:43,hierarch:[37,41],hierarchi:[3,41,50,51],high:[8,11,13,32,33,37,40],high_i:37,high_kl_penalty_coeffici:11,high_x:37,higher:11,highest:[5,6,10,23,30,32,33,37],highli:[0,39,50],hindsight:[9,34,50],hindsight_goal_selection_method:34,hindsight_transitions_per_regular_transit:34,hindsightgoalselectionmethod:34,hold:[14,26,27,34,40,41,43],horizont:[44,49,52],host:44,hostnam:0,hot:37,how:[4,7,11,30,42,44,50,52],hrl:34,html:44,http:[20,34,44],hub:44,huber:24,huber_loss_interv:24,human:[0,29],human_control:29,hyper:[38,46],hyperparamet:38,ident:26,identifi:[26,37],ies:51,ignor:29,imag:[0,26,29,32,33,37,39,43,52],image1:44,imit:[3,27,45,50],impact:26,implement:[3,7,11,26,28,29,30,34,38,39,42,46,50,52],impli:52,implment:36,importance_weight:26,importance_weight_trunc:6,importantli:41,improv:[5,18,25,29,41,50],improve_reward_model:51,includ:[0,3,4,29,31,32,36,43,47,51,52],increas:[11,23,32,50],increment:[3,51],index:[0,2,27,29,32,33,34,37],indic:37,inf:[32,37],infer:[3,26,29,51],infinit:[0,50],info:[3,14,27,37,39,51],info_as_list:27,inform:[3,4,20,27,29,31,40,41,44,47,51],inherit:[3,38,39],init_environment_dependent_modul:[3,51],initi:[3,4,11,23,26,27,38,41,49,51],initial_feed_dict:26,initial_kl_coeffici:11,initialize_session_dependent_compon:[3,51],innov:50,input:[1,2,3,4,8,13,14,16,17,19,21,22,23,26,31,37,41,43,51],input_embedders_paramet:26,input_high:32,input_low:32,input_space_high:33,input_space_low:33,inputembedderparamet:26,inputfilt:41,insert:[22,27],inspect:0,instal:[44,52],instanc:[3,35,37,43],instanti:[3,29,41],instead:[0,3,7,20,23,26,32,33,41,50,51],instruct:52,intact:[14,46],integ:[0,32,33],integr:[39,41,42,49],intel:49,intend:[10,26,30,41],interact:[27,41,42,49,52],interest:[26,40],interfac:[29,40,42,47],intermedi:22,intern:[3,10,20,26,27,31,41,51,52],intersect:50,interv:24,intro:49,introduc:50,invers:[29,47],invok:41,involv:38,is_empti:27,is_valid_index:37,item:27,iter:[3,5,6,8,11,13,18,26,51],its:[0,3,15,25,26,27,30,37,41,44,50,51,52],itself:[26,37,52],job:0,job_typ:0,joint:29,json:0,jump:[4,33],jupyt:38,just:[3,11,23,25,39,41,43,51,52],kapa:24,keep:[17,27,32,52],kei:[2,22,26,27,29,34,38,40,44,50,52],key_error_threshold:34,key_width:34,keyboard:[29,52],keyword:26,kl_coeffici:26,kl_coefficient_ph:26,know:[3,50,51,52],knowledg:[3,41,51],known:[27,40,46,50],kubeconfig:36,kubernet:44,kubernetes_orchestr:36,kubernetesparamet:36,kwarg:[26,29],l2_norm_added_delta:22,l2_regular:26,lack:40,lamb:30,lambda:[5,7,11,30],lane:2,larg:[30,32,47],larger:26,last:[4,6,11,22,27,29,32],last_env_respons:29,lastli:41,later:[0,3,26,51,52],latest:[20,22,41,44],layer:[26,30,34,41,43],lazi:[27,32],lazystack:32,lbfg:26,ld_library_path:44,lead:30,learn:[0,3,4,5,6,8,9,10,12,14,15,16,17,18,21,24,25,26,27,29,30,32,40,41,43,45,46,47,50,51],learn_from_batch:[3,38,41,51],learner:26,learning_r:[26,34],learning_rate_decay_r:26,learning_rate_decay_step:26,least:[43,50],leav:[11,14],left:[2,6,12,50],length:[4,5,7,11,20,22,26,27],less:[18,50],level:[0,3,26,29,39,51,52],levelmanag:[3,41,51],levelselect:29,libatla:44,libav:44,libavformat:44,libbla:44,libboost:44,libbz2:44,libfluidsynth:44,libgl1:44,libglew:44,libgm:44,libgstream:44,libgtk2:44,libgtk:44,libjpeg:44,liblapack:44,libnotifi:44,libopen:44,libosmesa6:44,libportmidi:44,librari:[29,44,47],libsdl1:44,libsdl2:44,libsdl:44,libsm:44,libsmpeg:44,libswscal:44,libtiff:44,libwebkitgtk:44,libwildmidi:44,like:[12,29,37,41,43,44,50],likelihood:[7,11],line:[3,41,51,52],linear:33,linearboxtoboxmap:33,linearli:33,list:[0,3,4,26,27,29,30,32,33,37,38,51,52],load:[0,3,40,42,51,52],load_memory_from_fil:[3,51],load_memory_from_file_path:52,local:[3,43,44,51],locat:[24,27,32,50],log:[0,3,5,6,10,12,51],log_to_screen:[3,51],logger:[0,3,51],look:[39,44],loop:41,loss:[1,2,3,6,7,10,11,15,16,17,24,25,26,30,38,43,51],lot:[30,40,46,50],low:[8,11,13,32,33,37],low_i:37,low_x:37,lower:[0,34,41],lowest:[32,33,37],lstm:43,lumin:32,lvert:[6,15,25],lvl:52,mai:[0,26,45,52],main:[3,38,41,43,45,51,52],mainli:42,major:30,make:[0,3,26,29,38,40,44,46,50,51],manag:[3,26,42,44,51],mandatori:[37,39,43],mani:[3,18,45,46],manner:[11,19,20,23,32,41],manual:44,map:[3,26,29,31,32,33,37,38,51],mark:27,markdown:51,mask:[14,33],masked_target_space_high:33,masked_target_space_low:33,master:[3,41,44,51],match:[2,22,26,37],mathbb:[5,6],mathcal:13,mathop:5,max:[5,6,13,15,20,25,32],max_a:[14,17,22,23],max_action_valu:27,max_episodes_to_achieve_reward:0,max_fps_for_human_control:0,max_kl_diverg:6,max_over_num_fram:29,max_simultaneous_selected_act:37,max_siz:34,max_spe:29,maxim:[4,16],maximum:[0,12,15,17,22,23,27,29,30,32,34,50],mean:[0,2,7,8,9,10,11,12,13,21,26,30,32,33,37,40,50],meant:43,measur:[3,4,26,29,32,37,39,50,51],measurements_nam:37,mechan:[31,42,46,52],memor:50,memori:[3,25,27,32,38,41,42,44,49,50,51],memory_backend:44,memorygranular:34,memoryparamet:[3,38],merg:[26,29],mesa:44,method:[0,5,7,11,13,20,26,32,34],metric:[0,37,40],mid:6,middlewar:[22,26,43],middleware_paramet:26,middlewareparamet:26,midpoint:24,might:[3,10,29,38,43,51],min:[6,7,13,15,23,25],min_:[12,13],min_reward_threshold:0,mind:52,minim:[2,4,15],minimap_s:29,minimum:[0,7,13,32],mitig:50,mix:[3,7,11,22,23,50],mixedmontecarloalgorithmparamet:19,mixer1:44,mixtur:[19,26],mjkei:44,mjpro150:44,mjpro150_linux:44,mkdir:44,mmc:[19,50],mmc_agent:19,mode:[23,26,28,35,36,41,42,44,52],model:[0,19,21,26,49,51,52],modif:50,modifi:6,modul:[3,38,41,42,51],modular:[38,41,43,49],monitor:42,mont:[3,23],monte_carlo_mixing_r:[19,23],more:[3,8,13,20,26,32,38,40,41,43,44,49,51,52],moreov:40,most:[3,10,22,26,27,30,43,46,50,51,52],mostli:[32,41],motiv:41,move:[6,7,11,32,40,46],mp4:0,mse:[2,6,16,17,24],much:[7,11,41,50],mujoco:[29,33,39,44,47],mujoco_kei:44,mujoco_pi:44,multi:[11,26,37,43],multiarrai:[3,51],multidimension:37,multipl:[4,7,11,20,26,29,30,32,33,34,37,40,41,46,49,52],multipli:[4,10,26,32],multiselect:33,multitask:[29,47],must:[26,32,37,46],mxnet:52,n_step:[22,25,27,34],n_step_discounted_reward:27,n_step_q_ag:20,nabla:[6,8,13],nabla_:[8,12,13],nabla_a:[8,13],naf:50,naf_ag:21,nafalgorithmparamet:21,name:[3,26,27,29,32,37,38,44,51,52],namespac:36,nasm:44,nativ:[0,29,39,47],native_rend:0,navig:3,ndarrai:[3,26,27,29,30,32,33,37,39,51],nearest:22,neat:40,nec:[0,50],nec_ag:22,necalgorithmparamet:22,necessari:[3,22,26,51],necessarili:32,need:[0,3,6,25,26,29,30,37,38,41,46,50,51,52],neg:[4,32],neighbor:22,neon_compon:38,nervanasystem:44,network:[0,3,26,30,38,41,46,49,50,51,52],network_input_tupl:26,network_nam:[3,51],network_param:30,network_paramet:26,network_wrapp:[3,26,51],networkparamet:[3,26,30,38],networkwrapp:[3,51],neural:[3,19,26,43,46],never:26,new_value_shift_coeffici:[22,34],new_weight:26,newli:[23,39,50],next:[0,3,8,13,16,17,21,23,24,27,29,41,51,52],next_stat:27,nfs_data_stor:28,nfsdatastoreparamet:28,nice:52,no_accumul:26,node:[26,43],nois:[8,9,13,21,30,41,50],noise_as_percentage_from_action_spac:30,noise_schedul:30,noisi:[10,25,30],non_episod:34,none:[0,3,7,8,11,13,26,27,29,30,32,33,37,39,51],norm:26,norm_unclipped_grad:26,norm_unclippsed_grad:26,normal:[3,4,10,30,31,32,37],note:[22,26,30,51],notebook:38,notic:[26,50],notori:[40,46,50],now:[7,39],nstepqalgorithmparamet:20,nth:25,num_act:[22,34,37],num_bins_per_dimens:33,num_class:34,num_consecutive_playing_step:[3,8,13,51],num_consecutive_training_step:[3,51],num_gpu:0,num_neighbor:34,num_predicted_steps_ahead:4,num_speedup_step:29,num_steps_between_copying_online_weights_to_target:[8,12,13,20],num_steps_between_gradient_upd:[5,6,10,20],num_task:0,num_training_task:0,num_transitions_to_start_replai:6,num_work:0,number:[0,2,4,5,6,8,10,12,13,14,15,20,22,24,25,26,27,29,30,32,33,34,40,47,51,52],number_of_knn:22,numpi:[3,26,27,29,30,32,33,37,39,51],nvidia:44,object:[0,3,25,26,29,30,32,34,41,51],observ:[0,3,4,11,26,27,29,31,39,41,51],observation_reduction_by_sub_parts_name_filt:32,observation_space_s:26,observation_space_typ:29,observation_stat:32,observation_typ:29,observationspac:37,observationspacetyp:29,observationtyp:29,off:[3,6,12,42,50,51],offer:[29,47],often:[40,41,43],old:[7,11,26,50],old_weight:26,onc:[0,7,10,11,14,15,16,17,19,20,23,24,25,26,37,52],one:[0,3,6,18,22,23,26,27,29,30,31,34,37,39,40,43,50,51],ones:[39,50],onli:[0,3,4,5,6,7,10,11,14,15,17,18,20,22,24,25,26,27,29,30,32,33,39,41,50,51,52],onlin:[8,12,13,14,15,16,17,19,20,21,22,23,24,25,26,41,43],online_network:26,onnx:[0,26],onto:31,open:[0,29,47],openai:[44,47],opencv:44,oper:[23,26,32],optim:[3,4,6,26,45,50],optimization_epoch:7,optimizer_epsilon:26,optimizer_typ:26,option:[6,10,26,29,33,37,38,40,42,43,52],orchestr:[42,44,49],order:[0,3,5,6,7,8,10,11,12,13,16,17,18,20,21,22,23,24,26,27,31,32,33,40,41,43,46,50,51],org:[20,34],origin:[20,32,33,46],ornstein:[8,9,30],other:[0,2,10,18,23,26,29,31,32,34,40,41,50],otherwis:[11,14,26,29,30,37],ou_process:30,our:7,out:[2,16,17,30,31,33,40,44,49,50,52],outcom:[30,41],output:[0,4,6,8,13,14,15,21,22,26,30,31,32,37,38,43],output_0_0:26,output_observation_spac:32,outputfilt:41,outsid:[4,30],over:[3,7,10,11,20,22,25,26,27,30,32,33,40,41,50,51],overestim:[8,13,50],overfit:11,overhead:0,overlai:40,overrid:[3,51],override_existing_kei:34,overriden:38,overview:41,overwhelm:41,overwritten:26,own:[26,38],p_j:[15,25],page:[3,46],pair:[0,37],pal:[23,50],pal_ag:23,pal_alpha:23,palalgorithmparamet:23,paper:[5,10,12,15,20,22,24,29,34,46],parallel:[6,26,40,43],parallel_predict:26,param:[3,26,27,28,29,30,35,36,38,39,51],paramet:[2,3,4,5,6,7,8,10,11,12,13,15,19,20,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,39,46,49,51,52],parameter_nois:30,parameters_server_host:0,parent:[3,26,51],parent_path_suffix:[3,26,51],parmet:3,pars:41,part:[0,3,14,26,27,30,32,33,42,43,46,50,51],part_nam:32,partial:33,partialdiscreteactionspacemap:33,particular:4,particularli:[29,30,37,46,50],pass:[0,4,8,9,13,21,22,26,29,30,31,39,40,41,43,52],patamet:22,patchelf:44,patchelf_0:44,path:[0,3,26,38,39,44,51,52],pattern:41,pdf:34,penal:[7,8,11,13],penalti:11,pendulum_hac:39,pendulum_with_go:39,pendulumwithgo:39,per:[0,3,4,37,38,41,51],percentag:30,percentil:30,perceptron:43,perform:[0,3,6,26,27,32,34,39,40,41,50,51],period:[43,52],persist:3,persistent_advantage_learn:23,perspect:15,phase:[3,6,7,8,9,11,12,13,26,29,30,41,51],phi:[15,25],physic:[29,47],pi_:[6,7,12],pick:[12,29],pickl:52,pickledreplaybuff:52,pip3:44,pip:44,pixel:29,place:[33,40,41],placehold:[26,30],plai:[0,3,10,14,16,17,20,30,38,40,51],plain:43,planarmap:29,planarmapsobservationspac:32,platform:[29,47],pleas:[20,46],plu:26,plugin:44,point:[32,37,41,42],polici:[1,3,4,5,6,9,12,14,20,21,22,28,38,41,42,43,44,45,49,50,51],policy_gradient_rescal:[5,7,10,11],policy_gradients_ag:10,policygradientalgorithmparamet:10,policygradientrescal:[5,7,10,11],policyoptimizationag:38,popul:41,popular:[29,47],port:0,posit:[4,32],possibl:[2,3,4,22,30,33,37,40,43,49,50,51,52],post:[31,49],post_training_command:[3,51],power:[29,47],ppo:[7,11,50],ppo_ag:11,ppoalgorithmparamet:11,pre:[8,13,30,31],predefin:[14,23,30,52],predict:[1,2,3,5,6,7,8,11,12,13,14,15,16,17,23,24,25,26,30,43,50,51],prediction_typ:[3,51],predictiontyp:[3,51],prefect:50,prefer:26,prefix:[3,51],prep:44,prepar:[3,51],prepare_batch_for_infer:[3,51],present:[18,22,26,29,32,50],preset:[0,5,38,39,41,42,44,52],press:[40,52],prevent:[8,11,13,41],previou:32,previous:[11,26],print:[0,3,52],print_networks_summari:0,priorit:[25,34],prioriti:[25,34],privat:37,probabilit:[5,6],probabl:[3,5,6,10,14,15,25,27,30,38,50,51],problem:50,procedur:6,process:[0,3,8,9,26,30,31,32,33,38,40,41,43,46,49,51],produc:26,progress:26,project:[15,25],propag:7,propagate_updates_to_dnd:22,properti:[3,26,27,29,34,38,39,44,51],proport:34,provid:[26,42],proxi:41,proxim:3,pub:[35,36,44],publish:46,purpos:[0,3,10],pursuit:2,push:[3,51],pybullet:[29,47],pygam:[0,44],pytest:44,python3:44,python:[29,34,38,44,47,49],q_i:12,qr_dqn_agent:24,quad:6,qualiti:29,quantil:[3,50],quantileregressiondqnalgorithmparamet:24,queri:[22,26,41,50],question:50,quit:40,r_i:[5,20],r_t:[4,6,7,25],rainbow:[3,38,50],rainbow_ag:38,rainbow_dqn_ag:25,rainbowag:38,rainbowagentparamet:38,rainbowalgorithmparamet:38,rainbowdqnalgorithmparamet:25,rainbowexplorationparamet:38,rainbowmemoryparamet:38,rainbownetworkparamet:38,rais:[3,27,51],ramp:[38,41],random:[0,20,29,30,37,41,46],random_initialization_step:29,randomli:[27,41],rang:[4,7,8,11,13,15,25,29,32,33,37,50],rare:22,rate:[0,6,19,22,26,29,43],rate_for_copying_weights_to_target:[6,8,12,13],rather:[4,12,40],ratio:[6,7,11,19,32],ratio_of_replai:6,raw:[29,47],reach:[0,11,37],read:[0,28],read_csv_tri:0,readabl:41,readm:44,real:3,reason:[32,46],rebuild_on_every_upd:34,receiv:[26,27],recent:[3,25,26,50,51],recommend:39,redi:[35,36,44],redispubsub:44,redispubsubmemorybackendparamet:35,reduc:[1,2,10,11,23,26,32,41,50],reduct:32,reduction_method:32,reductionmethod:32,redund:32,refer:[2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,42,44],referenc:3,regard:[3,51],region:[6,50],regist:[3,51],register_sign:[3,51],registri:44,regress:[2,3,50],regula:[6,7,11],regular:[5,7,10,11,20,22,26,30,33,34,50],regularli:26,reinforc:[3,5,8,9,10,12,15,16,17,18,20,23,24,25,29,30,40,41,43,45,46,47,50],relat:[26,44],relationship:50,releas:[0,49,50],relev:[3,14,30,32,51],remov:[0,32],render:[0,3,29,39],reorder:32,repeat:[29,41],replac:[30,32,34,44],replace_mse_with_huber_loss:26,replai:[1,2,3,6,8,12,13,14,15,16,17,20,22,23,24,25,34,41,50,51,52],replay_buff:52,replicated_devic:26,repo:39,repositori:49,repres:[0,7,11,15,25,26,27,29,30,33,37,52],represent:43,reproduc:[41,46],request:[3,26,51],requir:[3,26,28,30,32,40,43,44,50,51],requires_action_valu:30,rescal:[4,5,7,10,11,26,31,32],rescale_factor:32,research:[29,46,47],reset:[3,22,26,29,30,39,51],reset_accumulated_gradi:26,reset_evaluation_st:[3,51],reset_gradi:26,reset_internal_st:[3,29,51],resourc:[42,44],respect:[8,13,27,29],respons:[3,27,29,41,51],rest:[26,27,33,44],restart:39,restor:[0,3,51],restore_checkpoint:[3,51],result:[3,4,13,15,16,17,18,24,25,26,32,33,46,50,51,52],ret:6,retrac:6,retri:0,retriev:[22,34],return_additional_data:34,reus:41,reusabl:43,reward:[0,1,2,3,4,8,10,13,19,20,25,26,27,29,31,37,39,40,41,50,51],reward_test_level:0,reward_typ:37,rgb:[29,32,37],rho:[6,8,13],rho_t:6,right:[2,3,6,12,30,33,40,50,51],rl_coach:[0,1,2,3,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,39,44,51,52],rms_prop_optimizer_decai:26,rmsprop:26,roboschool:[29,47],robot:[29,37,47,49],roboti:44,robust:51,rollout:[28,35,36,42,44,52],root:[40,44],rule:[8,13,14],run:[0,3,4,8,10,11,12,13,14,16,17,22,23,26,29,30,32,51,52],run_off_policy_evalu:[3,51],run_pre_network_filter_for_infer:[3,51],runphas:[3,51],runtim:44,rvert:[15,25],rvert_2:6,s3_bucket_nam:44,s3_creds_fil:44,s3_data_stor:28,s3_end_point:44,s3datastoreparamet:28,s_t:[4,5,6,8,12,13,14,15,16,17,19,20,21,23,25],sac:50,sai:50,same:[3,4,7,10,13,19,20,23,26,29,33,34,40,43,46,50,51],sampl:[1,2,3,5,6,8,10,11,12,13,14,15,16,17,19,20,23,24,25,26,30,34,37,41,44,51],sample_with_info:37,satur:[8,13],save:[0,3,25,26,30,44,51,52],save_checkpoint:[3,51],saver:[3,26,51],savercollect:[3,26,51],scale:[4,10,26,32,40,44,49,52],scale_down_gradients_by_number_of_workers_for_sync_train:26,scale_measurements_target:4,scaler:26,schedul:[7,30,34,41,42,44,52],scheme:[5,30,41,50],schulman:11,sci:44,scienc:46,scipi:[32,44],scope:26,scratch:50,scratchpad:0,screen:[3,29,39,52],screen_siz:29,script:41,second:[0,26,40,50,52],section:[44,45,47],see:[3,29,32,44,46,47,50,51,52],seed:[0,29,46],seen:[4,22,23,29,32,41,46,50],segment:[29,37],select:[5,14,22,26,27,30,32,33,37,39,40,41,49,52],self:[3,26,38,39,51],send:[39,43],separ:[0,3,18,32,33,43,45,50],separate_actions_for_throttle_and_brak:29,seper:10,sequenti:[4,27,34],serv:[7,10,43],server:0,server_height:29,server_width:29,sess:[3,26,51],session:[3,26,51],set:[0,2,3,4,5,6,7,8,11,13,15,16,17,19,22,23,25,26,27,29,30,32,33,37,38,42,46,47,49,50,51,52],set_environment_paramet:[3,51],set_goal:29,set_incoming_direct:[3,51],set_is_train:26,set_sess:[3,51],set_variable_valu:26,set_weight:26,setup:[3,44,51],setup_logg:[3,51],setuptool:44,sever:[0,3,7,10,11,14,26,29,30,32,38,39,40,41,43,47,50,51,52],shape:[26,32,37],share:[0,3,26,34,43,51],shared_memory_scratchpad:0,shared_optim:26,shift:[33,41],shine:40,should:[0,3,4,7,11,14,20,23,26,27,29,32,34,37,38,39,42,51,52],should_dump:0,shouldn:14,show:46,shown:46,shuffl:[3,27,51],side:[3,51],sigma:[13,30],signal:[3,41,51],signal_nam:[3,51],significantli:18,sim:[6,12],similar:[7,18,20,27,29,33,50],simpl:[10,34,38,39,43,49,50,52],simplest:50,simplif:50,simplifi:[7,40,43],simul:[29,39,47,52],simultan:7,sinc:[3,7,8,10,13,20,22,23,25,26,30,32,51],singl:[3,4,5,6,7,11,14,18,19,20,26,27,29,30,33,37,40,41,43,51],size:[26,27,30,32,33,34,37],skill:50,skip:[29,41],slave:[3,51],slice:27,slow:[26,50,52],slower:[0,13,18,26],slowli:[8,13],small:[7,13,22,34],smaller:30,smooth:[40,50],soft:[3,8,11,13,21,50],soft_actor_critic_ag:12,softactorcriticalgorithmparamet:12,softmax:[26,30],softmax_temperatur:26,softwar:44,solut:50,solv:[32,39,47,49],some:[0,3,11,26,27,30,32,38,39,40,43,46,50,51,52],sort:24,sourc:[0,1,2,3,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,39,44,47,51],space:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,29,30,31,32,33,34,41,49,51],spacesdefinit:[3,26,51],spatial:50,spawn:[42,44],special:18,specif:[0,3,14,18,22,26,27,38,41,52],specifi:[0,26,29,30,32,39,42,52],speed:[26,32,50],speedup:52,spread:[32,33],squar:32,squeeze_list:26,squeeze_output:26,src:44,stabil:[6,20,26,50],stabl:[43,50],stack:[3,31,32,37,51],stack_siz:[26,32],stacking_axi:32,stage:43,stai:46,standard:[7,10,11,14,30,32,40],starcraft2_environ:29,starcraft2environ:29,starcraft:[37,47],starcraftobservationtyp:29,start:[3,6,8,11,12,13,18,23,27,32,33,39,44,51],state:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,29,32,34,37,38,39,41,43,45,50,51],state_key_with_the_class_index:[2,34],state_spac:29,state_valu:27,statist:[3,10,32,49,51],std:12,stdev:30,steep:30,step:[0,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,30,32,38,39,40,41,50,51,52],stepmethod:[8,12,13,20],stochast:[12,41,50],stop:[0,29],store:[0,3,22,25,27,29,32,34,40,41,42,44,49,51,52],store_transitions_only_when_episodes_are_termin:25,str:[0,2,3,4,20,26,27,29,30,32,33,37,51],strategi:[29,47],stream:[18,42],strict:46,string:[0,26,29],structur:[0,3,27,34,38,41,51],stuff:26,style:30,sub:[33,34,35,36,37,38,41,44,52],sub_spac:37,subset:[40,46,50],subtract:23,succeed:29,success:[0,29,50],suffer:40,suffici:27,suffix:[3,26,51],suggest:38,suit:[0,47],suitabl:[42,52],sum:[4,7,10,19,26,27],sum_:[5,12,15,19,20,22,25],summari:[0,3,51],supervis:50,suppli:[3,51],support:[0,3,26,29,30,40,43,44,45,47,49,52],sure:[0,3,44,46,51],surrog:7,swig:44,swingup:29,symbol:26,sync:[3,26,41,42,51],synchron:[0,26,41,43],t_max:[10,20],tag:44,take:[0,3,10,11,18,22,23,26,29,30,31,39,40,41,51],taken:[1,2,4,5,6,7,8,11,12,13,15,18,22,23,24,25,26,27,29,30],tanh:[8,13],tar:44,target:[0,1,2,3,4,5,6,7,8,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,29,32,33,37,38,41,43,51],target_act:33,target_kl_diverg:11,target_network:26,target_success_r:29,targets_horizon:20,task:[0,1,2,29,32,38,40,47],task_index:0,tau:12,td3:50,td3_agent:13,td3algorithmparamet:13,techniqu:[7,11,49,50],technolog:42,teh:26,temperatur:[26,30],temperature_schedul:30,tensor:[3,26,51],tensorboard:0,tensorflow:[0,3,26,51,52],tensorflow_support:26,term:[6,7,11],termin:[3,8,13,27,41,51],test:[0,3,5,6,8,9,10,11,12,13,26,38,46,49,52],test_using_a_trace_test:0,text:6,textrm:41,than:[0,3,11,13,26,30,40,43,51],thei:[3,22,23,26,30,40,41,42,50,51,52],them:[4,5,10,20,26,27,29,32,37,39,40,43],therefor:[0,8,13,26,31,50],theta:[6,7,8,12,13,15,25,30],theta_:[6,7],thi:[0,3,4,5,6,7,8,10,11,13,14,18,20,22,25,26,27,29,30,31,32,33,34,35,37,38,39,40,41,42,43,44,46,50,51,52],thing:40,those:[0,3,8,13,14,16,17,18,22,27,30,33,41,43,45,50,51],thousand:[11,14,15,16,17,19,23,24,25],thread:26,three:[3,42,43,44,45],threshold:[11,22,32],through:[0,3,4,8,9,10,11,13,14,22,23,26,38,39,41,43,51],tild:[8,12,13],time:[0,4,23,26,30,33,34,40,43,50],time_limit:39,timestep:[4,10],timid:44,tmp:0,togeth:[3,20,27,41,51],toggl:40,too:11,tool:[40,44,50],top:[26,29,31,32,34,39,40,50],torqu:29,total:[0,3,10,11,19,22,23,27,34,38,40,50,51],total_loss:26,total_return:27,trace:0,trace_max_env_step:0,trace_test_level:0,tradeoff:30,train:[0,3,18,26,30,35,36,38,39,40,41,42,43,46,49,50,51],train_and_sync_network:26,train_on_batch:26,train_to_eval_ratio:34,trainer:[28,42],transfer:[29,35,47],transit:[1,2,3,4,5,6,8,10,11,12,13,15,16,17,20,22,23,24,25,34,38,41,42,51],transition_idx:27,tri:50,trick:46,tricki:40,trigger:[29,44],truncat:6,truncated_norm:30,trust:[6,50],ttf2:44,tune:30,tupl:[1,2,3,8,13,26,27,29,34,37,38],turn:[2,50],tutori:[38,39],tweak:[3,51],twin:3,two:[8,10,13,20,26,29,30,31,32,33,37,39,42,43,50,52],txt:44,type:[0,3,10,18,26,29,32,37,38,41,43,49,50,51,52],typic:[7,11,26,50,52],ubuntu16:44,uhlenbeck:[8,9,30],uint8:32,unbound:37,uncertain:30,uncertainti:30,unchang:11,unclip:[3,38,51],uncorrel:20,undeploi:42,under:[3,26,38,52],underbrac:5,understand:52,unifi:7,uniformli:[29,30,33,37],union:[3,27,29,30,33,37,51],uniqu:26,unit:40,unlik:11,unmask:33,unnecessari:0,unshar:[3,51],unsign:32,unspecifi:26,unstabl:[40,46],until:[0,6,10,11,22,25,30],unus:26,unzip:44,updat:[3,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,27,30,38,39,40,41,43,44,50,51],update_discounted_reward:27,update_filter_internal_st:[3,51],update_log:[3,51],update_online_network:26,update_step_in_episode_log:[3,51],update_target_network:26,update_transition_before_adding_to_replay_buff:[3,51],upgrad:44,upon:[3,5,38,51],upper:[6,30],usag:[33,49],use:[0,1,2,3,4,5,6,8,9,10,12,13,14,16,17,21,26,27,28,29,30,32,33,34,37,38,39,41,43,44,49,50,51,52],use_accumulated_reward_as_measur:4,use_cpu:0,use_deterministic_for_evalu:12,use_full_action_spac:29,use_kl_regular:[7,11],use_non_zero_discount_for_terminal_st:[8,13],use_separate_networks_per_head:26,use_target_network_for_evalu:[8,13],use_trust_region_optim:6,used:[0,2,3,5,6,7,8,10,11,12,13,14,15,19,20,21,22,23,24,26,29,30,32,33,34,35,36,38,39,41,42,43,46,51,52],useful:[0,3,4,25,26,30,32,37,46,50,51,52],user:[26,29,30,40,41,44],userguid:44,uses:[0,1,7,11,18,27,28,30,36,41,42,44,46,50,52],using:[0,3,5,6,7,8,10,11,12,13,16,17,19,20,21,22,23,25,26,28,29,30,32,35,38,39,40,42,47,50,51,52],usr:44,usual:[32,41],util:[3,40,51],v_max:15,v_min:15,val:[3,37,51],valid:[0,37],valu:[0,2,3,4,5,6,7,8,11,12,13,14,15,16,17,18,20,21,22,23,25,26,27,29,30,32,33,34,37,38,41,43,44,45,50,51],valuabl:40,value_targets_mix_fract:[7,11],valueexcept:[3,51],valueoptimizationag:38,van:4,vari:43,variabl:[26,29,44],variable_scop:26,varianc:[10,30,40,50],variant:[30,34,50],variou:[3,27,34,49],vector:[3,4,8,9,11,13,14,26,29,32,37,39,43,50,51],vectorobservationspac:32,verbos:29,veri:[0,7,8,10,13,18,22,40,50,52],version:[7,11,27],versu:26,vert:12,vertic:26,via:[2,14],video:[0,3,29],video_dump_method:0,view:40,viewabl:[3,51],visit:46,visual:[0,3,29,47,49],visualization_paramet:29,visualizationparamet:[3,29],vizdoom:[44,47],vote:30,wai:[3,7,11,30,33,39,41,43,49,50,51,52],wait:[5,26,42],walk:39,want:[3,4,25,26,32,33,34,51],warn:[30,32,33],wasn:27,weather_id:29,websit:[29,49],weight:[4,5,6,7,8,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,30,41,43,50],well:[22,26,30,37,50],went:11,were:[4,15,16,17,18,22,24,25,26,27,33,46],west:44,wget:44,what:[11,50],whatev:[3,51],when:[0,3,4,5,6,7,8,9,10,11,12,13,22,26,27,28,29,30,32,35,36,38,39,40,51,52],whenev:42,where:[2,3,4,5,6,7,11,14,15,18,20,22,23,25,26,27,29,30,32,33,37,40,50,51],whether:30,which:[0,1,2,3,5,6,7,8,10,11,12,13,14,18,20,21,22,23,24,26,27,28,29,30,32,34,35,36,37,38,39,40,41,42,43,45,46,47,49,50,51,52],who:41,why:[40,41],window:[32,33],wise:32,within:[0,7,11,21,30,37,40],without:[5,11,33,34,40,50,52],won:[4,26],wont:26,work:[3,20,26,30,32,33,40,41,50,51,52],workaround:0,workdir:44,worker:[0,20,26,28,32,34,35,36,40,42,43,44,50,52],worker_devic:26,worker_host:0,wors:50,would:[26,44,50],wrap:[29,32,41,47],wrapper:[3,26,27,29,37,43,51],write:[0,3,51],written:[3,25,28,51],www:44,xdist:44,y_t:[8,12,13,14,16,17,19,21,22,23],year:50,yet:[18,39],you:[4,32,34,38,39,44,49,52],your:[38,39,44,52],yuv:32,z_i:[15,25],z_j:[15,25],zero:[2,13,16,17],zip:44,zlib1g:44},titles:["Additional Parameters","Behavioral Cloning","Conditional Imitation Learning","Agents","Direct Future Prediction","Actor-Critic","ACER","Clipped Proximal Policy Optimization","Deep Deterministic Policy Gradient","Hierarchical Actor Critic","Policy Gradient","Proximal Policy Optimization","Soft Actor-Critic","Twin Delayed Deep Deterministic Policy Gradient","Bootstrapped DQN","Categorical DQN","Double DQN","Deep Q Networks","Dueling DQN","Mixed Monte Carlo","N-Step Q Learning","Normalized Advantage Functions","Neural Episodic Control","Persistent Advantage Learning","Quantile Regression DQN","Rainbow","Architectures","Core Types","Data Stores","Environments","Exploration Policies","Filters","Input Filters","Output Filters","Memories","Memory Backends","Orchestrators","Spaces","Adding a New Agent","Adding a New Environment","Coach Dashboard","Control Flow","Distributed Coach - Horizontal Scale-Out","Network Design","Usage - Distributed Coach","Algorithms","Benchmarks","Environments","Features","Reinforcement Learning Coach","Selecting an Algorithm","test","Usage"],titleterms:{"final":22,"function":21,"new":[38,39],"switch":52,Adding:[38,39],Using:39,acer:6,across:50,action:[4,5,6,7,8,9,10,11,12,13,14,21,22,33,37,50],actioninfo:27,actor:[5,9,12],addit:[0,52],additivenois:30,advantag:[21,23],agent:[3,38,41,52],algorithm:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,45,50,52],api:39,architectur:26,attentionactionspac:37,backend:35,balancedexperiencereplai:34,batch:27,behavior:1,benchmark:46,between:52,blizzard:29,boltzmann:30,bootstrap:[14,30],boxactionspac:37,build:44,can:50,carla:29,carlo:19,categor:[15,30],choos:[4,5,6,7,8,9,10,11,12,13,14,21,22],clip:7,clone:[1,44],coach:[39,40,42,44,49],collect:50,compar:40,compoundactionspac:37,condit:2,config:44,contain:44,continu:[7,11,12,50],continuousentropi:30,control:[22,29,41],copi:43,core:27,creat:44,critic:[5,9,12],dashboard:40,data:28,deep:[8,13,17,52],deepmind:29,delai:13,demonstr:50,descript:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],design:43,determinist:[8,13],direct:4,discret:[5,6,10,50],discreteactionspac:37,distribut:[42,44],distributedtaskparamet:0,doe:50,doubl:16,dqn:[14,15,16,18,24],duel:18,dump:52,egreedi:30,environ:[29,39,47,50,52],envrespons:27,episod:[22,27,34],episodicexperiencereplai:34,episodichindsightexperiencereplai:34,episodichrlhindsightexperiencereplai:34,evalu:52,experiencereplai:34,explor:30,explorationpolici:30,featur:48,file:44,filter:[31,32,33],flag:52,flow:41,framework:52,from:50,futur:4,gener:18,gif:52,goal:37,gradient:[8,10,13],graph:41,greedi:30,gym:[29,39],have:50,hierarch:9,horizont:42,human:[50,52],imag:44,imageobservationspac:37,imit:[2,52],implement:44,input:32,interfac:44,keep:43,kubernet:36,learn:[2,20,23,49,52],level:41,manag:41,memori:[34,35],mix:19,mont:19,more:50,multi:52,multipl:50,multiselectactionspac:37,network:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,43],networkwrapp:26,neural:22,nfsdatastor:28,node:[50,52],non:34,normal:21,observ:[32,37],observationclippingfilt:32,observationcropfilt:32,observationmoveaxisfilt:32,observationnormalizationfilt:32,observationreductionbysubpartsnamefilt:32,observationrescalesizebyfactorfilt:32,observationrescaletosizefilt:32,observationrgbtoyfilt:32,observationsqueezefilt:32,observationstackingfilt:32,observationtouint8filt:32,openai:[29,39],optim:[7,11],orchestr:36,ouprocess:30,out:42,output:33,pain:50,parallel:50,paramet:0,parameternois:30,persist:23,plai:52,planarmapsobservationspac:37,polici:[7,8,10,11,13,30],predict:4,prerequisit:44,presetvalidationparamet:0,prioritizedexperiencereplai:34,process:50,proxim:[7,11],push:44,qdnd:34,quantil:24,rainbow:25,redispubsubbackend:35,regress:24,reinforc:49,render:52,repositori:44,reward:32,rewardclippingfilt:32,rewardnormalizationfilt:32,rewardrescalefilt:32,run:[40,44],s3datastor:28,sampl:50,scale:42,select:50,signal:40,simul:50,singl:52,singleepisodebuff:34,soft:12,solv:50,space:[37,50],starcraft:29,statist:40,step:20,store:[14,28],structur:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],suit:29,support:42,sync:43,synchron:42,task:50,taskparamet:0,test:51,thread:52,through:52,track:40,train:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,52],transit:[14,27],transitioncollect:34,truncatednorm:30,twin:13,type:[27,42],ucb:30,usag:[44,52],vectorobservationspac:37,visual:[40,52],visualizationparamet:0,vizdoom:29,you:50,your:50}}) \ No newline at end of file diff --git a/docs/selecting_an_algorithm.html b/docs/selecting_an_algorithm.html index b5844f3..f38ace9 100644 --- a/docs/selecting_an_algorithm.html +++ b/docs/selecting_an_algorithm.html @@ -391,6 +391,16 @@ $(document).ready(function() { and therefore it is able to use a replay buffer in order to improve sample efficiency. +
                                      + + TD3 +
                                      + Very similar to DDPG, i.e. an actor-critic for continuous action spaces, that uses a replay buffer in + order to improve sample efficiency. TD3 uses two critic networks in order to mitigate the overestimation + in the Q state-action value prediction, slows down the actor updates in order to increase stability and + adds noise to actions while training the critic in order to smooth out the critic's predictions. +
                                      +
                                      PPO diff --git a/docs/test.html b/docs/test.html index 20146f7..0798236 100644 --- a/docs/test.html +++ b/docs/test.html @@ -190,10 +190,10 @@
                                      -class rl_coach.agents.dqn_agent.DQNAgent(agent_parameters, parent: Union[LevelManager, CompositeAgent] = None)[source]
                                      +class rl_coach.agents.dqn_agent.DQNAgent(agent_parameters, parent: Union[LevelManager, CompositeAgent] = None)[source]
                                      -act(action: Union[None, int, float, numpy.ndarray, List] = None) → rl_coach.core_types.ActionInfo
                                      +act(action: Union[None, int, float, numpy.ndarray, List] = None) → rl_coach.core_types.ActionInfo

                                      Given the agents current knowledge, decide on the next action to apply to the environment

                                      Parameters
                                      @@ -207,7 +207,7 @@
                                      -call_memory(func, args=())
                                      +call_memory(func, args=())

                                      This function is a wrapper to allow having the same calls for shared or unshared memories. It should be used instead of calling the memory directly in order to allow different algorithms to work both with a shared and a local memory.

                                      @@ -226,7 +226,7 @@ both with a shared and a local memory.

                                      -choose_action(curr_state)
                                      +choose_action(curr_state)

                                      choose an action to act with in the current episode being played. Different behavior might be exhibited when training or testing.

                                      @@ -241,7 +241,7 @@ training or testing.

                                      -collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection
                                      +collect_savers(parent_path_suffix: str) → rl_coach.saver.SaverCollection

                                      Collect all of agent’s network savers :param parent_path_suffix: path suffix of the parent of the agent (could be name of level manager or composite agent) @@ -250,7 +250,7 @@ training or testing.

                                      -create_networks() → Dict[str, rl_coach.architectures.network_wrapper.NetworkWrapper]
                                      +create_networks() → Dict[str, rl_coach.architectures.network_wrapper.NetworkWrapper]

                                      Create all the networks of the agent. The network creation will be done after setting the environment parameters for the agent, since they are needed for creating the network.

                                      @@ -261,9 +261,16 @@ for creating the network.

                                      +
                                      +
                                      +freeze_memory()
                                      +

                                      Shuffle episodes in the memory and freeze it to make sure that no extra data is being pushed anymore. +:return: None

                                      +
                                      +
                                      -get_predictions(states: List[Dict[str, numpy.ndarray]], prediction_type: rl_coach.core_types.PredictionType)
                                      +get_predictions(states: List[Dict[str, numpy.ndarray]], prediction_type: rl_coach.core_types.PredictionType)

                                      Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so, raise a ValueException.

                                      @@ -282,7 +289,7 @@ raise a ValueException.

                                      -get_state_embedding(state: dict) → numpy.ndarray
                                      +get_state_embedding(state: dict) → numpy.ndarray

                                      Given a state, get the corresponding state embedding from the main network

                                      Parameters
                                      @@ -296,7 +303,7 @@ raise a ValueException.

                                      -handle_episode_ended() → None
                                      +handle_episode_ended() → None

                                      Make any changes needed when each episode is ended. This includes incrementing counters, updating full episode dependent values, updating logs, etc. This function is called right after each episode is ended.

                                      @@ -309,7 +316,7 @@ This function is called right after each episode is ended.

                                      -improve_reward_model(epochs: int)
                                      +improve_reward_model(epochs: int)

                                      Train a reward model to be used by the doubly-robust estimator

                                      Parameters
                                      @@ -323,7 +330,7 @@ This function is called right after each episode is ended.

                                      -init_environment_dependent_modules()
                                      +init_environment_dependent_modules()

                                      Initialize any modules that depend on knowing information about the environment such as the action space or the observation space

                                      @@ -333,9 +340,20 @@ the observation space

                                      +
                                      +
                                      +initialize_session_dependent_components()
                                      +

                                      Initialize components which require a session as part of their initialization.

                                      +
                                      +
                                      Returns
                                      +

                                      None

                                      +
                                      +
                                      +
                                      +
                                      -learn_from_batch(batch)[source]
                                      +learn_from_batch(batch)[source]

                                      Given a batch of transitions, calculates their target values and updates the network.

                                      Parameters
                                      @@ -347,9 +365,20 @@ the observation space

                                      +
                                      +
                                      +load_memory_from_file()
                                      +

                                      Load memory transitions from a file.

                                      +
                                      +
                                      Returns
                                      +

                                      None

                                      +
                                      +
                                      +
                                      +
                                      -log_to_screen() → None
                                      +log_to_screen() → None

                                      Write an episode summary line to the terminal

                                      Returns
                                      @@ -360,7 +389,7 @@ the observation space

                                      -observe(env_response: rl_coach.core_types.EnvResponse) → bool
                                      +observe(env_response: rl_coach.core_types.EnvResponse) → bool

                                      Given a response from the environment, distill the observation from it and store it for later use. The response should be a dictionary containing the performed action, the new observation and measurements, the reward, a game over flag and any additional information necessary.

                                      @@ -375,9 +404,9 @@ given observation

                                      -
                                      +
                                      -parent
                                      +property parent

                                      Get the parent class of the agent

                                      Returns
                                      @@ -386,9 +415,9 @@ given observation

                                      -
                                      +
                                      -phase
                                      +property phase

                                      The current running phase of the agent

                                      Returns
                                      @@ -399,7 +428,7 @@ given observation

                                      -post_training_commands() → None
                                      +post_training_commands() → None

                                      A function which allows adding any functionality that is required to run right after the training phase ends.

                                      Returns
                                      @@ -410,7 +439,7 @@ given observation

                                      -prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.array]
                                      +prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.core.multiarray.array]

                                      Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all observations together, measurements together, etc.

                                      @@ -430,7 +459,7 @@ the observation relevant for the network from the states.

                                    • -register_signal(signal_name: str, dump_one_value_per_episode: bool = True, dump_one_value_per_step: bool = False) → rl_coach.utils.Signal
                                      +register_signal(signal_name: str, dump_one_value_per_episode: bool = True, dump_one_value_per_step: bool = False) → rl_coach.utils.Signal

                                      Register a signal such that its statistics will be dumped and be viewable through dashboard

                                      Parameters
                                      @@ -448,7 +477,7 @@ the observation relevant for the network from the states.

                                      -reset_evaluation_state(val: rl_coach.core_types.RunPhase) → None
                                      +reset_evaluation_state(val: rl_coach.core_types.RunPhase) → None

                                      Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given by val, and by the current phase set in self.phase.

                                      @@ -464,7 +493,7 @@ by val, and by the current phase set in self.phase.

                                      -reset_internal_state() → None
                                      +reset_internal_state() → None

                                      Reset all the episodic parameters. This function is called right before each episode starts.

                                      Returns
                                      @@ -475,7 +504,7 @@ by val, and by the current phase set in self.phase.

                                      -restore_checkpoint(checkpoint_dir: str) → None
                                      +restore_checkpoint(checkpoint_dir: str) → None

                                      Allows agents to store additional information when saving checkpoints.

                                      Parameters
                                      @@ -489,7 +518,7 @@ by val, and by the current phase set in self.phase.

                                      -run_off_policy_evaluation()
                                      +run_off_policy_evaluation()

                                      Run the off-policy evaluation estimators to get a prediction for the performance of the current policy based on an evaluation dataset, which was collected by another policy(ies). :return: None

                                      @@ -497,7 +526,7 @@ an evaluation dataset, which was collected by another policy(ies).
                                      -run_pre_network_filter_for_inference(state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True) → Dict[str, numpy.ndarray]
                                      +run_pre_network_filter_for_inference(state: Dict[str, numpy.ndarray], update_filter_internal_state: bool = True) → Dict[str, numpy.ndarray]

                                      Run filters which where defined for being applied right before using the state for inference.

                                      Parameters
                                      @@ -514,7 +543,7 @@ an evaluation dataset, which was collected by another policy(ies).
                                      -save_checkpoint(checkpoint_prefix: str) → None
                                      +save_checkpoint(checkpoint_prefix: str) → None

                                      Allows agents to store additional information when saving checkpoints.

                                      Parameters
                                      @@ -528,7 +557,7 @@ an evaluation dataset, which was collected by another policy(ies).
                                      -set_environment_parameters(spaces: rl_coach.spaces.SpacesDefinition)
                                      +set_environment_parameters(spaces: rl_coach.spaces.SpacesDefinition)

                                      Sets the parameters that are environment dependent. As a side effect, initializes all the components that are dependent on those values, by calling init_environment_dependent_modules

                                      @@ -543,7 +572,7 @@ dependent on those values, by calling init_environment_dependent_modules

                                      -set_incoming_directive(action: Union[int, float, numpy.ndarray, List]) → None
                                      +set_incoming_directive(action: Union[int, float, numpy.ndarray, List]) → None

                                      Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent has another master agent that is controlling it. In such cases, the master agent can define the goals for the slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent @@ -560,7 +589,7 @@ in-action-space.

                                      -set_session(sess) → None
                                      +set_session(sess) → None

                                      Set the deep learning framework session for all the agents in the composite agent

                                      Returns
                                      @@ -571,7 +600,7 @@ in-action-space.

                                      -setup_logger() → None
                                      +setup_logger() → None

                                      Setup the logger for the agent

                                      Returns
                                      @@ -582,7 +611,7 @@ in-action-space.

                                      -sync() → None
                                      +sync() → None

                                      Sync the global network parameters to local networks

                                      Returns
                                      @@ -593,7 +622,7 @@ in-action-space.

                                      -train() → float
                                      +train() → float

                                      Check if a training phase should be done as configured by num_consecutive_playing_steps. If it should, then do several training steps as configured by num_consecutive_training_steps. A single training iteration: Sample a batch, train on it and update target networks.

                                      @@ -606,7 +635,7 @@ A single training iteration: Sample a batch, train on it and update target netwo
                                      -update_log() → None
                                      +update_log() → None

                                      Updates the episodic log file with all the signal values from the most recent episode. Additional signals for logging can be set by the creating a new signal using self.register_signal, and then updating it with some internal agent values.

                                      @@ -619,7 +648,7 @@ and then updating it with some internal agent values.

                                      -update_step_in_episode_log() → None
                                      +update_step_in_episode_log() → None

                                      Updates the in-episode log file with all the signal values from the most recent step.

                                      Returns
                                      @@ -630,7 +659,7 @@ and then updating it with some internal agent values.

                                      -update_transition_before_adding_to_replay_buffer(transition: rl_coach.core_types.Transition) → rl_coach.core_types.Transition
                                      +update_transition_before_adding_to_replay_buffer(transition: rl_coach.core_types.Transition) → rl_coach.core_types.Transition

                                      Allows agents to update the transition just before adding it to the replay buffer. Can be useful for agents that want to tweak the reward, termination signal, etc.

                                      diff --git a/docs_raw/source/_static/img/algorithms.png b/docs_raw/source/_static/img/algorithms.png index b3310c0..6c00f21 100644 Binary files a/docs_raw/source/_static/img/algorithms.png and b/docs_raw/source/_static/img/algorithms.png differ diff --git a/docs_raw/source/_static/img/design_imgs/td3.png b/docs_raw/source/_static/img/design_imgs/td3.png new file mode 100644 index 0000000..fc28eb4 Binary files /dev/null and b/docs_raw/source/_static/img/design_imgs/td3.png differ diff --git a/docs_raw/source/algorithms.xml b/docs_raw/source/algorithms.xml new file mode 100644 index 0000000..e6f68c9 --- /dev/null +++ b/docs_raw/source/algorithms.xml @@ -0,0 +1 @@ +7V1bk5s2FP41O9M+JIMQ18e9NduZbCdp0jR5ysgg26SAXMC73v76CnMVEphdkLGzbB5sDhJgne/7dCQdkQt4HezeRWizvicu9i9Uxd1dwJsLVQVAt+hHannKLIZuZ4ZV5Ll5ocrwyfsP50Ylt249F8dMwYQQP/E2rNEhYYidhLGhKCKPbLEl8dm7btAKc4ZPDvJ569+em6wzq6Urlf0Oe6t1cWeg5GcCVBTODfEaueSxZoK3F/A6IiTJvgW7a+ynjVe0S1bvt5az5YNFOEz6VPh+fW+gL8GXH+p7HP5156+j3dc3mp4/XPJU/GLs0gbID0mUrMmKhMi/raxXEdmGLk4vq9Cjqsx7QjbUCKjxB06Sp9ybaJsQalongZ+fxTsv+ZpWf6vnR99qZ252+ZX3B0/FQZhET7VK6eG3+rmq2v6oqBcnEfmn9F16j+wXpz+ztSVzU0y2kYM7mi8ttYckilY46SgItdLjlCqYBJg+I60YYR8l3gP7JCjH7Kosl1e9jCL0VCuwIV6YxLUrf0gNtEBOP6jl2MvJpxsNiDTLm1ZXefole4LiqPZTKtMeds+AYP6jH5C/zZuBg2QFuNR/j2svwZ82aO+ZRyo7LLiWnu9fE59E+7pwqaf/SiDUzhj7v7QGCZOaPftLaxSMTXGEIifHs8qhSi1R9YCjBO+6ccWjoKigs+4ChXQ8Vsqj5i5R1jXVgYXoiJBT894LnKNwzpj14Rn60FceBqrDIB+rHAHJIsbRA701CYdxkWEQT0DHwoulkE0NErsIW0uHYSEYiXRU9N7qDO0MnnW2zpNOlcY5wDX6zLn+nIM9OTe0Rx7kYyjo9AyfPu7Vgn5ZJfsmyQxp78Sgwfh3S4oTb+K9Py9pAaBvdtXJ4iq/h5ttWv02WGDXxVFxVfrU2YXZm1Fz7QHkUd9SF3Df9x6mvo4tV2Oob4zEfMVqMB9AnvpAhzz3LVnc146Fi3vPdX38SBtrRkQNETaDBxoBC/Cg8njQZOFBHzk4PgMfADYILrnG+EAQBJfkHd0JhgRSQhEpb24+vKMnL52EtrOq3GHkDmEndULScD/j8ZCEuOHa3IR8bxXSQ4c6jfYZ8Cp1qecg/zI/EezlowVvbCzCQG4EgABDacg2FNBUEyBEXsSm/ewR24jxmdkzPrOmjM/MVsr3ZbgtInirbiyiUgNwGGMxvZtF/9gG9EZkm9AQL87uGm6D78hJB24xX/6AltQLyutjXNNeKPwgQNDHLJfYcCQN+lRWQVRL0NGropkWXZaEzBOxQ0TF6ikq9pSiYo0fRwhV5jMK754VN7w+/utqM4aA8C2vAYZo3gfKkgB7loABEmD3lAAwaWBhcxrwlfe673ubuC26rvkOxZtszXPp7VIMXLE8HZ2IYzDPZId30OSHd1C4xCGLdXBm3QDWaT1ZZ05JOmBO6WNQ83Dl70M+ZjxcOXwKHwPjHKIr0DlNcziUskSh1GXbWOpPFK56DtM47L2aeRlz4nkZYxjvwSvXdtB3VJXNSh87u6UctOdw06wD2S2ws7yk7BZ+0JfNz1BbgNFrXF/XtIkX2IsLzylHKrDYcFyzBSlHotUWaEjzjtraj/deXVFEfblDQgclrRMjLx2EncGYy26k9QHeyaZglVsaAeHYOX9nsKxpaKwPBLGRcGkZyPOCiGg/uRcaCbGmIMMSiDIsAZCW8WHNMeqQbK++6V5Z8DdZvld7wtcJLSjOC4Rt05S6LVgcMARhkaz1QW0eyg6Sib7zlNCYVCba8/9mmThBmYAaO7SdXCaKCHKWiZfJhN5TJrRJp7qhKC10lomTlYlmNGEIZsCOKxPDtpi8epnouyCmK2JcHEkm2lfEZpk4QZloRhOTy4Q2RxPH2R0+dP/ni9bPnr07XIFd5eWsn2nGDMH5BQWTvqBAH7a7YxoIqqeDQa1vZqY+iQyqjSVQ07QPyGBneUkYPJtVgiNAaShCDkLAMhurPRl081qNoGqMTo7PCp4kUBcG5WXlkN9uBOZwvmux0WSFQrC7/Lij/mF7jKbpyMzT6ciKPJgT7chgMatTdkwHginV7CovqSMT5V7JkLrxZiAGPVeXaM4SCUEzOXB6kTybASevdSewu1vvm49hqFJCOQ2wotY3lHu22AJ2MsTSDoitAbrKSxJbCa8ZEqrcvRceELXXvuHUMliZsy3BdtNipuEo20314YMOAERg+DjOZYqrxNsSOPWes2amnyhIMREu4vQj+0UH8diZ/vvTpeFr7MyZJdh3KdztLG1vjioJfoEX/jIBBGkrKB8FFVRxhV9nfLKRoAobkaAIoeZREdqeZVG68fMNlWfly0X6EM1OrfH6pkWrL/n9gCM0KLAbgXUxyVTfjCdI+i/W/MZvzh6v0RK+le7YDWexEaQhyBHXRB21tA0xZ7P+cJIjkhzPR9rEKNh12Nhh1Xx35EgDkuZuyOJF9G0DEq3xXM8tf+TdlvSwelV+Vrz6Dwfg7f8= \ No newline at end of file diff --git a/docs_raw/source/components/agents/index.rst b/docs_raw/source/components/agents/index.rst index 476bc7a..ca21713 100644 --- a/docs_raw/source/components/agents/index.rst +++ b/docs_raw/source/components/agents/index.rst @@ -21,6 +21,7 @@ A detailed description of those algorithms can be found by navigating to each of imitation/cil policy_optimization/cppo policy_optimization/ddpg + policy_optimization/td3 policy_optimization/sac other/dfp value_optimization/double_dqn diff --git a/docs_raw/source/components/agents/policy_optimization/td3.rst b/docs_raw/source/components/agents/policy_optimization/td3.rst new file mode 100644 index 0000000..f7b9a78 --- /dev/null +++ b/docs_raw/source/components/agents/policy_optimization/td3.rst @@ -0,0 +1,55 @@ +Twin Delayed Deep Deterministic Policy Gradient +================================== + +**Actions space:** Continuous + +**References:** `Addressing Function Approximation Error in Actor-Critic Methods `_ + +Network Structure +----------------- + +.. image:: /_static/img/design_imgs/td3.png + :align: center + +Algorithm Description +--------------------- +Choosing an action +++++++++++++++++++ + +Pass the current states through the actor network, and get an action mean vector :math:`\mu`. +While in training phase, use a continuous exploration policy, such as a small zero-meaned gaussian noise, +to add exploration noise to the action. When testing, use the mean vector :math:`\mu` as-is. + +Training the network +++++++++++++++++++++ + +Start by sampling a batch of transitions from the experience replay. + +* To train the two **critic networks**, use the following targets: + + :math:`y_t=r(s_t,a_t )+\gamma \cdot \min_{i=1,2} Q_{i}(s_{t+1},\mu(s_{t+1} )+[\mathcal{N}(0,\,\sigma^{2})]^{MAX\_NOISE}_{MIN\_NOISE})` + + First run the actor target network, using the next states as the inputs, and get :math:`\mu (s_{t+1} )`. Then, add a + clipped gaussian noise to these actions, and clip the resulting actions to the actions space. + Next, run the critic target networks using the next states and :math:`\mu (s_{t+1} )+[\mathcal{N}(0,\,\sigma^{2})]^{MAX\_NOISE}_{MIN\_NOISE}`, + and use the minimum between the two critic networks predictions in order to calculate :math:`y_t` according to the + equation above. To train the networks, use the current states and actions as the inputs, and :math:`y_t` + as the targets. + +* To train the **actor network**, use the following equation: + + :math:`\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q_{1}(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]` + + Use the actor's online network to get the action mean values using the current states as the inputs. + Then, use the first critic's online network in order to get the gradients of the critic output with respect to the + action mean values :math:`\nabla _a Q_{1}(s,a)|_{s=s_t,a=\mu(s_t ) }`. + Using the chain rule, calculate the gradients of the actor's output, with respect to the actor weights, + given :math:`\nabla_a Q(s,a)`. Finally, apply those gradients to the actor network. + + The actor's training is done at a slower frequency than the critic's training, in order to allow the critic to better fit the + current policy, before exercising the critic in order to train the actor. + Following the same, delayed, actor's training cadence, do a soft update of the critic and actor target networks' weights + from the online networks. + + +.. autoclass:: rl_coach.agents.td3_agent.TD3AlgorithmParameters \ No newline at end of file diff --git a/docs_raw/source/diagrams.xml b/docs_raw/source/diagrams.xml index b923c63..15f067f 100644 --- a/docs_raw/source/diagrams.xml +++ b/docs_raw/source/diagrams.xml @@ -1 +1 @@ -7V1bd6M6sv41eUwWQlwfc+l073O6e7J3nzln5uksbBOH2Rg8GCed/etHwgiDSgZsSxjSyszqbcvmYr6qUtWnqtIVvl/9/JwF65dv6SKMr0xj8fMKP1yZJkK2Rf5DR953Iy4qB5ZZtCi/tB/4Ef0VloNGObqNFuGm8cU8TeM8WjcH52mShPO8MRZkWfrW/NpzGjevug6WIRj4MQ9iOPp/0SJ/2Y16trEf/xJGyxd2ZWSUn8yC+Z/LLN0m5fWuTPxc/O0+XgXsXOX3Ny/BIn2rDeFPV/g+S9N892r18z6M6bNlj2133OOBT6v7zsIk73UAxrtDXoN4G7J7Lu4sf2dPo/g9IT0CXeG7t5coD3+sgzn99I3gT8Ze8lVcfryMgw19+gZ5PU9X0bx8vcmz9M/wPo3TrDgrduZeOHuuPmHPGZOR5yiOa99cBKH3PKfjaZKXwmIa5fva94zij4wHcbRMyFgcPuf0bTYvj3LIO/iIyqf2GmZ5+LM2VD6yz2G6CvPsnXyFfWp5JX7vTG59bzfwthcXiwnFS01UsFM+36AU0WV19j1M5EWJlBg1x1MI2iLYvFTHtSL4yaH/64NgqQPkQlmwiMI9akmahDKAdeUAS/Bp4GoajgFxtQS4Osw2nIOrb2hcc0lIuk0NpTYaIIlFSCJLBpIIIBmQeWoYy+q7D4brSsYPmS34yQDMt25sDjLHv/Hw/s8GAFYaUwdQCn5mtyaGyeKW+hrk3ZxCQwGp41XN7fQJxcEsjO8q76D2EO+K/4nR6vn4yVPP3v9Br0QeYfn2nwfnXcN9vL1rAy0PsmWYNwU5XDQ8JghjDRVbAAoby8I4yKPXpp8lQqq8wlMakburpKSyvUxGKpeRnWOTbrN5WB5W93zAmWxe3rDBnWv3IMC5COrBe+1ra/qFzTE3jY3We7N998wDkN1+ALgl7gDyYvcr99pSYd1LgdweCqSnsp5TGeeU4L5TmckL9CmmEKl0NocHr9WQNgRLethgNnWuCidrMDqOKGbwZcxoGkUlKCK/J4oWP1OdpIzOLw+jpFAP8zOgQBsxFhlVGf4lnB23m5AMJGH+lmZ/FnwX/YVZuIho3GCQ6CFKE/JiEREAotl293as8cSAgQN2mkCKZkdlcQJkz/5e4Ji/0H/Dn2vyk4MSuXUaR/N3Bu0mWK1j+qXnLF1VR5SAEwg10t1IuwMibQGk51kY5BS0IKkraJQ8U3zT2b/CneYmFMydekd5BX+eZntBqQ7efTlIivBisYjocBDvzxrM0m0ODtOyAWTDwgPKBpyUK+0OaMxLUC10vIJ5m0TPabaK33n9r76xKQAz7+lYnCbL4peQH0I+IfdHrUP4HGzjxtRQykhxGxstFTVwL2EwYNi0NxgUbyIVm4jJwyYqIN4JAXn2OTMRi+IFszOL6jtJ+EYnmOSVyle4WacJNTBjhXwE9KLRJHt8a0BRYB5JBxtMfkrOUYiNZ8+eWu0hl0OMdJiTBxSS8Tv6YKJ5EN+WH6yixYJeRigEezEx+hItfdmQ4nvlLxQsTp6r0KbA1xMTIRJA1JRwY7YbCSOM7aZIIIfDujchbHAn4uN1SWwwf8Nd3C7yAE3Nie15TC3qkQHwS8i1+THlGvmc/PBs4oXk2jYUyzWM2LJtEYe/BEUwVoxRPyxKCu/LIT47kZVktllXIjP52blVmWTPzphfdDdcMD0jIacmY362e+H9Qnzw7VqjLcGlRj3QNlWh7QK05y9pummhRc6HtFzT+6jONc+hmIJwufKKpAMKF6jS2SbMXmFYO03dvGSc5AmQ9BQByc4xEX/yZ5TX3Eny7p/lPUj2NHfu2lU9gqo7nywPdyTeJx90WK59mvfJe4UOL2KSvE/+hm3U7n3y98V9/2zvs1JZrQWHtABNQA242Mni2cBT1cDFatSAD6ourgaQNOOWKz7E3D5slGX7XOaiI6BBmcGWPr1ruojBMCpTxWuyd2piLOZqW/yzs2IbonWk/YAkTsealzYgPQwIJyomNB+ipTAp5gOyNM1UiN3CJlvyjJLqg/k2K37z/sPXIIuCWazXPY9Y90SGAcFWtvBpQpYmWCwqSImByKKETg9Uod+CrPgorT6vr5FriPuyrvtq1DrEomVRKRD3SA//sP5APZxhlq2xpOSO2kdAxunxjHXj1P5cq+kzDLR2igyvo87Fbz/g7AAHf5A4X5Lwj1bQ3ZMFnTvTUMunnaLNL78cfwBoDtD5203JygNjSTLNUucb5EPyWbXNjLl0m6+LvFh2+Fi9hQt6fvaQGW8Yxm2/olkcGZ1pI27WbgqILBvpnE1unsEYYBhgcjaFJxA4WxIl0zAllw88eAvjeUNaGEECQI0hCNdXJXsQrqNNuigSQFjSx3JXmcvCJM0eHcce8YyC7QDYkSr6GcNws8kfNfLqa/TRKlyl5aW0OvfkEQTNb5TxCLhHgfKv4DF4o/IYLF4mzFPTUAcLpEDIf/Ewh6lHTbQXhXY/6vlHwvwjqtx2BSohY/qxLtYubzT194raKIgKNsV9FKRU88L4lIsS9n7EBoQI66xZq69DhaOXoWx8sd5eFowQiyKAD2KML5lniiyrC1ZVaadWj84oI/If1eWl2OaoPEjgXVmGUETOTVNBlt963iE5KEtQ+50mVKsOTC/kmkV3hyvaQTif0zJ/xnTrOaX/nCKWgCHmFKZ0ohYQcVSU86fPAPZG6wdNUBwNuOVeDnBIPw4E3WgCAUmNuPjMfeQIaCdhJy4ZbfFsSDvFYZAle+0sDbJ03/CDF5NV7HoFq2B1AJmKaGIHUi7b9WKXNlg6S7UATpvc3ibXvVxPZmdSqTbnloiVD2wsXrzlN+Xg9PRy/kw+LxtDeupsN4ZpyNRlihVLeEYdYxKj5DgeshzbwB7vUHhISsTJN4jZbw6hOhXMs9pXMGzn3AOsjn7hMIqX3C/cgaRcnKZ0Ef85paRssl1d1Zt1FEv8NIbKo5Wg2d00qbtB11GADDgCBl5VGYgjCp+cmGK2iF4bWDr/3tLdkO4opNclOrc0rKb+c/UpeVVmdezOslnTrpy7sTglKJTj5MbqH9WGi+uyUS1Ox4oT3zwCeaLG2IrEyYXuvt5upRUu03fB9heXc+1d6NqLl2Q0gm0IWublEFS5MP6L7/dhCyZmdVtXuXBpXNvSYzXR9i6nidCTDunrgvz6EH7NsDymANzO3AVVi9wuXOQu50nWOV63FT4NZC60x4KSeGWdC10XYEaJFfYDSzCaBrf+1GrI7hkit04RcSkHLuSIjuawgixnrFh5g8XYY0R/ZnlBAW9GBsuv0Nv9V5jn7yXgwTZPyVCa5S/pku568JXG/3WBPUCZtUrKmeQXm4rr5Jfbt6inN6vVu6U8DHOg5IyHrVSX48J8lJHwj5bFeWygAqr/roQeP9u4ijqmwZvu4AAdnv88+gBWYXb41/Oeb/OAs1lGb1rNiy5B97sidRvXYpRlc2JSbcd9tLrxZ3IUNS4Aks2qFHrrztEH4A7tBLeEJVP6HnRWo9U6S3VH16tTXFR+6wujk65TFYV4H6RUS6UVFfmN5risKC9Qzqn13y7f+R+cSZnL0mUUwW889gCzq8EMf0um5AYzvshlUbVSVjXHv+ha2UfPjBPsdD8k3eD36M0xodTVY9j5ffKq12bJj0heNfg6NgPDanhLtAuCxXqknIUlJHvJE1vTAoJVkARLQVXa+HytMSjlvhVaa1sD0XwsRScvtq+zM/fC2XMfnVwEofc8l6F8SNLCC8GIAw0JOiFjUdYB4hmak1CDdT5x+BrGSrVP7uw4Dt0DnScEK57qdK9HOKNG9zxzhp1e8+HCDr2FdbruyUGJy6tEgox+T5DQz1p7nLeFpij7W5UfTFSXOvkfyQ0eh6LzvTkQ4xIGSBIjU7xW9D4oIYjSkIoOXVqt6FNTdFA3NKyiX8ybnpaiY7DlrQtRUqfoQ+aAa0VXo+gYAUX3BlT0Hs3nddjcHTabopJrZWEzcQMBbDpulhE3m6Iaa1Vxc8W16Gm2CybORmIBvaFsmkU6cP4Ams6HZBgP6E8jUyv6SYHzsIo+5EqxVvSBAudhFV0zZL1QAoEzNgcMnJFmyKav6CBwxnjAwBldjCHzHRcHvRQ9RETV3QvP6Dafj48swRIzYnnE8nV9SJIsTF6jLE1WWuNVaDxyMZQlOG0gNpXIV3rNlp2Cm81P9qYFUROzZTI6FCKk2TIp+mfzqXqmI9A+ZWyZqdmynjDxjpHASCrzrU3Nln0ATedzObE5oG/N9ForejtKfK3FwIqu2bLpK7rD7wU0rKJrtqwXSnx9GFF0iJI6Rdds2fQV3XWAorsDKvrFAudpsWWOARgOLMxIUMWWmTBU1mzZNDXeQYB5xYIcUnVsGYbxOqBdjNcofNN49rMNnBNgCfrDIt9VhSaM60GhpEazP5eNEeCyh8UT+t6aFJVBiiKBb66OFMUi51yHUN2kKHIGDKGwaLVYh1AT03SebkPugCEUvth+adNSdECKDqvookBXK/q0FB2QosMquq6m74USIEWRA1FSpuiWaDFaK/q0FB2Qooj1Bx9C0S3RSvkgil490f5tpPo2C9ynDzlKwmZ+02rXgEEzFkZb/F5jp2FmAsw+1VnMQeAbfgNbxagCPXRZW64GFSKw5XLKKS24lH27FOGpW38dSMUGVLMrSPJ1BasWciwpJEGiZL2lXSYJFLmAypqsA1XJjxRr6nOo+aLUbNEWwzL2WUEWDGnTba5x69Y3H5DHAyPXYz130AbGrRgcalMM7ffR+yhUpmckPYkJ5F5DMOAeq32bEiOwp4vRs7f7CU16kQWj3nS2CbNX8jTShAU5s0wYWVHgrzeFJNDACrnrn7vIiIutsvAtyBaSTna0c0CE67H4a/HTBnYTkNuiNk2FM5Ek42ViXkIFLrxoczYproItitvHa7kOtl4/s716NfeOxW5ZtiXLbnWfSqLdsmEQH8xPNFnaxEgyMVVn6MuYGBtaFLm7kXXuvTDhzch6yMvpVs82RS5cX1MofTuy6n4agSvRSvK86Kl3HIQRp0vytIcJhc4rNGwFT1EI2/Qm9ssCdUKQ9V9pUEdylB0rUPaWnQe1rvf2cAQ7yFSDl9D1HsUOI/I89yJngqD5sKSeDZotAI1ZybH4qtg3wP4jp+5XKDqZwVsmmf5qD+aGSNmavoxWAX3glYP4lQrcU7qJCv8WP8zSPE9XrZJYeZ28k5lTC3EXbNbhnKLzHP2k5vGuuOQtGzXYCHn9kufrTeEsP5L/L6P8ZTu7IZaUvPlOaYIk+PG+ycPVhgzM02D+Qv6bBW/0gQSbgkp8jFZL8m8QL9OMHL7a3KyTpZxpyOP6CO43kW6sRQvoODl8nO0rnoU+lMeJuFmobeXrXIPGKuQbBs1Dsmeh4tBjdwMD7S9N13FarZXHLRjwB1ydu1kXYqG6luRpSLIxUUn2cfsBEiQZaUmekiT3dTJHJ8mecpssCFS0JI9XkqXHuANJMjJYkrs6UVZN12hRlirKfVfX5dM1jqUlZUqSYk3V6FW7AKgzeioWpLQoKxNlZ7KibCmfvx0tylMSZelLq4OJsuupFmVXi/KURNmfqiibHQ5GjyM8s+sIVhhz4AgJ6uJpdZmSuniTVRdLOZ8Gc53/t3hpGn9b59Eq+qtMeubkfYzFD62SJX8XJ4NHE8GCTSTK7JNSJ+bqxamR2iDXEtggZyRTtsVPp04nEcpbLUe2DXL16tSURNm9XI6qq1d/piQp/khW5I83ejbLYlJn9PTqz5RE2Z3s/G07xx7hsp0D1Qk/zD8WNznQYQa2ePjMQcMMvV4zVjPlimbckWRzAjPVSXVg3kxJpzpcvV4zKVGWnm8xlChbLKBVJ8qwW8lTGkfzd03bHT2fDkzb6aWDsRohNR4+NA+O7d34NtEaY/cvn6fg2TeeTXxwc/ev27zAgfqro80acPLdkuc7aNYcPibmjjjfrLEM/ppZe/j9+0BG7Mxq4yGNmG/wy0IetGGiqkMpJoxVDNVQ+v2Paw2UqBaPd2o9wb7grjKgTADUXZrm5LEF6zWBpdYVZNfjo8CQH9SoAvXDdjeqyBDBasmAFXaIfNDaJ1yiRRZfzzyspYQ017dv9xooCBSoGyJOEKxaVgcUbNr5/ZMGSgCU5XSbPnUwwa1Lvl//yMM1nLV+v/4aBlkSJUs9o/XAFRlmN66+CFdbBq4COuP2qwauD3A2tJwAOEvQ7AG3BK79gYMdPL7fPmrgegBX9XdrA06dJYVpf/dBHtJuJHPa70tHBieqo8+xOYPaUV/Anzw+aeB6AGfzbRpFtLAydfQhpVLx+Z/L7RI0jH2sqs+XyrAYrNGLXLRlsRQcTYCJZvdHwe77omZu/kiqG8GCYWeNC2aZkMpqXHxdcz4eUZ6egFqA+W4T0D5XRBx32r8HNyTh/eapJPY09CGTdDvPCdKmcZ9FOe2dq6fxzmncwpx5E03jbCld/iwOaaaHh6fPGrkeyGG+7J4EQEMyuT6kku7jaF0sdxlPT3/TIPYB0eVBdLEAROYiyAcR0koaur6WE65NDqh/Jlv11G7jCNzGegRkGoIIyDQuVstlGrrqb0qSwrJgLx6KoCbDg42ymORwrGw6rUecHSubbPFMi/IkRNkYC+1ztCibCKkWZV2WOClRHklrk+NF2e44QoIoawZzUqI8mgrbY0XZ4WIuBaIMWcXPJJ59+RYkwXIiWxMPzUPxKGLBQqCqMp/KxainXgdUWU3jj68aLpg34dhcpi5GBgRMmDkhBS9IG37RQIn0ii1xt8HENsmSDxMkBr9t4zy6DnT3ADEd6PIZ8ELAWAKmfMBgntmPMH6+XsfBu4YLZiFx25wPawQRTB/7lGzC1Yz8QI0VSDVidTitiiXKvJWCFcwY+3J7r5dI+kxhJvQ1WHu+IZZI2AawNeget7p8S+Rr8Bo2KEywzI62VkiTa5bIoQEDi48cYKYpmL5U+RosQajuHH798UXjBBXLdrpxYjttyccJkhnfbncJNhqpHv67wAiq0ynIY/z9/o4ORMEVrWCBdXfMZdxoZ6QbYN8y+Qplz4f4itxIGZv1mkjixg+/IhN+eGHg3IR11ryhuQg/9MplS1tom8uq9Q3uMn0TdH1+I1+PFRnLz881EWSQHrZhXNQJG7s2Ck6wopYnmW3WBVq8FXuLiJiYxtOnP7SF6zGFGSAjTWDgfGUGzpdn4PosAlp+w/hdGzcGfTof2ALSn/oUZhHBJswOCuyRxg+LjN9I1roNvoOcV3ZEOmwrmTE/cERXMYTh2jfINyzXKf9tns217Zvap+bJZrjtIp7VdhGJFroyMXz2vaEJrr7p21wRpM9O0sWbcKieZHFNUXuiAr9y4vyiJ85+MBoOHxuIgHREFLMUICW2ftaxgczYgBUoNabHsdQKgtJUH3U0lrQM3HpE1/SI+caUnntiKIKwZ96QoBwh17FcA5zXcm/2H3rMS1ExCUpsE6t1T73ujWSb++N1D2O/9YhO3aMKw2ufd3NyrS4CpgCz8ytQMwx9zbvwJXiN0ox2wTHu41S3EOvprjiIkyRRq0VlrTeqBjraYg6W09swgSy9Unn7bNi4mqcLe1sbx2nvxO2PtBO36/BdbmR34mbN0GuG8bdVlOtNBQ7YPqtH9yh12cZM9+pF77/p/ol9kHP5fgWiWYttKC0fOJh2/EcQJTMyF2jwusFDfLcv30CiPgWK+u2ZWG/mMR5vpLmA4ItcFOnF7qevnrr+jbP/422Q7dwg8OnR7g2yXcFpaqusLReRF2Td/fXwPX387d/P//PydPc12nxK//Ff1yYLO7W7LltBit//8LO6WfruvXx3UHk4leD75Llq9eQQGcDvPN3ddcy0Wo/o7jrGX/H0rmMWTx6y2WIoFbNgOv/tvV576eVZWLADkqDMSZE/fwBO3S9leFtKHs3elNI371d1u3qkle3sSNquyJdml4/vAmmDbQWOtMcef8WT7bGNwc4VyrpA/vz/u5d0nv8Zz/57M/+x9D7/+GN2XUUL9Vq4XyajAdhegYIcNsc2umhXMzGeTC2VGmRtdHmje2B26jbIwMftIYDjs6gg+ezsvrrcKnZ/iwpuRZqHS95maZrXv160u0gXIf3GfwA= \ No newline at end of file +7V1bd6M6sv41eUwWQlwfc+l07zPdPdm7Z87MnJeziE0cZmPwYJxL//qRMMKgkgHbEoa0MrN627K5mK+qVPWpqnSBb5dvn7Ng9fwtnYfxhWnM3y7w3YVpIux75D905H074tj+dmCRRfPyS7uBH9HPsBw0ytFNNA/XjS/maRrn0ao5OEuTJJzljbEgy9LX5tee0rh51VWwCMHAj1kQw9F/RPP8eTvq2cZu/EsYLZ7ZlZFRfvIYzP5cZOkmKa93YeKn4m/78TJg5yq/v34O5ulrbQh/usC3WZrm21fLt9swps+WPbbtcfd7Pq3uOwuTvNcBGG8PeQniTcjuubiz/J09jeL3hPQIdIFvXp+jPPyxCmb001eCPxl7zpdx+fEiDtb06Rvk9SxdRrPy9TrP0j/D2zROs+Ks2Jl54eNT9Ql7zpiMPEVxXPvmPAi9pxkdT5O8FBbTKN/XvmcUf2Q8iKNFQsbi8Cmnb7NZeZRD3sFHVD61lzDLw7faUPnIPofpMsyzd/IV9qnllfi9M7llEv+6ExeLCcVzTVSwUz7foBTRRXX2HUzkRYmUGDXHUwjaPFg/V8e1IvjJof/rg2CpA+RCWTCPwh1qSZqEMoB15QBL8GngahqOAXG1BLg6zDacgqtvaFxzSUi6TQ2lNhogiUVIIksGkgggGZB5ahjL6rt3hutKxg+ZLfjJAMy3rmwOMse/8vDuzwYAVhpTB1AKfma3JobJ/Jr6GuTdjEJDAanjVc3t9AnFwWMY31TeQe0h3hT/E6PV8/GTp569/5NeiTzC8u2/9s67hnt/fdMGWh5kizBvCnI4b3hMEMYaKrYAFDaWhXGQRy9NP0uEVHmFhzQid1dJSWV7mYxULiM7xzrdZLOwPKzu+YAz2by8YYM71/ZBgHMR1IP32tdW9AvrQ24aG633ZvvuiQcgu/0AcEvcAeTF9lfutKXCupcCuT0USE9lPacyzinBfacykxfoY0whUulsDg9eqyFtCJb0sMFs6lwVTtZgdBxRzODLmNE0ikpQRH5PFC1+pjpKGZ1fHkZJoR7mZ0CBNmIsMqoy/Es4O27WIRlIwvw1zf4s+C76C7NwHtG4wSDRQ5Qm5MU8IgBEj5vt27HGEwMGDthpAimaHZXFCZA9+3uBY/5M/w3fVuQnByVyqzSOZu8M2nWwXMX0S09ZuqyOKAEnEGqku5F2B0TaAkjPsjDIKWhBUlfQKHmi+KaP/w63mptQMLfqHeUV/Hma7QSlOnj75SApwov5PKLDQbw7a/CYbnJwmJYNIBsWHlA24KRcaXdAY16CaqHjFcybJHpKs2X8zut/9Y11AZh5S8fiNFkUv4T8EPIJuT9qHcKnYBM3poZSRorbWGupqIF7DoMBw6adwaB4E6lYR0we1lEB8VYIyLPPmYmYFy+YnZlX30nCVzrBJC9UvsL1Kk2ogRkr5COgF40m2eNbA4oC80g62GDyU3KOQmw8e/bUag+5HGKkw4w8oJCM39AHE82C+Lr8YBnN5/QyQiHYiYnRl2jpy4YU3yt/oWBx8lSFNgW+npgIkQCipoQbs91IGGFsN0UCORzWvQlhgzsRH69LYoP5G+7idpEHaGpObE9jalGPDIBfQq7NjynXyOfkh2cTzyTXtqFYrmHElm2KOPw5KIKxYoz6YVFSeF8O8dmJrCSP61UlMpOfnVuVSfbsjPlFd8MF0zMScmoy5me7F97PxAffrDTaElxq1ANtUxXaLkB79pym6xZa5HRIyzW9j+pc8xyKKQiXK69IOqBwgSp9XIfZCwxrp6mb54yTPAGSniIg2Tkm4k++RXnNnSTv/lXeg2RPc+uuXdQjqLrzyfJwR+J98kGH5drHeZ+8V+jwIibJ++Rv2Ebt3id/X9z3T/Y+K5XVWrBPC9AE1ICLnSyeDTxWDVysRg34oOrsagBJM2654kPM7cNGWbbPZS46AhqUGWzp07umixgMozJVvCZ7xybGYq62xT85K7YhWgfaD0jidKx5aQPSw4BwomJC8yFaCpNiPiBL00yF2C5ssiXPKKk+mG2y4jfvPnwJsih4jPW65wHrnsgwINjKFj5NyNIE83kFKTEQWZTQ6YEq9GuQFR+l1ef1NXINcV/WdVeNWodYtCwqBeIe6eEf1h+ohzPMsjWWlNxR+wjIOD6esa6c2p9rNX2GgdZOkeF11Ln47QecHODgDxLnSxL+0Qq6e7Sgc2caavm0U7T55ZfDDwDNATp/uylZeWAsSaZZ6nyDfEg+q7aZMZdu8lWRF8sOH6u3cEbPzx4y4w3DuO1XNIsjozNtxM3aTQGRZSOdk8nNExgDDANMzqbwBAJnS6JkGqbk/IEHb2E8b0gLI0gAqDEE4eqiZA/CVbRO50UCCEv6WGwrc1mYpNmjw9gjnlGwHQA7UkU/YxhuNvmjRl59jT5ahsu0vJRW5548gqD5jTIeAfcoUP4VPAZvVB6DxcuEeWwa6mCBFAj5zx7mMPWoifa80O57Pf9ImH9ElduuQCVkTD/W2drljab+XlEbBVHBpriPgpRqXhifclHCzo9YgxBhlTVr9XWocPAylI3P1tvLghFiUQTwQYzxOfNMkWV1waoq7dTq0RllRP6jurwU2xyVBwm8K8sQisipaSrI8lvPOyQHZQlqv9OEatWe6YVcs+jucEE7COczWubPmG49p/SfU8QSMMScwpRO1AIijopy/vQJwN5o/aAJioMBt9zzAQ7px4GgG00gIKkRF5+5jxwB7STsxCWjLZ4Naac4DLJkp52lQZbuG37wYrKKXa9gFawOIFMRTexAymWzmm/TBktnqRbAaZPb2+S65+vJ7Ewq1ebUErHygY3Fi7f8phwcn17On8nnZWNIT53txjANmTpPsWIJz6hjTGKUHMdDlmMb2OMdCg9JiTj5BjG7zSFUp4J5VvsKhu2ceoDV0S8cRvGS+4U7kJSL05Qu4j+llJRNNsuLerOOYomfxlB5tBQ0u5smdTfoOgqQAUfAwKsqA3FE4ZMTU8zm0UsDS+c/G7ob0g2F9LJE55qG1dR/rj4lr8qsju1Z1ivalXM7FqcEhXKc3Fj9o9pwcV02qsXpUHHim0cgT9QYW5E4udDd19uttMJl+i7Y/uJ8rr0LXXvxkoxGsA1ByzwfgioXxn/x/T5swcSsbusqFy6Na1t6qCba3vk0EXrSIX1dkF8fwq8ZlscUgNuZu6BqkduFi9zlPMk6x+u2wseBzIX2WFASr6xzoesCzCixwn5gCUbT4NafWg3ZHUPk1ikiLuXAhRzRwRxWkOWMFStvsBi7j+jPLC8o4M3IYPkVerv/DvP8vQQ82OQpGUqz/Dld0F0PvtL4vy6weyizVkk5kfxiU3Gd/HL7FvX0ZrV6t5SHYQ6UnPGwlepyXJiPMhL+0bI4jw1UQPXfldDjZxtXUcc0eNMdHKDD858HH8AqzPb/et7zbR5wMsvoTat50TnoflekbuNajLJsTkyq7bgPVjf+TI6ixgVAslmVQm/dOfgA3KGd4JawZErfg85qtFxlqe7oenGMi8pvfWF00nWqohDvg5RqqbSiIr/RHJcV5QXKObb+2+U7/4MzKXNZuowi+I2HHmB2NZjhb8mU3GDGF7ksqlbKqub4Z10r++iZcYKd7oekG/wevTkmlLp6CDu/S1712iz5AcmrBl/HZmBYDW+JdkGwWI+Uk7CEZC95YitaQLAMkmAhqEobn681BqXctUJrbWsgmo+l6OTZ9nV2Zl74+NRHJ+dB6D3NZCgfkrTwQjDiQEOCTshYlHWAeIbmKNRgnU8cvoSxUu2TOzuOQ/dA5wnBiqc63esRzqjRPc98xE6v+XBuh97cOl735KDE5VUiQUa/J0joZ609TttCU5T9rcoPJqpLnfyP5AaPQ9H53hyIcQkDJImRKV4reh+UEERpSEWHLq1W9KkpOqgbGlbRz+ZNT0vRMdjy1oUoqVP0IXPAtaKrUXSMgKJ7Ayp6j+bzOmzuDptNUcm1srCZuIEANh03y4ibTVGNtaq4ueJa9DTbBRNnI7GA3lA2zSIdOH8ATedDMowH9KeRqRX9qMB5WEUfcqVYK/pAgfOwiq4Zsl4ogcAZmwMGzkgzZNNXdBA4Yzxg4IzOxpD5jouDXooeIqLq7plndJvPx0eWYIkZsTxi+bo+JEkWJi9RliZLrfEqNB65GMoSnDYQm0rkK71my47BzeYne9OCqInZMhkdChHSbJkU/bP5VD3TEWifMrbM1GxZT5h4x0hgJJX51qZmyz6ApvO5nNgc0Ldmeq0VvR0lvtZiYEXXbNn0Fd3h9wIaVtE1W9YLJb4+jCg6REmdomu2bPqK7jpA0d0BFf1sgfO02DLHAAwHFmYkqGLLTBgqa7ZsmhrvIMC8YkEOqTq2DMN4HdAuxksUvmo8+9kGzgmwBP1hke+qQhPG9aBQUqPZn8vGCHDZw+IJfW9NisogRZHAN1dHimKRc65DqG5SFDkDhlBYtFqsQ6iJaTpPtyF3wBAKn22/tGkpOiBFh1V0UaCrFX1aig5I0WEVXVfT90IJkKLIgSgpU3RLtBitFX1aig5IUcT6gw+h6JZopXwQRa+eaP82Un2bBe7ShxwlYTO/abVrwKAZC6Mtfq+x4zAzAWaf6izmIPANv4GtYlSBHrqsLVeDChHYcjnllBZcyr5eiPDUrb/2pGIDqtkVJPm6glULOZYUkiBRstrQLpMEilxAZU3WgarkR4o19TnUfFFqtmiLYRn7rCALhrTpJte4deubD8jjgZHrsZ47aAPjVgz2tSmG9vvgfRQq0zOSnsQEcq8hGHCP1b5NiRHY08Xo2dv9iCa9yIJRb/q4DrMX8jTShAU5j5kwsqLAX64LSaCBFXJXb9vIiIutsvA1yOaSTnawc0CE6774a/HTBnYTkNuiNk2FM5Ek42ViXkIFLrxoczYproItitvHa7n2tl4/sb16NfeOxW5ZtiXLbnWfSqLdsmEQH8yONFnaxEgyMVVn6POYGBtaFLm7kXXuvTDhzch6yMvxVs82RS5cX1MofTuy6n4agSvRSvK86Km3HIQRpwvytIcJhU4rNGwFT1EI2/QmdssCdUKQ9V9pUEdylB0rUPaWnQe1rvf2cAQ7yFSD59D1HsUOI/I8dyJngqB5v6SeDJotAI1ZybH4qtg3wP4jx+5XKDqZwVsmmf5qD+aGSNmKvoyWAX3glYP4lQrcQ7qOCv8W3z2meZ4uWyWx8jp5JzOnFuImWK/CGUXnKXqj5vGmuOQ1GzXYCHn9nOerdeEs35P/L6L8efN4RSwpefOd0gRJ8ON9nYfLNRmYpcHsmfw3C17pAwnWBZV4Hy0X5N8gXqQZOXy5vlolCznTkMf1EdxtIt1YixbQcXL4ONtXPAt9KI8TcbNQ28rXqQaNVcg3DJqHZM9CxaGH7gYG2l+aruO0WiuPWzDgD7g4dbMuxEJ1LcnTkGRjopLs4/YDJEgy0pI8JUnu62SOTpI95TZZEKhoSR6vJEuPcQeSZGSwJHd1oqyartGiLFWU+66uy6drHEtLypQkxZqq0at2AVBn9FQsSGlRVibKzmRF2VI+fztalKckytKXVgcTZddTLcquFuUpibI/VVE2OxyMHkd4ZtcRrDBmzxES1MXT6jIldfEmqy6Wcj4N5jr/b/HSNP66yqNl9LNMeubkfYzFD62SJX8XJ4NHE8GCTSTK7JNSJ+bqxamR2iDXEtggZyRTtsVPp04nEcpbLUe2DXL16tSURNk9X46qq1d/piQp/khW5A83ejbLYlJn9PTqz5RE2Z3s/G07hx7hsp0D1Qk/zD8WNznQYQa2ePjMQcMMvV4zVjPlimbckWRzAjPVSXVg3kxJpzpcvV4zKVGWnm8xlChbLKBVJ8qwW8lDGkezd03bHTyfDkzb6aWDsRohNR4+NA+O7V35NtEaY/svn6fg2VeeTXxwc/uv27zAnvqrg80acPLdkufba9YcPibmjjjdrLEM/ppZu/v9+0BG7MRq4yGNmG/wy0IetGGiqkMpJoxVDNVQ+v2PSw2UqBaPd2o9wb7grjKgTADUTZrm5LEFqxWBpdYVZNvjo8CQH9SoAvXDdjeqyBDBasmAFXaIvNPaJ1yiRRZfzzyspYQ017dvtxooCBSoGyJOEKxaVgcUbNr5/ZMGSgCU5XSbPnUwwa1Lvl/+yMMVnLV+v/waBlkSJQs9o/XAFRlmN66+CFdbBq4COuP6qwauD3A2tJwAOEvQ7AG3BK79gYMdPL5f32vgegBX9XdrA06dJYVpf7dBHtJuJDPa70tHBkeqo8+xOYPaUV/An9w/aOB6AGfzbRpFtLAydfQhpVLx+Z/L7RI0jH2sqs+XyrAYrNGLXLRlsRQcTYCJZvdHwe77omZu/kiqG8GCYWeNC2aZkMpqXHxdcz4eUZ6egFqA+W4T0D5XRBx32r8HNyTh/eapJPY09CGTdD3LCdKmcZtFOe2dq6fxzmncwpx5E03jbCld/iwOaaa7u4fPGrkeyGG+7J4EQEMyuT6kkm7jaFUsdxkPD3/VIPYB0eVBdLEAROYiyAcR0koaur6WE65NDqh/Jlv11G7jCNzGegRkGoIIyDTOVstlGrrqb0qSwrJgzx6KoCbDg42ymGR/rGw6rUecHCubbPFMi/IkRNkYC+1zsCibCKkWZV2WOClRHklrk8NF2e44QoIoawZzUqI8mgrbQ0XZ4WIuBaIMWcXPJJ59/hYkwWIiWxMPzUPxKGLBQqCqMp/KxainXgdUWU3jj68aLpg34dhcpi5GBgRMmDkhBS9IG37RQIn0ii1xt8HENsmSDxMkBr9t4jy6DHT3ADEd6PIZ8ELAWAKmfMBgntmPMH66XMXBu4YLZiFx25wPawQRTB/7lKzD5SP5gRorkGrE6nBaFUuUeSsFK5gx9uX6Vi+R9JnCTOhrsPZ8QyyRsA1ga9Ddb3T5lsjX4DVsUJhgmR1trZAmlyyRQwMGFh85wExTMH2p8jVYglDdOfz644vGCSqW7XTjxHbako8TJDO+XW8TbDRSPfx3gRFUp1OQx/j77Q0diIILWsEC6+6Yy7jWzkg3wL5l8hXKng/xFbmRMjbrNZHEjR9+RSZ8/8LAqQnrrHlDcxF+6JXLlrbQNpdV6xvcZfom6Pr8Rr4eKzKWn59rIsgg3W3CuKgTNrZtFJxgSS1P8rheFWjxVuw1ImJiGg+f/tAWrscUZoCMNIGB85UZOF+egeuzCGj5DeN3aVwZ9Ol8YAtIf+pDmEUEmzDbK7AHGj8sMn4jWes2+A5yXtkRab+tZMZ8zxFdxRCGa18h37Bcp/y3eTbXtq9qn5pHm+G2i3hW20UkWujKxPDZ94YmuPqmb3NFkD47SRdvwqF6lMU1Re2JCvzKifOLnjj7wWg4fGwgAtIRUcxSgJTY+lnHBjJjA1ag1Jgex1IrCEpTfdTRWNIycOsRXdMj5htTeu6RoQjCnnlFgnKEXMdyDXBey73afegxL0XFJCixTazWPfW6N5Jt7g/XPYz91iM6dY8qDK993tXRtboImALMzq9AzTD0NW/C5+AlSjPaBce4jVPdQqynu+IgTpJErRaVtd6oGuhoizlYTm/DBLL0SuXts2Hjap4u7G1tHKe9E7c/0k7crsN3uZHdiZs1Q68Zxt+WUa43Fdhj+6we3aPUZRsz3asXvf+m+yf2Qc7l+xWIZi22obR84GDa8R9BlDySuUCD1w0e4rt9+QYS9SlQ1G/PxHozj/F4I80FBF/kokgvdj9+9dT1r5zdH2+DbOcKgU8Pdm+Q7QpOU1tlbbmIvCDr5ufd9/T+t/88/e354eZrtP6U/vN/Lk0Wdmp3XbaCFL//7q26WfruvXzXy5Wveuicm82w+J2nu7uOmVbrEd1dx/grHt91zOLJQzZbDKViFkznv77Vay+9PAsLdkASlDkp8uf3wKn7pQxvS8mj2ZlS+ub9om5XD7SynR1J2xX5/PaYzwnossc22FbgQHvs8Vc82h7bGOxcoawL5Nv/3zyns/zP+PEv69mPhff5xx+Pl1W0UK+F0xkNvcyxjc7a1UyMJ1NLpQZZG13e6O6ZnY4yyBO0qCD57OS+utwqdn+LCm5FnYdrfwujIHh/uV78w/tp/Uz+D8V/EVrUvxED9mtYVGA+BRLe4uBisDWaMEvMEVhU83SLKsZToov7K9rNk6wjZwo7WYI2jTxD3kN7BpGoKfiJOUf8FT2XczQO6E/Oeaa+yZ3qaDtK3mZpmte/XrQNSuch/cZ/AQ== \ No newline at end of file diff --git a/docs_raw/source/selecting_an_algorithm.rst b/docs_raw/source/selecting_an_algorithm.rst index e4027de..d039cc4 100644 --- a/docs_raw/source/selecting_an_algorithm.rst +++ b/docs_raw/source/selecting_an_algorithm.rst @@ -214,6 +214,16 @@ The algorithms are ordered by their release date in descending order. and therefore it is able to use a replay buffer in order to improve sample efficiency. +
                                      + + TD3 +
                                      + Very similar to DDPG, i.e. an actor-critic for continuous action spaces, that uses a replay buffer in + order to improve sample efficiency. TD3 uses two critic networks in order to mitigate the overestimation + in the Q state-action value prediction, slows down the actor updates in order to increase stability and + adds noise to actions while training the critic in order to smooth out the critic's predictions. +
                                      +
                                      PPO diff --git a/img/algorithms.png b/img/algorithms.png deleted file mode 100644 index b3310c0..0000000 Binary files a/img/algorithms.png and /dev/null differ diff --git a/rl_coach/agents/ddpg_agent.py b/rl_coach/agents/ddpg_agent.py index 0288f00..be58319 100644 --- a/rl_coach/agents/ddpg_agent.py +++ b/rl_coach/agents/ddpg_agent.py @@ -41,14 +41,15 @@ class DDPGCriticNetworkParameters(NetworkParameters): self.middleware_parameters = FCMiddlewareParameters() self.heads_parameters = [DDPGVHeadParameters()] self.optimizer_type = 'Adam' - self.adam_optimizer_beta2 = 0.999 - self.optimizer_epsilon = 1e-8 self.batch_size = 64 self.async_training = False self.learning_rate = 0.001 + self.adam_optimizer_beta2 = 0.999 + self.optimizer_epsilon = 1e-8 self.create_target_network = True self.shared_optimizer = True self.scale_down_gradients_by_number_of_workers_for_sync_training = False + # self.l2_regularization = 1e-2 class DDPGActorNetworkParameters(NetworkParameters): @@ -58,9 +59,9 @@ class DDPGActorNetworkParameters(NetworkParameters): self.middleware_parameters = FCMiddlewareParameters(batchnorm=True) self.heads_parameters = [DDPGActorHeadParameters()] self.optimizer_type = 'Adam' + self.batch_size = 64 self.adam_optimizer_beta2 = 0.999 self.optimizer_epsilon = 1e-8 - self.batch_size = 64 self.async_training = False self.learning_rate = 0.0001 self.create_target_network = True @@ -217,4 +218,4 @@ class DDPGAgent(ActorCriticAgent): action_info = ActionInfo(action=action, action_value=q_value) - return action_info + return action_info \ No newline at end of file diff --git a/rl_coach/agents/ddqn_bcq_agent.py b/rl_coach/agents/ddqn_bcq_agent.py index ee9fb39..1e4237c 100644 --- a/rl_coach/agents/ddqn_bcq_agent.py +++ b/rl_coach/agents/ddqn_bcq_agent.py @@ -90,7 +90,7 @@ class DDQNBCQAgent(DQNAgent): if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state: return self.networks['reward_model'].online_network.predict( states, - outputs=[self.networks['reward_model'].online_network.state_embedding]) + outputs=[self.networks['reward_model'].online_network.state_embedding[0]]) else: return states['observation'] self.embedding = to_embedding @@ -189,7 +189,7 @@ class DDQNBCQAgent(DQNAgent): if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state: self.knn_trees = [AnnoyDictionary( dict_size=knn_size, - key_width=int(self.networks['reward_model'].online_network.state_embedding.shape[-1]), + key_width=int(self.networks['reward_model'].online_network.state_embedding[0].shape[-1]), batch_size=knn_size) for _ in range(len(self.spaces.action.actions))] else: diff --git a/rl_coach/agents/nec_agent.py b/rl_coach/agents/nec_agent.py index 3e381b3..a184001 100644 --- a/rl_coach/agents/nec_agent.py +++ b/rl_coach/agents/nec_agent.py @@ -194,7 +194,7 @@ class NECAgent(ValueOptimizationAgent): ) if self.phase != RunPhase.TEST: # store the state embedding for inserting it to the DND later - self.current_episode_state_embeddings.append(embedding.squeeze()) + self.current_episode_state_embeddings.append(embedding[0].squeeze()) actions_q_values = actions_q_values[0][0] return actions_q_values diff --git a/rl_coach/agents/td3_agent.py b/rl_coach/agents/td3_agent.py new file mode 100644 index 0000000..44dbf3a --- /dev/null +++ b/rl_coach/agents/td3_agent.py @@ -0,0 +1,223 @@ +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +from typing import Union +from collections import OrderedDict + +import numpy as np + +from rl_coach.agents.agent import Agent +from rl_coach.agents.ddpg_agent import DDPGAgent +from rl_coach.architectures.embedder_parameters import InputEmbedderParameters +from rl_coach.architectures.head_parameters import DDPGActorHeadParameters, TD3VHeadParameters +from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters +from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \ + AgentParameters, EmbedderScheme +from rl_coach.core_types import ActionInfo, TrainingSteps, Transition +from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters +from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters +from rl_coach.spaces import BoxActionSpace, GoalsSpace + + +class TD3CriticNetworkParameters(NetworkParameters): + def __init__(self, num_q_networks): + super().__init__() + self.input_embedders_parameters = {'observation': InputEmbedderParameters(), + 'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)} + self.middleware_parameters = FCMiddlewareParameters(num_streams=num_q_networks) + self.heads_parameters = [TD3VHeadParameters()] + self.optimizer_type = 'Adam' + self.adam_optimizer_beta2 = 0.999 + self.optimizer_epsilon = 1e-8 + self.batch_size = 100 + self.async_training = False + self.learning_rate = 0.001 + self.create_target_network = True + self.shared_optimizer = True + self.scale_down_gradients_by_number_of_workers_for_sync_training = False + + +class TD3ActorNetworkParameters(NetworkParameters): + def __init__(self): + super().__init__() + self.input_embedders_parameters = {'observation': InputEmbedderParameters()} + self.middleware_parameters = FCMiddlewareParameters() + self.heads_parameters = [DDPGActorHeadParameters(batchnorm=False)] + self.optimizer_type = 'Adam' + self.adam_optimizer_beta2 = 0.999 + self.optimizer_epsilon = 1e-8 + self.batch_size = 100 + self.async_training = False + self.learning_rate = 0.001 + self.create_target_network = True + self.shared_optimizer = True + self.scale_down_gradients_by_number_of_workers_for_sync_training = False + + +class TD3AlgorithmParameters(AlgorithmParameters): + """ + :param num_steps_between_copying_online_weights_to_target: (StepMethod) + The number of steps between copying the online network weights to the target network weights. + + :param rate_for_copying_weights_to_target: (float) + When copying the online network weights to the target network weights, a soft update will be used, which + weight the new online network weights by rate_for_copying_weights_to_target + + :param num_consecutive_playing_steps: (StepMethod) + The number of consecutive steps to act between every two training iterations + + :param use_target_network_for_evaluation: (bool) + If set to True, the target network will be used for predicting the actions when choosing actions to act. + Since the target network weights change more slowly, the predicted actions will be more consistent. + + :param action_penalty: (float) + The amount by which to penalize the network on high action feature (pre-activation) values. + This can prevent the actions features from saturating the TanH activation function, and therefore prevent the + gradients from becoming very low. + + :param clip_critic_targets: (Tuple[float, float] or None) + The range to clip the critic target to in order to prevent overestimation of the action values. + + :param use_non_zero_discount_for_terminal_states: (bool) + If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state + values. If set to False, the terminal states reward will be taken as the target return for the network. + """ + def __init__(self): + super().__init__() + self.rate_for_copying_weights_to_target = 0.005 + self.use_target_network_for_evaluation = False + self.action_penalty = 0 + self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None + self.use_non_zero_discount_for_terminal_states = False + self.act_for_full_episodes = True + self.update_policy_every_x_episode_steps = 2 + self.num_steps_between_copying_online_weights_to_target = TrainingSteps(self.update_policy_every_x_episode_steps) + self.policy_noise = 0.2 + self.noise_clipping = 0.5 + self.num_q_networks = 2 + + +class TD3AgentExplorationParameters(AdditiveNoiseParameters): + def __init__(self): + super().__init__() + self.noise_as_percentage_from_action_space = False + + +class TD3AgentParameters(AgentParameters): + def __init__(self): + td3_algorithm_params = TD3AlgorithmParameters() + super().__init__(algorithm=td3_algorithm_params, + exploration=TD3AgentExplorationParameters(), + memory=EpisodicExperienceReplayParameters(), + networks=OrderedDict([("actor", TD3ActorNetworkParameters()), + ("critic", + TD3CriticNetworkParameters(td3_algorithm_params.num_q_networks))])) + + @property + def path(self): + return 'rl_coach.agents.td3_agent:TD3Agent' + + +# Twin Delayed DDPG - https://arxiv.org/pdf/1802.09477.pdf +class TD3Agent(DDPGAgent): + def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None): + super().__init__(agent_parameters, parent) + + self.q_values = self.register_signal("Q") + self.TD_targets_signal = self.register_signal("TD targets") + self.action_signal = self.register_signal("actions") + + def learn_from_batch(self, batch): + actor = self.networks['actor'] + critic = self.networks['critic'] + + actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys() + critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys() + + # TD error = r + discount*max(q_st_plus_1) - q_st + next_actions, actions_mean = actor.parallel_prediction([ + (actor.target_network, batch.next_states(actor_keys)), + (actor.online_network, batch.states(actor_keys)) + ]) + + # add noise to the next_actions + noise = np.random.normal(0, self.ap.algorithm.policy_noise, next_actions.shape).clip( + -self.ap.algorithm.noise_clipping, self.ap.algorithm.noise_clipping) + next_actions = self.spaces.action.clip_action_to_space(next_actions + noise) + + critic_inputs = copy.copy(batch.next_states(critic_keys)) + critic_inputs['action'] = next_actions + q_st_plus_1 = critic.target_network.predict(critic_inputs)[2] # output #2 is the min (Q1, Q2) + + # calculate the bootstrapped TD targets while discounting terminal states according to + # use_non_zero_discount_for_terminal_states + if self.ap.algorithm.use_non_zero_discount_for_terminal_states: + TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1 + else: + TD_targets = batch.rewards(expand_dims=True) + \ + (1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1 + + # clip the TD targets to prevent overestimation errors + if self.ap.algorithm.clip_critic_targets: + TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets) + + self.TD_targets_signal.add_sample(TD_targets) + + # train the critic + critic_inputs = copy.copy(batch.states(critic_keys)) + critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1) + result = critic.train_and_sync_networks(critic_inputs, TD_targets) + total_loss, losses, unclipped_grads = result[:3] + + if self.training_iteration % self.ap.algorithm.update_policy_every_x_episode_steps == 0: + # get the gradients of output #3 (=mean of Q1 network) w.r.t the action + critic_inputs = copy.copy(batch.states(critic_keys)) + critic_inputs['action'] = actions_mean + action_gradients = critic.online_network.predict(critic_inputs, + outputs=critic.online_network.gradients_wrt_inputs[3]['action']) + + # apply the gradients from the critic to the actor + initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients} + gradients = actor.online_network.predict(batch.states(actor_keys), + outputs=actor.online_network.weighted_gradients[0], + initial_feed_dict=initial_feed_dict) + + if actor.has_global: + actor.apply_gradients_to_global_network(gradients) + actor.update_online_network() + else: + actor.apply_gradients_to_online_network(gradients) + + return total_loss, losses, unclipped_grads + + def train(self): + self.ap.algorithm.num_consecutive_training_steps = self.current_episode_steps_counter + return Agent.train(self) + + def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition: + """ + Allows agents to update the transition just before adding it to the replay buffer. + Can be useful for agents that want to tweak the reward, termination signal, etc. + + :param transition: the transition to update + :return: the updated transition + """ + transition.game_over = False if self.current_episode_steps_counter ==\ + self.parent_level_manager.environment.env._max_episode_steps\ + else transition.game_over + + return transition \ No newline at end of file diff --git a/rl_coach/architectures/head_parameters.py b/rl_coach/architectures/head_parameters.py index 8879647..981251b 100644 --- a/rl_coach/architectures/head_parameters.py +++ b/rl_coach/architectures/head_parameters.py @@ -221,3 +221,14 @@ class SACQHeadParameters(HeadParameters): super().__init__(parameterized_class_name='SACQHead', activation_function=activation_function, name=name, dense_layer=dense_layer) self.network_layers_sizes = layers_sizes + + +class TD3VHeadParameters(HeadParameters): + def __init__(self, activation_function: str ='relu', name: str='td3_v_head_params', + num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0, + loss_weight: float = 1.0, dense_layer=None, initializer='xavier'): + super().__init__(parameterized_class_name="TD3VHead", activation_function=activation_function, name=name, + dense_layer=dense_layer, num_output_head_copies=num_output_head_copies, + rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor, + loss_weight=loss_weight) + self.initializer = initializer \ No newline at end of file diff --git a/rl_coach/architectures/middleware_parameters.py b/rl_coach/architectures/middleware_parameters.py index 73bb4bd..0bdbe09 100644 --- a/rl_coach/architectures/middleware_parameters.py +++ b/rl_coach/architectures/middleware_parameters.py @@ -41,10 +41,11 @@ class FCMiddlewareParameters(MiddlewareParameters): def __init__(self, activation_function='relu', scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium, batchnorm: bool = False, dropout_rate: float = 0.0, - name="middleware_fc_embedder", dense_layer=None, is_training=False): + name="middleware_fc_embedder", dense_layer=None, is_training=False, num_streams=1): super().__init__(parameterized_class_name="FCMiddleware", activation_function=activation_function, scheme=scheme, batchnorm=batchnorm, dropout_rate=dropout_rate, name=name, dense_layer=dense_layer, is_training=is_training) + self.num_streams = num_streams class LSTMMiddlewareParameters(MiddlewareParameters): diff --git a/rl_coach/architectures/tensorflow_components/architecture.py b/rl_coach/architectures/tensorflow_components/architecture.py index 648381f..9075936 100644 --- a/rl_coach/architectures/tensorflow_components/architecture.py +++ b/rl_coach/architectures/tensorflow_components/architecture.py @@ -203,7 +203,6 @@ class TensorFlowArchitecture(Architecture): self._create_gradient_accumulators() # gradients of the outputs w.r.t. the inputs - # at the moment, this is only used by ddpg self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in self.inputs.items()} for output in self.outputs] self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights') diff --git a/rl_coach/architectures/tensorflow_components/heads/__init__.py b/rl_coach/architectures/tensorflow_components/heads/__init__.py index 91c1b79..03c237a 100644 --- a/rl_coach/architectures/tensorflow_components/heads/__init__.py +++ b/rl_coach/architectures/tensorflow_components/heads/__init__.py @@ -16,6 +16,7 @@ from .sac_head import SACPolicyHead from .sac_q_head import SACQHead from .classification_head import ClassificationHead from .cil_head import RegressionHead +from .td3_v_head import TD3VHead from .ddpg_v_head import DDPGVHead __all__ = [ @@ -37,5 +38,6 @@ __all__ = [ 'SACQHead', 'ClassificationHead', 'RegressionHead', + 'TD3VHead' 'DDPGVHead' ] diff --git a/rl_coach/architectures/tensorflow_components/heads/head.py b/rl_coach/architectures/tensorflow_components/heads/head.py index d898362..d997442 100644 --- a/rl_coach/architectures/tensorflow_components/heads/head.py +++ b/rl_coach/architectures/tensorflow_components/heads/head.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import Type import numpy as np import tensorflow as tf @@ -22,7 +21,7 @@ from rl_coach.architectures.tensorflow_components.layers import Dense, convert_l from rl_coach.base_parameters import AgentParameters from rl_coach.spaces import SpacesDefinition from rl_coach.utils import force_list - +from rl_coach.architectures.tensorflow_components.utils import squeeze_tensor # Used to initialize weights for policy and value output layers def normalized_columns_initializer(std=1.0): @@ -72,8 +71,9 @@ class Head(object): :param input_layer: the input to the graph :return: the output of the last layer and the target placeholder """ + with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()): - self._build_module(input_layer) + self._build_module(squeeze_tensor(input_layer)) self.output = force_list(self.output) self.target = force_list(self.target) diff --git a/rl_coach/architectures/tensorflow_components/heads/td3_v_head.py b/rl_coach/architectures/tensorflow_components/heads/td3_v_head.py new file mode 100644 index 0000000..86457ec --- /dev/null +++ b/rl_coach/architectures/tensorflow_components/heads/td3_v_head.py @@ -0,0 +1,67 @@ +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import tensorflow as tf + +from rl_coach.architectures.tensorflow_components.layers import Dense +from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer +from rl_coach.base_parameters import AgentParameters +from rl_coach.core_types import VStateValue +from rl_coach.spaces import SpacesDefinition + + +class TD3VHead(Head): + def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str, + head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu', + dense_layer=Dense, initializer='xavier'): + super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function, + dense_layer=dense_layer) + self.name = 'td3_v_values_head' + self.return_type = VStateValue + self.loss_type = [] + self.initializer = initializer + self.loss = [] + self.output = [] + + def _build_module(self, input_layer): + # Standard V Network + q_outputs = [] + self.target = tf.placeholder(tf.float32, shape=(None, 1), name="q_networks_min_placeholder") + + for i in range(input_layer.shape[0]): # assuming that the actual size is 2, as there are two critic networks + if self.initializer == 'normalized_columns': + q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1), + kernel_initializer=normalized_columns_initializer(1.0))) + elif self.initializer == 'xavier' or self.initializer is None: + q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1))) + + self.output.append(q_outputs[i]) + self.loss.append(tf.reduce_mean((self.target-q_outputs[i])**2)) + + self.output.append(tf.reduce_min(q_outputs, axis=0)) + self.output.append(tf.reduce_mean(self.output[0])) + self.loss = sum(self.loss) + tf.losses.add_loss(self.loss) + + def __str__(self): + result = [ + "Q1 Action-Value Stream", + "\tDense (num outputs = 1)", + "Q2 Action-Value Stream", + "\tDense (num outputs = 1)", + "Min (Q1, Q2)" + ] + return '\n'.join(result) diff --git a/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py b/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py index 816674a..61d340b 100644 --- a/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py +++ b/rl_coach/architectures/tensorflow_components/middlewares/fc_middleware.py @@ -28,23 +28,28 @@ class FCMiddleware(Middleware): def __init__(self, activation_function=tf.nn.relu, scheme: MiddlewareScheme = MiddlewareScheme.Medium, batchnorm: bool = False, dropout_rate: float = 0.0, - name="middleware_fc_embedder", dense_layer=Dense, is_training=False): + name="middleware_fc_embedder", dense_layer=Dense, is_training=False, num_streams: int = 1): super().__init__(activation_function=activation_function, batchnorm=batchnorm, dropout_rate=dropout_rate, scheme=scheme, name=name, dense_layer=dense_layer, is_training=is_training) self.return_type = Middleware_FC_Embedding - self.layers = [] + + assert(isinstance(num_streams, int) and num_streams >= 1) + self.num_streams = num_streams def _build_module(self): - self.layers.append(self.input) + self.output = [] - for idx, layer_params in enumerate(self.layers_params): - self.layers.extend(force_list( - layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx), - is_training=self.is_training) - )) + for stream_idx in range(self.num_streams): + layers = [self.input] - self.output = self.layers[-1] + for idx, layer_params in enumerate(self.layers_params): + layers.extend(force_list( + layer_params(layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, + idx + stream_idx * len(self.layers_params)), + is_training=self.is_training) + )) + self.output.append((layers[-1])) @property def schemes(self): @@ -72,3 +77,15 @@ class FCMiddleware(Middleware): ] } + def __str__(self): + stream = [str(l) for l in self.layers_params] + if self.layers_params: + if self.num_streams > 1: + stream = [''] + ['\t' + l for l in stream] + result = stream * self.num_streams + result[0::len(stream)] = ['Stream {}'.format(i) for i in range(self.num_streams)] + else: + result = stream + return '\n'.join(result) + else: + return 'No layers' diff --git a/rl_coach/architectures/tensorflow_components/utils.py b/rl_coach/architectures/tensorflow_components/utils.py index 749a0ab..45f6d01 100644 --- a/rl_coach/architectures/tensorflow_components/utils.py +++ b/rl_coach/architectures/tensorflow_components/utils.py @@ -38,3 +38,10 @@ def get_activation_function(activation_function_string: str): "Activation function must be one of the following {}. instead it was: {}" \ .format(activation_functions.keys(), activation_function_string) return activation_functions[activation_function_string] + + +def squeeze_tensor(tensor): + if tensor.shape[0] == 1: + return tensor[0] + else: + return tensor \ No newline at end of file diff --git a/rl_coach/exploration_policies/additive_noise.py b/rl_coach/exploration_policies/additive_noise.py index e6ccbad..8194718 100644 --- a/rl_coach/exploration_policies/additive_noise.py +++ b/rl_coach/exploration_policies/additive_noise.py @@ -17,7 +17,6 @@ from typing import List import numpy as np -import scipy.stats from rl_coach.core_types import RunPhase, ActionType from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters @@ -31,8 +30,9 @@ from rl_coach.spaces import ActionSpace, BoxActionSpace class AdditiveNoiseParameters(ExplorationParameters): def __init__(self): super().__init__() - self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000) - self.evaluation_noise_percentage = 0.05 + self.noise_schedule = LinearSchedule(0.1, 0.1, 50000) + self.evaluation_noise = 0.05 + self.noise_as_percentage_from_action_space = True @property def path(self): @@ -48,17 +48,19 @@ class AdditiveNoise(ContinuousActionExplorationPolicy): 2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to be the mean of the action, and 2nd is assumed to be its standard deviation. """ - def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule, - evaluation_noise_percentage: float): + def __init__(self, action_space: ActionSpace, noise_schedule: Schedule, + evaluation_noise: float, noise_as_percentage_from_action_space: bool = True): """ :param action_space: the action space used by the environment - :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range - of the action space - :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases + :param noise_schedule: the schedule for the noise + :param evaluation_noise: the noise variance that will be used during evaluation phases + :param noise_as_percentage_from_action_space: a bool deciding whether the noise is absolute or as a percentage + from the action space """ super().__init__(action_space) - self.noise_percentage_schedule = noise_percentage_schedule - self.evaluation_noise_percentage = evaluation_noise_percentage + self.noise_schedule = noise_schedule + self.evaluation_noise = evaluation_noise + self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space if not isinstance(action_space, BoxActionSpace): raise ValueError("Additive noise exploration works only for continuous controls." @@ -68,19 +70,20 @@ class AdditiveNoise(ContinuousActionExplorationPolicy): or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf): raise ValueError("Additive noise exploration requires bounded actions") - # TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage - def get_action(self, action_values: List[ActionType]) -> ActionType: # TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies - # set the current noise percentage + # set the current noise if self.phase == RunPhase.TEST: - current_noise_precentage = self.evaluation_noise_percentage + current_noise = self.evaluation_noise else: - current_noise_precentage = self.noise_percentage_schedule.current_value + current_noise = self.noise_schedule.current_value # scale the noise to the action space range - action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low) + if self.noise_as_percentage_from_action_space: + action_values_std = current_noise * (self.action_space.high - self.action_space.low) + else: + action_values_std = current_noise # extract the mean values if isinstance(action_values, list): @@ -92,15 +95,18 @@ class AdditiveNoise(ContinuousActionExplorationPolicy): # step the noise schedule if self.phase is not RunPhase.TEST: - self.noise_percentage_schedule.step() + self.noise_schedule.step() # the second element of the list is assumed to be the standard deviation if isinstance(action_values, list) and len(action_values) > 1: action_values_std = action_values[1].squeeze() # add noise to the action means - action = np.random.normal(action_values_mean, action_values_std) + if self.phase is not RunPhase.TEST: + action = np.random.normal(action_values_mean, action_values_std) + else: + action = action_values_mean - return action + return np.atleast_1d(action) def get_control_param(self): - return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value + return np.ones(self.action_space.shape)*self.noise_schedule.current_value diff --git a/rl_coach/exploration_policies/e_greedy.py b/rl_coach/exploration_policies/e_greedy.py index fde73b3..b9cb885 100644 --- a/rl_coach/exploration_policies/e_greedy.py +++ b/rl_coach/exploration_policies/e_greedy.py @@ -32,7 +32,7 @@ class EGreedyParameters(ExplorationParameters): self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000) self.evaluation_epsilon = 0.05 self.continuous_exploration_policy_parameters = AdditiveNoiseParameters() - self.continuous_exploration_policy_parameters.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000) + self.continuous_exploration_policy_parameters.noise_schedule = LinearSchedule(0.1, 0.1, 50000) # for continuous control - # (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf) diff --git a/rl_coach/exploration_policies/truncated_normal.py b/rl_coach/exploration_policies/truncated_normal.py index 91848ed..7d859ee 100644 --- a/rl_coach/exploration_policies/truncated_normal.py +++ b/rl_coach/exploration_policies/truncated_normal.py @@ -28,10 +28,11 @@ from rl_coach.spaces import ActionSpace, BoxActionSpace class TruncatedNormalParameters(ExplorationParameters): def __init__(self): super().__init__() - self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000) - self.evaluation_noise_percentage = 0.05 + self.noise_schedule = LinearSchedule(0.1, 0.1, 50000) + self.evaluation_noise = 0.05 self.clip_low = 0 self.clip_high = 1 + self.noise_as_percentage_from_action_space = True @property def path(self): @@ -49,17 +50,20 @@ class TruncatedNormal(ContinuousActionExplorationPolicy): When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it is within the bounds. """ - def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule, - evaluation_noise_percentage: float, clip_low: float, clip_high: float): + def __init__(self, action_space: ActionSpace, noise_schedule: Schedule, + evaluation_noise: float, clip_low: float, clip_high: float, + noise_as_percentage_from_action_space: bool = True): """ :param action_space: the action space used by the environment - :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range - of the action space - :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases + :param noise_schedule: the schedule for the noise variance + :param evaluation_noise: the noise variance that will be used during evaluation phases + :param noise_as_percentage_from_action_space: whether to consider the noise as a percentage of the action space + or absolute value """ super().__init__(action_space) - self.noise_percentage_schedule = noise_percentage_schedule - self.evaluation_noise_percentage = evaluation_noise_percentage + self.noise_schedule = noise_schedule + self.evaluation_noise = evaluation_noise + self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space self.clip_low = clip_low self.clip_high = clip_high @@ -71,17 +75,21 @@ class TruncatedNormal(ContinuousActionExplorationPolicy): or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf): raise ValueError("Additive noise exploration requires bounded actions") - # TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage - def get_action(self, action_values: List[ActionType]) -> ActionType: - # set the current noise percentage + # set the current noise if self.phase == RunPhase.TEST: - current_noise_precentage = self.evaluation_noise_percentage + current_noise = self.evaluation_noise else: - current_noise_precentage = self.noise_percentage_schedule.current_value + current_noise = self.noise_schedule.current_value # scale the noise to the action space range - action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low) + if self.noise_as_percentage_from_action_space: + action_values_std = current_noise * (self.action_space.high - self.action_space.low) + else: + action_values_std = current_noise + + # scale the noise to the action space range + action_values_std = current_noise * (self.action_space.high - self.action_space.low) # extract the mean values if isinstance(action_values, list): @@ -93,7 +101,7 @@ class TruncatedNormal(ContinuousActionExplorationPolicy): # step the noise schedule if self.phase is not RunPhase.TEST: - self.noise_percentage_schedule.step() + self.noise_schedule.step() # the second element of the list is assumed to be the standard deviation if isinstance(action_values, list) and len(action_values) > 1: action_values_std = action_values[1].squeeze() @@ -107,4 +115,4 @@ class TruncatedNormal(ContinuousActionExplorationPolicy): return action def get_control_param(self): - return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value + return np.ones(self.action_space.shape)*self.noise_schedule.current_value diff --git a/rl_coach/presets/CARLA_CIL.py b/rl_coach/presets/CARLA_CIL.py index a2151c2..9c7342c 100644 --- a/rl_coach/presets/CARLA_CIL.py +++ b/rl_coach/presets/CARLA_CIL.py @@ -123,8 +123,8 @@ agent_params.input_filter.add_observation_filter( # no exploration is used agent_params.exploration = AdditiveNoiseParameters() -agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0) -agent_params.exploration.evaluation_noise_percentage = 0 +agent_params.exploration.noise_schedule = ConstantSchedule(0) +agent_params.exploration.evaluation_noise = 0 # no playing during the training phase agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0) diff --git a/rl_coach/presets/CartPole_Dueling_DDQN.py b/rl_coach/presets/CartPole_Dueling_DDQN.py index 861fdc5..97fd040 100644 --- a/rl_coach/presets/CartPole_Dueling_DDQN.py +++ b/rl_coach/presets/CartPole_Dueling_DDQN.py @@ -53,7 +53,7 @@ env_params = GymVectorEnvironment(level='CartPole-v0') preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 150 -preset_validation_params.max_episodes_to_achieve_reward = 250 +preset_validation_params.max_episodes_to_achieve_reward = 300 graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=VisualizationParameters(), diff --git a/rl_coach/presets/Fetch_DDPG_HER_baselines.py b/rl_coach/presets/Fetch_DDPG_HER_baselines.py index c2f17d7..24579ff 100644 --- a/rl_coach/presets/Fetch_DDPG_HER_baselines.py +++ b/rl_coach/presets/Fetch_DDPG_HER_baselines.py @@ -87,9 +87,9 @@ agent_params.memory.shared_memory = True agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = ConstantSchedule(0.3) agent_params.exploration.evaluation_epsilon = 0 -# they actually take the noise_percentage_schedule to be 0.2 * max_abs_range which is 0.1 * total_range -agent_params.exploration.continuous_exploration_policy_parameters.noise_percentage_schedule = ConstantSchedule(0.1) -agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise_percentage = 0 +# they actually take the noise_schedule to be 0.2 * max_abs_range which is 0.1 * total_range +agent_params.exploration.continuous_exploration_policy_parameters.noise_schedule = ConstantSchedule(0.1) +agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise = 0 agent_params.input_filter = InputFilter() agent_params.input_filter.add_observation_filter('observation', 'clipping', ObservationClippingFilter(-200, 200)) diff --git a/rl_coach/presets/Mujoco_DDPG.py b/rl_coach/presets/Mujoco_DDPG.py index 03a95a8..4f317fb 100644 --- a/rl_coach/presets/Mujoco_DDPG.py +++ b/rl_coach/presets/Mujoco_DDPG.py @@ -15,7 +15,7 @@ schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentSteps(2000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) -schedule_params.heatup_steps = EnvironmentSteps(1000) +schedule_params.heatup_steps = EnvironmentSteps(10000) ######### # Agent # @@ -38,7 +38,7 @@ env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2)) preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 -preset_validation_params.max_episodes_to_achieve_reward = 1000 +preset_validation_params.max_episodes_to_achieve_reward = 3000 preset_validation_params.reward_test_level = 'inverted_pendulum' preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper'] diff --git a/rl_coach/presets/Mujoco_TD3.py b/rl_coach/presets/Mujoco_TD3.py new file mode 100644 index 0000000..613b436 --- /dev/null +++ b/rl_coach/presets/Mujoco_TD3.py @@ -0,0 +1,49 @@ +from rl_coach.agents.td3_agent import TD3AgentParameters +from rl_coach.architectures.layers import Dense +from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme +from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps +from rl_coach.environments.environment import SingleLevelSelection +from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2 +from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager +from rl_coach.graph_managers.graph_manager import ScheduleParameters + +#################### +# Graph Scheduling # +#################### + +schedule_params = ScheduleParameters() +schedule_params.improve_steps = EnvironmentSteps(1000000) +schedule_params.steps_between_evaluation_periods = EnvironmentSteps(5000) +schedule_params.evaluation_steps = EnvironmentEpisodes(10) +schedule_params.heatup_steps = EnvironmentSteps(10000) + +######### +# Agent # +######### +agent_params = TD3AgentParameters() +agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense(400)] +agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense(300)] + +agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Empty +agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty +agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense(400), Dense(300)] + + +############### +# Environment # +############### +env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2)) + +######## +# Test # +######## +preset_validation_params = PresetValidationParameters() +preset_validation_params.test = True +preset_validation_params.min_reward_threshold = 500 +preset_validation_params.max_episodes_to_achieve_reward = 1100 +preset_validation_params.reward_test_level = 'hopper' +preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper'] + +graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, + schedule_params=schedule_params, vis_params=VisualizationParameters(), + preset_validation_params=preset_validation_params) diff --git a/rl_coach/presets/Starcraft_CollectMinerals_A3C.py b/rl_coach/presets/Starcraft_CollectMinerals_A3C.py index 6819ef5..9bc1401 100644 --- a/rl_coach/presets/Starcraft_CollectMinerals_A3C.py +++ b/rl_coach/presets/Starcraft_CollectMinerals_A3C.py @@ -37,9 +37,9 @@ agent_params.network_wrappers['main'].input_embedders_parameters = { } agent_params.exploration = AdditiveNoiseParameters() -agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0.05) -# agent_params.exploration.noise_percentage_schedule = LinearSchedule(0.4, 0.05, 100000) -agent_params.exploration.evaluation_noise_percentage = 0.05 +agent_params.exploration.noise_schedule = ConstantSchedule(0.05) +# agent_params.exploration.noise_schedule = LinearSchedule(0.4, 0.05, 100000) +agent_params.exploration.evaluation_noise = 0.05 agent_params.network_wrappers['main'].batch_size = 64 agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 diff --git a/rl_coach/run_multiple_seeds.py b/rl_coach/run_multiple_seeds.py index b8ddc5e..6a6ddde 100644 --- a/rl_coach/run_multiple_seeds.py +++ b/rl_coach/run_multiple_seeds.py @@ -53,10 +53,14 @@ if __name__ == "__main__": "the preset name, followed by the environment level", default='', type=str) - parser.add_argument('-sd', '--level_as_sub_dir', + parser.add_argument('-lsd', '--level_as_sub_dir', help="(flag) Store each level in it's own sub directory where the root directory name matches " "the preset name", action='store_true') + parser.add_argument('-ssd', '--seed_as_sub_dir', + help="(flag) Store each seed in it's own sub directory where the root directory name matches " + "the preset name", + action='store_true') parser.add_argument('-ew', '--evaluation_worker', help="(flag) Start an additional worker that will only do evaluation", action='store_true') @@ -108,6 +112,8 @@ if __name__ == "__main__": command.append("-c") if args.evaluation_worker: command.append("-ew") + if args.seed_as_sub_dir: + seed = '' if level is not None: command.extend(['-lvl', '{}'.format(level)]) if level_as_sub_dir: